nijisakai commited on
Commit
aaf6873
1 Parent(s): f06136b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +205 -182
app.py CHANGED
@@ -14,37 +14,15 @@ from huggingface_hub import hf_hub_download, list_repo_files
14
  from so_vits_svc_fork.hparams import HParams
15
  from so_vits_svc_fork.inference.core import Svc
16
 
17
- ###################################################################
18
- # REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS
19
- ###################################################################
20
- # The Hugging Face Hub repo IDs - 在这里修改repo_id,可替换成任何已经训练好的模型!
21
  repo_ids = ["nijisakai/sunyanzi", "kevinwang676/jay"]
22
-
23
- # If None, Uses latest ckpt in the repo
24
  ckpt_name = None
25
-
26
- # If None, Uses "kmeans.pt" if it exists in the repo
27
  cluster_model_name = None
28
-
29
- # Set the default f0 type to use - use the one it was trained on.
30
- # The default for so-vits-svc-fork is "dio".
31
- # Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
32
  default_f0_method = "crepe"
33
-
34
- # The default ratio of cluster inference to SVC inference.
35
- # If cluster_model_name is not found in the repo, this is set to 0.
36
  default_cluster_infer_ratio = 0.5
37
-
38
- # Limit on duration of audio at inference time. increase if you can
39
- # In this parent app, we set the limit with an env var to 30 seconds
40
- # If you didnt set env var + you go OOM try changing 9e9 to <=300ish
41
  duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
42
- ###################################################################
43
 
44
- interfaces = []
45
  for repo_id in repo_ids:
46
- # Figure out the latest generator by taking highest value one.
47
- # Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth
48
  if ckpt_name is None:
49
  latest_id = sorted(
50
  [
@@ -70,172 +48,217 @@ for repo_id in repo_ids:
70
  speakers = list(hparams.spk.keys())
71
  device = "cuda" if torch.cuda.is_available() else "cpu"
72
  model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
73
- demucs_model = get_model(DEFAULT_MODEL)
74
-
75
- def extract_vocal_demucs(model, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):
76
- wav, sr = librosa.load(filename, mono=False, sr=sr)
77
- wav = torch.tensor(wav)
78
- ref = wav.mean(0)
79
- wav = (wav - ref.mean()) / ref.std()
80
- sources = apply_model(
81
- model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs
82
- )[0]
83
- sources = sources * ref.std() + ref.mean()
84
- # We take just the vocals stem. I know the vocals for this model are at index -1
85
- # If using different model, check model.sources.index('vocals')
86
- vocal_wav = sources[-1]
87
- # I did this because its the same normalization the so-vits model required
88
- vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1)
89
- vocal_wav = vocal_wav.numpy()
90
- vocal_wav = librosa.to_mono(vocal_wav)
91
- vocal_wav = vocal_wav.T
92
- instrumental_wav = sources[:-1].sum(0).numpy().T
93
- return vocal_wav, instrumental_wav
94
-
95
- def download_youtube_clip(
96
- video_identifier,
97
- start_time,
98
- end_time,
99
- output_filename,
100
- num_attempts=5,
101
- url_base="https://www.youtube.com/watch?v=",
102
- quiet=False,
103
- force=False,
104
- ):
105
- output_path = Path(output_filename)
106
- if output_path.exists():
107
- if not force:
108
- return output_path
109
- else:
110
- output_path.unlink()
111
-
112
- quiet = "--quiet --no-warnings" if quiet else ""
113
- command = f"""
114
- yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}" # noqa: E501
115
- """.strip()
116
-
117
- attempts = 0
118
- while True:
119
- try:
120
- _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
121
- except subprocess.CalledProcessError:
122
- attempts += 1
123
- if attempts == num_attempts:
124
- return None
125
- else:
126
- break
127
-
128
- if output_path.exists():
129
  return output_path
130
  else:
131
- return None
132
-
133
- def predict(
134
- speaker,
135
- audio,
136
- transpose: int = 0,
137
- auto_predict_f0: bool = False,
138
- cluster_infer_ratio: float = 0,
139
- noise_scale: float = 0.4,
140
- f0_method: str = "crepe",
141
- db_thresh: int = -40,
142
- pad_seconds: float = 0.5,
143
- chunk_seconds: float = 0.5,
144
- absolute_thresh: bool = False,
145
- ):
146
- audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit)
147
- audio = model.infer_silence(
148
- audio.astype(np.float32),
149
- speaker=speaker,
150
- transpose=transpose,
151
- auto_predict_f0=auto_predict_f0,
152
- cluster_infer_ratio=cluster_infer_ratio,
153
- noise_scale=noise_scale,
154
- f0_method=f0_method,
155
- db_thresh=db_thresh,
156
- pad_seconds=pad_seconds,
157
- chunk_seconds=chunk_seconds,
158
- absolute_thresh=absolute_thresh,
159
- )
160
- return model.target_sample, audio
161
-
162
- def predict_song_from_yt(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  ytid_or_url,
164
  start,
165
  end,
166
- speaker=speakers[0],
167
- transpose: int = 0,
168
- auto_predict_f0: bool = False,
169
- cluster_infer_ratio: float = 0,
170
- noise_scale: float = 0.4,
171
- f0_method: str = "dio",
172
- db_thresh: int = -40,
173
- pad_seconds: float = 0.5,
174
- chunk_seconds: float = 0.5,
175
- absolute_thresh: bool = False,
176
- ):
177
- end = min(start + duration_limit, end)
178
- original_track_filepath = download_youtube_clip(
179
- ytid_or_url,
180
- start,
181
- end,
182
- "track.wav",
183
- force=True,
184
- url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=",
185
- )
186
- vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath)
187
- if transpose != 0:
188
- inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T
189
- cloned_vox = model.infer_silence(
190
- vox_wav.astype(np.float32),
191
- speaker=speaker,
192
- transpose=transpose,
193
- auto_predict_f0=auto_predict_f0,
194
- cluster_infer_ratio=cluster_infer_ratio,
195
- noise_scale=noise_scale,
196
- f0_method=f0_method,
197
- db_thresh=db_thresh,
198
- pad_seconds=pad_seconds,
199
- chunk_seconds=chunk_seconds,
200
- absolute_thresh=absolute_thresh,
201
- )
202
- full_song = inst_wav + np.expand_dims(cloned_vox, 1)
203
- return (model.target_sample, full_song), (model.target_sample, cloned_vox)
204
-
205
- description = f"""
206
- <center>💡 - 如何使用此程序:在页面上���选择“从B站视频上传”模块,填写视频网址和视频起止时间后,点击“submit”按键即可!您还可以点击页面最下方的示例快速预览效果</center>
207
- """.strip()
208
 
209
- article = """
210
- <p style='text-align: center'> 注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。
211
- </p>
212
- """.strip()
213
 
214
- interface = gr.Interface(
215
- predict,
216
- inputs=[
217
- gr.Dropdown(speakers, label="🎤AI歌手选择🎶"),
218
- gr.Audio(type="filepath", source="microphone", label="请用麦克风上传您想转换的歌曲"),
219
- gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0;有正负值,+2为升高两个key)"),
220
- gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启;配合聚类模型f0预测效果更好,仅限语音转换时使用", visible=False),
221
- gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降 (如果使用,建议0.5左右)"),
222
- gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False),
223
- gr.Dropdown(
224
- choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
225
- value=default_f0_method,
226
- label="模型推理方法 (crepe推理效果最好)", visible=False
227
- ),
228
- ],
229
- outputs="audio",
230
- cache_examples=False,
231
- title=f"🌊💕🎶 - 滔滔AI+音乐:可从B站直接上传素材,无需分离背景音 ({repo_id})",
232
- description=description,
233
- article=article,
234
- )
235
- interfaces.append(interface)
236
 
237
- # Combine the interfaces using a TabbedInterface
238
- interface = gr.TabbedInterface(interfaces, [f"Model {i+1}" for i in range(len(interfaces))])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
  if __name__ == "__main__":
241
- interface.launch(show_error=True)
 
14
  from so_vits_svc_fork.hparams import HParams
15
  from so_vits_svc_fork.inference.core import Svc
16
 
 
 
 
 
17
  repo_ids = ["nijisakai/sunyanzi", "kevinwang676/jay"]
 
 
18
  ckpt_name = None
 
 
19
  cluster_model_name = None
 
 
 
 
20
  default_f0_method = "crepe"
 
 
 
21
  default_cluster_infer_ratio = 0.5
 
 
 
 
22
  duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
 
23
 
24
+ models = []
25
  for repo_id in repo_ids:
 
 
26
  if ckpt_name is None:
27
  latest_id = sorted(
28
  [
 
48
  speakers = list(hparams.spk.keys())
49
  device = "cuda" if torch.cuda.is_available() else "cpu"
50
  model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
51
+ models.append(model)
52
+
53
+ demucs_model = get_model(DEFAULT_MODEL)
54
+
55
+ def extract_vocal_demucs(model, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):
56
+ wav, sr = librosa.load(filename, mono=False, sr=sr)
57
+ wav = torch.tensor(wav)
58
+ ref = wav.mean(0)
59
+ wav = (wav - ref.mean()) / ref.std()
60
+ sources = apply_model(
61
+ model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs
62
+ )[0]
63
+ sources = sources * ref.std() + ref.mean()
64
+ vocal_wav = sources[-1]
65
+ vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1)
66
+ vocal_wav = vocal_wav.numpy()
67
+ vocal_wav = librosa.to_mono(vocal_wav)
68
+ vocal_wav = vocal_wav.T
69
+ instrumental_wav = sources[:-1].sum(0).numpy().T
70
+ return vocal_wav, instrumental_wav
71
+
72
+ def download_youtube_clip(
73
+ video_identifier,
74
+ start_time,
75
+ end_time,
76
+ output_filename,
77
+ num_attempts=5,
78
+ url_base="https://www.youtube.com/watch?v=",
79
+ quiet=False,
80
+ force=False,
81
+ ):
82
+ output_path = Path(output_filename)
83
+ if output_path.exists():
84
+ if not force:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  return output_path
86
  else:
87
+ output_path.unlink()
88
+
89
+ quiet = "--quiet --no-warnings" if quiet else ""
90
+ command = f"""
91
+ yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}" # noqa: E501
92
+ """.strip()
93
+
94
+ attempts = 0
95
+ while True:
96
+ try:
97
+ _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
98
+ except subprocess.CalledProcessError:
99
+ attempts += 1
100
+ if attempts == num_attempts:
101
+ return None
102
+ else:
103
+ break
104
+
105
+ if output_path.exists():
106
+ return output_path
107
+ else:
108
+ return None
109
+
110
+ def predict(
111
+ speaker,
112
+ audio,
113
+ transpose: int = 0,
114
+ auto_predict_f0: bool = False,
115
+ cluster_infer_ratio: float = 0,
116
+ noise_scale: float = 0.4,
117
+ f0_method: str = "crepe",
118
+ db_thresh: int = -40,
119
+ pad_seconds: float = 0.5,
120
+ chunk_seconds: float = 0.5,
121
+ absolute_thresh: bool = False,
122
+ ):
123
+ audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit)
124
+ audio = model.infer_silence(
125
+ audio.astype(np.float32),
126
+ speaker=speaker,
127
+ transpose=transpose,
128
+ auto_predict_f0=auto_predict_f0,
129
+ cluster_infer_ratio=cluster_infer_ratio,
130
+ noise_scale=noise_scale,
131
+ f0_method=f0_method,
132
+ db_thresh=db_thresh,
133
+ pad_seconds=pad_seconds,
134
+ chunk_seconds=chunk_seconds,
135
+ absolute_thresh=absolute_thresh,
136
+ )
137
+ return model.target_sample, audio
138
+
139
+ def predict_song_from_yt(
140
+ ytid_or_url,
141
+ start,
142
+ end,
143
+ speaker=speakers[0],
144
+ transpose: int = 0,
145
+ auto_predict_f0: bool = False,
146
+ cluster_infer_ratio: float = 0,
147
+ noise_scale: float = 0.4,
148
+ f0_method: str = "dio",
149
+ db_thresh: int = -40,
150
+ pad_seconds: float = 0.5,
151
+ chunk_seconds: float = 0.5,
152
+ absolute_thresh: bool = False,
153
+ ):
154
+ end = min(start + duration_limit, end)
155
+ original_track_filepath = download_youtube_clip(
156
  ytid_or_url,
157
  start,
158
  end,
159
+ "track.wav",
160
+ force=True,
161
+ url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=",
162
+ )
163
+ vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath)
164
+ if transpose != 0:
165
+ inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T
166
+ cloned_vox = model.infer_silence(
167
+ vox_wav.astype(np.float32),
168
+ speaker=speaker,
169
+ transpose=transpose,
170
+ auto_predict_f0=auto_predict_f0,
171
+ cluster_infer_ratio=cluster_infer_ratio,
172
+ noise_scale=noise_scale,
173
+ f0_method=f0_method,
174
+ db_thresh=db_thresh,
175
+ pad_seconds=pad_seconds,
176
+ chunk_seconds=chunk_seconds,
177
+ absolute_thresh=absolute_thresh,
178
+ )
179
+ full_song = inst_wav + np.expand_dims(cloned_vox, 1)
180
+ return (model.target_sample, full_song), (model.target_sample, cloned_vox)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
+ description = f"""
183
+ <center>💡 - 如何使用此程序:在页面上方选择“从B站视频上传”模块,填写视频网址和视频起止时间后,点击“submit”按键即可!您还可以点击页面最下方的示例快速预览效果</center>
184
+ """.strip()
 
185
 
186
+ article = """
187
+ <p style='text-align: center'> 注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。
188
+ </p>
189
+ """.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
+ interface_mic = gr.Interface(
192
+ predict,
193
+ inputs=[
194
+ gr.Dropdown(speakers, label="🎤AI歌手选择🎶"),
195
+ gr.Audio(type="filepath", source="microphone", label="请用麦克风上传您想转换的歌曲"),
196
+ gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0;有正负值,+2为升高两个key)"),
197
+ gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启;配合聚类模型f0预测效果更好,仅限语音转换时使用", visible=False),
198
+ gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降 (如果使用,建议0.5左右)"),
199
+ gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False),
200
+ gr.Dropdown(
201
+ choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
202
+ value=default_f0_method,
203
+ label="模型推理方法 (crepe推理效果最好)", visible=False
204
+ ),
205
+ ],
206
+ outputs="audio",
207
+ cache_examples=False,
208
+ title="🌊💕🎶 - 滔滔AI+音乐:可从B站直接上传素材,无需分离背景音",
209
+ description=description,
210
+ article=article,
211
+ )
212
+ interface_file = gr.Interface(
213
+ predict,
214
+ inputs=[
215
+ gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手🎶 - 🌟孙燕姿🌟"),
216
+ gr.Audio(type="filepath", source="upload", label="请上传您想转换的歌曲 (仅人声部分)"),
217
+ gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0;有正负值,+2为升高两个key)"),
218
+ gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启;配合聚类模型f0预测效果更好,仅限语音转换时使用", visible=False),
219
+ gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降 (如果使用,建议0.5左右)"),
220
+ gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False),
221
+ gr.Dropdown(
222
+ choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
223
+ value=default_f0_method,
224
+ label="模型推理方法 (crepe推理效果最好)", visible=False
225
+ ),
226
+ ],
227
+ outputs="audio",
228
+ cache_examples=False,
229
+ title="🌊💕🎶 可从B站直接上传素材,无需分离背景音",
230
+ description=description,
231
+ article=article,
232
+ )
233
+ interface_yt = gr.Interface(
234
+ predict_song_from_yt,
235
+ inputs=[
236
+ gr.Textbox(
237
+ label="Bilibili网址", info="请填写含有您喜欢歌曲的Bilibili网址,可直接填写相应的BV号", value="https://www.bilibili.com/video/BV..."
238
+ ),
239
+ gr.Number(value=0, label="起始时间 (秒)"),
240
+ gr.Number(value=15, label="结束时间 (秒)"),
241
+ gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手🎶 - 🌟孙燕姿🌟"),
242
+ gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0;有正负值,+2为升高两个key)"),
243
+ gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启;配合聚类模型f0预测效果更好,仅限语音转换时使用", visible=False),
244
+ gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降"),
245
+ gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False),
246
+ gr.Dropdown(
247
+ choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
248
+ value=default_f0_method,
249
+ label="模型推理方法 (crepe推理效果最好)", visible=False
250
+ ),
251
+ ],
252
+ outputs=[gr.Audio(label="AI歌手+伴奏🎵"), gr.Audio(label="AI歌手人声部分🎤")],
253
+ title="🌊💕🎶 - 可从B站直接上传素材,无需分离背景音",
254
+ description=description,
255
+ article=article,
256
+ cache_examples=False,
257
+ )
258
+ interface = gr.TabbedInterface(
259
+ [interface_yt, interface_mic, interface_file],
260
+ ["📺 - 从B站视频上传 ⭐推荐⭐", "🎙️ - 从麦克风上传", "🎵 - 从文件上传"],
261
+ )
262
 
263
  if __name__ == "__main__":
264
+ interface.launch(show_error=True)