nijisakai commited on
Commit
f06136b
1 Parent(s): 1d3811a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +164 -423
app.py CHANGED
@@ -14,16 +14,11 @@ from huggingface_hub import hf_hub_download, list_repo_files
14
  from so_vits_svc_fork.hparams import HParams
15
  from so_vits_svc_fork.inference.core import Svc
16
 
17
-
18
  ###################################################################
19
  # REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS
20
  ###################################################################
21
- # The Hugging Face Hub repo IDs - Modify this list to include any pre-trained models you want!
22
- repo_ids = [
23
- "nijisakai/sunyanzi",
24
- "kevinwang676/jay",
25
- # Add more repo IDs here...
26
- ]
27
 
28
  # If None, Uses latest ckpt in the repo
29
  ckpt_name = None
@@ -42,14 +37,14 @@ default_cluster_infer_ratio = 0.5
42
 
43
  # Limit on duration of audio at inference time. increase if you can
44
  # In this parent app, we set the limit with an env var to 30 seconds
45
- # If you didn't set env var + you go OOM try changing 9e9 to <=300ish
46
  duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
47
  ###################################################################
48
 
49
- # Helper function to download model and cluster model
50
- def download_models(repo_id):
51
- global ckpt_name, cluster_model_name
52
-
53
  if ckpt_name is None:
54
  latest_id = sorted(
55
  [
@@ -74,427 +69,173 @@ def download_models(repo_id):
74
  hparams = HParams(**json.loads(Path(config_path).read_text()))
75
  speakers = list(hparams.spk.keys())
76
  device = "cuda" if torch.cuda.is_available() else "cpu"
77
- model = Svc(
78
- net_g_path=generator_path,
79
- config_path=config_path,
80
- device=device,
81
- cluster_model_path=cluster_model_path,
82
- )
83
  demucs_model = get_model(DEFAULT_MODEL)
84
- return model, demucs_model, speakers
85
-
86
 
87
- # Helper function to extract vocals using the demucs model
88
- def extract_vocal_demucs(model, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):
89
- wav, sr = librosa.load(filename, mono=False, sr=sr)
90
- wav = torch.tensor(wav)
91
- ref = wav.mean(0)
92
- wav = (wav - ref.mean()) / ref.std()
93
- sources = apply_model(
94
- model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs
95
- )[0]
96
- sources = sources * ref.std() + ref.mean()
97
- # We take just the vocals stem. I know the vocals for this model are at index -1
98
- # If using a different model, check model.sources.index('vocals')
99
- vocal_wav = sources[-1]
100
- # I did this because it's the same normalization the so-vits model required
101
- vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1)
102
- vocal_wav = vocal_wav.numpy()
103
- vocal_wav = librosa.to_mono(vocal_wav)
104
- vocal_wav = vocal_wav.T
105
- instrumental_wav = sources[:-1].sum(0).numpy().T
106
- return vocal_wav, instrumental_wav
107
-
108
-
109
- def download_youtube_clip(
110
- video_identifier,
111
- start_time,
112
- end_time,
113
- output_filename,
114
- num_attempts=5,
115
- url_base="https://www.youtube.com/watch?v=",
116
- quiet=False,
117
- force=False,
118
- ):
119
- output_path = Path(output_filename)
120
- if output_path.exists():
121
- if not force:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  return output_path
123
  else:
124
- output_path.unlink()
125
-
126
- quiet = "--quiet --no-warnings" if quiet else ""
127
- command = f"""
128
- yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}" # noqa: E501
129
- """.strip()
130
-
131
- attempts = 0
132
- while True:
133
- try:
134
- _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
135
- except subprocess.CalledProcessError:
136
- attempts += 1
137
- if attempts == num_attempts:
138
- return None
139
- else:
140
- break
141
-
142
- if output_path.exists():
143
- return output_path
144
- else:
145
- return None
146
-
147
-
148
- def predict(
149
- speaker,
150
- audio,
151
- transpose: int = 0,
152
- auto_predict_f0: bool = False,
153
- cluster_infer_ratio: float = 0,
154
- noise_scale: float = 0.4,
155
- f0_method: str = "crepe",
156
- db_thresh: int = -40,
157
- pad_seconds: float = 0.5,
158
- chunk_seconds: float = 0.5,
159
- absolute_thresh: bool = False,
160
- ):
161
- audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit)
162
- audio = model.infer_silence(
163
- audio.astype(np.float32),
164
- speaker=speaker,
165
- transpose=transpose,
166
- auto_predict_f0=auto_predict_f0,
167
- cluster_infer_ratio=cluster_infer_ratio,
168
- noise_scale=noise_scale,
169
- f0_method=f0_method,
170
- db_thresh=db_thresh,
171
- pad_seconds=pad_seconds,
172
- chunk_seconds=chunk_seconds,
173
- absolute_thresh=absolute_thresh,
174
- )
175
- return model.target_sample, audio
176
-
177
-
178
- def predict_song_from_yt(
179
- ytid_or_url,
180
- start,
181
- end,
182
- speaker=speakers[0],
183
- transpose: int = 0,
184
- auto_predict_f0: bool = False,
185
- cluster_infer_ratio: float = 0,
186
- noise_scale: float = 0.4,
187
- f0_method: str = "dio",
188
- db_thresh: int = -40,
189
- pad_seconds: float = 0.5,
190
- chunk_seconds: float = 0.5,
191
- absolute_thresh: bool = False,
192
- ):
193
- end = min(start + duration_limit, end)
194
- original_track_filepath = download_youtube_clip(
195
  ytid_or_url,
196
  start,
197
  end,
198
- "track.wav",
199
- force=True,
200
- url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=",
201
- )
202
- vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath)
203
- if transpose != 0:
204
- inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T
205
- cloned_vox = model.infer_silence(
206
- vox_wav.astype(np.float32),
207
- speaker=speaker,
208
- transpose=transpose,
209
- auto_predict_f0=auto_predict_f0,
210
- cluster_infer_ratio=cluster_infer_ratio,
211
- noise_scale=noise_scale,
212
- f0_method=f0_method,
213
- db_thresh=db_thresh,
214
- pad_seconds=pad_seconds,
215
- chunk_seconds=chunk_seconds,
216
- absolute_thresh=absolute_thresh,
217
- )
218
- full_song = inst_wav + np.expand_dims(cloned_vox, 1)
219
- return (model.target_sample, full_song), (model.target_sample, cloned_vox)
220
-
221
-
222
- # Create a dictionary to store all models, demucs models, and speakers
223
- all_models = {}
224
- for repo_id in repo_ids:
225
- model, demucs_model, speakers = download_models(repo_id)
226
- all_models[repo_id] = {
227
- "model": model,
228
- "demucs_model": demucs_model,
229
- "speakers": speakers,
230
- }
231
-
232
- # Interface definition
233
- description = """
234
- # ... (existing code)
235
- # No changes made to this part of the code, so skipping it
236
-
237
- interface_yt = gr.Interface(
238
- predict_song_from_yt,
239
- inputs=[
240
- gr.Dropdown(
241
- choices=repo_ids,
242
- label="Select Pre-trained Model",
243
- default=repo_ids[0],
244
- description="Choose from different pre-trained models.",
245
- ),
246
- gr.Textbox(
247
- label="Bilibili URL",
248
- info="Please enter the Bilibili URL containing the song you want to convert. You can also use the BV number directly.",
249
- value="https://www.bilibili.com/video/BV...",
250
- ),
251
- # ... (existing code)
252
- # Rest of the inputs, no changes made, so skipping the code
253
- ],
254
- outputs=[
255
- gr.Audio(label="AI Singer + Accompaniment"),
256
- gr.Audio(label="AI Singer Vocals Only"),
257
- ],
258
- title="🌊💕🎶 - Upload Audio from Bilibili, No Need to Separate Background Music",
259
- description=description,
260
- article=article,
261
- cache_examples=False,
262
- )
263
-
264
- interface = gr.Interface(
265
- predict,
266
- inputs=[
267
- gr.Dropdown(
268
- choices=repo_ids,
269
- label="Select Pre-trained Model",
270
- default=repo_ids[0],
271
- description="Choose from different pre-trained models.",
272
- ),
273
- gr.Dropdown(
274
- choices=speakers,
275
- label="AI Singer Selection",
276
- description="Choose your favorite AI singer.",
277
- ),
278
- gr.Audio(
279
- type="file",
280
- label="Upload Audio File",
281
- description="Upload the audio file you want to convert. (Voice only, no background music)",
282
- ),
283
- # ... (existing code)
284
- # Rest of the inputs, no changes made, so skipping the code
285
- ],
286
- outputs=gr.Audio(label="Converted Audio"),
287
- title="🌊💕🎶 - Upload Audio File, No Need to Separate Background Music",
288
- description=description,
289
- article=article,
290
- cache_examples=False,
291
- )
292
-
293
- interface_mic = gr.Interface(
294
- predict,
295
- inputs=[
296
- gr.Dropdown(
297
- choices=repo_ids,
298
- label="Select Pre-trained Model",
299
- default=repo_ids[0],
300
- description="Choose from different pre-trained models.",
301
- ),
302
- gr.Dropdown(
303
- choices=speakers,
304
- label="AI Singer Selection",
305
- description="Choose your favorite AI singer.",
306
- ),
307
- gr.Audio(
308
- type="microphone",
309
- label="Use Microphone to Upload Your Song",
310
- description="Upload the song you want to convert using your microphone.",
311
- ),
312
- # ... (existing code)
313
- # Rest of the inputs, no changes made, so skipping the code
314
- ],
315
- outputs=gr.Audio(label="Converted Audio"),
316
- title="🌊💕🎶 - Upload Audio from Microphone, No Need to Separate Background Music",
317
- description=description,
318
- article=article,
319
- cache_examples=False,
320
- )
321
-
322
- interface_file = gr.Interface(
323
- predict,
324
- inputs=[
325
- gr.Dropdown(
326
- choices=repo_ids,
327
- label="Select Pre-trained Model",
328
- default=repo_ids[0],
329
- description="Choose from different pre-trained models.",
330
- ),
331
- gr.Dropdown(
332
- choices=speakers,
333
- label="AI Singer Selection",
334
- description="Choose your favorite AI singer.",
335
- ),
336
- gr.Audio(
337
- type="file",
338
- label="Upload Audio File",
339
- description="Upload the audio file you want to convert. (Voice only, no background music)",
340
- ),
341
- # ... (existing code)
342
- # Rest of the inputs, no changes made, so skipping the code
343
- ],
344
- outputs=gr.Audio(label="Converted Audio"),
345
- title="🌊💕🎶 - Upload Audio File, No Need to Separate Background Music",
346
- description=description,
347
- article=article,
348
- cache_examples=False,
349
- )
350
-
351
- interface = gr.Interface(
352
- predict,
353
- inputs=[
354
- gr.Dropdown(
355
- choices=repo_ids,
356
- label="Select Pre-trained Model",
357
- default=repo_ids[0],
358
- description="Choose from different pre-trained models.",
359
- ),
360
- gr.Dropdown(
361
- choices=speakers,
362
- label="AI Singer Selection",
363
- description="Choose your favorite AI singer.",
364
- ),
365
- gr.Audio(
366
- type="file",
367
- label="Upload Audio File",
368
- description="Upload the audio file you want to convert. (Voice only, no background music)",
369
- ),
370
- # ... (existing code)
371
- # Rest of the inputs, no changes made, so skipping the code
372
- ],
373
- outputs=gr.Audio(label="Converted Audio"),
374
- title="🌊💕🎶 - Upload Audio File, No Need to Separate Background Music",
375
- description=description,
376
- article=article,
377
- cache_examples=False,
378
- )
379
-
380
- interface_yt = gr.Interface(
381
- predict_song_from_yt,
382
- inputs=[
383
- gr.Dropdown(
384
- choices=repo_ids,
385
- label="Select Pre-trained Model",
386
- default=repo_ids[0],
387
- description="Choose from different pre-trained models.",
388
- ),
389
- gr.Textbox(
390
- label="Bilibili URL",
391
- info="Please enter the Bilibili URL containing the song you want to convert. You can also use the BV number directly.",
392
- value="https://www.bilibili.com/video/BV...",
393
- ),
394
- # ... (existing code)
395
- # Rest of the inputs, no changes made, so skipping the code
396
- ],
397
- outputs=[
398
- gr.Audio(label="AI Singer + Accompaniment"),
399
- gr.Audio(label="AI Singer Vocals Only"),
400
- ],
401
- title="🌊💕🎶 - Upload Audio from Bilibili, No Need to Separate Background Music",
402
- description=description,
403
- article=article,
404
- cache_examples=False,
405
- )
406
-
407
- interface = gr.Interface(
408
- predict,
409
- inputs=[
410
- gr.Dropdown(
411
- choices=repo_ids,
412
- label="Select Pre-trained Model",
413
- default=repo_ids[0],
414
- description="Choose from different pre-trained models.",
415
- ),
416
- gr.Dropdown(
417
- choices=speakers,
418
- label="AI Singer Selection",
419
- description="Choose your favorite AI singer.",
420
- ),
421
- gr.Audio(
422
- type="file",
423
- label="Upload Audio File",
424
- description="Upload the audio file you want to convert. (Voice only, no background music)",
425
- ),
426
- # ... (existing code)
427
- # Rest of the inputs, no changes made, so skipping the code
428
- ],
429
- outputs=gr.Audio(label="Converted Audio"),
430
- title="🌊💕🎶 - Upload Audio File, No Need to Separate Background Music",
431
- description=description,
432
- article=article,
433
- cache_examples=False,
434
- )
435
 
436
- interface_mic = gr.Interface(
437
- predict,
438
- inputs=[
439
- gr.Dropdown(
440
- choices=repo_ids,
441
- label="Select Pre-trained Model",
442
- default=repo_ids[0],
443
- description="Choose from different pre-trained models.",
444
- ),
445
- gr.Dropdown(
446
- choices=speakers,
447
- label="AI Singer Selection",
448
- description="Choose your favorite AI singer.",
449
- ),
450
- gr.Audio(
451
- type="microphone",
452
- label="Use Microphone to Upload Your Song",
453
- description="Upload the song you want to convert using your microphone.",
454
- ),
455
- # ... (existing code)
456
- # Rest of the inputs, no changes made, so skipping the code
457
- ],
458
- outputs=gr.Audio(label="Converted Audio"),
459
- title="🌊💕🎶 - Upload Audio from Microphone, No Need to Separate Background Music",
460
- description=description,
461
- article=article,
462
- cache_examples=False,
463
- )
464
 
465
- interface_file = gr.Interface(
466
- predict,
467
- inputs=[
468
- gr.Dropdown(
469
- choices=repo_ids,
470
- label="Select Pre-trained Model",
471
- default=repo_ids[0],
472
- description="Choose from different pre-trained models.",
473
- ),
474
- gr.Dropdown(
475
- choices=speakers,
476
- label="AI Singer Selection",
477
- description="Choose your favorite AI singer.",
478
- ),
479
- gr.Audio(
480
- type="file",
481
- label="Upload Audio File",
482
- description="Upload the audio file you want to convert. (Voice only, no background music)",
483
- ),
484
- # ... (existing code)
485
- # Rest of the inputs, no changes made, so skipping the code
486
- ],
487
- outputs=gr.Audio(label="Converted Audio"),
488
- title="🌊💕🎶 - Upload Audio File, No Need to Separate Background Music",
489
- description=description,
490
- article=article,
491
- cache_examples=False,
492
- )
493
 
494
- interface = gr.TabbedInterface(
495
- [interface_yt, interface_mic, interface_file],
496
- ["📺 - Upload Audio from Bilibili ⭐Recommended⭐", "🎙️ - Upload Audio from Microphone", "🎵 - Upload Audio File"],
497
- )
498
 
499
  if __name__ == "__main__":
500
  interface.launch(show_error=True)
 
14
  from so_vits_svc_fork.hparams import HParams
15
  from so_vits_svc_fork.inference.core import Svc
16
 
 
17
  ###################################################################
18
  # REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS
19
  ###################################################################
20
+ # The Hugging Face Hub repo IDs - 在这里修改repo_id,可替换成任何已经训练好的模型!
21
+ repo_ids = ["nijisakai/sunyanzi", "kevinwang676/jay"]
 
 
 
 
22
 
23
  # If None, Uses latest ckpt in the repo
24
  ckpt_name = None
 
37
 
38
  # Limit on duration of audio at inference time. increase if you can
39
  # In this parent app, we set the limit with an env var to 30 seconds
40
+ # If you didnt set env var + you go OOM try changing 9e9 to <=300ish
41
  duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
42
  ###################################################################
43
 
44
+ interfaces = []
45
+ for repo_id in repo_ids:
46
+ # Figure out the latest generator by taking highest value one.
47
+ # Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth
48
  if ckpt_name is None:
49
  latest_id = sorted(
50
  [
 
69
  hparams = HParams(**json.loads(Path(config_path).read_text()))
70
  speakers = list(hparams.spk.keys())
71
  device = "cuda" if torch.cuda.is_available() else "cpu"
72
+ model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
 
 
 
 
 
73
  demucs_model = get_model(DEFAULT_MODEL)
 
 
74
 
75
+ def extract_vocal_demucs(model, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):
76
+ wav, sr = librosa.load(filename, mono=False, sr=sr)
77
+ wav = torch.tensor(wav)
78
+ ref = wav.mean(0)
79
+ wav = (wav - ref.mean()) / ref.std()
80
+ sources = apply_model(
81
+ model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs
82
+ )[0]
83
+ sources = sources * ref.std() + ref.mean()
84
+ # We take just the vocals stem. I know the vocals for this model are at index -1
85
+ # If using different model, check model.sources.index('vocals')
86
+ vocal_wav = sources[-1]
87
+ # I did this because its the same normalization the so-vits model required
88
+ vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1)
89
+ vocal_wav = vocal_wav.numpy()
90
+ vocal_wav = librosa.to_mono(vocal_wav)
91
+ vocal_wav = vocal_wav.T
92
+ instrumental_wav = sources[:-1].sum(0).numpy().T
93
+ return vocal_wav, instrumental_wav
94
+
95
+ def download_youtube_clip(
96
+ video_identifier,
97
+ start_time,
98
+ end_time,
99
+ output_filename,
100
+ num_attempts=5,
101
+ url_base="https://www.youtube.com/watch?v=",
102
+ quiet=False,
103
+ force=False,
104
+ ):
105
+ output_path = Path(output_filename)
106
+ if output_path.exists():
107
+ if not force:
108
+ return output_path
109
+ else:
110
+ output_path.unlink()
111
+
112
+ quiet = "--quiet --no-warnings" if quiet else ""
113
+ command = f"""
114
+ yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}" # noqa: E501
115
+ """.strip()
116
+
117
+ attempts = 0
118
+ while True:
119
+ try:
120
+ _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
121
+ except subprocess.CalledProcessError:
122
+ attempts += 1
123
+ if attempts == num_attempts:
124
+ return None
125
+ else:
126
+ break
127
+
128
+ if output_path.exists():
129
  return output_path
130
  else:
131
+ return None
132
+
133
+ def predict(
134
+ speaker,
135
+ audio,
136
+ transpose: int = 0,
137
+ auto_predict_f0: bool = False,
138
+ cluster_infer_ratio: float = 0,
139
+ noise_scale: float = 0.4,
140
+ f0_method: str = "crepe",
141
+ db_thresh: int = -40,
142
+ pad_seconds: float = 0.5,
143
+ chunk_seconds: float = 0.5,
144
+ absolute_thresh: bool = False,
145
+ ):
146
+ audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit)
147
+ audio = model.infer_silence(
148
+ audio.astype(np.float32),
149
+ speaker=speaker,
150
+ transpose=transpose,
151
+ auto_predict_f0=auto_predict_f0,
152
+ cluster_infer_ratio=cluster_infer_ratio,
153
+ noise_scale=noise_scale,
154
+ f0_method=f0_method,
155
+ db_thresh=db_thresh,
156
+ pad_seconds=pad_seconds,
157
+ chunk_seconds=chunk_seconds,
158
+ absolute_thresh=absolute_thresh,
159
+ )
160
+ return model.target_sample, audio
161
+
162
+ def predict_song_from_yt(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  ytid_or_url,
164
  start,
165
  end,
166
+ speaker=speakers[0],
167
+ transpose: int = 0,
168
+ auto_predict_f0: bool = False,
169
+ cluster_infer_ratio: float = 0,
170
+ noise_scale: float = 0.4,
171
+ f0_method: str = "dio",
172
+ db_thresh: int = -40,
173
+ pad_seconds: float = 0.5,
174
+ chunk_seconds: float = 0.5,
175
+ absolute_thresh: bool = False,
176
+ ):
177
+ end = min(start + duration_limit, end)
178
+ original_track_filepath = download_youtube_clip(
179
+ ytid_or_url,
180
+ start,
181
+ end,
182
+ "track.wav",
183
+ force=True,
184
+ url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=",
185
+ )
186
+ vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath)
187
+ if transpose != 0:
188
+ inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T
189
+ cloned_vox = model.infer_silence(
190
+ vox_wav.astype(np.float32),
191
+ speaker=speaker,
192
+ transpose=transpose,
193
+ auto_predict_f0=auto_predict_f0,
194
+ cluster_infer_ratio=cluster_infer_ratio,
195
+ noise_scale=noise_scale,
196
+ f0_method=f0_method,
197
+ db_thresh=db_thresh,
198
+ pad_seconds=pad_seconds,
199
+ chunk_seconds=chunk_seconds,
200
+ absolute_thresh=absolute_thresh,
201
+ )
202
+ full_song = inst_wav + np.expand_dims(cloned_vox, 1)
203
+ return (model.target_sample, full_song), (model.target_sample, cloned_vox)
204
+
205
+ description = f"""
206
+ <center>💡 - 如何使用此程序:在页面上方选择“从B站视频上传”模块,填写视频网址和视频起止时间后,点击“submit”按键即可!您还可以点击页面最下方的示例快速预览效果</center>
207
+ """.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
+ article = """
210
+ <p style='text-align: center'> 注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。
211
+ </p>
212
+ """.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
+ interface = gr.Interface(
215
+ predict,
216
+ inputs=[
217
+ gr.Dropdown(speakers, label="🎤AI歌手选择🎶"),
218
+ gr.Audio(type="filepath", source="microphone", label="请用麦克风上传您想转换的歌曲"),
219
+ gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0;有正负值,+2为升高两个key)"),
220
+ gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启;配合聚类模型f0预测效果更好,仅限语音转换时使用", visible=False),
221
+ gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降 (如果使用,建议0.5左右)"),
222
+ gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False),
223
+ gr.Dropdown(
224
+ choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
225
+ value=default_f0_method,
226
+ label="模型推理方法 (crepe推理效果最好)", visible=False
227
+ ),
228
+ ],
229
+ outputs="audio",
230
+ cache_examples=False,
231
+ title=f"🌊💕🎶 - 滔滔AI+音乐:可从B站直接上传素材,无需分离背景音 ({repo_id})",
232
+ description=description,
233
+ article=article,
234
+ )
235
+ interfaces.append(interface)
 
 
 
 
 
 
236
 
237
+ # Combine the interfaces using a TabbedInterface
238
+ interface = gr.TabbedInterface(interfaces, [f"Model {i+1}" for i in range(len(interfaces))])
 
 
239
 
240
  if __name__ == "__main__":
241
  interface.launch(show_error=True)