nijisakai commited on
Commit
eb7e85b
1 Parent(s): 43ee5dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -262
app.py CHANGED
@@ -1,6 +1,3 @@
1
- import torch.nn as nn
2
- import io
3
-
4
  import json
5
  import os
6
  import subprocess
@@ -17,273 +14,83 @@ from huggingface_hub import hf_hub_download, list_repo_files
17
  from so_vits_svc_fork.hparams import HParams
18
  from so_vits_svc_fork.inference.core import Svc
19
 
 
 
 
 
 
20
 
21
- repo_id = ["nijisakai/sunyanzi", "kevinwang676/talktalkai-qing"]
22
-
23
- ckpt_names = []
24
- latest_ids = []
25
- for repo in repo_id:
26
- latest_id = sorted(
27
- [
28
- int(Path(x).stem.split("_")[1])
29
- for x in list_repo_files(repo)
30
- if x.startswith("G_") and x.endswith(".pth")
31
- ]
32
- )[-1]
33
- ckpt_names.append(f"G_{latest_id}.pth")
34
- latest_ids.append(latest_id)
35
 
36
- cluster_model_names = ["kmeans.pt" for _ in range(len(repo_id))]
37
- cluster_model_paths = [
38
- hf_hub_download(repo, name) if name in list_repo_files(repo) else None for repo, name in zip(repo_id, cluster_model_names)
39
- ]
40
- device = "cuda" if torch.cuda.is_available() else "cpu"
41
 
42
- generator_paths = [hf_hub_download(repo, ckpt_name) for repo, ckpt_name in zip(repo_id, ckpt_names)]
 
 
 
43
 
44
- config_paths = [hf_hub_download(repo, "config.json") for repo in repo_id]
45
- hparams_list = [HParams(**json.loads(Path(config_path).read_text())) for config_path in config_paths]
46
- speakers = []
47
- for hparams in hparams_list:
48
- speakers.extend(list(hparams.spk.keys()))
49
-
50
- models = [
51
- Svc(net_g_path=gen_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
52
- for gen_path, config_path, cluster_model_path in zip(generator_paths, config_paths, cluster_model_paths)
53
- ]
54
- demucs_model = get_model(DEFAULT_MODEL)
55
 
 
 
 
56
  duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
57
-
58
-
59
-
60
-
61
-
62
- def extract_vocal_demucs(model_path, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):
63
- model = nn.Module()
64
- with open(model_path, "rb") as f:
65
- buffer = io.BytesIO(f.read())
66
- model_state_dict = torch.load(buffer)
67
- model.load_state_dict(model_state_dict)
68
- model.to(device)
69
-
70
- wav, sr = librosa.load(filename, mono=False, sr=sr)
71
- wav = torch.tensor(wav)
72
- ref = wav.mean(0)
73
- wav = (wav - ref.mean()) / ref.std()
74
- sources = apply_model(
75
- model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs
76
- )[0]
77
- sources = sources * ref.std() + ref.mean()
78
- vocal_wav = sources[-1]
79
- vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1)
80
- vocal_wav = vocal_wav.numpy()
81
- vocal_wav = librosa.to_mono(vocal_wav)
82
- vocal_wav = vocal_wav.T
83
- instrumental_wav = sources[:-1].sum(0).numpy().T
84
- return vocal_wav, instrumental_wav
85
-
86
-
87
- def predict(models, speaker, audio, transpose: int = 0, auto_predict_f0: bool = False, cluster_infer_ratio: float = 0,
88
- noise_scale: float = 0.4, f0_method: str = "crepe", db_thresh: int = -40, pad_seconds: float = 0.5,
89
- chunk_seconds: float = 0.5, absolute_thresh: bool = False):
90
- audio, _ = librosa.load(audio, sr=models[0].target_sample, duration=duration_limit)
91
- audio = model.infer_silence(
92
- audio.astype(np.float32),
93
- speaker=speaker,
94
- transpose=transpose,
95
- auto_predict_f0=auto_predict_f0,
96
- cluster_infer_ratio=cluster_infer_ratio,
97
- noise_scale=noise_scale,
98
- f0_method=f0_method,
99
- db_thresh=db_thresh,
100
- pad_seconds=pad_seconds,
101
- chunk_seconds=chunk_seconds,
102
- absolute_thresh=absolute_thresh,
103
- )
104
- return model.target_sample, audio
105
-
106
-
107
- def predict_song_from_yt(
108
- ytid_or_url,
109
- start,
110
- end,
111
- speaker,
112
- transpose: int = 0,
113
- auto_predict_f0: bool = False,
114
- cluster_infer_ratio: float = 0,
115
- noise_scale: float = 0.4,
116
- f0_method: str = "dio",
117
- db_thresh: int = -40,
118
- pad_seconds: float = 0.5,
119
- chunk_seconds: float = 0.5,
120
- absolute_thresh: bool = False,
121
- ):
122
- # Check if start and end are valid numeric values
123
- try:
124
- start = float(start)
125
- end = float(end)
126
- except ValueError:
127
- raise ValueError("Invalid start or end time. Please provide valid numeric values.")
128
-
129
-
130
- end = min(start + duration_limit, end)
131
- original_track_filepath = download_youtube_clip(
132
- ytid_or_url,
133
- start,
134
- end,
135
- "track.wav",
136
- force=True,
137
- url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=",
138
- )
139
- vox_wav, inst_wav = extract_vocal_demucs(models[0], original_track_filepath)
140
- if transpose != 0:
141
- inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=models[0].target_sample, n_steps=transpose).T
142
- cloned_vox = models[0].infer_silence(
143
- vox_wav.astype(np.float32),
144
- speaker=speaker,
145
- transpose=transpose,
146
- auto_predict_f0=auto_predict_f0,
147
- cluster_infer_ratio=cluster_infer_ratio,
148
- noise_scale=noise_scale,
149
- f0_method=f0_method,
150
- db_thresh=db_thresh,
151
- pad_seconds=pad_seconds,
152
- chunk_seconds=chunk_seconds,
153
- absolute_thresh=absolute_thresh,
154
- )
155
- full_song = inst_wav + np.expand_dims(cloned_vox, 1)
156
- return (models[0].target_sample, full_song), (models[0].target_sample, cloned_vox)
157
-
158
-
159
-
160
- description = f"""
161
- <center>💡 - How to use this app: Select the "Predict from YouTube Video" tab above, fill in the YouTube video URL and the start and end times of the video, then click the "Submit" button!</center>
162
- """.strip()
163
-
164
- article = """
165
- <p style='text-align: center'> Note❗: Please do not generate content that may cause harm to individuals or organizations. This program is for research, learning, and personal entertainment purposes only.
166
- </p>
167
- """.strip()
168
-
169
- def download_youtube_clip(
170
- video_identifier,
171
- start_time,
172
- end_time,
173
- output_filename,
174
- num_attempts=5,
175
- url_base="https://www.youtube.com/watch?v=",
176
- quiet=False,
177
- force=False,
178
- ):
179
- output_path = Path(output_filename)
180
- if output_path.exists():
181
- if not force:
182
- return output_path
183
- else:
184
- output_path.unlink()
185
-
186
- quiet = "--quiet --no-warnings" if quiet else ""
187
- command = f"""
188
- yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}" # noqa: E501
189
- """.strip()
190
-
191
- attempts = 0
192
- while True:
193
- try:
194
- _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
195
- except subprocess.CalledProcessError:
196
- attempts += 1
197
- if attempts == num_attempts:
198
- return None
199
- else:
200
- break
201
-
202
- if output_path.exists():
203
- return output_path
204
- else:
205
- return None
206
-
207
- interface_mic = gr.Interface(
208
- predict,
209
- inputs=[
210
- gr.Dropdown(speakers, label="🎤AI Singer Selection🎶"),
211
- gr.Audio(type="filepath", source="microphone", label="Please upload the song you want to convert using the microphone"),
212
- gr.Slider(-12, 12, value=0, step=1, label="Transpose (default is 0; positive values for pitch increase)"),
213
- gr.Checkbox(False, label="Enable Automatic f0 Prediction", info="Check this box to enable; works best with clustering model for f0 prediction, use for voice conversion only", visible=False),
214
- gr.Slider(0.0, 1.0, value=0.5, step=0.1, label="Cluster Inference Ratio", info="0-1 range, 0 for no clustering. Using clustering model can improve timbre similarity, but may affect articulation (recommended value around 0.5)"),
215
- gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="Noise Scale (keep unchanged)", visible=False),
216
- gr.Dropdown(
217
- choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
218
- value="crepe",
219
- label="Model Inference Method (crepe gives the best results)", visible=False
220
- ),
221
- ],
222
- outputs="audio",
223
- cache_examples=False,
224
- title="🌊💕🎶 - AI Music Generation: Upload from Bilibili Directly, No Need to Separate Background Audio",
225
- description=description,
226
- article=article,
227
- )
228
-
229
- interface_file = gr.Interface(
230
- predict,
231
- inputs=[
232
- gr.Dropdown(speakers, value=speakers[0], label="🎤AI Singer Selection🎶"),
233
- gr.Audio(type="filepath", source="upload", label="Please upload the song you want to convert (vocals only)"),
234
- gr.Slider(-12, 12, value=0, step=1, label="Transpose (default is 0; positive values for pitch increase)"),
235
- gr.Checkbox(False, label="Enable Automatic f0 Prediction", info="Check this box to enable; works best with clustering model for f0 prediction, use for voice conversion only", visible=False),
236
- gr.Slider(0.0, 1.0, value=0.5, step=0.1, label="Cluster Inference Ratio", info="0-1 range, 0 for no clustering. Using clustering model can improve timbre similarity, but may affect articulation (recommended value around 0.5)"),
237
- gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="Noise Scale (keep unchanged)", visible=False),
238
- gr.Dropdown(
239
- choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
240
- value="crepe",
241
- label="Model Inference Method (crepe gives the best results)", visible=False
242
- ),
243
- ],
244
- outputs="audio",
245
- cache_examples=False,
246
- title="🌊💕🎶 Upload from Bilibili Directly, No Need to Separate Background Audio",
247
- description=description,
248
- article=article,
249
- )
250
-
251
- interface_yt = gr.Interface(
252
- predict_song_from_yt,
253
- inputs=[
254
- gr.Textbox(
255
- label="Bilibili URL", info="Please provide the Bilibili URL containing the song you like, you can also directly input the BV number"
256
- ),
257
- gr.Number(value=0, label="Start Time (seconds)"),
258
- gr.Number(value=15, label="End Time (seconds)"),
259
- gr.Dropdown(speakers, value=speakers[0], label="🎤AI Singer Selection🎶"),
260
- gr.Slider(-12, 12, value=0, step=1, label="Transpose (default is 0; positive values for pitch increase)"),
261
- gr.Checkbox(False, label="Enable Automatic f0 Prediction", info="Check this box to enable; works best with clustering model for f0 prediction, use for voice conversion only", visible=False),
262
- gr.Slider(0.0, 1.0, value=0.5, step=0.1, label="Cluster Inference Ratio", info="0-1 range, 0 for no clustering. Using clustering model can improve timbre similarity, but may affect articulation"),
263
- gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="Noise Scale (keep unchanged)", visible=False),
264
- gr.Dropdown(
265
- choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
266
- value="crepe",
267
- label="Model Inference Method (crepe gives the best results)", visible=False
268
- ),
269
- ],
270
- outputs=[gr.Audio(label="AI Singer + Accompaniment🎵"), gr.Audio(label="AI Singer Vocals🎤")],
271
- title="🌊💕🎶 Upload from Bilibili Directly, No Need to Separate Background Audio",
272
- description=description,
273
- article=article,
274
- cache_examples=False,
275
- )
276
 
277
  interfaces = []
278
- for idx, model in enumerate(models):
279
- interfaces.append(
280
- gr.TabbedInterface(
281
- [interface_yt, interface_mic, interface_file],
282
- ["📺 Predict from Bilibili Video ⭐Recommended⭐", "🎙️ Predict from Microphone", "🎵 Predict from File"],
283
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  )
 
 
 
 
285
 
286
  if __name__ == "__main__":
287
- for idx, interface in enumerate(interfaces):
288
- print(f"Launching Interface {idx + 1}")
289
- interface.launch(show_error=True)
 
 
 
 
1
  import json
2
  import os
3
  import subprocess
 
14
  from so_vits_svc_fork.hparams import HParams
15
  from so_vits_svc_fork.inference.core import Svc
16
 
17
+ ###################################################################
18
+ # REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS
19
+ ###################################################################
20
+ # The Hugging Face Hub repo IDs - 在这里修改repo_id,可替换成任何已经训练好的模型!
21
+ repo_ids = ["nijisakai/sunyanzi", "kevinwang676/jay"]
22
 
23
+ # If None, Uses latest ckpt in the repo
24
+ ckpt_name = None
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ # If None, Uses "kmeans.pt" if it exists in the repo
27
+ cluster_model_name = None
 
 
 
28
 
29
+ # Set the default f0 type to use - use the one it was trained on.
30
+ # The default for so-vits-svc-fork is "dio".
31
+ # Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
32
+ default_f0_method = "crepe"
33
 
34
+ # The default ratio of cluster inference to SVC inference.
35
+ # If cluster_model_name is not found in the repo, this is set to 0.
36
+ default_cluster_infer_ratio = 0.5
 
 
 
 
 
 
 
 
37
 
38
+ # Limit on duration of audio at inference time. increase if you can
39
+ # In this parent app, we set the limit with an env var to 30 seconds
40
+ # If you didnt set env var + you go OOM try changing 9e9 to <=300ish
41
  duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
42
+ ###################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  interfaces = []
45
+ for repo_id in repo_ids:
46
+ # Figure out the latest generator by taking highest value one.
47
+ # Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth
48
+ if ckpt_name is None:
49
+ latest_id = sorted(
50
+ [
51
+ int(Path(x).stem.split("_")[1])
52
+ for x in list_repo_files(repo_id)
53
+ if x.startswith("G_") and x.endswith(".pth")
54
+ ]
55
+ )[-1]
56
+ ckpt_name = f"G_{latest_id}.pth"
57
+
58
+ cluster_model_name = cluster_model_name or "kmeans.pt"
59
+ if cluster_model_name in list_repo_files(repo_id):
60
+ print(f"Found Cluster model - Downloading {cluster_model_name} from {repo_id}")
61
+ cluster_model_path = hf_hub_download(repo_id, cluster_model_name)
62
+ else:
63
+ print(f"Could not find {cluster_model_name} in {repo_id}. Using None")
64
+ cluster_model_path = None
65
+ default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0
66
+
67
+ generator_path = hf_hub_download(repo_id, ckpt_name)
68
+ config_path = hf_hub_download(repo_id, "config.json")
69
+ hparams = HParams(**json.loads(Path(config_path).read_text()))
70
+ speakers = list(hparams.spk.keys())
71
+ device = "cuda" if torch.cuda.is_available() else "cpu"
72
+ model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
73
+ demucs_model = get_model(DEFAULT_MODEL)
74
+
75
+ # ... (same code as before to define the functions)
76
+
77
+ interface = gr.Interface(
78
+ predict,
79
+ inputs=[
80
+ gr.Dropdown(speakers, label="🎤AI歌手选择🎶"),
81
+ gr.Audio(type="filepath", source="microphone", label="请用麦克风上传您想转换的歌曲"),
82
+ # ... (same inputs as before)
83
+ ],
84
+ outputs="audio",
85
+ cache_examples=False,
86
+ title=f"🌊💕🎶 - 滔滔AI+音乐:可从B站直接上传素材,无需分离背景音 ({repo_id})",
87
+ description=description,
88
+ article=article,
89
  )
90
+ interfaces.append(interface)
91
+
92
+ # Combine the interfaces using a TabbedInterface
93
+ interface = gr.TabbedInterface(interfaces, [f"Model {i+1}" for i in range(len(interfaces))])
94
 
95
  if __name__ == "__main__":
96
+ interface.launch(show_error=True)