Spaces:
Runtime error
Runtime error
use new models for now
Browse files
notebooks/audio_diffusion_pipeline.ipynb
CHANGED
@@ -46,7 +46,7 @@
|
|
46 |
"from datasets import load_dataset\n",
|
47 |
"from IPython.display import Audio\n",
|
48 |
"from librosa.beat import beat_track\n",
|
49 |
-
"from diffusers import DiffusionPipeline
|
50 |
]
|
51 |
},
|
52 |
{
|
@@ -56,8 +56,6 @@
|
|
56 |
"metadata": {},
|
57 |
"outputs": [],
|
58 |
"source": [
|
59 |
-
"mel = Mel()\n",
|
60 |
-
"sample_rate = mel.get_sample_rate()\n",
|
61 |
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
62 |
"generator = torch.Generator(device=device)"
|
63 |
]
|
@@ -91,7 +89,7 @@
|
|
91 |
"\n",
|
92 |
"#@markdown teticio/audio-diffusion-instrumental-hiphop-256 - trained on instrumental hiphop\n",
|
93 |
"\n",
|
94 |
-
"model_id = \"teticio/audio-diffusion-256\" #@param [\"teticio/audio-diffusion-256\", \"teticio/audio-diffusion-breaks-256\", \"audio-diffusion-instrumenal-hiphop-256\", \"teticio/audio-diffusion-ddim-256\"]"
|
95 |
]
|
96 |
},
|
97 |
{
|
@@ -101,7 +99,9 @@
|
|
101 |
"metadata": {},
|
102 |
"outputs": [],
|
103 |
"source": [
|
104 |
-
"audio_diffusion = DiffusionPipeline.from_pretrained(model_id).to(device)"
|
|
|
|
|
105 |
]
|
106 |
},
|
107 |
{
|
@@ -150,7 +150,7 @@
|
|
150 |
" seed = generator.seed()\n",
|
151 |
" print(f'Seed = {seed}')\n",
|
152 |
" generator.manual_seed(seed)\n",
|
153 |
-
" output = audio_diffusion(
|
154 |
" image = output.images[0]\n",
|
155 |
" audio = output.audios[0, 0]\n",
|
156 |
" display(image)\n",
|
@@ -187,7 +187,7 @@
|
|
187 |
"source": [
|
188 |
"seed = 2391504374279719 #@param {type:\"integer\"}\n",
|
189 |
"generator.manual_seed(seed)\n",
|
190 |
-
"output = audio_diffusion(
|
191 |
"image = output.images[0]\n",
|
192 |
"audio = output.audios[0, 0]\n",
|
193 |
"display(image)\n",
|
@@ -206,7 +206,7 @@
|
|
206 |
"start_step = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
|
207 |
"track = loop_it(audio, sample_rate, loops=1)\n",
|
208 |
"for variation in range(12):\n",
|
209 |
-
" output = audio_diffusion(
|
210 |
" image2 = output.images[0]\n",
|
211 |
" audio2 = output.audios[0, 0]\n",
|
212 |
" display(image2)\n",
|
@@ -235,8 +235,7 @@
|
|
235 |
"overlap_samples = overlap_secs * sample_rate\n",
|
236 |
"track = audio\n",
|
237 |
"for variation in range(12):\n",
|
238 |
-
" output = audio_diffusion(
|
239 |
-
" raw_audio=audio[-overlap_samples:],\n",
|
240 |
" start_step=start_step,\n",
|
241 |
" mask_start_secs=overlap_secs)\n",
|
242 |
" image2 = output.images[0]\n",
|
@@ -306,8 +305,7 @@
|
|
306 |
" # Normalize and re-insert generated audio\n",
|
307 |
" audio[:overlap_samples] = audio2[-overlap_samples:] * np.max(\n",
|
308 |
" audio[:overlap_samples]) / np.max(audio2[-overlap_samples:])\n",
|
309 |
-
" output = audio_diffusion(
|
310 |
-
" raw_audio=audio,\n",
|
311 |
" start_step=start_step,\n",
|
312 |
" generator=generator,\n",
|
313 |
" mask_start_secs=overlap_secs * not_first)\n",
|
@@ -334,8 +332,7 @@
|
|
334 |
"source": [
|
335 |
"sample = 3 #@param {type:\"integer\"}\n",
|
336 |
"raw_audio = track_audio[sample * stride:sample * stride + slice_size]\n",
|
337 |
-
"output = audio_diffusion(
|
338 |
-
" raw_audio=raw_audio,\n",
|
339 |
" mask_start_secs=1,\n",
|
340 |
" mask_end_secs=1,\n",
|
341 |
" step_generator=torch.Generator(device=device))\n",
|
@@ -359,7 +356,9 @@
|
|
359 |
"metadata": {},
|
360 |
"outputs": [],
|
361 |
"source": [
|
362 |
-
"audio_diffusion = DiffusionPipeline.from_pretrained('teticio/audio-diffusion-ddim-256').to(device)"
|
|
|
|
|
363 |
]
|
364 |
},
|
365 |
{
|
@@ -381,7 +380,7 @@
|
|
381 |
" seed = generator.seed()\n",
|
382 |
" print(f'Seed = {seed}')\n",
|
383 |
" generator.manual_seed(seed)\n",
|
384 |
-
" output = audio_diffusion(
|
385 |
" image = output.images[0]\n",
|
386 |
" audio = output.audios[0, 0]\n",
|
387 |
" display(image)\n",
|
@@ -410,7 +409,7 @@
|
|
410 |
"metadata": {},
|
411 |
"outputs": [],
|
412 |
"source": [
|
413 |
-
"output = audio_diffusion(
|
414 |
"image = output.images[0]\n",
|
415 |
"audio = output.audios[0, 0]\n",
|
416 |
"display(image)\n",
|
@@ -509,7 +508,6 @@
|
|
509 |
"source": [
|
510 |
"alpha = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.1}\n",
|
511 |
"output = audio_diffusion(\n",
|
512 |
-
" mel=mel,\n",
|
513 |
" noise=audio_diffusion.slerp(noise, noise2, alpha),\n",
|
514 |
" generator=generator)\n",
|
515 |
"audio = output.audios[0, 0]\n",
|
@@ -534,7 +532,7 @@
|
|
534 |
"metadata": {},
|
535 |
"outputs": [],
|
536 |
"source": [
|
537 |
-
"model_id = \"teticio/latent-audio-diffusion-ddim-256\" #@param [\"teticio/latent-audio-diffusion-256\", \"teticio/latent-audio-diffusion-ddim-256\"]"
|
538 |
]
|
539 |
},
|
540 |
{
|
@@ -544,7 +542,9 @@
|
|
544 |
"metadata": {},
|
545 |
"outputs": [],
|
546 |
"source": [
|
547 |
-
"audio_diffusion = DiffusionPipeline.from_pretrained(model_id).to(device)"
|
|
|
|
|
548 |
]
|
549 |
},
|
550 |
{
|
@@ -556,7 +556,7 @@
|
|
556 |
"source": [
|
557 |
"seed = 3412253600050855 #@param {type:\"integer\"}\n",
|
558 |
"generator.manual_seed(seed)\n",
|
559 |
-
"output = audio_diffusion(
|
560 |
"image = output.images[0]\n",
|
561 |
"audio = output.audios[0, 0]\n",
|
562 |
"display(image)\n",
|
@@ -572,7 +572,7 @@
|
|
572 |
"source": [
|
573 |
"seed2 = 7016114633369557 #@param {type:\"integer\"}\n",
|
574 |
"generator.manual_seed(seed2)\n",
|
575 |
-
"output = audio_diffusion(
|
576 |
"image2 = output.images[0]\n",
|
577 |
"audio2 = output.audios[0, 0]\n",
|
578 |
"display(image2)\n",
|
@@ -628,7 +628,6 @@
|
|
628 |
"source": [
|
629 |
"alpha = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.1}\n",
|
630 |
"output = audio_diffusion(\n",
|
631 |
-
" mel=mel,\n",
|
632 |
" noise=audio_diffusion.slerp(latents, latents2, alpha),\n",
|
633 |
" generator=generator)\n",
|
634 |
"audio3 = output.audios[0, 0]\n",
|
|
|
46 |
"from datasets import load_dataset\n",
|
47 |
"from IPython.display import Audio\n",
|
48 |
"from librosa.beat import beat_track\n",
|
49 |
+
"from diffusers import DiffusionPipeline"
|
50 |
]
|
51 |
},
|
52 |
{
|
|
|
56 |
"metadata": {},
|
57 |
"outputs": [],
|
58 |
"source": [
|
|
|
|
|
59 |
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
60 |
"generator = torch.Generator(device=device)"
|
61 |
]
|
|
|
89 |
"\n",
|
90 |
"#@markdown teticio/audio-diffusion-instrumental-hiphop-256 - trained on instrumental hiphop\n",
|
91 |
"\n",
|
92 |
+
"model_id = \"teticio/audio-diffusion-256-new\" #@param [\"teticio/audio-diffusion-256\", \"teticio/audio-diffusion-breaks-256\", \"audio-diffusion-instrumenal-hiphop-256\", \"teticio/audio-diffusion-ddim-256\"]"
|
93 |
]
|
94 |
},
|
95 |
{
|
|
|
99 |
"metadata": {},
|
100 |
"outputs": [],
|
101 |
"source": [
|
102 |
+
"audio_diffusion = DiffusionPipeline.from_pretrained(model_id).to(device)\n",
|
103 |
+
"mel = audio_diffusion.mel\n",
|
104 |
+
"sample_rate = mel.get_sample_rate()"
|
105 |
]
|
106 |
},
|
107 |
{
|
|
|
150 |
" seed = generator.seed()\n",
|
151 |
" print(f'Seed = {seed}')\n",
|
152 |
" generator.manual_seed(seed)\n",
|
153 |
+
" output = audio_diffusion(generator=generator)\n",
|
154 |
" image = output.images[0]\n",
|
155 |
" audio = output.audios[0, 0]\n",
|
156 |
" display(image)\n",
|
|
|
187 |
"source": [
|
188 |
"seed = 2391504374279719 #@param {type:\"integer\"}\n",
|
189 |
"generator.manual_seed(seed)\n",
|
190 |
+
"output = audio_diffusion(generator=generator)\n",
|
191 |
"image = output.images[0]\n",
|
192 |
"audio = output.audios[0, 0]\n",
|
193 |
"display(image)\n",
|
|
|
206 |
"start_step = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
|
207 |
"track = loop_it(audio, sample_rate, loops=1)\n",
|
208 |
"for variation in range(12):\n",
|
209 |
+
" output = audio_diffusion(raw_audio=audio, start_step=start_step)\n",
|
210 |
" image2 = output.images[0]\n",
|
211 |
" audio2 = output.audios[0, 0]\n",
|
212 |
" display(image2)\n",
|
|
|
235 |
"overlap_samples = overlap_secs * sample_rate\n",
|
236 |
"track = audio\n",
|
237 |
"for variation in range(12):\n",
|
238 |
+
" output = audio_diffusion(raw_audio=audio[-overlap_samples:],\n",
|
|
|
239 |
" start_step=start_step,\n",
|
240 |
" mask_start_secs=overlap_secs)\n",
|
241 |
" image2 = output.images[0]\n",
|
|
|
305 |
" # Normalize and re-insert generated audio\n",
|
306 |
" audio[:overlap_samples] = audio2[-overlap_samples:] * np.max(\n",
|
307 |
" audio[:overlap_samples]) / np.max(audio2[-overlap_samples:])\n",
|
308 |
+
" output = audio_diffusion(raw_audio=audio,\n",
|
|
|
309 |
" start_step=start_step,\n",
|
310 |
" generator=generator,\n",
|
311 |
" mask_start_secs=overlap_secs * not_first)\n",
|
|
|
332 |
"source": [
|
333 |
"sample = 3 #@param {type:\"integer\"}\n",
|
334 |
"raw_audio = track_audio[sample * stride:sample * stride + slice_size]\n",
|
335 |
+
"output = audio_diffusion(raw_audio=raw_audio,\n",
|
|
|
336 |
" mask_start_secs=1,\n",
|
337 |
" mask_end_secs=1,\n",
|
338 |
" step_generator=torch.Generator(device=device))\n",
|
|
|
356 |
"metadata": {},
|
357 |
"outputs": [],
|
358 |
"source": [
|
359 |
+
"audio_diffusion = DiffusionPipeline.from_pretrained('teticio/audio-diffusion-ddim-256-new').to(device)\n",
|
360 |
+
"mel = audio_diffusion.mel\n",
|
361 |
+
"sample_rate = mel.get_sample_rate()"
|
362 |
]
|
363 |
},
|
364 |
{
|
|
|
380 |
" seed = generator.seed()\n",
|
381 |
" print(f'Seed = {seed}')\n",
|
382 |
" generator.manual_seed(seed)\n",
|
383 |
+
" output = audio_diffusion(generator=generator)\n",
|
384 |
" image = output.images[0]\n",
|
385 |
" audio = output.audios[0, 0]\n",
|
386 |
" display(image)\n",
|
|
|
409 |
"metadata": {},
|
410 |
"outputs": [],
|
411 |
"source": [
|
412 |
+
"output = audio_diffusion(steps=1000, generator=generator, eta=1)\n",
|
413 |
"image = output.images[0]\n",
|
414 |
"audio = output.audios[0, 0]\n",
|
415 |
"display(image)\n",
|
|
|
508 |
"source": [
|
509 |
"alpha = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.1}\n",
|
510 |
"output = audio_diffusion(\n",
|
|
|
511 |
" noise=audio_diffusion.slerp(noise, noise2, alpha),\n",
|
512 |
" generator=generator)\n",
|
513 |
"audio = output.audios[0, 0]\n",
|
|
|
532 |
"metadata": {},
|
533 |
"outputs": [],
|
534 |
"source": [
|
535 |
+
"model_id = \"teticio/latent-audio-diffusion-ddim-256-new\" #@param [\"teticio/latent-audio-diffusion-256\", \"teticio/latent-audio-diffusion-ddim-256\"]"
|
536 |
]
|
537 |
},
|
538 |
{
|
|
|
542 |
"metadata": {},
|
543 |
"outputs": [],
|
544 |
"source": [
|
545 |
+
"audio_diffusion = DiffusionPipeline.from_pretrained(model_id).to(device)\n",
|
546 |
+
"mel = audio_diffusion.mel\n",
|
547 |
+
"sample_rate = mel.get_sample_rate()"
|
548 |
]
|
549 |
},
|
550 |
{
|
|
|
556 |
"source": [
|
557 |
"seed = 3412253600050855 #@param {type:\"integer\"}\n",
|
558 |
"generator.manual_seed(seed)\n",
|
559 |
+
"output = audio_diffusion(generator=generator)\n",
|
560 |
"image = output.images[0]\n",
|
561 |
"audio = output.audios[0, 0]\n",
|
562 |
"display(image)\n",
|
|
|
572 |
"source": [
|
573 |
"seed2 = 7016114633369557 #@param {type:\"integer\"}\n",
|
574 |
"generator.manual_seed(seed2)\n",
|
575 |
+
"output = audio_diffusion(generator=generator)\n",
|
576 |
"image2 = output.images[0]\n",
|
577 |
"audio2 = output.audios[0, 0]\n",
|
578 |
"display(image2)\n",
|
|
|
628 |
"source": [
|
629 |
"alpha = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.1}\n",
|
630 |
"output = audio_diffusion(\n",
|
|
|
631 |
" noise=audio_diffusion.slerp(latents, latents2, alpha),\n",
|
632 |
" generator=generator)\n",
|
633 |
"audio3 = output.audios[0, 0]\n",
|