Spaces:
Runtime error
Runtime error
fix pipelines
Browse files- audiodiffusion/__init__.py +16 -11
- notebooks/test_vae.ipynb +2 -2
- scripts/train_unconditional.py +8 -8
audiodiffusion/__init__.py
CHANGED
@@ -5,7 +5,7 @@ import numpy as np
|
|
5 |
from PIL import Image
|
6 |
from tqdm.auto import tqdm
|
7 |
from librosa.beat import beat_track
|
8 |
-
from diffusers import
|
9 |
|
10 |
from .mel import Mel
|
11 |
|
@@ -42,13 +42,14 @@ class AudioDiffusion:
|
|
42 |
hop_length=hop_length,
|
43 |
top_db=top_db)
|
44 |
self.model_id = model_id
|
45 |
-
self.
|
46 |
if cuda:
|
47 |
-
self.
|
48 |
self.progress_bar = progress_bar or (lambda _: _)
|
49 |
|
50 |
def generate_spectrogram_and_audio(
|
51 |
self,
|
|
|
52 |
generator: torch.Generator = None
|
53 |
) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
|
54 |
"""Generate random mel spectrogram and convert to audio.
|
@@ -60,7 +61,10 @@ class AudioDiffusion:
|
|
60 |
PIL Image: mel spectrogram
|
61 |
(float, np.ndarray): sample rate and raw audio
|
62 |
"""
|
63 |
-
images = self.
|
|
|
|
|
|
|
64 |
images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
|
65 |
image = Image.fromarray(images[0][0])
|
66 |
audio = self.mel.image_to_audio(image)
|
@@ -95,16 +99,17 @@ class AudioDiffusion:
|
|
95 |
(float, np.ndarray): sample rate and raw audio
|
96 |
"""
|
97 |
|
98 |
-
# It would be better to derive a class from
|
99 |
# but currently the return type ImagePipelineOutput cannot be imported.
|
100 |
if steps is None:
|
101 |
-
steps = self.
|
102 |
-
|
|
|
103 |
scheduler.set_timesteps(steps)
|
104 |
mask = None
|
105 |
images = noise = torch.randn(
|
106 |
-
(1, self.
|
107 |
-
self.
|
108 |
generator=generator)
|
109 |
|
110 |
if audio_file is not None or raw_audio is not None:
|
@@ -129,10 +134,10 @@ class AudioDiffusion:
|
|
129 |
torch.tensor(input_image[np.newaxis, np.newaxis, :]), noise,
|
130 |
torch.tensor(scheduler.timesteps[start_step:]))
|
131 |
|
132 |
-
images = images.to(self.
|
133 |
for step, t in enumerate(
|
134 |
self.progress_bar(scheduler.timesteps[start_step:])):
|
135 |
-
model_output = self.
|
136 |
images = scheduler.step(model_output,
|
137 |
t,
|
138 |
images,
|
|
|
5 |
from PIL import Image
|
6 |
from tqdm.auto import tqdm
|
7 |
from librosa.beat import beat_track
|
8 |
+
from diffusers import DiffusionPipeline
|
9 |
|
10 |
from .mel import Mel
|
11 |
|
|
|
42 |
hop_length=hop_length,
|
43 |
top_db=top_db)
|
44 |
self.model_id = model_id
|
45 |
+
self.pipe = DiffusionPipeline.from_pretrained(self.model_id)
|
46 |
if cuda:
|
47 |
+
self.pipe.to("cuda")
|
48 |
self.progress_bar = progress_bar or (lambda _: _)
|
49 |
|
50 |
def generate_spectrogram_and_audio(
|
51 |
self,
|
52 |
+
steps: int = None,
|
53 |
generator: torch.Generator = None
|
54 |
) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
|
55 |
"""Generate random mel spectrogram and convert to audio.
|
|
|
61 |
PIL Image: mel spectrogram
|
62 |
(float, np.ndarray): sample rate and raw audio
|
63 |
"""
|
64 |
+
images = self.pipe(output_type="numpy",
|
65 |
+
generator=generator,
|
66 |
+
num_inference_steps=self.pipe.scheduler.
|
67 |
+
num_train_timesteps)["sample"]
|
68 |
images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
|
69 |
image = Image.fromarray(images[0][0])
|
70 |
audio = self.mel.image_to_audio(image)
|
|
|
99 |
(float, np.ndarray): sample rate and raw audio
|
100 |
"""
|
101 |
|
102 |
+
# It would be better to derive a class from DiffusionPipeline
|
103 |
# but currently the return type ImagePipelineOutput cannot be imported.
|
104 |
if steps is None:
|
105 |
+
steps = self.pipe.scheduler.num_train_timesteps
|
106 |
+
# Unfortunately, the schedule is set up in the constructor.
|
107 |
+
scheduler = self.pipe.scheduler.__class__(num_train_timesteps=steps)
|
108 |
scheduler.set_timesteps(steps)
|
109 |
mask = None
|
110 |
images = noise = torch.randn(
|
111 |
+
(1, self.pipe.unet.in_channels, self.pipe.unet.sample_size,
|
112 |
+
self.pipe.unet.sample_size),
|
113 |
generator=generator)
|
114 |
|
115 |
if audio_file is not None or raw_audio is not None:
|
|
|
134 |
torch.tensor(input_image[np.newaxis, np.newaxis, :]), noise,
|
135 |
torch.tensor(scheduler.timesteps[start_step:]))
|
136 |
|
137 |
+
images = images.to(self.pipe.device)
|
138 |
for step, t in enumerate(
|
139 |
self.progress_bar(scheduler.timesteps[start_step:])):
|
140 |
+
model_output = self.pipe.unet(images, t)['sample']
|
141 |
images = scheduler.step(model_output,
|
142 |
t,
|
143 |
images,
|
notebooks/test_vae.ipynb
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
"execution_count": null,
|
6 |
-
"id": "
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": [
|
@@ -57,7 +57,7 @@
|
|
57 |
"metadata": {},
|
58 |
"outputs": [],
|
59 |
"source": [
|
60 |
-
"ds = load_dataset('teticio/audio-diffusion-
|
61 |
]
|
62 |
},
|
63 |
{
|
|
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
"execution_count": null,
|
6 |
+
"id": "bcbbe26c",
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": [
|
|
|
57 |
"metadata": {},
|
58 |
"outputs": [],
|
59 |
"source": [
|
60 |
+
"ds = load_dataset('teticio/audio-diffusion-256')"
|
61 |
]
|
62 |
},
|
63 |
{
|
scripts/train_unconditional.py
CHANGED
@@ -231,15 +231,14 @@ def main(args):
|
|
231 |
|
232 |
# Generate sample images for visual inspection
|
233 |
if accelerator.is_main_process:
|
234 |
-
if
|
|
|
|
|
235 |
if args.vae is not None:
|
236 |
-
pipeline = LDMPipeline(
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
vqvae=vqvae,
|
241 |
-
scheduler=noise_scheduler,
|
242 |
-
)
|
243 |
else:
|
244 |
pipeline = DDPMPipeline(
|
245 |
unet=accelerator.unwrap_model(
|
@@ -269,6 +268,7 @@ def main(args):
|
|
269 |
generator=generator,
|
270 |
batch_size=args.eval_batch_size,
|
271 |
output_type="numpy",
|
|
|
272 |
)["sample"]
|
273 |
|
274 |
# denormalize the images and save to tensorboard
|
|
|
231 |
|
232 |
# Generate sample images for visual inspection
|
233 |
if accelerator.is_main_process:
|
234 |
+
if (
|
235 |
+
epoch + 1
|
236 |
+
) % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
|
237 |
if args.vae is not None:
|
238 |
+
pipeline = LDMPipeline(unet=accelerator.unwrap_model(
|
239 |
+
ema_model.averaged_model if args.use_ema else model),
|
240 |
+
vqvae=vqvae,
|
241 |
+
scheduler=noise_scheduler)
|
|
|
|
|
|
|
242 |
else:
|
243 |
pipeline = DDPMPipeline(
|
244 |
unet=accelerator.unwrap_model(
|
|
|
268 |
generator=generator,
|
269 |
batch_size=args.eval_batch_size,
|
270 |
output_type="numpy",
|
271 |
+
num_inference_steps=args.num_train_steps,
|
272 |
)["sample"]
|
273 |
|
274 |
# denormalize the images and save to tensorboard
|