Spaces:
Runtime error
Runtime error
fixes
Browse files- README.md +2 -2
- audiodiffusion/__init__.py +23 -7
- scripts/train_unconditional.py +25 -15
- scripts/train_vae.py +1 -0
README.md
CHANGED
@@ -71,7 +71,7 @@ python scripts/audio_to_images.py \
|
|
71 |
--output_dir data/audio-diffusion-256 \
|
72 |
--push_to_hub teticio/audio-diffusion-256
|
73 |
```
|
74 |
-
|
75 |
## Train model
|
76 |
#### Run training on local machine.
|
77 |
```bash
|
@@ -123,7 +123,7 @@ accelerate launch --config_file config/accelerate_sagemaker.yaml \
|
|
123 |
--mixed_precision no
|
124 |
```
|
125 |
## Latent Audio Diffusion
|
126 |
-
Rather than denoising images directly, it is interesting to work in the "latent space" after first encoding images using an autoencoder. This has a number of advantages. Firstly, the information in the images is compressed into a latent space of a much lower dimension, so it is much faster to train denoising diffusion models and run inference with them. Secondly,
|
127 |
|
128 |
At the time of writing, the Hugging Face `diffusers` library is geared towards inference and lacking in training functionality, rather like its cousin `transformers` in the early days of development. In order to train a VAE (Variational Autoencoder), I use the [stable-diffusion](https://github.com/CompVis/stable-diffusion) repo from CompVis and convert the checkpoints to `diffusers` format. Note that it uses a perceptual loss function for images; it would be nice to try a perceptual *audio* loss function.
|
129 |
|
|
|
71 |
--output_dir data/audio-diffusion-256 \
|
72 |
--push_to_hub teticio/audio-diffusion-256
|
73 |
```
|
74 |
+
|
75 |
## Train model
|
76 |
#### Run training on local machine.
|
77 |
```bash
|
|
|
123 |
--mixed_precision no
|
124 |
```
|
125 |
## Latent Audio Diffusion
|
126 |
+
Rather than denoising images directly, it is interesting to work in the "latent space" after first encoding images using an autoencoder. This has a number of advantages. Firstly, the information in the images is compressed into a latent space of a much lower dimension, so it is much faster to train denoising diffusion models and run inference with them. Secondly, similar images tend to be clustered together and interpolating between two images in latent space can produce meaningful combinations.
|
127 |
|
128 |
At the time of writing, the Hugging Face `diffusers` library is geared towards inference and lacking in training functionality, rather like its cousin `transformers` in the early days of development. In order to train a VAE (Variational Autoencoder), I use the [stable-diffusion](https://github.com/CompVis/stable-diffusion) repo from CompVis and convert the checkpoints to `diffusers` format. Note that it uses a perceptual loss function for images; it would be nice to try a perceptual *audio* loss function.
|
129 |
|
audiodiffusion/__init__.py
CHANGED
@@ -100,10 +100,10 @@ class AudioDiffusion:
|
|
100 |
"""
|
101 |
|
102 |
# It would be better to derive a class from DiffusionPipeline
|
103 |
-
# but currently the return type ImagePipelineOutput cannot be imported
|
104 |
if steps is None:
|
105 |
steps = self.pipe.scheduler.num_train_timesteps
|
106 |
-
# Unfortunately, the schedule is set up in the constructor
|
107 |
scheduler = self.pipe.scheduler.__class__(num_train_timesteps=steps)
|
108 |
scheduler.set_timesteps(steps)
|
109 |
mask = None
|
@@ -121,15 +121,21 @@ class AudioDiffusion:
|
|
121 |
input_image.width))
|
122 |
input_image = ((input_image / 255) * 2 - 1)
|
123 |
|
|
|
|
|
|
|
|
|
|
|
124 |
if start_step > 0:
|
125 |
images[0, 0] = scheduler.add_noise(
|
126 |
torch.tensor(input_image[np.newaxis, np.newaxis, :]),
|
127 |
noise, torch.tensor(steps - start_step))
|
128 |
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
|
|
133 |
mask = scheduler.add_noise(
|
134 |
torch.tensor(input_image[np.newaxis, np.newaxis, :]), noise,
|
135 |
torch.tensor(scheduler.timesteps[start_step:]))
|
@@ -150,11 +156,21 @@ class AudioDiffusion:
|
|
150 |
if mask_end > 0:
|
151 |
images[0, 0, :, -mask_end:] = mask[step, 0, :, -mask_end:]
|
152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
images = (images / 2 + 0.5).clamp(0, 1)
|
154 |
images = images.cpu().permute(0, 2, 3, 1).numpy()
|
155 |
-
|
156 |
images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
|
157 |
image = Image.fromarray(images[0][0])
|
|
|
|
|
|
|
|
|
|
|
158 |
audio = self.mel.image_to_audio(image)
|
159 |
return image, (self.mel.get_sample_rate(), audio)
|
160 |
|
|
|
100 |
"""
|
101 |
|
102 |
# It would be better to derive a class from DiffusionPipeline
|
103 |
+
# but currently the return type ImagePipelineOutput cannot be imported
|
104 |
if steps is None:
|
105 |
steps = self.pipe.scheduler.num_train_timesteps
|
106 |
+
# Unfortunately, the schedule is set up in the constructor
|
107 |
scheduler = self.pipe.scheduler.__class__(num_train_timesteps=steps)
|
108 |
scheduler.set_timesteps(steps)
|
109 |
mask = None
|
|
|
121 |
input_image.width))
|
122 |
input_image = ((input_image / 255) * 2 - 1)
|
123 |
|
124 |
+
if hasattr(self.pipe, 'vqvae'):
|
125 |
+
input_image = self.pipe.vqvae.encode(
|
126 |
+
input_image).latent_dist.sample(generator=generator)
|
127 |
+
input_image = 0.18215 * input_image
|
128 |
+
|
129 |
if start_step > 0:
|
130 |
images[0, 0] = scheduler.add_noise(
|
131 |
torch.tensor(input_image[np.newaxis, np.newaxis, :]),
|
132 |
noise, torch.tensor(steps - start_step))
|
133 |
|
134 |
+
pixels_per_second = (self.mel.get_sample_rate() *
|
135 |
+
self.pipe.unet.sample_size /
|
136 |
+
self.mel.hop_length / self.mel.x_res)
|
137 |
+
mask_start = int(mask_start_secs * pixels_per_second)
|
138 |
+
mask_end = int(mask_end_secs * pixels_per_second)
|
139 |
mask = scheduler.add_noise(
|
140 |
torch.tensor(input_image[np.newaxis, np.newaxis, :]), noise,
|
141 |
torch.tensor(scheduler.timesteps[start_step:]))
|
|
|
156 |
if mask_end > 0:
|
157 |
images[0, 0, :, -mask_end:] = mask[step, 0, :, -mask_end:]
|
158 |
|
159 |
+
if hasattr(self.pipe, 'vqvae'):
|
160 |
+
# 0.18215 was scaling factor used in training to ensure unit variance
|
161 |
+
# This is also currently hardcoded in diffusers pipeline
|
162 |
+
images = 1 / 0.18215 * images
|
163 |
+
images = self.pipe.vqvae.decode(images)['sample']
|
164 |
+
|
165 |
images = (images / 2 + 0.5).clamp(0, 1)
|
166 |
images = images.cpu().permute(0, 2, 3, 1).numpy()
|
|
|
167 |
images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
|
168 |
image = Image.fromarray(images[0][0])
|
169 |
+
|
170 |
+
if hasattr(self.pipe,
|
171 |
+
'vqvae') and self.pipe.vqvae.config['out_channels'] == 3:
|
172 |
+
image = image.convert('L')
|
173 |
+
|
174 |
audio = self.mel.image_to_audio(image)
|
175 |
return image, (self.mel.get_sample_rate(), audio)
|
176 |
|
scripts/train_unconditional.py
CHANGED
@@ -50,8 +50,10 @@ def main(args):
|
|
50 |
model = UNet2DModel(
|
51 |
sample_size=args.resolution
|
52 |
if args.vae is None else args.latent_resolution,
|
53 |
-
in_channels=1
|
54 |
-
|
|
|
|
|
55 |
layers_per_block=2,
|
56 |
block_out_channels=(128, 128, 256, 256, 512, 512),
|
57 |
down_block_types=(
|
@@ -115,9 +117,9 @@ def main(args):
|
|
115 |
)
|
116 |
|
117 |
def transforms(examples):
|
118 |
-
if args.vae is not None:
|
119 |
images = [
|
120 |
-
augmentations(image.convert(
|
121 |
for image in examples["image"]
|
122 |
]
|
123 |
else:
|
@@ -182,6 +184,8 @@ def main(args):
|
|
182 |
with torch.no_grad():
|
183 |
clean_images = vqvae.encode(
|
184 |
clean_images).latent_dist.sample()
|
|
|
|
|
185 |
|
186 |
# Sample noise that we'll add to the images
|
187 |
noise = torch.randn(clean_images.shape).to(clean_images.device)
|
@@ -231,9 +235,7 @@ def main(args):
|
|
231 |
|
232 |
# Generate sample images for visual inspection
|
233 |
if accelerator.is_main_process:
|
234 |
-
if
|
235 |
-
epoch + 1
|
236 |
-
) % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
|
237 |
if args.vae is not None:
|
238 |
pipeline = LDMPipeline(unet=accelerator.unwrap_model(
|
239 |
ema_model.averaged_model if args.use_ema else model),
|
@@ -262,14 +264,16 @@ def main(args):
|
|
262 |
else:
|
263 |
pipeline.save_pretrained(output_dir)
|
264 |
|
265 |
-
|
|
|
266 |
# run pipeline in inference (sample random noise and denoise)
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
|
|
273 |
|
274 |
# denormalize the images and save to tensorboard
|
275 |
images_processed = ((images *
|
@@ -278,7 +282,13 @@ def main(args):
|
|
278 |
accelerator.trackers[0].writer.add_images(
|
279 |
"test_samples", images_processed, epoch)
|
280 |
for _, image in enumerate(images_processed):
|
281 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
accelerator.trackers[0].writer.add_audio(
|
283 |
f"test_audio_{_}",
|
284 |
normalize(audio),
|
|
|
50 |
model = UNet2DModel(
|
51 |
sample_size=args.resolution
|
52 |
if args.vae is None else args.latent_resolution,
|
53 |
+
in_channels=1
|
54 |
+
if args.vae is None else vqvae.config['latent_channels'],
|
55 |
+
out_channels=1
|
56 |
+
if args.vae is None else vqvae.config['latent_channels'],
|
57 |
layers_per_block=2,
|
58 |
block_out_channels=(128, 128, 256, 256, 512, 512),
|
59 |
down_block_types=(
|
|
|
117 |
)
|
118 |
|
119 |
def transforms(examples):
|
120 |
+
if args.vae is not None and vqvae.config['in_channels'] == 3:
|
121 |
images = [
|
122 |
+
augmentations(image.convert('RGB'))
|
123 |
for image in examples["image"]
|
124 |
]
|
125 |
else:
|
|
|
184 |
with torch.no_grad():
|
185 |
clean_images = vqvae.encode(
|
186 |
clean_images).latent_dist.sample()
|
187 |
+
# Scale latent images to ensure approximately unit variance
|
188 |
+
clean_images = clean_images * 0.18215
|
189 |
|
190 |
# Sample noise that we'll add to the images
|
191 |
noise = torch.randn(clean_images.shape).to(clean_images.device)
|
|
|
235 |
|
236 |
# Generate sample images for visual inspection
|
237 |
if accelerator.is_main_process:
|
238 |
+
if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
|
|
|
|
|
239 |
if args.vae is not None:
|
240 |
pipeline = LDMPipeline(unet=accelerator.unwrap_model(
|
241 |
ema_model.averaged_model if args.use_ema else model),
|
|
|
264 |
else:
|
265 |
pipeline.save_pretrained(output_dir)
|
266 |
|
267 |
+
if epoch % args.save_images_epochs == 0 or epoch == args.num_epochs - 1:
|
268 |
+
generator = torch.manual_seed(42)
|
269 |
# run pipeline in inference (sample random noise and denoise)
|
270 |
+
with torch.no_grad():
|
271 |
+
images = pipeline(
|
272 |
+
generator=generator,
|
273 |
+
batch_size=args.eval_batch_size,
|
274 |
+
output_type="numpy",
|
275 |
+
num_inference_steps=args.num_train_steps,
|
276 |
+
)["sample"]
|
277 |
|
278 |
# denormalize the images and save to tensorboard
|
279 |
images_processed = ((images *
|
|
|
282 |
accelerator.trackers[0].writer.add_images(
|
283 |
"test_samples", images_processed, epoch)
|
284 |
for _, image in enumerate(images_processed):
|
285 |
+
image = Image.fromarray(image[0])
|
286 |
+
|
287 |
+
if args.vae is not None and vqvae.config[
|
288 |
+
'out_channels'] == 3:
|
289 |
+
image = image.convert('L')
|
290 |
+
|
291 |
+
audio = mel.image_to_audio(image)
|
292 |
accelerator.trackers[0].writer.add_audio(
|
293 |
f"test_audio_{_}",
|
294 |
normalize(audio),
|
scripts/train_vae.py
CHANGED
@@ -4,6 +4,7 @@
|
|
4 |
|
5 |
# TODO
|
6 |
# grayscale
|
|
|
7 |
|
8 |
import os
|
9 |
import argparse
|
|
|
4 |
|
5 |
# TODO
|
6 |
# grayscale
|
7 |
+
# update generate from audio to include vae step
|
8 |
|
9 |
import os
|
10 |
import argparse
|