teticio commited on
Commit
96e542f
1 Parent(s): 001a426

fix pipelines

Browse files
audiodiffusion/__init__.py CHANGED
@@ -5,7 +5,7 @@ import numpy as np
5
  from PIL import Image
6
  from tqdm.auto import tqdm
7
  from librosa.beat import beat_track
8
- from diffusers import DDPMPipeline, DDPMScheduler
9
 
10
  from .mel import Mel
11
 
@@ -42,13 +42,14 @@ class AudioDiffusion:
42
  hop_length=hop_length,
43
  top_db=top_db)
44
  self.model_id = model_id
45
- self.ddpm = DDPMPipeline.from_pretrained(self.model_id)
46
  if cuda:
47
- self.ddpm.to("cuda")
48
  self.progress_bar = progress_bar or (lambda _: _)
49
 
50
  def generate_spectrogram_and_audio(
51
  self,
 
52
  generator: torch.Generator = None
53
  ) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
54
  """Generate random mel spectrogram and convert to audio.
@@ -60,7 +61,10 @@ class AudioDiffusion:
60
  PIL Image: mel spectrogram
61
  (float, np.ndarray): sample rate and raw audio
62
  """
63
- images = self.ddpm(output_type="numpy", generator=generator)["sample"]
 
 
 
64
  images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
65
  image = Image.fromarray(images[0][0])
66
  audio = self.mel.image_to_audio(image)
@@ -95,16 +99,17 @@ class AudioDiffusion:
95
  (float, np.ndarray): sample rate and raw audio
96
  """
97
 
98
- # It would be better to derive a class from DDPMDiffusionPipeline
99
  # but currently the return type ImagePipelineOutput cannot be imported.
100
  if steps is None:
101
- steps = self.ddpm.scheduler.num_train_timesteps
102
- scheduler = DDPMScheduler(num_train_timesteps=steps)
 
103
  scheduler.set_timesteps(steps)
104
  mask = None
105
  images = noise = torch.randn(
106
- (1, self.ddpm.unet.in_channels, self.ddpm.unet.sample_size,
107
- self.ddpm.unet.sample_size),
108
  generator=generator)
109
 
110
  if audio_file is not None or raw_audio is not None:
@@ -129,10 +134,10 @@ class AudioDiffusion:
129
  torch.tensor(input_image[np.newaxis, np.newaxis, :]), noise,
130
  torch.tensor(scheduler.timesteps[start_step:]))
131
 
132
- images = images.to(self.ddpm.device)
133
  for step, t in enumerate(
134
  self.progress_bar(scheduler.timesteps[start_step:])):
135
- model_output = self.ddpm.unet(images, t)['sample']
136
  images = scheduler.step(model_output,
137
  t,
138
  images,
 
5
  from PIL import Image
6
  from tqdm.auto import tqdm
7
  from librosa.beat import beat_track
8
+ from diffusers import DiffusionPipeline
9
 
10
  from .mel import Mel
11
 
 
42
  hop_length=hop_length,
43
  top_db=top_db)
44
  self.model_id = model_id
45
+ self.pipe = DiffusionPipeline.from_pretrained(self.model_id)
46
  if cuda:
47
+ self.pipe.to("cuda")
48
  self.progress_bar = progress_bar or (lambda _: _)
49
 
50
  def generate_spectrogram_and_audio(
51
  self,
52
+ steps: int = None,
53
  generator: torch.Generator = None
54
  ) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
55
  """Generate random mel spectrogram and convert to audio.
 
61
  PIL Image: mel spectrogram
62
  (float, np.ndarray): sample rate and raw audio
63
  """
64
+ images = self.pipe(output_type="numpy",
65
+ generator=generator,
66
+ num_inference_steps=self.pipe.scheduler.
67
+ num_train_timesteps)["sample"]
68
  images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
69
  image = Image.fromarray(images[0][0])
70
  audio = self.mel.image_to_audio(image)
 
99
  (float, np.ndarray): sample rate and raw audio
100
  """
101
 
102
+ # It would be better to derive a class from DiffusionPipeline
103
  # but currently the return type ImagePipelineOutput cannot be imported.
104
  if steps is None:
105
+ steps = self.pipe.scheduler.num_train_timesteps
106
+ # Unfortunately, the schedule is set up in the constructor.
107
+ scheduler = self.pipe.scheduler.__class__(num_train_timesteps=steps)
108
  scheduler.set_timesteps(steps)
109
  mask = None
110
  images = noise = torch.randn(
111
+ (1, self.pipe.unet.in_channels, self.pipe.unet.sample_size,
112
+ self.pipe.unet.sample_size),
113
  generator=generator)
114
 
115
  if audio_file is not None or raw_audio is not None:
 
134
  torch.tensor(input_image[np.newaxis, np.newaxis, :]), noise,
135
  torch.tensor(scheduler.timesteps[start_step:]))
136
 
137
+ images = images.to(self.pipe.device)
138
  for step, t in enumerate(
139
  self.progress_bar(scheduler.timesteps[start_step:])):
140
+ model_output = self.pipe.unet(images, t)['sample']
141
  images = scheduler.step(model_output,
142
  t,
143
  images,
notebooks/test_vae.ipynb CHANGED
@@ -3,7 +3,7 @@
3
  {
4
  "cell_type": "code",
5
  "execution_count": null,
6
- "id": "3c8663ed",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
@@ -57,7 +57,7 @@
57
  "metadata": {},
58
  "outputs": [],
59
  "source": [
60
- "ds = load_dataset('teticio/audio-diffusion-breaks-256')"
61
  ]
62
  },
63
  {
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": null,
6
+ "id": "bcbbe26c",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
 
57
  "metadata": {},
58
  "outputs": [],
59
  "source": [
60
+ "ds = load_dataset('teticio/audio-diffusion-256')"
61
  ]
62
  },
63
  {
scripts/train_unconditional.py CHANGED
@@ -231,15 +231,14 @@ def main(args):
231
 
232
  # Generate sample images for visual inspection
233
  if accelerator.is_main_process:
234
- if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
 
 
235
  if args.vae is not None:
236
- pipeline = LDMPipeline(
237
- unet=accelerator.unwrap_model(
238
- ema_model.averaged_model if args.use_ema else model
239
- ),
240
- vqvae=vqvae,
241
- scheduler=noise_scheduler,
242
- )
243
  else:
244
  pipeline = DDPMPipeline(
245
  unet=accelerator.unwrap_model(
@@ -269,6 +268,7 @@ def main(args):
269
  generator=generator,
270
  batch_size=args.eval_batch_size,
271
  output_type="numpy",
 
272
  )["sample"]
273
 
274
  # denormalize the images and save to tensorboard
 
231
 
232
  # Generate sample images for visual inspection
233
  if accelerator.is_main_process:
234
+ if (
235
+ epoch + 1
236
+ ) % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
237
  if args.vae is not None:
238
+ pipeline = LDMPipeline(unet=accelerator.unwrap_model(
239
+ ema_model.averaged_model if args.use_ema else model),
240
+ vqvae=vqvae,
241
+ scheduler=noise_scheduler)
 
 
 
242
  else:
243
  pipeline = DDPMPipeline(
244
  unet=accelerator.unwrap_model(
 
268
  generator=generator,
269
  batch_size=args.eval_batch_size,
270
  output_type="numpy",
271
+ num_inference_steps=args.num_train_steps,
272
  )["sample"]
273
 
274
  # denormalize the images and save to tensorboard