teticio commited on
Commit
d68360d
1 Parent(s): 0954180

sync with latest version of diffusers

Browse files
audiodiffusion/__init__.py CHANGED
@@ -8,6 +8,8 @@ from tqdm.auto import tqdm
8
  from librosa.beat import beat_track
9
  from diffusers import (DiffusionPipeline, UNet2DConditionModel, DDIMScheduler,
10
  DDPMScheduler, AutoencoderKL)
 
 
11
 
12
  from .mel import Mel
13
 
@@ -83,7 +85,8 @@ class AudioDiffusion:
83
  generator=generator,
84
  step_generator=step_generator,
85
  eta=eta,
86
- noise=noise)
 
87
  return images[0], (sample_rate, audios[0])
88
 
89
  def generate_spectrogram_and_audio_from_audio(
@@ -133,7 +136,8 @@ class AudioDiffusion:
133
  mask_end_secs=mask_end_secs,
134
  step_generator=step_generator,
135
  eta=eta,
136
- noise=noise)
 
137
  return images[0], (sample_rate, audios[0])
138
 
139
  @staticmethod
@@ -158,9 +162,7 @@ class AudioDiffusion:
158
 
159
 
160
  class AudioDiffusionPipeline(DiffusionPipeline):
161
-
162
- def __init__(self, unet: UNet2DConditionModel,
163
- scheduler: Union[DDIMScheduler, DDPMScheduler]):
164
  super().__init__()
165
  self.register_modules(unet=unet, scheduler=scheduler)
166
 
@@ -170,11 +172,13 @@ class AudioDiffusionPipeline(DiffusionPipeline):
170
  Returns:
171
  Tuple: (height, width)
172
  """
173
- input_module = self.vqvae if hasattr(self, 'vqvae') else self.unet
174
  # For backwards compatibility
175
  sample_size = (
176
- input_module.sample_size, input_module.sample_size) if type(
177
- input_module.sample_size) == int else input_module.sample_size
 
 
178
  return sample_size
179
 
180
  def get_default_steps(self) -> int:
@@ -200,8 +204,11 @@ class AudioDiffusionPipeline(DiffusionPipeline):
200
  mask_end_secs: float = 0,
201
  step_generator: torch.Generator = None,
202
  eta: float = 0,
203
- noise: torch.Tensor = None
204
- ) -> Tuple[List[Image.Image], Tuple[int, List[np.ndarray]]]:
 
 
 
205
  """Generate random mel spectrogram from audio input and convert to audio.
206
 
207
  Args:
@@ -218,10 +225,10 @@ class AudioDiffusionPipeline(DiffusionPipeline):
218
  step_generator (torch.Generator): random number generator used to de-noise or None
219
  eta (float): parameter between 0 and 1 used with DDIM scheduler
220
  noise (torch.Tensor): noise tensor of shape (batch_size, 1, height, width) or None
 
221
 
222
  Returns:
223
- List[PIL Image]: mel spectrograms
224
- (float, List[np.ndarray]): sample rate and raw audios
225
  """
226
 
227
  steps = steps or self.get_default_steps()
@@ -229,89 +236,78 @@ class AudioDiffusionPipeline(DiffusionPipeline):
229
  step_generator = step_generator or generator
230
  # For backwards compatibility
231
  if type(self.unet.sample_size) == int:
232
- self.unet.sample_size = (self.unet.sample_size,
233
- self.unet.sample_size)
 
234
  if noise is None:
235
  noise = torch.randn(
236
- (batch_size, self.unet.in_channels, self.unet.sample_size[0],
237
- self.unet.sample_size[1]),
238
- generator=generator)
 
239
  images = noise
240
  mask = None
241
 
242
  if audio_file is not None or raw_audio is not None:
243
  mel.load_audio(audio_file, raw_audio)
244
  input_image = mel.audio_slice_to_image(slice)
245
- input_image = np.frombuffer(input_image.tobytes(),
246
- dtype="uint8").reshape(
247
- (input_image.height,
248
- input_image.width))
249
- input_image = ((input_image / 255) * 2 - 1)
250
- input_images = torch.tensor(input_image[np.newaxis, :, :],
251
- dtype=torch.float)
252
-
253
- if hasattr(self, 'vqvae'):
254
- input_images = self.vqvae.encode(
255
- torch.unsqueeze(input_images,
256
- 0).to(self.device)).latent_dist.sample(
257
- generator=generator).cpu()[0]
258
  input_images = 0.18215 * input_images
259
 
260
  if start_step > 0:
261
- images[0, 0] = self.scheduler.add_noise(
262
- input_images, noise,
263
- self.scheduler.timesteps[start_step - 1])
264
 
265
- pixels_per_second = (self.unet.sample_size[1] *
266
- mel.get_sample_rate() / mel.x_res /
267
- mel.hop_length)
268
  mask_start = int(mask_start_secs * pixels_per_second)
269
  mask_end = int(mask_end_secs * pixels_per_second)
270
- mask = self.scheduler.add_noise(
271
- input_images, noise,
272
- torch.tensor(self.scheduler.timesteps[start_step:]))
273
 
274
- images = images.to(self.device)
275
- for step, t in enumerate(
276
- self.progress_bar(self.scheduler.timesteps[start_step:])):
277
- model_output = self.unet(images, t)['sample']
278
 
279
  if isinstance(self.scheduler, DDIMScheduler):
280
  images = self.scheduler.step(
281
- model_output=model_output,
282
- timestep=t,
283
- sample=images,
284
- eta=eta,
285
- generator=step_generator)['prev_sample']
286
  else:
287
  images = self.scheduler.step(
288
- model_output=model_output,
289
- timestep=t,
290
- sample=images,
291
- generator=step_generator)['prev_sample']
292
 
293
  if mask is not None:
294
  if mask_start > 0:
295
- images[:, :, :, :mask_start] = mask[:,
296
- step, :, :mask_start]
297
  if mask_end > 0:
298
  images[:, :, :, -mask_end:] = mask[:, step, :, -mask_end:]
299
 
300
- if hasattr(self, 'vqvae'):
301
  # 0.18215 was scaling factor used in training to ensure unit variance
302
  images = 1 / 0.18215 * images
303
- images = self.vqvae.decode(images)['sample']
304
 
305
  images = (images / 2 + 0.5).clamp(0, 1)
306
  images = images.cpu().permute(0, 2, 3, 1).numpy()
307
  images = (images * 255).round().astype("uint8")
308
  images = list(
309
- map(lambda _: Image.fromarray(_[:, :, 0]), images) if images.
310
- shape[3] == 1 else map(
311
- lambda _: Image.fromarray(_, mode='RGB').convert('L'), images))
 
312
 
313
  audios = list(map(lambda _: mel.image_to_audio(_), images))
314
- return images, (mel.get_sample_rate(), audios)
 
 
 
315
 
316
  @torch.no_grad()
317
  def encode(self, images: List[Image.Image], steps: int = 50) -> np.ndarray:
@@ -328,35 +324,30 @@ class AudioDiffusionPipeline(DiffusionPipeline):
328
  # Only works with DDIM as this method is deterministic
329
  assert isinstance(self.scheduler, DDIMScheduler)
330
  self.scheduler.set_timesteps(steps)
331
- sample = np.array([
332
- np.frombuffer(image.tobytes(), dtype="uint8").reshape(
333
- (1, image.height, image.width)) for image in images
334
- ])
335
- sample = ((sample / 255) * 2 - 1)
336
  sample = torch.Tensor(sample).to(self.device)
337
 
338
- for t in self.progress_bar(torch.flip(self.scheduler.timesteps,
339
- (0, ))):
340
- prev_timestep = (t - self.scheduler.num_train_timesteps //
341
- self.scheduler.num_inference_steps)
342
  alpha_prod_t = self.scheduler.alphas_cumprod[t]
343
- alpha_prod_t_prev = (self.scheduler.alphas_cumprod[prev_timestep]
344
- if prev_timestep >= 0 else
345
- self.scheduler.final_alpha_cumprod)
 
 
346
  beta_prod_t = 1 - alpha_prod_t
347
- model_output = self.unet(sample, t)['sample']
348
- pred_sample_direction = (1 -
349
- alpha_prod_t_prev)**(0.5) * model_output
350
- sample = (sample -
351
- pred_sample_direction) * alpha_prod_t_prev**(-0.5)
352
- sample = sample * alpha_prod_t**(0.5) + beta_prod_t**(
353
- 0.5) * model_output
354
 
355
  return sample
356
 
357
  @staticmethod
358
- def slerp(x0: torch.Tensor, x1: torch.Tensor,
359
- alpha: float) -> torch.Tensor:
360
  """Spherical Linear intERPolation
361
 
362
  Args:
@@ -368,18 +359,14 @@ class AudioDiffusionPipeline(DiffusionPipeline):
368
  torch.Tensor: interpolated tensor
369
  """
370
 
371
- theta = acos(
372
- torch.dot(torch.flatten(x0), torch.flatten(x1)) / torch.norm(x0) /
373
- torch.norm(x1))
374
- return sin((1 - alpha) * theta) * x0 / sin(theta) + sin(
375
- alpha * theta) * x1 / sin(theta)
376
 
377
 
378
  class LatentAudioDiffusionPipeline(AudioDiffusionPipeline):
379
-
380
- def __init__(self, unet: UNet2DConditionModel,
381
- scheduler: Union[DDIMScheduler,
382
- DDPMScheduler], vqvae: AutoencoderKL):
383
  super().__init__(unet=unet, scheduler=scheduler)
384
  self.register_modules(vqvae=vqvae)
385
 
 
8
  from librosa.beat import beat_track
9
  from diffusers import (DiffusionPipeline, UNet2DConditionModel, DDIMScheduler,
10
  DDPMScheduler, AutoencoderKL)
11
+ from diffusers.pipeline_utils import (AudioPipelineOutput, BaseOutput,
12
+ ImagePipelineOutput)
13
 
14
  from .mel import Mel
15
 
 
85
  generator=generator,
86
  step_generator=step_generator,
87
  eta=eta,
88
+ noise=noise,
89
+ return_dict=False)
90
  return images[0], (sample_rate, audios[0])
91
 
92
  def generate_spectrogram_and_audio_from_audio(
 
136
  mask_end_secs=mask_end_secs,
137
  step_generator=step_generator,
138
  eta=eta,
139
+ noise=noise,
140
+ return_dict=False)
141
  return images[0], (sample_rate, audios[0])
142
 
143
  @staticmethod
 
162
 
163
 
164
  class AudioDiffusionPipeline(DiffusionPipeline):
165
+ def __init__(self, unet: UNet2DConditionModel, scheduler: Union[DDIMScheduler, DDPMScheduler]):
 
 
166
  super().__init__()
167
  self.register_modules(unet=unet, scheduler=scheduler)
168
 
 
172
  Returns:
173
  Tuple: (height, width)
174
  """
175
+ input_module = self.vqvae if hasattr(self, "vqvae") else self.unet
176
  # For backwards compatibility
177
  sample_size = (
178
+ (input_module.sample_size, input_module.sample_size)
179
+ if type(input_module.sample_size) == int
180
+ else input_module.sample_size
181
+ )
182
  return sample_size
183
 
184
  def get_default_steps(self) -> int:
 
204
  mask_end_secs: float = 0,
205
  step_generator: torch.Generator = None,
206
  eta: float = 0,
207
+ noise: torch.Tensor = None,
208
+ return_dict=True,
209
+ ) -> Union[
210
+ Union[AudioPipelineOutput, ImagePipelineOutput], Tuple[List[Image.Image], Tuple[int, List[np.ndarray]]]
211
+ ]:
212
  """Generate random mel spectrogram from audio input and convert to audio.
213
 
214
  Args:
 
225
  step_generator (torch.Generator): random number generator used to de-noise or None
226
  eta (float): parameter between 0 and 1 used with DDIM scheduler
227
  noise (torch.Tensor): noise tensor of shape (batch_size, 1, height, width) or None
228
+ return_dict (bool): if True return AudioPipelineOutput, ImagePipelineOutput else Tuple
229
 
230
  Returns:
231
+ List[PIL Image]: mel spectrograms (float, List[np.ndarray]): sample rate and raw audios
 
232
  """
233
 
234
  steps = steps or self.get_default_steps()
 
236
  step_generator = step_generator or generator
237
  # For backwards compatibility
238
  if type(self.unet.sample_size) == int:
239
+ self.unet.sample_size = (self.unet.sample_size, self.unet.sample_size)
240
+ input_dims = self.get_input_dims()
241
+ mel.set_resolution(x_res=input_dims[1], y_res=input_dims[0])
242
  if noise is None:
243
  noise = torch.randn(
244
+ (batch_size, self.unet.in_channels, self.unet.sample_size[0], self.unet.sample_size[1]),
245
+ generator=generator,
246
+ device=self.device,
247
+ )
248
  images = noise
249
  mask = None
250
 
251
  if audio_file is not None or raw_audio is not None:
252
  mel.load_audio(audio_file, raw_audio)
253
  input_image = mel.audio_slice_to_image(slice)
254
+ input_image = np.frombuffer(input_image.tobytes(), dtype="uint8").reshape(
255
+ (input_image.height, input_image.width)
256
+ )
257
+ input_image = (input_image / 255) * 2 - 1
258
+ input_images = torch.tensor(input_image[np.newaxis, :, :], dtype=torch.float).to(self.device)
259
+
260
+ if hasattr(self, "vqvae"):
261
+ input_images = self.vqvae.encode(torch.unsqueeze(input_images, 0)).latent_dist.sample(
262
+ generator=generator
263
+ )[0]
 
 
 
264
  input_images = 0.18215 * input_images
265
 
266
  if start_step > 0:
267
+ images[0, 0] = self.scheduler.add_noise(input_images, noise, self.scheduler.timesteps[start_step - 1])
 
 
268
 
269
+ pixels_per_second = self.unet.sample_size[1] * mel.get_sample_rate() / mel.x_res / mel.hop_length
 
 
270
  mask_start = int(mask_start_secs * pixels_per_second)
271
  mask_end = int(mask_end_secs * pixels_per_second)
272
+ mask = self.scheduler.add_noise(input_images, noise, torch.tensor(self.scheduler.timesteps[start_step:]))
 
 
273
 
274
+ for step, t in enumerate(self.progress_bar(self.scheduler.timesteps[start_step:])):
275
+ model_output = self.unet(images, t)["sample"]
 
 
276
 
277
  if isinstance(self.scheduler, DDIMScheduler):
278
  images = self.scheduler.step(
279
+ model_output=model_output, timestep=t, sample=images, eta=eta, generator=step_generator
280
+ )["prev_sample"]
 
 
 
281
  else:
282
  images = self.scheduler.step(
283
+ model_output=model_output, timestep=t, sample=images, generator=step_generator
284
+ )["prev_sample"]
 
 
285
 
286
  if mask is not None:
287
  if mask_start > 0:
288
+ images[:, :, :, :mask_start] = mask[:, step, :, :mask_start]
 
289
  if mask_end > 0:
290
  images[:, :, :, -mask_end:] = mask[:, step, :, -mask_end:]
291
 
292
+ if hasattr(self, "vqvae"):
293
  # 0.18215 was scaling factor used in training to ensure unit variance
294
  images = 1 / 0.18215 * images
295
+ images = self.vqvae.decode(images)["sample"]
296
 
297
  images = (images / 2 + 0.5).clamp(0, 1)
298
  images = images.cpu().permute(0, 2, 3, 1).numpy()
299
  images = (images * 255).round().astype("uint8")
300
  images = list(
301
+ map(lambda _: Image.fromarray(_[:, :, 0]), images)
302
+ if images.shape[3] == 1
303
+ else map(lambda _: Image.fromarray(_, mode="RGB").convert("L"), images)
304
+ )
305
 
306
  audios = list(map(lambda _: mel.image_to_audio(_), images))
307
+ if not return_dict:
308
+ return images, (mel.get_sample_rate(), audios)
309
+
310
+ return BaseOutput(**AudioPipelineOutput(np.array(audios)[:, np.newaxis, :]), **ImagePipelineOutput(images))
311
 
312
  @torch.no_grad()
313
  def encode(self, images: List[Image.Image], steps: int = 50) -> np.ndarray:
 
324
  # Only works with DDIM as this method is deterministic
325
  assert isinstance(self.scheduler, DDIMScheduler)
326
  self.scheduler.set_timesteps(steps)
327
+ sample = np.array(
328
+ [np.frombuffer(image.tobytes(), dtype="uint8").reshape((1, image.height, image.width)) for image in images]
329
+ )
330
+ sample = (sample / 255) * 2 - 1
 
331
  sample = torch.Tensor(sample).to(self.device)
332
 
333
+ for t in self.progress_bar(torch.flip(self.scheduler.timesteps, (0,))):
334
+ prev_timestep = t - self.scheduler.num_train_timesteps // self.scheduler.num_inference_steps
 
 
335
  alpha_prod_t = self.scheduler.alphas_cumprod[t]
336
+ alpha_prod_t_prev = (
337
+ self.scheduler.alphas_cumprod[prev_timestep]
338
+ if prev_timestep >= 0
339
+ else self.scheduler.final_alpha_cumprod
340
+ )
341
  beta_prod_t = 1 - alpha_prod_t
342
+ model_output = self.unet(sample, t)["sample"]
343
+ pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * model_output
344
+ sample = (sample - pred_sample_direction) * alpha_prod_t_prev ** (-0.5)
345
+ sample = sample * alpha_prod_t ** (0.5) + beta_prod_t ** (0.5) * model_output
 
 
 
346
 
347
  return sample
348
 
349
  @staticmethod
350
+ def slerp(x0: torch.Tensor, x1: torch.Tensor, alpha: float) -> torch.Tensor:
 
351
  """Spherical Linear intERPolation
352
 
353
  Args:
 
359
  torch.Tensor: interpolated tensor
360
  """
361
 
362
+ theta = acos(torch.dot(torch.flatten(x0), torch.flatten(x1)) / torch.norm(x0) / torch.norm(x1))
363
+ return sin((1 - alpha) * theta) * x0 / sin(theta) + sin(alpha * theta) * x1 / sin(theta)
 
 
 
364
 
365
 
366
  class LatentAudioDiffusionPipeline(AudioDiffusionPipeline):
367
+ def __init__(
368
+ self, unet: UNet2DConditionModel, scheduler: Union[DDIMScheduler, DDPMScheduler], vqvae: AutoencoderKL
369
+ ):
 
370
  super().__init__(unet=unet, scheduler=scheduler)
371
  self.register_modules(vqvae=vqvae)
372
 
audiodiffusion/mel.py CHANGED
@@ -1,22 +1,25 @@
1
  import warnings
2
 
3
- warnings.filterwarnings('ignore')
4
 
5
- import librosa
6
- import numpy as np
7
- from PIL import Image
8
 
 
 
 
 
9
 
10
- class Mel:
11
 
12
- def __init__(self,
13
- x_res: int = 256,
14
- y_res: int = 256,
15
- sample_rate: int = 22050,
16
- n_fft: int = 2048,
17
- hop_length: int = 512,
18
- top_db: int = 80,
19
- n_iter: int = 32):
 
 
 
20
  """Class to convert audio to mel spectrograms and vice versa.
21
 
22
  Args:
@@ -28,17 +31,26 @@ class Mel:
28
  top_db (int): loudest in decibels
29
  n_iter (int): number of iterations for Griffin Linn mel inversion
30
  """
31
- self.x_res = x_res
32
- self.y_res = y_res
33
  self.sr = sample_rate
34
  self.n_fft = n_fft
35
- self.hop_length = hop_length
36
- self.n_mels = self.y_res
37
- self.slice_size = self.x_res * self.hop_length - 1
38
  self.top_db = top_db
39
  self.n_iter = n_iter
 
40
  self.audio = None
41
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
43
  """Load audio.
44
 
@@ -53,10 +65,7 @@ class Mel:
53
 
54
  # Pad with silence if necessary.
55
  if len(self.audio) < self.x_res * self.hop_length:
56
- self.audio = np.concatenate([
57
- self.audio,
58
- np.zeros((self.x_res * self.hop_length - len(self.audio), ))
59
- ])
60
 
61
  def get_number_of_slices(self) -> int:
62
  """Get number of slices in audio.
@@ -75,8 +84,7 @@ class Mel:
75
  Returns:
76
  np.ndarray: audio as numpy array
77
  """
78
- return self.audio[self.slice_size * slice:self.slice_size *
79
- (slice + 1)]
80
 
81
  def get_sample_rate(self) -> int:
82
  """Get sample rate:
@@ -95,14 +103,11 @@ class Mel:
95
  Returns:
96
  PIL Image: grayscale image of x_res x y_res
97
  """
98
- S = librosa.feature.melspectrogram(y=self.get_audio_slice(slice),
99
- sr=self.sr,
100
- n_fft=self.n_fft,
101
- hop_length=self.hop_length,
102
- n_mels=self.n_mels)
103
  log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
104
- bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) +
105
- 0.5).astype(np.uint8)
106
  image = Image.fromarray(bytedata)
107
  return image
108
 
@@ -115,14 +120,10 @@ class Mel:
115
  Returns:
116
  audio (np.ndarray): raw audio
117
  """
118
- bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape(
119
- (image.height, image.width))
120
  log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
121
  S = librosa.db_to_power(log_S)
122
  audio = librosa.feature.inverse.mel_to_audio(
123
- S,
124
- sr=self.sr,
125
- n_fft=self.n_fft,
126
- hop_length=self.hop_length,
127
- n_iter=self.n_iter)
128
  return audio
 
1
  import warnings
2
 
 
3
 
4
+ warnings.filterwarnings("ignore")
 
 
5
 
6
+ import numpy as np # noqa: E402
7
+
8
+ import librosa # noqa: E402
9
+ from PIL import Image # noqa: E402
10
 
 
11
 
12
+ class Mel:
13
+ def __init__(
14
+ self,
15
+ x_res: int = 256,
16
+ y_res: int = 256,
17
+ sample_rate: int = 22050,
18
+ n_fft: int = 2048,
19
+ hop_length: int = 512,
20
+ top_db: int = 80,
21
+ n_iter: int = 32,
22
+ ):
23
  """Class to convert audio to mel spectrograms and vice versa.
24
 
25
  Args:
 
31
  top_db (int): loudest in decibels
32
  n_iter (int): number of iterations for Griffin Linn mel inversion
33
  """
34
+ self.hop_length = hop_length
 
35
  self.sr = sample_rate
36
  self.n_fft = n_fft
 
 
 
37
  self.top_db = top_db
38
  self.n_iter = n_iter
39
+ self.set_resolution(x_res, y_res)
40
  self.audio = None
41
 
42
+ def set_resolution(self, x_res: int, y_res: int):
43
+ """Set resolution.
44
+
45
+ Args:
46
+ x_res (int): x resolution of spectrogram (time)
47
+ y_res (int): y resolution of spectrogram (frequency bins)
48
+ """
49
+ self.x_res = x_res
50
+ self.y_res = y_res
51
+ self.n_mels = self.y_res
52
+ self.slice_size = self.x_res * self.hop_length - 1
53
+
54
  def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
55
  """Load audio.
56
 
 
65
 
66
  # Pad with silence if necessary.
67
  if len(self.audio) < self.x_res * self.hop_length:
68
+ self.audio = np.concatenate([self.audio, np.zeros((self.x_res * self.hop_length - len(self.audio),))])
 
 
 
69
 
70
  def get_number_of_slices(self) -> int:
71
  """Get number of slices in audio.
 
84
  Returns:
85
  np.ndarray: audio as numpy array
86
  """
87
+ return self.audio[self.slice_size * slice : self.slice_size * (slice + 1)]
 
88
 
89
  def get_sample_rate(self) -> int:
90
  """Get sample rate:
 
103
  Returns:
104
  PIL Image: grayscale image of x_res x y_res
105
  """
106
+ S = librosa.feature.melspectrogram(
107
+ y=self.get_audio_slice(slice), sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels
108
+ )
 
 
109
  log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
110
+ bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8)
 
111
  image = Image.fromarray(bytedata)
112
  return image
113
 
 
120
  Returns:
121
  audio (np.ndarray): raw audio
122
  """
123
+ bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape((image.height, image.width))
 
124
  log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
125
  S = librosa.db_to_power(log_S)
126
  audio = librosa.feature.inverse.mel_to_audio(
127
+ S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_iter=self.n_iter
128
+ )
 
 
 
129
  return audio
notebooks/test_model.ipynb CHANGED
@@ -61,7 +61,8 @@
61
  "outputs": [],
62
  "source": [
63
  "mel = Mel(x_res=256, y_res=256)\n",
64
- "generator = torch.Generator()"
 
65
  ]
66
  },
67
  {
@@ -160,7 +161,7 @@
160
  "metadata": {},
161
  "outputs": [],
162
  "source": [
163
- "seed = 16183389798189209330 #@param {type:\"integer\"}\n",
164
  "generator.manual_seed(seed)\n",
165
  "image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(\n",
166
  " generator=generator)\n",
@@ -270,7 +271,7 @@
270
  "overlap_samples = overlap_secs * mel.get_sample_rate()\n",
271
  "slice_size = mel.x_res * mel.hop_length\n",
272
  "stride = slice_size - overlap_samples\n",
273
- "generator = torch.Generator()\n",
274
  "seed = generator.seed()\n",
275
  "print(f'Seed = {seed}')\n",
276
  "track = np.array([])\n",
@@ -315,7 +316,7 @@
315
  " raw_audio=raw_audio,\n",
316
  " mask_start_secs=1,\n",
317
  " mask_end_secs=1,\n",
318
- " step_generator=torch.Generator())\n",
319
  "display(Audio(audio, rate=sample_rate))\n",
320
  "display(Audio(audio2, rate=sample_rate))"
321
  ]
@@ -526,7 +527,7 @@
526
  "metadata": {},
527
  "outputs": [],
528
  "source": [
529
- "seed = 6015487092443227811 #@param {type:\"integer\"}\n",
530
  "generator.manual_seed(seed)\n",
531
  "image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(\n",
532
  " generator=generator)\n",
@@ -541,7 +542,7 @@
541
  "metadata": {},
542
  "outputs": [],
543
  "source": [
544
- "seed2 = 5623685468252603494 #@param {type:\"integer\"}\n",
545
  "generator.manual_seed(seed2)\n",
546
  "image2, (sample_rate, audio2) = audio_diffusion.generate_spectrogram_and_audio(\n",
547
  " generator=generator)\n",
 
61
  "outputs": [],
62
  "source": [
63
  "mel = Mel(x_res=256, y_res=256)\n",
64
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
65
+ "generator = torch.Generator(device=device)"
66
  ]
67
  },
68
  {
 
161
  "metadata": {},
162
  "outputs": [],
163
  "source": [
164
+ "seed = 2391504374279719 #@param {type:\"integer\"}\n",
165
  "generator.manual_seed(seed)\n",
166
  "image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(\n",
167
  " generator=generator)\n",
 
271
  "overlap_samples = overlap_secs * mel.get_sample_rate()\n",
272
  "slice_size = mel.x_res * mel.hop_length\n",
273
  "stride = slice_size - overlap_samples\n",
274
+ "generator = torch.Generator(device=device)\n",
275
  "seed = generator.seed()\n",
276
  "print(f'Seed = {seed}')\n",
277
  "track = np.array([])\n",
 
316
  " raw_audio=raw_audio,\n",
317
  " mask_start_secs=1,\n",
318
  " mask_end_secs=1,\n",
319
+ " step_generator=torch.Generator(device=device))\n",
320
  "display(Audio(audio, rate=sample_rate))\n",
321
  "display(Audio(audio2, rate=sample_rate))"
322
  ]
 
527
  "metadata": {},
528
  "outputs": [],
529
  "source": [
530
+ "seed = 3412253600050855 #@param {type:\"integer\"}\n",
531
  "generator.manual_seed(seed)\n",
532
  "image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(\n",
533
  " generator=generator)\n",
 
542
  "metadata": {},
543
  "outputs": [],
544
  "source": [
545
+ "seed2 = 7016114633369557 #@param {type:\"integer\"}\n",
546
  "generator.manual_seed(seed2)\n",
547
  "image2, (sample_rate, audio2) = audio_diffusion.generate_spectrogram_and_audio(\n",
548
  " generator=generator)\n",