File size: 2,002 Bytes
d533c9c
c17b696
 
 
d533c9c
c17b696
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d533c9c
 
 
b7f49a5
 
 
 
 
 
 
 
 
 
d533c9c
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import numpy as np
from PIL import Image
from torch import cuda
from diffusers import DDPMPipeline
from librosa.beat import beat_track

from .mel import Mel

VERSION = "1.0.1"


class AudioDiffusion:

    def __init__(self,
                 model_id="teticio/audio-diffusion-256",
                 resolution=256,
                 cuda=cuda.is_available()):
        """Class for generating audio using Denoising Diffusion Probabilistic Models.

        Args:
            model_id (String): name of model (local directory or Hugging Face Hub)
            resolution (int): size of square mel spectrogram in pixels
            cuda (bool): use CUDA?
        """
        self.mel = Mel(x_res=resolution, y_res=resolution)
        self.model_id = model_id
        self.ddpm = DDPMPipeline.from_pretrained(self.model_id)
        if cuda:
            self.ddpm.to("cuda")

    def generate_spectrogram_and_audio(self):
        """Generate random mel spectrogram and convert to audio.

        Returns:
            PIL Image: mel spectrogram
            (float, array): sample rate and raw audio
        """
        images = self.ddpm(output_type="numpy")["sample"]
        images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
        image = Image.fromarray(images[0][0])
        audio = self.mel.image_to_audio(image)
        return image, (self.mel.get_sample_rate(), audio)

    @staticmethod
    def loop_it(audio, sample_rate, loops=12):
        """Loop audio

        Args:
            audio (array): audio as numpy array
            sample_rate (int): sample rate of audio
            loops (int): number of times to loop

        Returns:
            (float, array): sample rate and raw audio or None
        """
        tempo, beats = beat_track(y=audio, sr=sample_rate, units='samples')
        if len(beats) > 8:
            return np.tile(audio[beats[0]:beats[8]], loops)
        if len(beats) > 4:
            return np.tile(audio[beats[0]:beats[4]], loops)
        return None