Spaces:

teticio
/

audio-diffusion

Runtime error

App Files Files Community

teticio commited on Dec 8, 2022

Commit

8a3fb2e

•

1 Parent(s): 52ca0bd

migrate to diffusers!

Browse files

Files changed (8) hide show

README.md +11 -1
audiodiffusion/__init__.py +7 -86
notebooks/test_model.ipynb +0 -0
requirements.txt +1 -1
scripts/audio_to_images.py +1 -2
scripts/train_unconditional.py +2 -1
scripts/train_vae.py +2 -4
setup.cfg +1 -1

README.md CHANGED Viewed

@@ -23,6 +23,8 @@ Go to https://soundcloud.com/teticio2/sets/audio-diffusion-loops for more exampl
 ---
 #### Updates
 **2/12/2022**. Added Mel to pipeline and updated the pretrained models to save Mel config (they are now no longer compatible with previous versions of this repo). It is relatively straightforward to migrate previously trained models to the new format (see https://huggingface.co/teticio/audio-diffusion-256).
 **7/11/2022**. Added pre-trained latent audio diffusion models [teticio/latent-audio-diffusion-256](https://huggingface.co/teticio/latent-audio-diffusion-256) and [teticio/latent-audio-diffusion-ddim-256](https://huggingface.co/teticio/latent-audio-diffusion-ddim-256). You can use the pre-trained VAE to train your own latent diffusion models on a different set of audio files.
@@ -62,12 +64,20 @@ You can play around with some pre-trained models on [Google Colab](https://colab
 ## Generate Mel spectrogram dataset from directory of audio files
-#### Install
 ```bash
 pip install .
 ```
 #### Training can be run with Mel spectrograms of resolution 64x64 on a single commercial grade GPU (e.g. RTX 2080 Ti). The `hop_length` should be set to 1024 for better results
 ```bash

 ---
 #### Updates
+**5/12/2022** 🤗 Exciting news! `AudioDiffusionPipeline` has been migrated to the Hugging Face `diffusers` package so that it is even easier for others to use and contribute.
 **2/12/2022**. Added Mel to pipeline and updated the pretrained models to save Mel config (they are now no longer compatible with previous versions of this repo). It is relatively straightforward to migrate previously trained models to the new format (see https://huggingface.co/teticio/audio-diffusion-256).
 **7/11/2022**. Added pre-trained latent audio diffusion models [teticio/latent-audio-diffusion-256](https://huggingface.co/teticio/latent-audio-diffusion-256) and [teticio/latent-audio-diffusion-ddim-256](https://huggingface.co/teticio/latent-audio-diffusion-ddim-256). You can use the pre-trained VAE to train your own latent diffusion models on a different set of audio files.
 ## Generate Mel spectrogram dataset from directory of audio files
+#### Install from GitHub (includes training scripts)
 ```bash
+git clone https://github.com/teticio/audio-diffusion.git
+cd audio-diffusion
 pip install .
 ```
+#### Install from PyPI
+```bash
+pip install audiodiffusion
+```
 #### Training can be run with Mel spectrograms of resolution 64x64 on a single commercial grade GPU (e.g. RTX 2080 Ti). The `hop_length` should be set to 1024 for better results
 ```bash

audiodiffusion/__init__.py CHANGED Viewed

@@ -1,13 +1,13 @@
-from typing import Iterable, Tuple, Union
 import torch
 import numpy as np
 from PIL import Image
 from tqdm.auto import tqdm
 from librosa.beat import beat_track
-#from diffusers import DiffusionPipeline
-VERSION = "1.3.1"
 class AudioDiffusion:
@@ -131,6 +131,7 @@ class AudioDiffusion:
         return None
 # This code will be migrated to diffusers shortly
 #-----------------------------------------------------------------------------#
@@ -140,6 +141,7 @@ import warnings
 from typing import Any, Dict, Optional, Union
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 warnings.filterwarnings("ignore")
@@ -150,7 +152,7 @@ import librosa  # noqa: E402
 from PIL import Image  # noqa: E402
-class Mel(ConfigMixin):
     """
     Parameters:
         x_res (`int`): x resolution of spectrogram (time)
@@ -272,88 +274,6 @@ class Mel(ConfigMixin):
         )
         return audio
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: Dict[str, Any] = None,
-        subfolder: Optional[str] = None,
-        return_unused_kwargs=False,
-        **kwargs,
-    ):
-        r"""
-        Instantiate a Mel class from a pre-defined JSON configuration file inside a directory or Hub repo.
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-                    - A string, the *model id* of a model repo on huggingface.co. Valid model ids should have an
-                      organization name, like `google/ddpm-celebahq-256`.
-                    - A path to a *directory* containing the mel configurations saved using [`~Mel.save_pretrained`],
-                      e.g., `./my_model_directory/`.
-            subfolder (`str`, *optional*):
-                In case the relevant files are located inside a subfolder of the model repo (either remote in
-                huggingface.co or downloaded locally), you can specify the folder name here.
-            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
-                Whether kwargs that are not consumed by the Python class should be returned or not.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(`bool`, *optional*, defaults to `False`):
-                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `transformers-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-        <Tip>
-         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-         models](https://huggingface.co/docs/hub/models-gated#gated-models).
-        </Tip>
-        <Tip>
-        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
-        use this method in a firewalled environment.
-        </Tip>
-        """
-        config, kwargs = cls.load_config(
-            pretrained_model_name_or_path=pretrained_model_name_or_path,
-            subfolder=subfolder,
-            return_unused_kwargs=True,
-            **kwargs,
-        )
-        return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs)
-    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
-        """
-        Save a mel configuration object to the directory `save_directory`, so that it can be re-loaded using the
-        [`~Mel.from_pretrained`] class method.
-        Args:
-            save_directory (`str` or `os.PathLike`):
-                Directory where the configuration JSON file will be saved (will be created if it does not exist).
-        """
-        self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
 #-----------------------------------------------------------------------------#
 from math import acos, sin
@@ -603,3 +523,4 @@ diffusers.AudioDiffusionPipeline = AudioDiffusionPipeline
 setattr(diffusers, AudioDiffusionPipeline.__name__, AudioDiffusionPipeline)
 diffusers.pipeline_utils.LOADABLE_CLASSES['audio_diffusion'] = {}
 diffusers.pipeline_utils.LOADABLE_CLASSES['audio_diffusion']['Mel'] = ["save_pretrained", "from_pretrained"]

+from typing import Iterable, Tuple
 import torch
 import numpy as np
 from PIL import Image
 from tqdm.auto import tqdm
 from librosa.beat import beat_track
+from diffusers import AudioDiffusionPipeline
+VERSION = "1.3.2"
 class AudioDiffusion:
         return None
+'''
 # This code will be migrated to diffusers shortly
 #-----------------------------------------------------------------------------#
 from typing import Any, Dict, Optional, Union
 from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
 warnings.filterwarnings("ignore")
 from PIL import Image  # noqa: E402
+class Mel(ConfigMixin, SchedulerMixin):
     """
     Parameters:
         x_res (`int`): x resolution of spectrogram (time)
         )
         return audio
 #-----------------------------------------------------------------------------#
 from math import acos, sin
 setattr(diffusers, AudioDiffusionPipeline.__name__, AudioDiffusionPipeline)
 diffusers.pipeline_utils.LOADABLE_CLASSES['audio_diffusion'] = {}
 diffusers.pipeline_utils.LOADABLE_CLASSES['audio_diffusion']['Mel'] = ["save_pretrained", "from_pretrained"]
+'''

notebooks/test_model.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
 torch
 numpy
 Pillow
-diffusers>=0.9.0
 librosa
 datasets
 gradio

 torch
 numpy
 Pillow
+diffusers>=0.10.0
 librosa
 datasets
 gradio

scripts/audio_to_images.py CHANGED Viewed

@@ -7,10 +7,9 @@ import argparse
 import numpy as np
 import pandas as pd
 from tqdm.auto import tqdm
 from datasets import Dataset, DatasetDict, Features, Image, Value
-from audiodiffusion import Mel
 logging.basicConfig(level=logging.WARN)
 logger = logging.getLogger('audio_to_images')

 import numpy as np
 import pandas as pd
 from tqdm.auto import tqdm
+from diffusers.pipelines.audio_diffusion import Mel
 from datasets import Dataset, DatasetDict, Features, Image, Value
 logging.basicConfig(level=logging.WARN)
 logger = logging.getLogger('audio_to_images')

scripts/train_unconditional.py CHANGED Viewed

@@ -11,11 +11,13 @@ from accelerate import Accelerator
 from accelerate.logging import get_logger
 from datasets import load_from_disk, load_dataset
 from diffusers import (
     DDPMScheduler,
     UNet2DModel,
     DDIMScheduler,
     AutoencoderKL,
 )
 from huggingface_hub import HfFolder, Repository, whoami
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import EMAModel
@@ -27,7 +29,6 @@ from torchvision.transforms import (
 import numpy as np
 from tqdm.auto import tqdm
 from librosa.util import normalize
-from audiodiffusion import AudioDiffusionPipeline, Mel
 logger = get_logger(__name__)

 from accelerate.logging import get_logger
 from datasets import load_from_disk, load_dataset
 from diffusers import (
+    AudioDiffusionPipeline,
     DDPMScheduler,
     UNet2DModel,
     DDIMScheduler,
     AutoencoderKL,
 )
+from diffusers.pipelines.audio_diffusion import Mel
 from huggingface_hub import HfFolder, Repository, whoami
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import EMAModel
 import numpy as np
 from tqdm.auto import tqdm
 from librosa.util import normalize
 logger = get_logger(__name__)

scripts/train_vae.py CHANGED Viewed

@@ -14,13 +14,11 @@ from ldm.util import instantiate_from_config
 from pytorch_lightning.trainer import Trainer
 from torch.utils.data import DataLoader, Dataset
 from datasets import load_from_disk, load_dataset
 from pytorch_lightning.callbacks import Callback, ModelCheckpoint
 from pytorch_lightning.utilities.distributed import rank_zero_only
-#from diffusers import Mel
-from audiodiffusion import Mel
-from audiodiffusion.utils import convert_ldm_to_hf_vae
 class AudioDiffusion(Dataset):

 from pytorch_lightning.trainer import Trainer
 from torch.utils.data import DataLoader, Dataset
 from datasets import load_from_disk, load_dataset
+from diffusers.pipelines.audio_diffusion import Mel
+from audiodiffusion.utils import convert_ldm_to_hf_vae
 from pytorch_lightning.callbacks import Callback, ModelCheckpoint
 from pytorch_lightning.utilities.distributed import rank_zero_only
 class AudioDiffusion(Dataset):

setup.cfg CHANGED Viewed

@@ -15,6 +15,6 @@ install_requires =
     torch
     numpy
     Pillow
-    diffusers>=0.9.0
     librosa
     datasets

     torch
     numpy
     Pillow
+    diffusers>=0.10.0
     librosa
     datasets