Spaces:
Runtime error
Runtime error
migrate to diffusers!
Browse files- README.md +11 -1
- audiodiffusion/__init__.py +7 -86
- notebooks/test_model.ipynb +0 -0
- requirements.txt +1 -1
- scripts/audio_to_images.py +1 -2
- scripts/train_unconditional.py +2 -1
- scripts/train_vae.py +2 -4
- setup.cfg +1 -1
README.md
CHANGED
@@ -23,6 +23,8 @@ Go to https://soundcloud.com/teticio2/sets/audio-diffusion-loops for more exampl
|
|
23 |
---
|
24 |
#### Updates
|
25 |
|
|
|
|
|
26 |
**2/12/2022**. Added Mel to pipeline and updated the pretrained models to save Mel config (they are now no longer compatible with previous versions of this repo). It is relatively straightforward to migrate previously trained models to the new format (see https://huggingface.co/teticio/audio-diffusion-256).
|
27 |
|
28 |
**7/11/2022**. Added pre-trained latent audio diffusion models [teticio/latent-audio-diffusion-256](https://huggingface.co/teticio/latent-audio-diffusion-256) and [teticio/latent-audio-diffusion-ddim-256](https://huggingface.co/teticio/latent-audio-diffusion-ddim-256). You can use the pre-trained VAE to train your own latent diffusion models on a different set of audio files.
|
@@ -62,12 +64,20 @@ You can play around with some pre-trained models on [Google Colab](https://colab
|
|
62 |
|
63 |
## Generate Mel spectrogram dataset from directory of audio files
|
64 |
|
65 |
-
#### Install
|
66 |
|
67 |
```bash
|
|
|
|
|
68 |
pip install .
|
69 |
```
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
#### Training can be run with Mel spectrograms of resolution 64x64 on a single commercial grade GPU (e.g. RTX 2080 Ti). The `hop_length` should be set to 1024 for better results
|
72 |
|
73 |
```bash
|
|
|
23 |
---
|
24 |
#### Updates
|
25 |
|
26 |
+
**5/12/2022** 🤗 Exciting news! `AudioDiffusionPipeline` has been migrated to the Hugging Face `diffusers` package so that it is even easier for others to use and contribute.
|
27 |
+
|
28 |
**2/12/2022**. Added Mel to pipeline and updated the pretrained models to save Mel config (they are now no longer compatible with previous versions of this repo). It is relatively straightforward to migrate previously trained models to the new format (see https://huggingface.co/teticio/audio-diffusion-256).
|
29 |
|
30 |
**7/11/2022**. Added pre-trained latent audio diffusion models [teticio/latent-audio-diffusion-256](https://huggingface.co/teticio/latent-audio-diffusion-256) and [teticio/latent-audio-diffusion-ddim-256](https://huggingface.co/teticio/latent-audio-diffusion-ddim-256). You can use the pre-trained VAE to train your own latent diffusion models on a different set of audio files.
|
|
|
64 |
|
65 |
## Generate Mel spectrogram dataset from directory of audio files
|
66 |
|
67 |
+
#### Install from GitHub (includes training scripts)
|
68 |
|
69 |
```bash
|
70 |
+
git clone https://github.com/teticio/audio-diffusion.git
|
71 |
+
cd audio-diffusion
|
72 |
pip install .
|
73 |
```
|
74 |
|
75 |
+
#### Install from PyPI
|
76 |
+
|
77 |
+
```bash
|
78 |
+
pip install audiodiffusion
|
79 |
+
```
|
80 |
+
|
81 |
#### Training can be run with Mel spectrograms of resolution 64x64 on a single commercial grade GPU (e.g. RTX 2080 Ti). The `hop_length` should be set to 1024 for better results
|
82 |
|
83 |
```bash
|
audiodiffusion/__init__.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
-
from typing import Iterable, Tuple
|
2 |
|
3 |
import torch
|
4 |
import numpy as np
|
5 |
from PIL import Image
|
6 |
from tqdm.auto import tqdm
|
7 |
from librosa.beat import beat_track
|
8 |
-
|
9 |
|
10 |
-
VERSION = "1.3.
|
11 |
|
12 |
|
13 |
class AudioDiffusion:
|
@@ -131,6 +131,7 @@ class AudioDiffusion:
|
|
131 |
return None
|
132 |
|
133 |
|
|
|
134 |
# This code will be migrated to diffusers shortly
|
135 |
|
136 |
#-----------------------------------------------------------------------------#
|
@@ -140,6 +141,7 @@ import warnings
|
|
140 |
from typing import Any, Dict, Optional, Union
|
141 |
|
142 |
from diffusers.configuration_utils import ConfigMixin, register_to_config
|
|
|
143 |
|
144 |
|
145 |
warnings.filterwarnings("ignore")
|
@@ -150,7 +152,7 @@ import librosa # noqa: E402
|
|
150 |
from PIL import Image # noqa: E402
|
151 |
|
152 |
|
153 |
-
class Mel(ConfigMixin):
|
154 |
"""
|
155 |
Parameters:
|
156 |
x_res (`int`): x resolution of spectrogram (time)
|
@@ -272,88 +274,6 @@ class Mel(ConfigMixin):
|
|
272 |
)
|
273 |
return audio
|
274 |
|
275 |
-
@classmethod
|
276 |
-
def from_pretrained(
|
277 |
-
cls,
|
278 |
-
pretrained_model_name_or_path: Dict[str, Any] = None,
|
279 |
-
subfolder: Optional[str] = None,
|
280 |
-
return_unused_kwargs=False,
|
281 |
-
**kwargs,
|
282 |
-
):
|
283 |
-
r"""
|
284 |
-
Instantiate a Mel class from a pre-defined JSON configuration file inside a directory or Hub repo.
|
285 |
-
|
286 |
-
Parameters:
|
287 |
-
pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
|
288 |
-
Can be either:
|
289 |
-
|
290 |
-
- A string, the *model id* of a model repo on huggingface.co. Valid model ids should have an
|
291 |
-
organization name, like `google/ddpm-celebahq-256`.
|
292 |
-
- A path to a *directory* containing the mel configurations saved using [`~Mel.save_pretrained`],
|
293 |
-
e.g., `./my_model_directory/`.
|
294 |
-
subfolder (`str`, *optional*):
|
295 |
-
In case the relevant files are located inside a subfolder of the model repo (either remote in
|
296 |
-
huggingface.co or downloaded locally), you can specify the folder name here.
|
297 |
-
return_unused_kwargs (`bool`, *optional*, defaults to `False`):
|
298 |
-
Whether kwargs that are not consumed by the Python class should be returned or not.
|
299 |
-
cache_dir (`Union[str, os.PathLike]`, *optional*):
|
300 |
-
Path to a directory in which a downloaded pretrained model configuration should be cached if the
|
301 |
-
standard cache should not be used.
|
302 |
-
force_download (`bool`, *optional*, defaults to `False`):
|
303 |
-
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
|
304 |
-
cached versions if they exist.
|
305 |
-
resume_download (`bool`, *optional*, defaults to `False`):
|
306 |
-
Whether or not to delete incompletely received files. Will attempt to resume the download if such a
|
307 |
-
file exists.
|
308 |
-
proxies (`Dict[str, str]`, *optional*):
|
309 |
-
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
|
310 |
-
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
|
311 |
-
output_loading_info(`bool`, *optional*, defaults to `False`):
|
312 |
-
Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
|
313 |
-
local_files_only(`bool`, *optional*, defaults to `False`):
|
314 |
-
Whether or not to only look at local files (i.e., do not try to download the model).
|
315 |
-
use_auth_token (`str` or *bool*, *optional*):
|
316 |
-
The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
|
317 |
-
when running `transformers-cli login` (stored in `~/.huggingface`).
|
318 |
-
revision (`str`, *optional*, defaults to `"main"`):
|
319 |
-
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
|
320 |
-
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
|
321 |
-
identifier allowed by git.
|
322 |
-
|
323 |
-
<Tip>
|
324 |
-
|
325 |
-
It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
|
326 |
-
models](https://huggingface.co/docs/hub/models-gated#gated-models).
|
327 |
-
|
328 |
-
</Tip>
|
329 |
-
|
330 |
-
<Tip>
|
331 |
-
|
332 |
-
Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
|
333 |
-
use this method in a firewalled environment.
|
334 |
-
|
335 |
-
</Tip>
|
336 |
-
|
337 |
-
"""
|
338 |
-
config, kwargs = cls.load_config(
|
339 |
-
pretrained_model_name_or_path=pretrained_model_name_or_path,
|
340 |
-
subfolder=subfolder,
|
341 |
-
return_unused_kwargs=True,
|
342 |
-
**kwargs,
|
343 |
-
)
|
344 |
-
return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs)
|
345 |
-
|
346 |
-
def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
|
347 |
-
"""
|
348 |
-
Save a mel configuration object to the directory `save_directory`, so that it can be re-loaded using the
|
349 |
-
[`~Mel.from_pretrained`] class method.
|
350 |
-
|
351 |
-
Args:
|
352 |
-
save_directory (`str` or `os.PathLike`):
|
353 |
-
Directory where the configuration JSON file will be saved (will be created if it does not exist).
|
354 |
-
"""
|
355 |
-
self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
|
356 |
-
|
357 |
#-----------------------------------------------------------------------------#
|
358 |
|
359 |
from math import acos, sin
|
@@ -603,3 +523,4 @@ diffusers.AudioDiffusionPipeline = AudioDiffusionPipeline
|
|
603 |
setattr(diffusers, AudioDiffusionPipeline.__name__, AudioDiffusionPipeline)
|
604 |
diffusers.pipeline_utils.LOADABLE_CLASSES['audio_diffusion'] = {}
|
605 |
diffusers.pipeline_utils.LOADABLE_CLASSES['audio_diffusion']['Mel'] = ["save_pretrained", "from_pretrained"]
|
|
|
|
1 |
+
from typing import Iterable, Tuple
|
2 |
|
3 |
import torch
|
4 |
import numpy as np
|
5 |
from PIL import Image
|
6 |
from tqdm.auto import tqdm
|
7 |
from librosa.beat import beat_track
|
8 |
+
from diffusers import AudioDiffusionPipeline
|
9 |
|
10 |
+
VERSION = "1.3.2"
|
11 |
|
12 |
|
13 |
class AudioDiffusion:
|
|
|
131 |
return None
|
132 |
|
133 |
|
134 |
+
'''
|
135 |
# This code will be migrated to diffusers shortly
|
136 |
|
137 |
#-----------------------------------------------------------------------------#
|
|
|
141 |
from typing import Any, Dict, Optional, Union
|
142 |
|
143 |
from diffusers.configuration_utils import ConfigMixin, register_to_config
|
144 |
+
from diffusers.schedulers.scheduling_utils import SchedulerMixin
|
145 |
|
146 |
|
147 |
warnings.filterwarnings("ignore")
|
|
|
152 |
from PIL import Image # noqa: E402
|
153 |
|
154 |
|
155 |
+
class Mel(ConfigMixin, SchedulerMixin):
|
156 |
"""
|
157 |
Parameters:
|
158 |
x_res (`int`): x resolution of spectrogram (time)
|
|
|
274 |
)
|
275 |
return audio
|
276 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
#-----------------------------------------------------------------------------#
|
278 |
|
279 |
from math import acos, sin
|
|
|
523 |
setattr(diffusers, AudioDiffusionPipeline.__name__, AudioDiffusionPipeline)
|
524 |
diffusers.pipeline_utils.LOADABLE_CLASSES['audio_diffusion'] = {}
|
525 |
diffusers.pipeline_utils.LOADABLE_CLASSES['audio_diffusion']['Mel'] = ["save_pretrained", "from_pretrained"]
|
526 |
+
'''
|
notebooks/test_model.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
torch
|
2 |
numpy
|
3 |
Pillow
|
4 |
-
diffusers>=0.
|
5 |
librosa
|
6 |
datasets
|
7 |
gradio
|
|
|
1 |
torch
|
2 |
numpy
|
3 |
Pillow
|
4 |
+
diffusers>=0.10.0
|
5 |
librosa
|
6 |
datasets
|
7 |
gradio
|
scripts/audio_to_images.py
CHANGED
@@ -7,10 +7,9 @@ import argparse
|
|
7 |
import numpy as np
|
8 |
import pandas as pd
|
9 |
from tqdm.auto import tqdm
|
|
|
10 |
from datasets import Dataset, DatasetDict, Features, Image, Value
|
11 |
|
12 |
-
from audiodiffusion import Mel
|
13 |
-
|
14 |
logging.basicConfig(level=logging.WARN)
|
15 |
logger = logging.getLogger('audio_to_images')
|
16 |
|
|
|
7 |
import numpy as np
|
8 |
import pandas as pd
|
9 |
from tqdm.auto import tqdm
|
10 |
+
from diffusers.pipelines.audio_diffusion import Mel
|
11 |
from datasets import Dataset, DatasetDict, Features, Image, Value
|
12 |
|
|
|
|
|
13 |
logging.basicConfig(level=logging.WARN)
|
14 |
logger = logging.getLogger('audio_to_images')
|
15 |
|
scripts/train_unconditional.py
CHANGED
@@ -11,11 +11,13 @@ from accelerate import Accelerator
|
|
11 |
from accelerate.logging import get_logger
|
12 |
from datasets import load_from_disk, load_dataset
|
13 |
from diffusers import (
|
|
|
14 |
DDPMScheduler,
|
15 |
UNet2DModel,
|
16 |
DDIMScheduler,
|
17 |
AutoencoderKL,
|
18 |
)
|
|
|
19 |
from huggingface_hub import HfFolder, Repository, whoami
|
20 |
from diffusers.optimization import get_scheduler
|
21 |
from diffusers.training_utils import EMAModel
|
@@ -27,7 +29,6 @@ from torchvision.transforms import (
|
|
27 |
import numpy as np
|
28 |
from tqdm.auto import tqdm
|
29 |
from librosa.util import normalize
|
30 |
-
from audiodiffusion import AudioDiffusionPipeline, Mel
|
31 |
|
32 |
logger = get_logger(__name__)
|
33 |
|
|
|
11 |
from accelerate.logging import get_logger
|
12 |
from datasets import load_from_disk, load_dataset
|
13 |
from diffusers import (
|
14 |
+
AudioDiffusionPipeline,
|
15 |
DDPMScheduler,
|
16 |
UNet2DModel,
|
17 |
DDIMScheduler,
|
18 |
AutoencoderKL,
|
19 |
)
|
20 |
+
from diffusers.pipelines.audio_diffusion import Mel
|
21 |
from huggingface_hub import HfFolder, Repository, whoami
|
22 |
from diffusers.optimization import get_scheduler
|
23 |
from diffusers.training_utils import EMAModel
|
|
|
29 |
import numpy as np
|
30 |
from tqdm.auto import tqdm
|
31 |
from librosa.util import normalize
|
|
|
32 |
|
33 |
logger = get_logger(__name__)
|
34 |
|
scripts/train_vae.py
CHANGED
@@ -14,13 +14,11 @@ from ldm.util import instantiate_from_config
|
|
14 |
from pytorch_lightning.trainer import Trainer
|
15 |
from torch.utils.data import DataLoader, Dataset
|
16 |
from datasets import load_from_disk, load_dataset
|
|
|
|
|
17 |
from pytorch_lightning.callbacks import Callback, ModelCheckpoint
|
18 |
from pytorch_lightning.utilities.distributed import rank_zero_only
|
19 |
|
20 |
-
#from diffusers import Mel
|
21 |
-
from audiodiffusion import Mel
|
22 |
-
from audiodiffusion.utils import convert_ldm_to_hf_vae
|
23 |
-
|
24 |
|
25 |
class AudioDiffusion(Dataset):
|
26 |
|
|
|
14 |
from pytorch_lightning.trainer import Trainer
|
15 |
from torch.utils.data import DataLoader, Dataset
|
16 |
from datasets import load_from_disk, load_dataset
|
17 |
+
from diffusers.pipelines.audio_diffusion import Mel
|
18 |
+
from audiodiffusion.utils import convert_ldm_to_hf_vae
|
19 |
from pytorch_lightning.callbacks import Callback, ModelCheckpoint
|
20 |
from pytorch_lightning.utilities.distributed import rank_zero_only
|
21 |
|
|
|
|
|
|
|
|
|
22 |
|
23 |
class AudioDiffusion(Dataset):
|
24 |
|
setup.cfg
CHANGED
@@ -15,6 +15,6 @@ install_requires =
|
|
15 |
torch
|
16 |
numpy
|
17 |
Pillow
|
18 |
-
diffusers>=0.
|
19 |
librosa
|
20 |
datasets
|
|
|
15 |
torch
|
16 |
numpy
|
17 |
Pillow
|
18 |
+
diffusers>=0.10.0
|
19 |
librosa
|
20 |
datasets
|