Spaces:

teticio
/

audio-diffusion

Runtime error

App Files Files Community

teticio commited on Aug 11, 2022

Commit

825c8bf

•

1 Parent(s): a1960dc

update README

Browse files

Files changed (5) hide show

README.md +64 -4
mel.png +0 -0
notebooks/test-mel.ipynb +2 -2
notebooks/test-model.ipynb +5 -5
src/train_unconditional.py +6 -3

README.md CHANGED Viewed

@@ -1,16 +1,76 @@
 # audio-diffusion
 ```bash
-accelerate config
 ```
 ```bash
 python src/audio_to_images.py \
   --resolution 256 \
   --input_dir path-to-audio-files \
-  --output_dir data-256
 ```
 ```bash
-accelerate launch src/train_unconditional.py \
-  --dataset_name data-256 \
   --resolution 256 \
   --output_dir ddpm-ema-audio-256 \
   --train_batch_size 16 \

 # audio-diffusion
+### Apply [Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239) using the new Hugging Face [diffusers](https://github.com/huggingface/diffusers) package to synthesize music instead of images.
+---
+![mel spectrogram](mel.png)
+Audio can be represented as images by transforming to a [mel spectrogram](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum), such as the one shown above. The class `Mel` in `mel.py` can convert a slice of audio into a mel spectrogram of `x_res` x `y_res` and vice-versa. The higher the resolution, the less audio information will be lost. You can see how this works in the `test-mel.ipynb` notebook.
+A DDPM model is trained on a set of mel spectrograms that have been generated from a directory of audio files. It is then used to synthesize similar mel spectrograms, which are then converted back into audio. See the `test-model.ipynb` notebook for an example.
+## Generate Mel spectrogram dataset from directory of audio files
+### Training can be run with Mel spectrograms of resolution 64x64 on a single commercial grade GPU (e.g. RTX 2080 Ti). The `hop_length` should be set to 1024 for better results.
 ```bash
+python src/audio_to_images.py \
+  --resolution 64 \
+  --hop_length 1024\
+  --input_dir path-to-audio-files \
+  --output_dir data-test
 ```
+### Generate dataset of 256x256 Mel spectrograms and push to hub (you will need to be authenticated with `huggingface-cli login`).
 ```bash
 python src/audio_to_images.py \
   --resolution 256 \
   --input_dir path-to-audio-files \
+  --output_dir data-256 \
+  --push_to_hub teticio\audio-diffusion-256
+```
+## Train model
+### Run training on local machine.
+```bash
+accelerate launch --config_file accelerate_local.yaml \
+  src/train_unconditional.py \
+  --dataset_name data-64 \
+  --resolution 64 \
+  --hop_length 1024 \
+  --output_dir ddpm-ema-audio-64 \
+  --train_batch_size 16 \
+  --num_epochs 100 \
+  --gradient_accumulation_steps 1 \
+  --learning_rate 1e-4 \
+  --lr_warmup_steps 500 \
+  --mixed_precision no
+```
+### Run training on local machine with `batch_size` of 1 and `gradient_accumulation_steps` 16 to compensate, so that 256x256 resolution model fits on commercial grade GPU.
+```bash
+accelerate launch --config_file accelerate_local.yaml \
+  src/train_unconditional.py \
+  --dataset_name teticio/audio-diffusion-256 \
+  --resolution 256 \
+  --output_dir ddpm-ema-audio-256 \
+  --num_epochs 100 \
+  --train_batch_size 1 \
+  --eval_batch_size 1 \
+  --gradient_accumulation_steps 16 \
+  --learning_rate 1e-4 \
+  --lr_warmup_steps 500 \
+  --mixed_precision no
 ```
+### Run training on SageMaker.
 ```bash
+accelerate launch --config_file accelerate_sagemaker.yaml \
+  src/train_unconditional.py \
+  --dataset_name teticio/audio-diffusion-256 \
   --resolution 256 \
   --output_dir ddpm-ema-audio-256 \
   --train_batch_size 16 \

mel.png ADDED Viewed

notebooks/test-mel.ipynb CHANGED Viewed

@@ -49,7 +49,7 @@
    "id": "b2178c3f",
    "metadata": {},
    "source": [
-    "### Transform slice of audio to Mel spectrogram"
    ]
   },
   {
@@ -120,7 +120,7 @@
    "id": "fe112fef",
    "metadata": {},
    "source": [
-    "### Transform Mel spectrogram back to audio"
    ]
   },
   {

    "id": "b2178c3f",
    "metadata": {},
    "source": [
+    "### Transform slice of audio to mel spectrogram"
    ]
   },
   {
    "id": "fe112fef",
    "metadata": {},
    "source": [
+    "### Transform mel spectrogram back to audio"
    ]
   },
   {

notebooks/test-model.ipynb CHANGED Viewed

@@ -42,7 +42,7 @@
    "id": "011fb5a1",
    "metadata": {},
    "source": [
-    "### Run model inference to generate Mel spectrogram"
    ]
   },
   {
@@ -76,7 +76,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "id": "7c24cc95",
    "metadata": {},
    "outputs": [
     {
@@ -101,7 +101,7 @@
    "id": "7230c280",
    "metadata": {},
    "source": [
-    "### Transform Mel spectrogram to audio"
    ]
   },
   {
@@ -155,7 +155,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "id": "af947751",
    "metadata": {},
    "outputs": [
     {
@@ -208,7 +208,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2ab3d0d2",
    "metadata": {},
    "outputs": [],
    "source": []

    "id": "011fb5a1",
    "metadata": {},
    "source": [
+    "### Run model inference to generate mel spectrogram"
    ]
   },
   {
   {
    "cell_type": "code",
    "execution_count": 6,
+   "id": "75db4b7c",
    "metadata": {},
    "outputs": [
     {
    "id": "7230c280",
    "metadata": {},
    "source": [
+    "### Transform mel spectrogram to audio"
    ]
   },
   {
   {
    "cell_type": "code",
    "execution_count": 8,
+   "id": "b9023846",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "acf96aba",
    "metadata": {},
    "outputs": [],
    "source": []

src/train_unconditional.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import argparse
 import os
@@ -30,7 +32,8 @@ logger = get_logger(__name__)
 def main(args):
-    logging_dir = os.path.join(args.output_dir, args.logging_dir)
     accelerator = Accelerator(
         mixed_precision=args.mixed_precision,
         log_with="tensorboard",
@@ -122,7 +125,7 @@ def main(args):
     )
     ema_model = EMAModel(
-        model,
         inv_gamma=args.ema_inv_gamma,
         power=args.ema_power,
         max_value=args.ema_max_decay,
@@ -234,7 +237,7 @@ def main(args):
                         blocking=False,
                     )
                 else:
-                    pipeline.save_pretrained(args.output_dir)
         accelerator.wait_for_everyone()
     accelerator.end_training()

+# based on https://github.com/huggingface/diffusers/blob/main/examples/train_unconditional.py
 import argparse
 import os
 def main(args):
+    output_dir = os.environ.get("SM_MODEL_DIR", None) or args.output_dir
+    logging_dir = os.path.join(output_dir, args.logging_dir)
     accelerator = Accelerator(
         mixed_precision=args.mixed_precision,
         log_with="tensorboard",
     )
     ema_model = EMAModel(
+        getattr(model, "module", model),
         inv_gamma=args.ema_inv_gamma,
         power=args.ema_power,
         max_value=args.ema_max_decay,
                         blocking=False,
                     )
                 else:
+                    pipeline.save_pretrained(output_dir)
         accelerator.wait_for_everyone()
     accelerator.end_training()