diff --git "a/notebooks/test-model.ipynb" "b/notebooks/test-model.ipynb" deleted file mode 100644--- "a/notebooks/test-model.ipynb" +++ /dev/null @@ -1,440 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0fd939b0", - "metadata": {}, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "6c7800a6", - "metadata": {}, - "outputs": [], - "source": [ - "try:\n", - " # are we running on Google Colab?\n", - " import google.colab\n", - " !git clone -q https://github.com/teticio/audio-diffusion.git\n", - " %cd audio-diffusion\n", - " !pip install -q -r requirements.txt\n", - "except:\n", - " pass" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "b447e2c4", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import sys\n", - "sys.path.insert(0, os.path.dirname(os.path.abspath(\"\")))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "c2fc0e7a", - "metadata": {}, - "outputs": [], - "source": [ - "import random\n", - "from datasets import load_dataset\n", - "from IPython.display import Audio\n", - "from audiodiffusion.mel import Mel\n", - "from audiodiffusion import AudioDiffusion" - ] - }, - { - "cell_type": "markdown", - "id": "011fb5a1", - "metadata": {}, - "source": [ - "### Run model inference to generate mel spectrogram and audios" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "a3d45c36", - "metadata": {}, - "outputs": [], - "source": [ - "audio_diffusion = AudioDiffusion(model_id=\"teticio/audio-diffusion-256\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "b809fed5", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "71abbf1ea35d4180bf97bdc94dd66ef1", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1000 [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ad5c8b3eaad240a6869ee788740d75e9", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1000 [00:00\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m----> 2\u001b[0m image, (sample_rate, audio) \u001b[38;5;241m=\u001b[39m \u001b[43maudio_diffusion\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_spectrogram_and_audio\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m display(image)\n\u001b[1;32m 4\u001b[0m display(Audio(audio, rate\u001b[38;5;241m=\u001b[39msample_rate))\n", - "File \u001b[0;32m~/ML/huggingface/audio-diffusion/audiodiffusion/__init__.py:36\u001b[0m, in \u001b[0;36mAudioDiffusion.generate_spectrogram_and_audio\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mgenerate_spectrogram_and_audio\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 30\u001b[0m \u001b[38;5;124;03m\"\"\"Generate random mel spectrogram and convert to audio.\u001b[39;00m\n\u001b[1;32m 31\u001b[0m \n\u001b[1;32m 32\u001b[0m \u001b[38;5;124;03m Returns:\u001b[39;00m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;124;03m PIL Image: mel spectrogram\u001b[39;00m\n\u001b[1;32m 34\u001b[0m \u001b[38;5;124;03m (Float, Array): sample rate and raw audio\u001b[39;00m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 36\u001b[0m images \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mddpm\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnumpy\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msample\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 37\u001b[0m images \u001b[38;5;241m=\u001b[39m (images \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m255\u001b[39m)\u001b[38;5;241m.\u001b[39mround()\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muint8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m)\n\u001b[1;32m 38\u001b[0m image \u001b[38;5;241m=\u001b[39m Image\u001b[38;5;241m.\u001b[39mfromarray(images[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m])\n", - "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/autograd/grad_mode.py:27\u001b[0m, in \u001b[0;36m_DecoratorContextManager.__call__..decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclone():\n\u001b[0;32m---> 27\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/diffusers/pipelines/ddpm/pipeline_ddpm.py:58\u001b[0m, in \u001b[0;36mDDPMPipeline.__call__\u001b[0;34m(self, batch_size, generator, output_type, **kwargs)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mset_timesteps(\u001b[38;5;241m1000\u001b[39m)\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m t \u001b[38;5;129;01min\u001b[39;00m tqdm(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mtimesteps):\n\u001b[1;32m 57\u001b[0m \u001b[38;5;66;03m# 1. predict noise model_output\u001b[39;00m\n\u001b[0;32m---> 58\u001b[0m model_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43munet\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msample\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 60\u001b[0m \u001b[38;5;66;03m# 2. compute previous image: x_t -> t_t-1\u001b[39;00m\n\u001b[1;32m 61\u001b[0m image \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mstep(model_output, t, image)[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprev_sample\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", - "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1126\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1127\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", - "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/diffusers/models/unet_2d.py:147\u001b[0m, in \u001b[0;36mUNet2DModel.forward\u001b[0;34m(self, sample, timestep)\u001b[0m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;66;03m# 3. down\u001b[39;00m\n\u001b[1;32m 146\u001b[0m down_block_res_samples \u001b[38;5;241m=\u001b[39m (sample,)\n\u001b[0;32m--> 147\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m downsample_block \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdown_blocks:\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(downsample_block, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mskip_conv\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 149\u001b[0m sample, res_samples, skip_sample \u001b[38;5;241m=\u001b[39m downsample_block(\n\u001b[1;32m 150\u001b[0m hidden_states\u001b[38;5;241m=\u001b[39msample, temb\u001b[38;5;241m=\u001b[39memb, skip_sample\u001b[38;5;241m=\u001b[39mskip_sample\n\u001b[1;32m 151\u001b[0m )\n", - "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/container.py:219\u001b[0m, in \u001b[0;36mModuleList.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 217\u001b[0m \u001b[38;5;129m@_copy_to_script_wrapper\u001b[39m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Iterator[Module]:\n\u001b[0;32m--> 219\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43miter\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_modules\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "while True:\n", - " image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio()\n", - " display(image)\n", - " display(Audio(audio, rate=sample_rate))" - ] - }, - { - "cell_type": "markdown", - "id": "ef54cef3", - "metadata": {}, - "source": [ - "### Compare results with random sample from training set" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "f028a3c8", - "metadata": {}, - "outputs": [], - "source": [ - "mel = Mel(x_res=256, y_res=256)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "269ee816", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using custom data configuration teticio--audio-diffusion-256-90642b08dc2c6e33\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloading and preparing dataset None/None (download: 910.09 MiB, generated: 911.42 MiB, post-processed: Unknown size, total: 1.78 GiB) to /home/teticio/.cache/huggingface/datasets/teticio___parquet/teticio--audio-diffusion-256-90642b08dc2c6e33/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c91ed8a7a45f45ffbc1452a7219311fc", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading data files: 0%| | 0/1 [00:00" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "image = random.choice(ds['train'])['image']\n", - "image" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "492e2334", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "audio = mel.image_to_audio(image)\n", - "Audio(data=audio, rate=mel.get_sample_rate())" - ] - }, - { - "cell_type": "markdown", - "id": "946fdb4d", - "metadata": {}, - "source": [ - "### Push model to hub" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "37c0564e", - "metadata": {}, - "outputs": [], - "source": [ - "from diffusers.hub_utils import init_git_repo, push_to_hub\n", - "\n", - "\n", - "class AttributeDict(dict):\n", - "\n", - " def __getattr__(self, attr):\n", - " return self[attr]\n", - "\n", - " def __setattr__(self, attr, value):\n", - " self[attr] = value\n", - "\n", - "\n", - "args = AttributeDict({\n", - " \"hub_model_id\":\n", - " \"teticio/audio-diffusion-256\",\n", - " \"output_dir\":\n", - " \"../ddpm-ema-audio-256-repo\",\n", - " \"local_rank\":\n", - " -1,\n", - " \"hub_token\":\n", - " open(os.path.join(os.environ['HOME'], '.huggingface/token'), 'rt').read(),\n", - " \"hub_private_repo\":\n", - " False,\n", - " \"overwrite_output_dir\":\n", - " False\n", - "})\n", - "\n", - "repo = init_git_repo(args, at_init=True)\n", - "ddpm = DDPMPipeline.from_pretrained('../ddpm-ema-audio-256')\n", - "push_to_hub(args, ddpm, repo)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c8261a0", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "huggingface", - "language": "python", - "name": "huggingface" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - }, - "colab": { - "provenance": [] - }, - "accelerator": "GPU", - "gpuClass": "standard" - }, - "nbformat": 4, - "nbformat_minor": 5 -}