Spaces:

BeveledCube
/

image-gen

Runtime error

App Files Files Community

BeveledCube commited on Mar 24

Commit

3b3a783

•

1 Parent(s): 953f815

Trying sum

Browse files

Files changed (32) hide show

Dockerfile.fastapi +14 -0
Dockerfile.gradio +13 -0
LICENSE +21 -0
docker-compose.yml +17 -0
main.py +0 -38
mainHistory.py +0 -46
models.md +0 -7
og readme.md +211 -0
pyproject.toml +0 -17
requirements.txt +9 -3
setup.py +36 -0
start.sh +0 -5
tests/__init__.py +0 -0
tests/test_api.py +31 -0
tests/test_diffuser.py +99 -0
tld/__init__.py +0 -0
tld/app.py +67 -0
tld/data.py +243 -0
tld/denoiser.py +123 -0
tld/diffusion.py +198 -0
tld/gradio_app.py +40 -0
tld/img_examples/a beautiful woman with blonde hair in her 50s_cfg_7_seed_11.png +0 -0
tld/img_examples/a cute grey great owl_cfg_8_seed_11.png +0 -0
tld/img_examples/a lake in mountains in the fall at sunset_cfg_7_seed_11.png +0 -0
tld/img_examples/a woman cyborg with red curly hair, 8k_cfg_9.5_seed_11.png +0 -0
tld/img_examples/an aerial view of manhattan, isometric view, as pantinted by mondrian_cfg_7_seed_11.png +0 -0
tld/img_examples/isometric view of small japanese village with blooming trees_cfg_7_seed_11.png +0 -0
tld/img_examples/painting of a cute fox in a suit in a field of poppies_cfg_8_seed_11.png +0 -0
tld/img_examples/painting of a cyberpunk market_cfg_7_seed_11.png +0 -0
tld/img_examples/watercolor of a cute cat riding a motorcycle_cfg_7_seed_11.png +0 -0
tld/train.py +208 -0
tld/transformer_blocks.py +139 -0

Dockerfile.fastapi ADDED Viewed

	@@ -0,0 +1,14 @@

+FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime
+WORKDIR /app
+RUN apt-get update && apt-get install -y git
+COPY . /app
+RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install --no-cache-dir uvicorn gunicorn fastapi pytest ruff pytest-asyncio httpx
+EXPOSE 80
+CMD ["uvicorn", "tld.app:app", "--host", "0.0.0.0", "--port", "80"]

Dockerfile.gradio ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.10-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y git
+COPY . /app
+RUN pip install --no-cache-dir gradio Pillow
+EXPOSE 80
+CMD ["python", "tld/gradio_app.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Alexandru Papiu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,17 @@

+version: '3.8'
+services:
+  fastapi:
+    image: apapiu89/tld-app:latest
+    ports:
+      - "80:80"
+    environment:
+      - API_TOKEN=${API_TOKEN}
+  gradio:
+    image: apapiu89/gradio-app:latest
+    ports:
+      - "7860:7860"
+    environment:
+      - API_URL=http://fastapi:80
+    depends_on:
+      - fastapi

main.py DELETED Viewed

@@ -1,38 +0,0 @@
-import os
-from transformers import CLIPProcessor, CLIPModel
-import torch
-from PIL import Image
-# Get the directory of the script
-script_directory = os.path.dirname(os.path.realpath(__file__))
-# Specify the directory where the cache will be stored (same folder as the script)
-cache_directory = os.path.join(script_directory, "cache")
-# Create the cache directory if it doesn't exist
-os.makedirs(cache_directory, exist_ok=True)
-# Load the CLIP processor and model
-clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", cache_dir=cache_directory)
-clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", cache_dir=cache_directory)
-# Text description to generate image
-text = "a cat sitting on a table"
-# Tokenize text and get features
-inputs = clip_processor(text, return_tensors="pt", padding=True)
-# Generate image from text
-generated_image = clip_model.generate_images(
-  input_ids=inputs.input_ids,
-  attention_mask=inputs.attention_mask,
-  visual_input=None,  # We don't provide image input
-  return_tensors="pt"  # Return PyTorch tensor
-)
-# Convert the generated image tensor to a NumPy array
-generated_image_np = generated_image[0].cpu().numpy()
-# Save the generated image
-output_image_path = "generated_image.png"
-Image.fromarray(generated_image_np).save(output_image_path)
-print("Image generated and saved as:", output_image_path)

mainHistory.py DELETED Viewed

@@ -1,46 +0,0 @@
-from fastapi.staticfiles import StaticFiles
-from fastapi.responses import FileResponse
-from pydantic import BaseModel
-from fastapi import FastAPI
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-model_name = "facebook/blenderbot-1B-distill"
-# https://huggingface.co/models?sort=trending&search=facebook%2Fblenderbot
-# facebook/blenderbot-3B
-# facebook/blenderbot-1B-distill
-# facebook/blenderbot-400M-distill
-# facebook/blenderbot-90M
-# facebook/blenderbot_small-90M
-# https://www.youtube.com/watch?v=irjYqV6EebU
-app = FastAPI()
-model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-class req(BaseModel):
-  prompt: str
-@app.get("/")
-def read_root():
-  return FileResponse(path="templates/index.html", media_type="text/html")
-@app.post("/api")
-def read_root(data: req):
-  print("Prompt:", data.prompt)
-  input_text = data.prompt
-  # Tokenize the input text
-  input_ids = tokenizer.encode(input_text, return_tensors="pt")
-  # Generate output using the model
-  output_ids = model.generate(input_ids, num_beams=5, no_repeat_ngram_size=2)
-  generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-  answer_data = { "answer": generated_text }
-  print("Answer:", generated_text)
-  return answer_data

models.md DELETED Viewed

@@ -1,7 +0,0 @@
-# Model list
-* [microsoft/DialoGPT-small](https://huggingface.co/microsoft/DialoGPT-small)
-* [microsoft/DialoGPT-medium](https://huggingface.co/microsoft/DialoGPT-medium)
-* [microsoft/DialoGPT-large](https://huggingface.co/microsoft/DialoGPT-large)
-# Download locations
-* Github Codespaces: /home/codespace/.local/lib/python3.10/site-packages/transformers/models/

og readme.md ADDED Viewed

	@@ -0,0 +1,211 @@

+# Transformer Latent Diffusion
+Text to Image Latent Diffusion using a Transformer core in PyTorch.
+[Original Github](https://github.com/apapiu/transformer_latent_diffusion)
+**Try with own inputs**: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1VaCe01YG9rnPwAfwVLBKdXEX7D_tk1U5?usp=sharing)
+Below are some random examples (at 256 resolution) from a 100MM model trained from scratch for 260k iterations (about 32 hours on 1 A100):
+<img width="760" alt="image" src="https://github.com/apapiu/transformer_latent_diffusion/assets/13619417/e01e3094-2487-4c04-bc0f-d9b03eeaed00">
+#### Clip interpolation Examples:
+a photo of a cat → an anime drawing of a super saiyan cat, artstation:
+<img width="1361" alt="image" src="https://github.com/apapiu/transformer_latent_diffusion/assets/13619417/a079458b-9bd5-4557-aa7a-5a3e78f31b53">
+a cute great gray owl → starry night by van gogh:
+<img width="1399" alt="image" src="https://github.com/apapiu/transformer_latent_diffusion/assets/13619417/8731d87a-89fa-43a2-847d-c7ff772de286">
+Note that the model has not converged yet and could use more training.
+#### High(er) Resolution:
+By upsampling the positional encoding the model can also generate 512 or 1024 px images with minimal fine-tuning. See below for some examples of model fine-tuned on 100k extra 512 px images and 30k 1024 px images for about 2 hours on an A100. The images do sometimes lack global coherence at 1024 px - more to come here:
+<img width="600" alt="image" src="https://github.com/apapiu/transformer_latent_diffusion/assets/13619417/adba64f0-b43c-423e-9a7d-033a4afea207">
+<img width="600" alt="image" src="https://github.com/apapiu/transformer_latent_diffusion/assets/13619417/5a94515b-313e-420d-89d4-6bdc376d9a00">
+### Intro:
+The main goal of this project is to build an accessible diffusion model in PyTorch that is:
+- fast (close to real time generation)
+- small (~100MM params)
+- reasonably good (of course not SOTA)
+- can be trained in a reasonable amount of time on a single GPU (under 50 hours on an A100 or equivalent).
+- simple self-contained codebase (model + train loop is about ~400 lines of PyTorch with little dependencies)
+- uses ~ 1 million images with a focus on data quality over quantity
+This is part II of a previous [project](https://github.com/apapiu/guided-diffusion-keras) I did where I trained a pixel level diffusion model in Keras. Even though this model outputs 4x higher resolution images (256px vs 64px), it's actually faster to both train and sample from, which shows the power of training in the latent space and speed of transformer architectures.
+## Table of Contents:
+- [Codebase](#codebase)
+- [Usage](#usage)
+- [Examples](#examples)
+- [Data Processing](#data-processing)
+- [Architecture](#architecture)
+- [TO-DOs](#todos)
+## Codebase:
+The code is written in pure PyTorch with as few dependencies as possible.
+- [transformer_blocks.py](https://github.com/apapiu/transformer_latent_diffusion/blob/main/tld/transformer_blocks.py) - basic transformer building blocks relevant to the transformer denoiser
+- [denoiser.py](https://github.com/apapiu/transformer_latent_diffusion/blob/main/tld/denoiser.py) - the architecture of the denoiser transformer
+- [train.py](https://github.com/apapiu/transformer_latent_diffusion/blob/main/tld/train.py). The train loop uses `accelerate` so its training can scale to multiple GPUs if needed.
+- [diffusion.py](https://github.com/apapiu/transformer_latent_diffusion/blob/main/tld/diffusion.py). Class to generate image from noise using reverse diffusion. Short (~60 lines) and self-contained.
+- [data.py](https://github.com/apapiu/transformer_latent_diffusion/blob/main/tld/data.py). Data utils to download images/text and process necessary features for the diffusion model.
+### Usage:
+If you have your own dataset of URLs + captions, the process to train a model on the data consists of two steps:
+1. Use `train.download_and_process_data` to obtain the latent and text encodings as numpy files. See [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1BPDFDBdsP9SSKBNEFJysmlBjfoxKK13r?usp=sharing) for a notebook example downloading and processing 2000 images from this HuggingFace [dataset](https://huggingface.co/datasets/zzliang/GRIT).
+2. use the `train.main` function in an accelerate `notebook_launcher` - see [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1sKk0usxEF4bmdCDcNQJQNMt4l9qBOeAM?usp=sharing) for a colab notebook that trains a model on 100k images from scratch. Note that this downloads already pre-preprocessed latents and embeddings from [here](https://huggingface.co/apapiu/small_ldt/tree/main) but you could just use whatever `.npy` files you had saved from step 1.
+#### Fine-Tuning - TODO but it is the same as step 2 above except you train on a pre-trained model.
+```python
+!wandb login
+import os
+from tld.train import main, DataConfig, ModelConfig
+from accelerate import notebook_launcher
+data_config = DataConfig(latent_path='path/to/image_latents.npy',
+                         text_emb_path='path/to/text_encodings.npy',
+                         val_path='path/to/val_encodings.npy')
+model_config = ModelConfig(embed_dim=512, n_layers=6) #see ModelConfig for more params
+#run the training process on 2 GPUs:
+notebook_launcher(main, (model_config, data_config), num_processes=2)
+```
+### Dependencies:
+- `PyTorch` `numpy` `einops` for model building
+- `wandb` `tqdm` for logging + progress bars
+- `accelerate` for train loop and multi-GPU support
+- `img2dataset` `webdataset` `torchvision` for data downloading and image processing
+- `diffusers` `clip` for pretrained VAE and CLIP text model
+### Codebases used for inspiration:
+- [PixArt-alpha](https://github.com/PixArt-alpha/PixArt-alpha)
+- [k-diffusion](https://github.com/crowsonkb/k-diffusion)
+- [nanoGPT](https://github.com/karpathy/nanoGPT/tree/master)
+- [LocalViT](https://github.com/ofsoundof/LocalViT)
+#### Speed:
+I try to speed up training and inference as much as possible by:
+- using mixed precision for training + [sdpa]
+- precompute all latent and text embeddings
+- using float16 precision for inference
+- using [sdpa] for the attention natively + torch.compile() (compile doesn't always work).
+- use a highly performant sampler (DPM-Solver++(2M)) that gets good results in ~ 15 steps.
+- TODO: would distillation or something like LCM work here?
+- TODO: use flash-attention2?
+- TODO: use smaller vae?
+The time to generate a batch of 36 images (15 iterations) on a:
+- T4: ~ 3.5 seconds
+- A100: ~ 0.6 seconds
+In fact on an A100 the vae becomes the bottleneck even though it is only used once.
+## Examples:
+More examples generated with the 100MM model - click the photo to see the prompt and other params like cfg and seed:
+![image](tld/img_examples/a%20cute%20grey%20great%20owl_cfg_8_seed_11.png)
+![image](tld/img_examples/watercolor%20of%20a%20cute%20cat%20riding%20a%20motorcycle_cfg_7_seed_11.png)
+![image](tld/img_examples/painting%20of%20a%20cyberpunk%20market_cfg_7_seed_11.png)
+![image](tld/img_examples/isometric%20view%20of%20small%20japanese%20village%20with%20blooming%20trees_cfg_7_seed_11.png)
+![image](tld/img_examples/a%20beautiful%20woman%20with%20blonde%20hair%20in%20her%2050s_cfg_7_seed_11.png)
+![image](tld/img_examples/painting%20of%20a%20cute%20fox%20in%20a%20suit%20in%20a%20field%20of%20poppies_cfg_8_seed_11.png)
+![image](tld/img_examples/an%20aerial%20view%20of%20manhattan%2C%20isometric%20view%2C%20as%20pantinted%20by%20mondrian_cfg_7_seed_11.png)
+## Outpainting model:
+I also fine-tuned an outpaing model on top of the original 101MM model. I had to modify the original input conv2d patch to 8 channel and initialize the mask channels parameters to zero. The rest of the architecture remained the same.
+Below I apply the outpainting model repatedly to generate a somewhat consistent scenery based on the prompt "a cyberpunk marketplace":
+<img width="1440" alt="image" src="https://github.com/apapiu/transformer_latent_diffusion/assets/13619417/4451719f-d45a-4a86-a7bb-06c021b34996">
+## Data Processing:
+In [data.py](https://github.com/apapiu/transformer_latent_diffusion/blob/main/tld/data.py), I have some helper functions to process images and captions. The flow is as follows:
+- Use `img2dataset` to download images from a dataframe containing URLs and captions.
+- Use `CLIP` to encode the prompts and the `VAE` to encode images to latents on a web2dataset data generator.
+- Save the latents and text embedding for future training.
+There are two advantages to this approach. One is that the VAE encoding is somewhat expensive, so doing it every epoch would affect training times. The other is that we can discard the images after processing. For `3*256*256` images, the latent dimension is `4*32*32`, so every latent is around 4KB (when quantized in uint8; see [here](https://pub.towardsai.net/stable-diffusion-based-image-compresssion-6f1f0a399202?gi=1f45c6522d3b)). This means that 1 million latents will be "only" 4GB in size, which is easy to handle even in RAM. Storing the raw images would have been 48x larger in size.
+## Architecture:
+See [here](https://github.com/apapiu/transformer_latent_diffusion/blob/main/tld/denoiser.py) for the denoiser class.
+The denoiser model is a Transformer-based model based on the archirtecture in [DiT](https://arxiv.org/abs/2203.02378) and [Pixart-Alpha](https://pixart-alpha.github.io/), albeit with quite a few modifications and simplifications. Using a Transformer as the denoiser is different from most diffusion models in that most other models used a CNN-based U-NET as the denoising backbone. I decided to use a Transformer for a few reasons. One was I just wanted to experiment and learn how to build and train Transformers from the ground up. Secondly, Transformers are fast both to train and to do inference on, and they will benefit most from future advances (both in hardware and in software) in performance.
+Transformers are not natively built for spatial data and at first I found a lot of the outputs to be very "patchy". To remediy that I added a depth-wise convolution in the FFN layer of the transformer (this was introduced in the [Local ViT](https://arxiv.org/abs/2104.05707) paper. This allows the model to mix pixels that are close to each other with very little added compute cost.
+### Img+Text+Noise Encoding:
+The image latent inputs are `4*32*32` and we use a patch size of 2 to build 256 flattened `4*2*2=16` dimensional input "pixels". These are then projected into the embed dimensions are are fed through the transformer blocks.
+The text and noise conditioning is very simple - we concatenate a pooled CLIP text embedding (`ViT/L14` - 768-dimensional) and the sinusoidal noise embedding and feed it as input in the cross-attention layer in each transformer block. No unpooled CLIP embeddings are used.
+### Training:
+The base model is 101MM parameters and has 12 layers and embedding dimension = 768. I train it with a batch size of 256 on a A100 and learning rate  of `3e-4`. I used 1000 steps for warmup. Due to computational contraints I did not do any ablations for this configuration.
+## Train and Diffusion Setup:
+We train a denoising transformer that takes the following three inputs:
+- `noise_level` (sampled from 0 to 1 with more values concentrated close to 0 - I use a beta distribution)
+- Image latent (x) corrupted with a level of random noise
+  - For a given `noise_level` between 0 and 1, the corruption is as follows:
+    - `x_noisy = x*(1-noise_level) + eps*noise_level where eps ~ np.random.normal(0, 1)`
+- CLIP embeddings of a text prompt
+  - You can think of this as a numerical representation of a text prompt.
+  - We use the pooled text embedding here (768 dimensional for `ViT/L14`)
+The output is a prediction of the denoised image latent - call it `f(x_noisy)`.
+The model is trained to minimize the mean squared error `|f(x_noisy) - x|` between the prediction and actual image
+(you can also use absolute error here). Note that I don't reparameterize the loss in terms of the noise here to keep things simple.
+Using this model, we then iteratively generate an image from random noise as follows:
+         for i in range(len(self.noise_levels) - 1):
+            curr_noise, next_noise = self.noise_levels[i], self.noise_levels[i + 1]
+            # Predict original denoised image:
+            x0_pred = predict_x_zero(new_img, label, curr_noise)
+            # New image at next_noise level is a weighted average of old image and predicted x0:
+            new_img = ((curr_noise - next_noise) * x0_pred + next_noise * new_img) / curr_noise
+The `predict_x_zero` method uses classifier free guidance by combining the conditional and unconditional
+prediction: `x0_pred = class_guidance * x0_pred_conditional + (1 - class_guidance) * x0_pred_unconditional`
+A bit of math: The approach above falls within the VDM parametrization see 3.1 in [Kingma et al.](https://arxiv.org/pdf/2107.00630.pdf):
+$$z_t = \alpha_t x + \sigma_t \epsilon,  \epsilon \sim \mathcal{N}(0,1)$$
+Where $z_t$ is the noisy version of $x$ at time $t$.
+Generally, $\alpha_t$ is chosen to be $\sqrt{1-\sigma_t^2}$ so that the process is variance preserving. Here, I chose $\alpha_t=1-\sigma_t$ so that we linearly interpolate between the image and random noise. Why? For one, it simplifies the updating equation quite a bit, and it's easier to understand what the noise to signal ratio will look like. I also found that the model produces sharper images faster - more validation here is needed. The updating equation above is the DDIM model for this parametrization, which simplifies to a simple weighted average. Note that the DDIM model deterministically maps random normal noise to images - this has two benefits: we can interpolate in the random normal latent space, and it generally takes fewer steps to achieve decent image quality.
+## TODOS:
+- better config in the train file
+- how to speed up generation even more - LCMs or other sampling strategies?
+- add script to compute FID

pyproject.toml DELETED Viewed

@@ -1,17 +0,0 @@
-[tool.poetry]
-name = "img gen"
-version = "0.0.1"
-description = "A project to test image generation with AI models"
-authors = ["CubeBeveled <[email protected]>"]
-readme = "README.md"
-[tool.poetry.dependencies]
-python = "^3.11"
-transformers = "^4.39.1"
-torch = "^2.2.1"
-pillow = "^10.2.0"
-[build-system]
-requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"

requirements.txt CHANGED Viewed

@@ -1,4 +1,10 @@
-transformers
 torch
-poetry
-pillow

 torch
+numpy
+einops
+torchvision
+tqdm
+diffusers
+accelerate
+transformers
+Pillow
+git+https://github.com/openai/CLIP.git

setup.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from setuptools import setup, find_packages
+def load_requirements(filename="requirements.txt"):
+    with open(filename, "r") as file:
+        lines = [line.strip() for line in file.readlines() if line.strip() and not line.startswith("#")]
+    return lines
+setup(
+    name="tld",
+    version="0.1.0",
+    author="Alexandru Papiu",
+    author_email="[email protected]",
+    description="Transformer Latent Diffusion",
+    url="https://github.com/apapiu/transformer_latent_diffusion",
+    packages=find_packages(exclude=["tests*"]),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.6",
+    install_requires=[
+        "torch",
+        "numpy",
+        "einops",
+        "torchvision",
+        "tqdm",
+        "diffusers",
+        "accelerate",
+        "transformers",
+        "Pillow",
+        "clip @ git+https://github.com/openai/CLIP.git",
+    ],
+)

start.sh DELETED Viewed

@@ -1,5 +0,0 @@
-pip install --upgrade pip
-pip install -r requirements.txt
-poetry install --no-root
-python main.py

tests/__init__.py ADDED Viewed

File without changes

tests/test_api.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import os
+from fastapi.testclient import TestClient
+from tld.app import app
+import PIL
+from PIL import Image
+from io import BytesIO
+client = TestClient(app)
+def test_read_main():
+    response = client.get("/")
+    assert response.status_code == 200
+    assert response.json() == {"message": "Welcome to Image Generator"}
+def test_generate_image_unauthorized():
+    response = client.post("/generate-image/", json={})
+    assert response.status_code == 401
+    assert response.json() == {"detail": "Not authenticated"}
+def test_generate_image_authorized():
+    api_token = os.getenv("API_TOKEN")
+    response = client.post(
+        "/generate-image/", json={"prompt": "a cute cat"}, headers={"Authorization": f"Bearer {api_token}"}
+    )
+    assert response.status_code == 200
+    image = Image.open(BytesIO(response.content))
+    assert type(image) == PIL.JpegImagePlugin.JpegImageFile

tests/test_diffuser.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+import time
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+import torchvision.utils as vutils
+from diffusers import AutoencoderKL
+from tld.denoiser import Denoiser
+from tld.diffusion import DiffusionGenerator, DiffusionTransformer, LTDConfig
+from PIL.Image import Image
+to_pil = transforms.ToPILImage()
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+def test_outputs(num_imgs=4):
+    model = Denoiser(
+        image_size=32, noise_embed_dims=128, patch_size=2, embed_dim=768, dropout=0.1, n_layers=12
+    )
+    x = torch.rand(num_imgs, 4, 32, 32)
+    noise_level = torch.rand(num_imgs, 1)
+    label = torch.rand(num_imgs, 768)
+    print(f"Model has {sum(p.numel() for p in model.parameters())} parameters")
+    with torch.no_grad():
+        start_time = time.time()
+        output = model(x, noise_level, label)
+        end_time = time.time()
+    execution_time = end_time - start_time
+    print(f"Model execution took {execution_time:.4f} seconds.")
+    assert output.shape == torch.Size([num_imgs, 4, 32, 32])
+    print("Basic tests passed.")
+    # model = Denoiser(image_size=16, noise_embed_dims=128, patch_size=2, embed_dim=256, dropout=0.1, n_layers=6)
+    # x = torch.rand(8, 4, 32, 32)
+    # noise_level = torch.rand(8, 1)
+    # label = torch.rand(8, 768)
+    # with torch.no_grad():
+    #     output = model(x, noise_level, label)
+    # assert output.shape == torch.Size([8, 4, 32, 32])
+    # print("Uspscale tests passed.")
+def test_diffusion_generator():
+    model_dtype = torch.float32  ##float 16 will not work on cpu
+    num_imgs = 1
+    nrow = int(np.sqrt(num_imgs))
+    denoiser = Denoiser(
+        image_size=32, noise_embed_dims=128, patch_size=2, embed_dim=256, dropout=0.1, n_layers=3
+    )
+    print(f"Model has {sum(p.numel() for p in denoiser.parameters())} parameters")
+    denoiser.to(model_dtype)
+    vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=model_dtype).to(device)
+    labels = torch.rand(num_imgs, 768)
+    diffuser = DiffusionGenerator(denoiser, vae, device, model_dtype)
+    out, _ = diffuser.generate(
+        labels=labels,
+        num_imgs=num_imgs,
+        class_guidance=3,
+        seed=1,
+        n_iter=5,
+        exponent=1,
+        scale_factor=8,
+        sharp_f=0,
+        bright_f=0,
+    )
+    out = to_pil((vutils.make_grid((out + 1) / 2, nrow=nrow, padding=4)).float().clip(0, 1))
+    out.save("test.png")
+    print("Images generated at test.png")
+def test_full_generation_pipeline():
+    ltdconfig = LTDConfig()
+    diffusion_transformer = DiffusionTransformer(ltdconfig)
+    out = diffusion_transformer.generate_image_from_text(prompt="a cute cat")
+    print(out)
+    assert type(out) == Image
+# TODO: should add tests for train loop and data processing

tld/__init__.py ADDED Viewed

File without changes

tld/app.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import io
+import os
+from typing import Optional
+import torch
+import torchvision.transforms as transforms
+from fastapi import Depends, FastAPI, HTTPException, status
+from fastapi.responses import StreamingResponse
+from fastapi.security import OAuth2PasswordBearer
+from pydantic import BaseModel
+from tld.diffusion import DiffusionTransformer, LTDConfig
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+to_pil = transforms.ToPILImage()
+ltdconfig = LTDConfig()
+diffusion_transformer = DiffusionTransformer(ltdconfig)
+app = FastAPI()
+oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token")
+def validate_token(token: str = Depends(oauth2_scheme)):
+    if token != os.getenv("API_TOKEN"):
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Invalid authentication credentials",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
+class ImageRequest(BaseModel):
+    prompt: str
+    class_guidance: Optional[int] = 6
+    seed: Optional[int] = 11
+    num_imgs: Optional[int] = 1
+    img_size: Optional[int] = 32
+@app.get("/")
+def read_root():
+    return {"message": "Welcome to Image Generator"}
+@app.post("/generate-image/")
+async def generate_image(request: ImageRequest, token: str = Depends(validate_token)):
+    try:
+        img = diffusion_transformer.generate_image_from_text(
+            prompt=request.prompt,
+            class_guidance=request.class_guidance,
+            seed=request.seed,
+            num_imgs=request.num_imgs,
+            img_size=request.img_size,
+        )
+        # Convert PIL image to byte stream suitable for HTTP response
+        img_byte_arr = io.BytesIO()
+        img.save(img_byte_arr, format="JPEG")
+        img_byte_arr.seek(0)
+        return StreamingResponse(img_byte_arr, media_type="image/jpeg")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# build job to test and deploy the API on a docker image (maybe in Azure?)

tld/data.py ADDED Viewed

	@@ -0,0 +1,243 @@

+####data util to get and preprocess data from a text and image pair to latents and text embeddings.
+### all that is required is a csv file with an image url and text caption:
+#!pip install datasets img2dataset accelerate diffusers
+#!pip install git+https://github.com/openai/CLIP.git
+import json
+import os
+from dataclasses import dataclass
+from typing import List, Union
+import clip
+import h5py
+import numpy as np
+import pandas as pd
+import torch
+import torchvision.transforms as transforms
+import webdataset as wds
+from diffusers import AutoencoderKL
+from img2dataset import download
+from torch import Tensor, nn
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+@torch.no_grad()
+def encode_text(label: Union[str, List[str]], model: nn.Module, device: str) -> Tensor:
+    text_tokens = clip.tokenize(label, truncate=True).to(device)
+    text_encoding = model.encode_text(text_tokens)
+    return text_encoding.cpu()
+@torch.no_grad()
+def encode_image(img: Tensor, vae: AutoencoderKL) -> Tensor:
+    x = img.to("cuda").to(torch.float16)
+    x = x * 2 - 1  # to make it between -1 and 1.
+    encoded = vae.encode(x, return_dict=False)[0].sample()
+    return encoded.cpu()
+@torch.no_grad()
+def decode_latents(out_latents: torch.FloatTensor, vae: AutoencoderKL) -> Tensor:
+    # expected to be in the unscaled latent space
+    out = vae.decode(out_latents.cuda())[0].cpu()
+    return ((out + 1) / 2).clip(0, 1)
+def quantize_latents(lat: Tensor, clip_val: float = 20) -> Tensor:
+    """scale and quantize latents to unit8"""
+    lat_norm = lat.clip(-clip_val, clip_val) / clip_val
+    return (((lat_norm + 1) / 2) * 255).to(torch.uint8)
+def dequantize_latents(lat: Tensor, clip_val: float = 20) -> Tensor:
+    lat_norm = (lat.to(torch.float16) / 255) * 2 - 1
+    return lat_norm * clip_val
+def append_to_dataset(dataset: h5py.File, new_data: Tensor) -> None:
+    """Appends new data to an HDF5 dataset."""
+    new_size = dataset.shape[0] + new_data.shape[0]
+    dataset.resize(new_size, axis=0)
+    dataset[-new_data.shape[0] :] = new_data
+def get_text_and_latent_embeddings_hdf5(
+    dataloader: DataLoader, vae: AutoencoderKL, model: nn.Module, drive_save_path: str
+) -> None:
+    """Process img/text inptus that outputs an latent and text embeddings and text_prompts, saving encodings as float16."""
+    img_latent_path = os.path.join(drive_save_path, "image_latents.hdf5")
+    text_embed_path = os.path.join(drive_save_path, "text_encodings.hdf5")
+    metadata_csv_path = os.path.join(drive_save_path, "metadata.csv")
+    with h5py.File(img_latent_path, "a") as img_file, h5py.File(text_embed_path, "a") as text_file:
+        if "image_latents" not in img_file:
+            img_ds = img_file.create_dataset(
+                "image_latents",
+                shape=(0, 4, 32, 32),
+                maxshape=(None, 4, 32, 32),
+                dtype="float16",
+                chunks=True,
+            )
+        else:
+            img_ds = img_file["image_latents"]
+        if "text_encodings" not in text_file:
+            text_ds = text_file.create_dataset(
+                "text_encodings", shape=(0, 768), maxshape=(None, 768), dtype="float16", chunks=True
+            )
+        else:
+            text_ds = text_file["text_encodings"]
+        for img, (label, url) in tqdm(dataloader):
+            text_encoding = encode_text(label, model).cpu().numpy().astype(np.float16)
+            img_encoding = encode_image(img, vae).cpu().numpy().astype(np.float16)
+            append_to_dataset(img_ds, img_encoding)
+            append_to_dataset(text_ds, text_encoding)
+            metadata_df = pd.DataFrame({"text": label, "url": url})
+            if os.path.exists(metadata_csv_path):
+                metadata_df.to_csv(metadata_csv_path, mode="a", header=False, index=False)
+            else:
+                metadata_df.to_csv(metadata_csv_path, mode="w", header=True, index=False)
+def download_and_process_data(
+    latent_save_path="latents",
+    raw_imgs_save_path="raw_imgs",
+    csv_path="imgs.csv",
+    image_size=256,
+    bs=64,
+    caption_col="captions",
+    url_col="url",
+    download_data=True,
+    number_sample_per_shard=10000,
+):
+    if not os.path.exists(raw_imgs_save_path):
+        os.mkdir(raw_imgs_save_path)
+    if not os.path.exists(latent_save_path):
+        os.mkdir(latent_save_path)
+    if download_data:
+        download(
+            processes_count=8,
+            thread_count=64,
+            url_list=csv_path,
+            image_size=image_size,
+            output_folder=raw_imgs_save_path,
+            output_format="webdataset",
+            input_format="csv",
+            url_col=url_col,
+            caption_col=caption_col,
+            enable_wandb=False,
+            number_sample_per_shard=number_sample_per_shard,
+            distributor="multiprocessing",
+            resize_mode="center_crop",
+        )
+    files = os.listdir(raw_imgs_save_path)
+    tar_files = [os.path.join(raw_imgs_save_path, file) for file in files if file.endswith(".tar")]
+    print(tar_files)
+    dataset = wds.WebDataset(tar_files)
+    transform = transforms.Compose(
+        [
+            transforms.ToTensor(),
+        ]
+    )
+    # output is (img_tensor, (caption , url_col)) per batch:
+    dataset = (
+        dataset.decode("pil")
+        .to_tuple("jpg;png", "json")
+        .map_tuple(transform, lambda x: (x["caption"], x[url_col]))
+    )
+    dataloader = DataLoader(dataset, batch_size=bs, shuffle=False)
+    model, _ = clip.load("ViT-L/14")
+    vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+    vae = vae.to("cuda")
+    model.to("cuda")
+    print("Starting to encode latents and text:")
+    get_text_and_latent_embeddings_hdf5(dataloader, vae, model, latent_save_path)
+    print("Finished encode latents and text:")
+@dataclass
+class DataConfiguration:
+    data_link: str
+    caption_col: str = "caption"
+    url_col: str = "url"
+    latent_save_path: str = "latents_folder"
+    raw_imgs_save_path: str = "raw_imgs_folder"
+    use_drive: bool = False
+    initial_csv_path: str = "imgs.csv"
+    number_sample_per_shard: int = 10000
+    image_size: int = 256
+    batch_size: int = 64
+    download_data: bool = True
+if __name__ == "__main__":
+    use_wandb = False
+    if use_wandb:
+        import wandb
+        os.environ["WANDB_API_KEY"] = "key"
+        #!wandb login
+    data_link = "https://huggingface.co/datasets/zzliang/GRIT/resolve/main/grit-20m/coyo_0_snappy.parquet?download=true"
+    data_config = DataConfiguration(
+        data_link=data_link,
+        latent_save_path="latent_folder",
+        raw_imgs_save_path="raw_imgs_folder",
+        download_data=False,
+        number_sample_per_shard=1000,
+    )
+    if use_wandb:
+        wandb.init(project="image_vae_processing", entity="apapiu", config=data_config)
+    if not os.path.exists(data_config.latent_save_path):
+        os.mkdir(data_config.latent_save_path)
+    config_file_path = os.path.join(data_config.latent_save_path, "config.json")
+    with open(config_file_path, "w") as f:
+        json.dump(data_config.__dict__, f)
+    print("Config saved to:", config_file_path)
+    df = pd.read_parquet(data_link)
+    ###add additional data cleaning here...should I
+    df = df.iloc[:3000]
+    df[["key", "url", "caption"]].to_csv("imgs.csv", index=None)
+    if data_config.use_drive:
+        from google.colab import drive
+        drive.mount("/content/drive")
+    download_and_process_data(
+        latent_save_path=data_config.latent_save_path,
+        raw_imgs_save_path=data_config.raw_imgs_save_path,
+        csv_path=data_config.initial_csv_path,
+        image_size=data_config.image_size,
+        bs=data_config.batch_size,
+        caption_col=data_config.caption_col,
+        url_col=data_config.url_col,
+        download_data=data_config.download_data,
+        number_sample_per_shard=data_config.number_sample_per_shard,
+    )
+    if use_wandb:
+        wandb.finish()

tld/denoiser.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""transformer based denoiser"""
+import torch
+from einops.layers.torch import Rearrange
+from torch import nn
+from tld.transformer_blocks import DecoderBlock, MLPSepConv, SinusoidalEmbedding
+class DenoiserTransBlock(nn.Module):
+    def __init__(
+        self,
+        patch_size: int,
+        img_size: int,
+        embed_dim: int,
+        dropout: float,
+        n_layers: int,
+        mlp_multiplier: int = 4,
+        n_channels: int = 4,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.img_size = img_size
+        self.n_channels = n_channels
+        self.embed_dim = embed_dim
+        self.dropout = dropout
+        self.n_layers = n_layers
+        self.mlp_multiplier = mlp_multiplier
+        seq_len = int((self.img_size / self.patch_size) * (self.img_size / self.patch_size))
+        patch_dim = self.n_channels * self.patch_size * self.patch_size
+        self.patchify_and_embed = nn.Sequential(
+            nn.Conv2d(
+                self.n_channels,
+                patch_dim,
+                kernel_size=self.patch_size,
+                stride=self.patch_size,
+            ),
+            Rearrange("bs d h w -> bs (h w) d"),
+            nn.LayerNorm(patch_dim),
+            nn.Linear(patch_dim, self.embed_dim),
+            nn.LayerNorm(self.embed_dim),
+        )
+        self.rearrange2 = Rearrange(
+            "b (h w) (c p1 p2) -> b c (h p1) (w p2)",
+            h=int(self.img_size / self.patch_size),
+            p1=self.patch_size,
+            p2=self.patch_size,
+        )
+        self.pos_embed = nn.Embedding(seq_len, self.embed_dim)
+        self.register_buffer("precomputed_pos_enc", torch.arange(0, seq_len).long())
+        self.decoder_blocks = nn.ModuleList(
+            [
+                DecoderBlock(
+                    embed_dim=self.embed_dim,
+                    mlp_multiplier=self.mlp_multiplier,
+                    # note that this is a non-causal block since we are
+                    # denoising the entire image no need for masking
+                    is_causal=False,
+                    dropout_level=self.dropout,
+                    mlp_class=MLPSepConv,
+                )
+                for _ in range(self.n_layers)
+            ]
+        )
+        self.out_proj = nn.Sequential(nn.Linear(self.embed_dim, patch_dim), self.rearrange2)
+    def forward(self, x, cond):
+        x = self.patchify_and_embed(x)
+        pos_enc = self.precomputed_pos_enc[: x.size(1)].expand(x.size(0), -1)
+        x = x + self.pos_embed(pos_enc)
+        for block in self.decoder_blocks:
+            x = block(x, cond)
+        return self.out_proj(x)
+class Denoiser(nn.Module):
+    def __init__(
+        self,
+        image_size: int,
+        noise_embed_dims: int,
+        patch_size: int,
+        embed_dim: int,
+        dropout: float,
+        n_layers: int,
+        text_emb_size: int = 768,
+    ):
+        super().__init__()
+        self.image_size = image_size
+        self.noise_embed_dims = noise_embed_dims
+        self.embed_dim = embed_dim
+        self.fourier_feats = nn.Sequential(
+            SinusoidalEmbedding(embedding_dims=noise_embed_dims),
+            nn.Linear(noise_embed_dims, self.embed_dim),
+            nn.GELU(),
+            nn.Linear(self.embed_dim, self.embed_dim),
+        )
+        self.denoiser_trans_block = DenoiserTransBlock(patch_size, image_size, embed_dim, dropout, n_layers)
+        self.norm = nn.LayerNorm(self.embed_dim)
+        self.label_proj = nn.Linear(text_emb_size, self.embed_dim)
+    def forward(self, x, noise_level, label):
+        noise_level = self.fourier_feats(noise_level).unsqueeze(1)
+        label = self.label_proj(label).unsqueeze(1)
+        noise_label_emb = torch.cat([noise_level, label], dim=1)  # bs, 2, d
+        noise_label_emb = self.norm(noise_label_emb)
+        x = self.denoiser_trans_block(x, noise_label_emb)
+        return x

tld/diffusion.py ADDED Viewed

	@@ -0,0 +1,198 @@

+from dataclasses import dataclass
+import clip
+import numpy as np
+import requests
+import torch
+import torchvision.transforms as transforms
+import torchvision.utils as vutils
+from diffusers import AutoencoderKL
+from torch import Tensor
+from tqdm import tqdm
+from tld.denoiser import Denoiser
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+to_pil = transforms.ToPILImage()
+@dataclass
+class DiffusionGenerator:
+    model: Denoiser
+    vae: AutoencoderKL
+    device: torch.device
+    model_dtype: torch.dtype = torch.float32
+    @torch.no_grad()
+    def generate(
+        self,
+        labels: Tensor,  # embeddings to condition on
+        n_iter: int = 30,
+        num_imgs: int = 16,
+        class_guidance: float = 3,
+        seed: int = 10,
+        scale_factor: int = 8,  # latent scaling before decoding - should be ~ std of latent space
+        img_size: int = 32,  # height, width of latent
+        sharp_f: float = 0.1,
+        bright_f: float = 0.1,
+        exponent: float = 1,
+        seeds: Tensor | None = None,
+        noise_levels=None,
+        use_ddpm_plus: bool = True,
+    ):
+        """Generate images via reverse diffusion.
+        if use_ddpm_plus=True uses Algorithm 2 DPM-Solver++(2M) here: https://arxiv.org/pdf/2211.01095.pdf
+        else use ddim with alpha = 1-sigma
+        """
+        if noise_levels is None:
+            noise_levels = (1 - torch.pow(torch.arange(0, 1, 1 / n_iter), exponent)).tolist()
+        noise_levels[0] = 0.99
+        if use_ddpm_plus:
+            lambdas = [np.log((1 - sigma) / sigma) for sigma in noise_levels]  # log snr
+            hs = [lambdas[i] - lambdas[i - 1] for i in range(1, len(lambdas))]
+            rs = [hs[i - 1] / hs[i] for i in range(1, len(hs))]
+        x_t = self.initialize_image(seeds, num_imgs, img_size, seed)
+        labels = torch.cat([labels, torch.zeros_like(labels)])
+        self.model.eval()
+        x0_pred_prev = None
+        for i in tqdm(range(len(noise_levels) - 1)):
+            curr_noise, next_noise = noise_levels[i], noise_levels[i + 1]
+            x0_pred = self.pred_image(x_t, labels, curr_noise, class_guidance)
+            if x0_pred_prev is None:
+                x_t = ((curr_noise - next_noise) * x0_pred + next_noise * x_t) / curr_noise
+            else:
+                if use_ddpm_plus:
+                    # x0_pred is a combination of the two previous x0_pred:
+                    D = (1 + 1 / (2 * rs[i - 1])) * x0_pred - (1 / (2 * rs[i - 1])) * x0_pred_prev
+                else:
+                    # ddim:
+                    D = x0_pred
+                x_t = ((curr_noise - next_noise) * D + next_noise * x_t) / curr_noise
+            x0_pred_prev = x0_pred
+        x0_pred = self.pred_image(x_t, labels, next_noise, class_guidance)
+        # shifting latents works a bit like an image editor:
+        x0_pred[:, 3, :, :] += sharp_f
+        x0_pred[:, 0, :, :] += bright_f
+        x0_pred_img = self.vae.decode((x0_pred * scale_factor).to(self.model_dtype))[0].cpu()
+        return x0_pred_img, x0_pred
+    def pred_image(self, noisy_image, labels, noise_level, class_guidance):
+        num_imgs = noisy_image.size(0)
+        noises = torch.full((2 * num_imgs, 1), noise_level)
+        x0_pred = self.model(
+            torch.cat([noisy_image, noisy_image]),
+            noises.to(self.device, self.model_dtype),
+            labels.to(self.device, self.model_dtype),
+        )
+        x0_pred = self.apply_classifier_free_guidance(x0_pred, num_imgs, class_guidance)
+        return x0_pred
+    def initialize_image(self, seeds, num_imgs, img_size, seed):
+        """Initialize the seed tensor."""
+        if seeds is None:
+            generator = torch.Generator(device=self.device)
+            generator.manual_seed(seed)
+            return torch.randn(
+                num_imgs,
+                4,
+                img_size,
+                img_size,
+                dtype=self.model_dtype,
+                device=self.device,
+                generator=generator,
+            )
+        else:
+            return seeds.to(self.device, self.model_dtype)
+    def apply_classifier_free_guidance(self, x0_pred, num_imgs, class_guidance):
+        """Apply classifier-free guidance to the predictions."""
+        x0_pred_label, x0_pred_no_label = x0_pred[:num_imgs], x0_pred[num_imgs:]
+        return class_guidance * x0_pred_label + (1 - class_guidance) * x0_pred_no_label
+@dataclass
+class LTDConfig:
+    vae_scale_factor: float = 8
+    img_size: int = 32
+    model_dtype: torch.dtype = torch.float32
+    file_url: str = None  # = "https://huggingface.co/apapiu/small_ldt/resolve/main/state_dict_378000.pth"
+    local_filename: str = "state_dict_378000.pth"
+    vae_name: str = "madebyollin/sdxl-vae-fp16-fix"
+    clip_model_name: str = "ViT-L/14"
+    denoiser: Denoiser = Denoiser(
+        image_size=32,
+        noise_embed_dims=256,
+        patch_size=2,
+        embed_dim=256,
+        dropout=0,
+        n_layers=4,
+    )
+def download_file(url, filename):
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        with open(filename, "wb") as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+@torch.no_grad()
+def encode_text(label, model):
+    text_tokens = clip.tokenize(label, truncate=True).to(device)
+    text_encoding = model.encode_text(text_tokens)
+    return text_encoding.cpu()
+class DiffusionTransformer:
+    def __init__(self, config: LTDConfig):
+        denoiser = config.denoiser.to(config.model_dtype)
+        if config.file_url is not None:
+            print(f"Downloading model from {config.file_url}")
+            download_file(config.file_url, config.local_filename)
+            state_dict = torch.load(config.local_filename, map_location=torch.device("cpu"))
+            denoiser.load_state_dict(state_dict)
+        denoiser = denoiser.to(device)
+        vae = AutoencoderKL.from_pretrained(config.vae_name, torch_dtype=config.model_dtype).to(device)
+        self.clip_model, preprocess = clip.load(config.clip_model_name)
+        self.clip_model = self.clip_model.to(device)
+        self.diffuser = DiffusionGenerator(denoiser, vae, device, config.model_dtype)
+    def generate_image_from_text(
+        self, prompt: str, class_guidance=6, seed=11, num_imgs=1, img_size=32, n_iter=15
+    ):
+        nrow = int(np.sqrt(num_imgs))
+        cur_prompts = [prompt] * num_imgs
+        labels = encode_text(cur_prompts, self.clip_model)
+        out, out_latent = self.diffuser.generate(
+            labels=labels,
+            num_imgs=num_imgs,
+            class_guidance=class_guidance,
+            seed=seed,
+            n_iter=n_iter,
+            exponent=1,
+            scale_factor=8,
+            sharp_f=0,
+            bright_f=0,
+        )
+        out = to_pil((vutils.make_grid((out + 1) / 2, nrow=nrow, padding=4)).float().clip(0, 1))
+        return out

tld/gradio_app.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+from io import BytesIO
+import gradio as gr
+import requests
+from PIL import Image
+# runpod_id = os.environ['RUNPOD_ID']
+# token_id = os.environ['AUTH_TOKEN']
+# url = f'https://{runpod_id}-8000.proxy.runpod.net/generate-image/'
+url = os.getenv("API_URL")
+token_id = os.getenv("API_TOKEN")
+def generate_image_from_text(prompt, class_guidance):
+    headers = {"Authorization": f"Bearer {token_id}"}
+    data = {"prompt": prompt, "class_guidance": class_guidance, "seed": 11, "num_imgs": 1, "img_size": 32}
+    response = requests.post(url, json=data, headers=headers)
+    if response.status_code == 200:
+        image = Image.open(BytesIO(response.content))
+    else:
+        print("Failed to fetch image:", response.status_code, response.text)
+    return image
+iface = gr.Interface(
+    fn=generate_image_from_text,
+    inputs=["text", "slider"],
+    outputs="image",
+    title="Text-to-Image Generator",
+    description="Enter a text prompt to generate an image.",
+)
+# Launch the app
+iface.launch()

tld/img_examples/a beautiful woman with blonde hair in her 50s_cfg_7_seed_11.png ADDED Viewed

tld/img_examples/a cute grey great owl_cfg_8_seed_11.png ADDED Viewed

tld/img_examples/a lake in mountains in the fall at sunset_cfg_7_seed_11.png ADDED Viewed

tld/img_examples/a woman cyborg with red curly hair, 8k_cfg_9.5_seed_11.png ADDED Viewed

tld/img_examples/an aerial view of manhattan, isometric view, as pantinted by mondrian_cfg_7_seed_11.png ADDED Viewed

tld/img_examples/isometric view of small japanese village with blooming trees_cfg_7_seed_11.png ADDED Viewed

tld/img_examples/painting of a cute fox in a suit in a field of poppies_cfg_8_seed_11.png ADDED Viewed

tld/img_examples/painting of a cyberpunk market_cfg_7_seed_11.png ADDED Viewed

tld/img_examples/watercolor of a cute cat riding a motorcycle_cfg_7_seed_11.png ADDED Viewed

tld/train.py ADDED Viewed

	@@ -0,0 +1,208 @@

+#!/usr/bin/env python3
+import copy
+from dataclasses import asdict, dataclass
+import numpy as np
+import torch
+import torchvision
+import torchvision.utils as vutils
+import wandb
+from accelerate import Accelerator
+from diffusers import AutoencoderKL
+from PIL.Image import Image
+from torch import Tensor, nn
+from torch.utils.data import DataLoader, TensorDataset
+from tqdm import tqdm
+from tld.denoiser import Denoiser
+from tld.diffusion import DiffusionGenerator
+def eval_gen(diffuser: DiffusionGenerator, labels: Tensor) -> Image:
+    class_guidance = 4.5
+    seed = 10
+    out, _ = diffuser.generate(
+        labels=torch.repeat_interleave(labels, 8, dim=0),
+        num_imgs=64,
+        class_guidance=class_guidance,
+        seed=seed,
+        n_iter=40,
+        exponent=1,
+        sharp_f=0.1,
+    )
+    out = to_pil((vutils.make_grid((out + 1) / 2, nrow=8, padding=4)).float().clip(0, 1))
+    out.save(f"emb_val_cfg:{class_guidance}_seed:{seed}.png")
+    return out
+def count_parameters(model: nn.Module):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+def count_parameters_per_layer(model: nn.Module):
+    for name, param in model.named_parameters():
+        print(f"{name}: {param.numel()} parameters")
+to_pil = torchvision.transforms.ToPILImage()
+def update_ema(ema_model: nn.Module, model: nn.Module, alpha: float = 0.999):
+    with torch.no_grad():
+        for ema_param, model_param in zip(ema_model.parameters(), model.parameters()):
+            ema_param.data.mul_(alpha).add_(model_param.data, alpha=1 - alpha)
+@dataclass
+class ModelConfig:
+    embed_dim: int = 512
+    n_layers: int = 6
+    clip_embed_size: int = 768
+    scaling_factor: int = 8
+    patch_size: int = 2
+    image_size: int = 32
+    n_channels: int = 4
+    dropout: float = 0
+    mlp_multiplier: int = 4
+    batch_size: int = 128
+    class_guidance: int = 3
+    lr: float = 3e-4
+    n_epoch: int = 100
+    alpha: float = 0.999
+    noise_embed_dims: int = 128
+    diffusion_n_iter: int = 35
+    from_scratch: bool = True
+    run_id: str = ""
+    model_name: str = ""
+    beta_a: float = 0.75
+    beta_b: float = 0.75
+    save_and_eval_every_iters: int = 1000
+@dataclass
+class DataConfig:
+    latent_path: str  # path to a numpy file containing latents
+    text_emb_path: str
+    val_path: str
+def main(config: ModelConfig, dataconfig: DataConfig) -> None:
+    """main train loop to be used with accelerate"""
+    accelerator = Accelerator(mixed_precision="fp16", log_with="wandb")
+    accelerator.print("Loading Data:")
+    latent_train_data = torch.tensor(np.load(dataconfig.latent_path), dtype=torch.float32)
+    train_label_embeddings = torch.tensor(np.load(dataconfig.text_emb_path), dtype=torch.float32)
+    emb_val = torch.tensor(np.load(dataconfig.val_path), dtype=torch.float32)
+    dataset = TensorDataset(latent_train_data, train_label_embeddings)
+    train_loader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True)
+    vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+    if accelerator.is_main_process:
+        vae = vae.to(accelerator.device)
+    model = Denoiser(
+        image_size=config.image_size,
+        noise_embed_dims=config.noise_embed_dims,
+        patch_size=config.patch_size,
+        embed_dim=config.embed_dim,
+        dropout=config.dropout,
+        n_layers=config.n_layers,
+    )
+    loss_fn = nn.MSELoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
+    accelerator.print("Compiling model:")
+    model = torch.compile(model)
+    if not config.from_scratch:
+        accelerator.print("Loading Model:")
+        wandb.restore(
+            config.model_name, run_path=f"apapiu/cifar_diffusion/runs/{config.run_id}", replace=True
+        )
+        full_state_dict = torch.load(config.model_name)
+        model.load_state_dict(full_state_dict["model_ema"])
+        optimizer.load_state_dict(full_state_dict["opt_state"])
+        global_step = full_state_dict["global_step"]
+    else:
+        global_step = 0
+    if accelerator.is_local_main_process:
+        ema_model = copy.deepcopy(model).to(accelerator.device)
+        diffuser = DiffusionGenerator(ema_model, vae, accelerator.device, torch.float32)
+    accelerator.print("model prep")
+    model, train_loader, optimizer = accelerator.prepare(model, train_loader, optimizer)
+    accelerator.init_trackers(project_name="cifar_diffusion", config=asdict(config))
+    accelerator.print(count_parameters(model))
+    accelerator.print(count_parameters_per_layer(model))
+    ### Train:
+    for i in range(1, config.n_epoch + 1):
+        accelerator.print(f"epoch: {i}")
+        for x, y in tqdm(train_loader):
+            x = x / config.scaling_factor
+            noise_level = torch.tensor(
+                np.random.beta(config.beta_a, config.beta_b, len(x)), device=accelerator.device
+            )
+            signal_level = 1 - noise_level
+            noise = torch.randn_like(x)
+            x_noisy = noise_level.view(-1, 1, 1, 1) * noise + signal_level.view(-1, 1, 1, 1) * x
+            x_noisy = x_noisy.float()
+            noise_level = noise_level.float()
+            label = y
+            prob = 0.15
+            mask = torch.rand(y.size(0), device=accelerator.device) < prob
+            label[mask] = 0  # OR replacement_vector
+            if global_step % config.save_and_eval_every_iters == 0:
+                accelerator.wait_for_everyone()
+                if accelerator.is_main_process:
+                    ##eval and saving:
+                    out = eval_gen(diffuser=diffuser, labels=emb_val)
+                    out.save("img.jpg")
+                    accelerator.log({f"step: {global_step}": wandb.Image("img.jpg")})
+                    opt_unwrapped = accelerator.unwrap_model(optimizer)
+                    full_state_dict = {
+                        "model_ema": ema_model.state_dict(),
+                        "opt_state": opt_unwrapped.state_dict(),
+                        "global_step": global_step,
+                    }
+                    accelerator.save(full_state_dict, config.model_name)
+                    wandb.save(config.model_name)
+            model.train()
+            with accelerator.accumulate():
+                ###train loop:
+                optimizer.zero_grad()
+                pred = model(x_noisy, noise_level.view(-1, 1), label)
+                loss = loss_fn(pred, x)
+                accelerator.log({"train_loss": loss.item()}, step=global_step)
+                accelerator.backward(loss)
+                optimizer.step()
+                if accelerator.is_main_process:
+                    update_ema(ema_model, model, alpha=config.alpha)
+            global_step += 1
+    accelerator.end_training()
+# args = (config, data_path, val_path)
+# notebook_launcher(training_loop)

tld/transformer_blocks.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+class SinusoidalEmbedding(nn.Module):
+    def __init__(self, emb_min_freq=1.0, emb_max_freq=1000.0, embedding_dims=32):
+        super(SinusoidalEmbedding, self).__init__()
+        frequencies = torch.exp(
+            torch.linspace(np.log(emb_min_freq), np.log(emb_max_freq), embedding_dims // 2)
+        )
+        self.register_buffer("angular_speeds", 2.0 * torch.pi * frequencies)
+    def forward(self, x):
+        embeddings = torch.cat(
+            [torch.sin(self.angular_speeds * x), torch.cos(self.angular_speeds * x)], dim=-1
+        )
+        return embeddings
+class MHAttention(nn.Module):
+    def __init__(self, is_causal=False, dropout_level=0.0, n_heads=4):
+        super().__init__()
+        self.is_causal = is_causal
+        self.dropout_level = dropout_level
+        self.n_heads = n_heads
+    def forward(self, q, k, v, attn_mask=None):
+        assert q.size(-1) == k.size(-1)
+        assert k.size(-2) == v.size(-2)
+        q, k, v = [rearrange(x, "bs n (h d) -> bs h n d", h=self.n_heads) for x in [q, k, v]]
+        out = nn.functional.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=attn_mask,
+            is_causal=self.is_causal,
+            dropout_p=self.dropout_level if self.training else 0,
+        )
+        out = rearrange(out, "bs h n d -> bs n (h d)", h=self.n_heads)
+        return out
+class SelfAttention(nn.Module):
+    def __init__(self, embed_dim, is_causal=False, dropout_level=0.0, n_heads=4):
+        super().__init__()
+        self.qkv_linear = nn.Linear(embed_dim, 3 * embed_dim, bias=False)
+        self.mha = MHAttention(is_causal, dropout_level, n_heads)
+    def forward(self, x):
+        q, k, v = self.qkv_linear(x).chunk(3, dim=2)
+        return self.mha(q, k, v)
+class CrossAttention(nn.Module):
+    def __init__(self, embed_dim, is_causal=False, dropout_level=0, n_heads=4):
+        super().__init__()
+        self.kv_linear = nn.Linear(embed_dim, 2 * embed_dim, bias=False)
+        self.q_linear = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.mha = MHAttention(is_causal, dropout_level, n_heads)
+    def forward(self, x, y):
+        q = self.q_linear(x)
+        k, v = self.kv_linear(y).chunk(2, dim=2)
+        return self.mha(q, k, v)
+class MLP(nn.Module):
+    def __init__(self, embed_dim, mlp_multiplier, dropout_level):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(embed_dim, mlp_multiplier * embed_dim),
+            nn.GELU(),
+            nn.Linear(mlp_multiplier * embed_dim, embed_dim),
+            nn.Dropout(dropout_level),
+        )
+    def forward(self, x):
+        return self.mlp(x)
+class MLPSepConv(nn.Module):
+    def __init__(self, embed_dim, mlp_multiplier, dropout_level):
+        """see: https://github.com/ofsoundof/LocalViT"""
+        super().__init__()
+        self.mlp = nn.Sequential(
+            # this Conv with kernel size 1 is equivalent to the Linear layer in a "regular" transformer MLP
+            nn.Conv2d(embed_dim, mlp_multiplier * embed_dim, kernel_size=1, padding="same"),
+            nn.Conv2d(
+                mlp_multiplier * embed_dim,
+                mlp_multiplier * embed_dim,
+                kernel_size=3,
+                padding="same",
+                groups=mlp_multiplier * embed_dim,
+            ),  # <- depthwise conv
+            nn.GELU(),
+            nn.Conv2d(mlp_multiplier * embed_dim, embed_dim, kernel_size=1, padding="same"),
+            nn.Dropout(dropout_level),
+        )
+    def forward(self, x):
+        w = h = int(np.sqrt(x.size(1)))  # only square images for now
+        x = rearrange(x, "bs (h w) d -> bs d h w", h=h, w=w)
+        x = self.mlp(x)
+        x = rearrange(x, "bs d h w -> bs (h w) d")
+        return x
+class DecoderBlock(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        is_causal: bool,
+        mlp_multiplier: int,
+        dropout_level: float,
+        mlp_class: type[MLP] | type[MLPSepConv],
+    ):
+        super().__init__()
+        self.self_attention = SelfAttention(embed_dim, is_causal, dropout_level, n_heads=embed_dim // 64)
+        self.cross_attention = CrossAttention(
+            embed_dim, is_causal=False, dropout_level=0, n_heads=embed_dim // 64
+        )
+        self.mlp = mlp_class(embed_dim, mlp_multiplier, dropout_level)
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.norm2 = nn.LayerNorm(embed_dim)
+        self.norm3 = nn.LayerNorm(embed_dim)
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        x = self.self_attention(self.norm1(x)) + x
+        x = self.cross_attention(self.norm2(x), y) + x
+        x = self.mlp(self.norm3(x)) + x
+        return x