File size: 8,956 Bytes

87c126b

import torch
import torch.nn as nn
import numpy as np
import math

from einops import rearrange
from pdb import set_trace as st

# from .dit_models import DiT, DiTBlock, DiT_models, get_2d_sincos_pos_embed, modulate, FinalLayer

from .dit_models_xformers import DiT, DiTBlock, DiT_models, get_2d_sincos_pos_embed, modulate, FinalLayer
# from .dit_models import DiT, DiTBlock, DiT_models, get_2d_sincos_pos_embed, modulate, FinalLayer


def modulate2(x, shift, scale):
    return x * (1 + scale) + shift


class DiTBlock2(DiTBlock):
    """
    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
    """

    def __init__(self, hidden_size, num_heads, mlp_ratio=4, **block_kwargs):
        super().__init__(hidden_size, num_heads, mlp_ratio, **block_kwargs)

    def forward(self, x, c):
        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(
            c).chunk(6, dim=-1)
        # st()
        x = x + gate_msa * self.attn(
            modulate2(self.norm1(x), shift_msa, scale_msa))
        x = x + gate_mlp * self.mlp(
            modulate2(self.norm2(x), shift_mlp, scale_mlp))
        return x


class FinalLayer2(FinalLayer):
    """
    The final layer of DiT, basically the decoder_pred in MAE with adaLN.
    """

    def __init__(self, hidden_size, patch_size, out_channels):
        super().__init__(hidden_size, patch_size, out_channels)

    def forward(self, x, c):
        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
        x = modulate2(self.norm_final(x), shift, scale)
        x = self.linear(x)
        return x


class DiT2(DiT):
    # a conditional ViT
    def __init__(self,
                 input_size=32,
                 patch_size=2,
                 in_channels=4,
                 hidden_size=1152,
                 depth=28,
                 num_heads=16,
                 mlp_ratio=4,
                 class_dropout_prob=0.1,
                 num_classes=1000,
                 learn_sigma=True,
                 mixing_logit_init=-3,
                 mixed_prediction=True,
                 context_dim=False,
                 roll_out=False,
                 plane_n=3,
                 return_all_layers=False,
                 vit_blk=...):
        super().__init__(input_size,
                         patch_size,
                         in_channels,
                         hidden_size,
                         depth,
                         num_heads,
                         mlp_ratio,
                         class_dropout_prob,
                         num_classes,
                         learn_sigma,
                         mixing_logit_init,
                         mixed_prediction,
                         context_dim,
                         roll_out,
                         vit_blk=DiTBlock2,
                         final_layer_blk=FinalLayer2)

        # no t and x embedder
        del self.x_embedder
        del self.t_embedder
        del self.final_layer
        torch.cuda.empty_cache()
        self.clip_text_proj = None
        self.plane_n = plane_n
        self.return_all_layers = return_all_layers

    def forward(self, c, *args, **kwargs):
        # return super().forward(x, timesteps, context, y, get_attr, **kwargs)
        """
        Forward pass of DiT.
        c: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
        """
        x = self.pos_embed.repeat(
            c.shape[0], 1, 1)  # (N, T, D), where T = H * W / patch_size ** 2

        if self.return_all_layers:
            all_layers = []

        # if context is not None:
        # c = context # B 3HW C

        for blk_idx, block in enumerate(self.blocks):
            if self.roll_out:
                if blk_idx % 2 == 0:  # with-in plane self attention
                    x = rearrange(x, 'b (n l) c -> (b n) l c ', n=self.plane_n)
                    x = block(x,
                              rearrange(c,
                                        'b (n l) c -> (b n) l c ',
                                        n=self.plane_n))  # (N, T, D)
                    # st()
                    if self.return_all_layers:
                        all_layers.append(x)
                else:  # global attention
                    x = rearrange(x, '(b n) l c -> b (n l) c ', n=self.plane_n)
                    x = block(x, c)  # (N, T, D)
                    # st()
                    if self.return_all_layers:
                        #  all merged into B dim
                        all_layers.append(
                            rearrange(x,
                                      'b (n l) c -> (b n) l c',
                                      n=self.plane_n))
            else:
                x = block(x, c)  # (N, T, D)

        # x = self.final_layer(x, c)  # (N, T, patch_size ** 2 * out_channels)

        # if self.roll_out:  # move n from L to B axis
        # x = rearrange(x, 'b (n l) c ->(b n) l c', n=3)

        # x = self.unpatchify(x)  # (N, out_channels, H, W)

        # if self.roll_out:  # move n from L to B axis
        #     x = rearrange(x, '(b n) c h w -> b (n c) h w', n=3)

        if self.return_all_layers:
            return all_layers
        else:
            return x


# class DiT2_DPT(DiT2):
#     def __init__(self, input_size=32, patch_size=2, in_channels=4, hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4, class_dropout_prob=0.1, num_classes=1000, learn_sigma=True, mixing_logit_init=-3, mixed_prediction=True, context_dim=False, roll_out=False, plane_n=3, vit_blk=...):
#         super().__init__(input_size, patch_size, in_channels, hidden_size, depth, num_heads, mlp_ratio, class_dropout_prob, num_classes, learn_sigma, mixing_logit_init, mixed_prediction, context_dim, roll_out, plane_n, vit_blk)
#         self.return_all_layers = True

#################################################################################
#                                   DiT2 Configs                                  #
#################################################################################


def DiT2_XL_2(**kwargs):
    return DiT2(depth=28,
                hidden_size=1152,
                patch_size=2,
                num_heads=16,
                **kwargs)


def DiT2_XL_2_half(**kwargs):
    return DiT2(depth=28 // 2,
                hidden_size=1152,
                patch_size=2,
                num_heads=16,
                **kwargs)


def DiT2_XL_4(**kwargs):
    return DiT2(depth=28,
                hidden_size=1152,
                patch_size=4,
                num_heads=16,
                **kwargs)


def DiT2_XL_8(**kwargs):
    return DiT2(depth=28,
                hidden_size=1152,
                patch_size=8,
                num_heads=16,
                **kwargs)


def DiT2_L_2(**kwargs):
    return DiT2(depth=24,
                hidden_size=1024,
                patch_size=2,
                num_heads=16,
                **kwargs)


def DiT2_L_2_half(**kwargs):
    return DiT2(depth=24 // 2,
                hidden_size=1024,
                patch_size=2,
                num_heads=16,
                **kwargs)


def DiT2_L_4(**kwargs):
    return DiT2(depth=24,
                hidden_size=1024,
                patch_size=4,
                num_heads=16,
                **kwargs)


def DiT2_L_8(**kwargs):
    return DiT2(depth=24,
                hidden_size=1024,
                patch_size=8,
                num_heads=16,
                **kwargs)


def DiT2_B_2(**kwargs):
    return DiT2(depth=12,
                hidden_size=768,
                patch_size=2,
                num_heads=12,
                **kwargs)


def DiT2_B_4(**kwargs):
    return DiT2(depth=12,
                hidden_size=768,
                patch_size=4,
                num_heads=12,
                **kwargs)


def DiT2_B_8(**kwargs):
    return DiT2(depth=12,
                hidden_size=768,
                patch_size=8,
                num_heads=12,
                **kwargs)


def DiT2_B_16(**kwargs):  # ours cfg
    return DiT2(depth=12,
                hidden_size=768,
                patch_size=16,
                num_heads=12,
                **kwargs)


def DiT2_S_2(**kwargs):
    return DiT2(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs)


def DiT2_S_4(**kwargs):
    return DiT2(depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs)


def DiT2_S_8(**kwargs):
    return DiT2(depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs)


DiT2_models = {
    'DiT2-XL/2': DiT2_XL_2,
    'DiT2-XL/2/half': DiT2_XL_2_half,
    'DiT2-XL/4': DiT2_XL_4,
    'DiT2-XL/8': DiT2_XL_8,
    'DiT2-L/2': DiT2_L_2,
    'DiT2-L/2/half': DiT2_L_2_half,
    'DiT2-L/4': DiT2_L_4,
    'DiT2-L/8': DiT2_L_8,
    'DiT2-B/2': DiT2_B_2,
    'DiT2-B/4': DiT2_B_4,
    'DiT2-B/8': DiT2_B_8,
    'DiT2-B/16': DiT2_B_16,
    'DiT2-S/2': DiT2_S_2,
    'DiT2-S/4': DiT2_S_4,
    'DiT2-S/8': DiT2_S_8,
}