|
import argparse |
|
import inspect |
|
|
|
from pdb import set_trace as st |
|
|
|
from cldm.cldm import ControlledUnetModel, ControlNet |
|
|
|
from . import gaussian_diffusion as gd |
|
from .respace import SpacedDiffusion, space_timesteps |
|
|
|
from .unet import SuperResModel, UNetModel, EncoderUNetModel |
|
import torch as th |
|
from dit.dit_models_xformers import DiT_models |
|
if th.cuda.is_available(): |
|
from xformers.triton import FusedLayerNorm as LayerNorm |
|
|
|
NUM_CLASSES = 1000 |
|
|
|
|
|
def diffusion_defaults(): |
|
""" |
|
Defaults for image and classifier training. |
|
""" |
|
return dict( |
|
learn_sigma=False, |
|
diffusion_steps=1000, |
|
noise_schedule="linear", |
|
standarization_xt=False, |
|
timestep_respacing="", |
|
use_kl=False, |
|
predict_xstart=False, |
|
predict_v=False, |
|
rescale_timesteps=False, |
|
rescale_learned_sigmas=False, |
|
mixed_prediction=False, |
|
) |
|
|
|
|
|
def classifier_defaults(): |
|
""" |
|
Defaults for classifier models. |
|
""" |
|
return dict( |
|
image_size=64, |
|
classifier_use_fp16=False, |
|
classifier_width=128, |
|
classifier_depth=2, |
|
classifier_attention_resolutions="32,16,8", |
|
classifier_use_scale_shift_norm=True, |
|
classifier_resblock_updown=True, |
|
classifier_pool="attention", |
|
) |
|
|
|
|
|
def control_net_defaults(): |
|
res = dict( |
|
only_mid_control=False, |
|
control_key='img', |
|
normalize_clip_encoding=False, |
|
scale_clip_encoding=1.0, |
|
cfg_dropout_prob=0.0, |
|
|
|
) |
|
return res |
|
|
|
|
|
def continuous_diffusion_defaults(): |
|
|
|
res = dict( |
|
sde_time_eps=1e-2, |
|
sde_beta_start=0.1, |
|
sde_beta_end=20.0, |
|
sde_sde_type='vpsde', |
|
sde_sigma2_0=0.0, |
|
iw_sample_p='drop_sigma2t_iw', |
|
iw_sample_q='ll_iw', |
|
iw_subvp_like_vp_sde=False, |
|
train_vae=True, |
|
pred_type='eps', |
|
|
|
p_rendering_loss=False, |
|
unfix_logit=False, |
|
loss_type='eps', |
|
loss_weight='simple', |
|
|
|
diffusion_ce_anneal=True, |
|
enable_mixing_normal=True, |
|
) |
|
|
|
return res |
|
|
|
|
|
def model_and_diffusion_defaults(): |
|
""" |
|
Defaults for image training. |
|
""" |
|
res = dict( |
|
|
|
diffusion_input_size=224, |
|
num_channels=128, |
|
num_res_blocks=2, |
|
num_heads=4, |
|
num_heads_upsample=-1, |
|
num_head_channels=-1, |
|
attention_resolutions="16,8", |
|
channel_mult="", |
|
dropout=0.0, |
|
class_cond=False, |
|
use_checkpoint=False, |
|
use_scale_shift_norm=True, |
|
resblock_updown=False, |
|
use_fp16=False, |
|
use_new_attention_order=False, |
|
denoise_in_channels=3, |
|
denoise_out_channels=3, |
|
|
|
create_controlnet=False, |
|
create_dit=False, |
|
create_unet_with_hint=False, |
|
dit_model_arch='DiT-L/2', |
|
|
|
use_spatial_transformer=False, |
|
transformer_depth=1, |
|
context_dim=-1, |
|
roll_out=False, |
|
n_embed= |
|
None, |
|
legacy=True, |
|
mixing_logit_init=-6, |
|
hint_channels=3, |
|
|
|
|
|
) |
|
res.update(diffusion_defaults()) |
|
|
|
return res |
|
|
|
|
|
def classifier_and_diffusion_defaults(): |
|
res = classifier_defaults() |
|
res.update(diffusion_defaults()) |
|
return res |
|
|
|
|
|
def create_model_and_diffusion( |
|
|
|
diffusion_input_size, |
|
class_cond, |
|
learn_sigma, |
|
num_channels, |
|
num_res_blocks, |
|
channel_mult, |
|
num_heads, |
|
num_head_channels, |
|
num_heads_upsample, |
|
attention_resolutions, |
|
dropout, |
|
diffusion_steps, |
|
noise_schedule, |
|
timestep_respacing, |
|
use_kl, |
|
predict_xstart, |
|
predict_v, |
|
rescale_timesteps, |
|
rescale_learned_sigmas, |
|
use_checkpoint, |
|
use_scale_shift_norm, |
|
resblock_updown, |
|
use_fp16, |
|
use_new_attention_order, |
|
denoise_in_channels, |
|
denoise_out_channels, |
|
standarization_xt, |
|
mixed_prediction, |
|
|
|
create_controlnet, |
|
|
|
|
|
use_spatial_transformer, |
|
transformer_depth, |
|
context_dim, |
|
n_embed, |
|
legacy, |
|
mixing_logit_init, |
|
create_dit, |
|
create_unet_with_hint, |
|
dit_model_arch, |
|
roll_out, |
|
hint_channels, |
|
|
|
|
|
): |
|
model = create_model( |
|
diffusion_input_size, |
|
num_channels, |
|
num_res_blocks, |
|
channel_mult=channel_mult, |
|
learn_sigma=learn_sigma, |
|
class_cond=class_cond, |
|
use_checkpoint=use_checkpoint, |
|
attention_resolutions=attention_resolutions, |
|
num_heads=num_heads, |
|
num_head_channels=num_head_channels, |
|
num_heads_upsample=num_heads_upsample, |
|
use_scale_shift_norm=use_scale_shift_norm, |
|
dropout=dropout, |
|
resblock_updown=resblock_updown, |
|
use_fp16=use_fp16, |
|
use_new_attention_order=use_new_attention_order, |
|
denoise_in_channels=denoise_in_channels, |
|
denoise_out_channels=denoise_out_channels, |
|
mixed_prediction=mixed_prediction, |
|
create_controlnet=create_controlnet, |
|
|
|
|
|
use_spatial_transformer=use_spatial_transformer, |
|
transformer_depth=transformer_depth, |
|
context_dim=context_dim, |
|
n_embed=n_embed, |
|
legacy=legacy, |
|
mixing_logit_init=mixing_logit_init, |
|
create_dit=create_dit, |
|
create_unet_with_hint=create_unet_with_hint, |
|
dit_model_arch=dit_model_arch, |
|
roll_out=roll_out, |
|
hint_channels=hint_channels, |
|
|
|
) |
|
diffusion = create_gaussian_diffusion( |
|
diffusion_steps=diffusion_steps, |
|
learn_sigma=learn_sigma, |
|
noise_schedule=noise_schedule, |
|
use_kl=use_kl, |
|
predict_xstart=predict_xstart, |
|
predict_v=predict_v, |
|
rescale_timesteps=rescale_timesteps, |
|
rescale_learned_sigmas=rescale_learned_sigmas, |
|
timestep_respacing=timestep_respacing, |
|
standarization_xt=standarization_xt, |
|
) |
|
return model, diffusion |
|
|
|
|
|
def create_model( |
|
image_size, |
|
num_channels, |
|
num_res_blocks, |
|
channel_mult="", |
|
learn_sigma=False, |
|
class_cond=False, |
|
use_checkpoint=False, |
|
attention_resolutions="16", |
|
num_heads=1, |
|
num_head_channels=-1, |
|
num_heads_upsample=-1, |
|
use_scale_shift_norm=False, |
|
dropout=0, |
|
resblock_updown=False, |
|
use_fp16=False, |
|
use_new_attention_order=False, |
|
|
|
denoise_in_channels=-1, |
|
denoise_out_channels=3, |
|
mixed_prediction=False, |
|
create_controlnet=False, |
|
create_dit=False, |
|
create_unet_with_hint=False, |
|
dit_model_arch='DiT-L/2', |
|
hint_channels=3, |
|
use_spatial_transformer=False, |
|
transformer_depth=1, |
|
context_dim=None, |
|
n_embed=None, |
|
legacy=True, |
|
mixing_logit_init=-6, |
|
roll_out=False, |
|
|
|
): |
|
if channel_mult == "": |
|
if image_size == 512: |
|
channel_mult = (0.5, 1, 1, 2, 2, 4, 4) |
|
elif image_size == 448: |
|
channel_mult = (0.5, 1, 1, 2, 2, 4, 4) |
|
elif image_size == 320: |
|
channel_mult = (0.5, 1, 1, 2, 2, 4, 4) |
|
elif image_size == 224 and denoise_in_channels == 144: |
|
channel_mult = (1, 1, 2, 3, 4, 4) |
|
elif image_size == 224: |
|
channel_mult = (1, 1, 2, 2, 4, 4) |
|
elif image_size == 256: |
|
channel_mult = (1, 1, 2, 2, 4, 4) |
|
elif image_size == 128: |
|
channel_mult = (1, 1, 2, 3, 4) |
|
elif image_size == 64: |
|
channel_mult = (1, 2, 3, 4) |
|
|
|
elif image_size == 32: |
|
channel_mult = (1, 2, 4, 4) |
|
|
|
elif image_size == 16: |
|
channel_mult = (1, 2, 3, 4) |
|
else: |
|
raise ValueError(f"unsupported image size: {image_size}") |
|
else: |
|
channel_mult = tuple( |
|
int(ch_mult) for ch_mult in channel_mult.split(",")) |
|
|
|
attention_ds = [] |
|
for res in attention_resolutions.split(","): |
|
attention_ds.append(image_size // int(res)) |
|
|
|
if create_controlnet: |
|
|
|
controlledUnetModel = ControlledUnetModel( |
|
image_size=image_size, |
|
in_channels=denoise_in_channels, |
|
model_channels=num_channels, |
|
|
|
out_channels=(denoise_out_channels |
|
if not learn_sigma else denoise_out_channels * 2), |
|
num_res_blocks=num_res_blocks, |
|
attention_resolutions=tuple(attention_ds), |
|
dropout=dropout, |
|
channel_mult=channel_mult, |
|
num_classes=(NUM_CLASSES if class_cond else None), |
|
use_checkpoint=use_checkpoint, |
|
use_fp16=use_fp16, |
|
num_heads=num_heads, |
|
num_head_channels=num_head_channels, |
|
num_heads_upsample=num_heads_upsample, |
|
use_scale_shift_norm=use_scale_shift_norm, |
|
resblock_updown=resblock_updown, |
|
use_new_attention_order=use_new_attention_order, |
|
mixed_prediction=mixed_prediction, |
|
|
|
use_spatial_transformer=use_spatial_transformer, |
|
transformer_depth=transformer_depth, |
|
context_dim=context_dim, |
|
n_embed=n_embed, |
|
legacy=legacy, |
|
mixing_logit_init=mixing_logit_init, |
|
roll_out=roll_out |
|
) |
|
|
|
controlNet = ControlNet( |
|
image_size=image_size, |
|
in_channels=denoise_in_channels, |
|
model_channels=num_channels, |
|
|
|
hint_channels=hint_channels, |
|
|
|
|
|
|
|
num_res_blocks=num_res_blocks, |
|
attention_resolutions=tuple(attention_ds), |
|
dropout=dropout, |
|
channel_mult=channel_mult, |
|
|
|
use_checkpoint=use_checkpoint, |
|
use_fp16=use_fp16, |
|
num_heads=num_heads, |
|
num_head_channels=num_head_channels, |
|
num_heads_upsample=num_heads_upsample, |
|
use_scale_shift_norm=use_scale_shift_norm, |
|
resblock_updown=resblock_updown, |
|
use_new_attention_order=use_new_attention_order, |
|
roll_out=roll_out |
|
) |
|
|
|
|
|
return controlledUnetModel, controlNet |
|
|
|
elif create_dit: |
|
return DiT_models[dit_model_arch]( |
|
input_size=image_size, |
|
num_classes=0, |
|
learn_sigma=learn_sigma, |
|
in_channels=denoise_in_channels, |
|
context_dim=context_dim, |
|
roll_out=roll_out) |
|
else: |
|
|
|
|
|
|
|
|
|
unet_cls = UNetModel |
|
|
|
|
|
return unet_cls( |
|
image_size=image_size, |
|
in_channels=denoise_in_channels, |
|
model_channels=num_channels, |
|
|
|
out_channels=(denoise_out_channels |
|
if not learn_sigma else denoise_out_channels * 2), |
|
num_res_blocks=num_res_blocks, |
|
attention_resolutions=tuple(attention_ds), |
|
dropout=dropout, |
|
channel_mult=channel_mult, |
|
num_classes=(NUM_CLASSES if class_cond else None), |
|
use_checkpoint=use_checkpoint, |
|
use_fp16=use_fp16, |
|
num_heads=num_heads, |
|
num_head_channels=num_head_channels, |
|
num_heads_upsample=num_heads_upsample, |
|
use_scale_shift_norm=use_scale_shift_norm, |
|
resblock_updown=resblock_updown, |
|
use_new_attention_order=use_new_attention_order, |
|
mixed_prediction=mixed_prediction, |
|
|
|
use_spatial_transformer=use_spatial_transformer, |
|
transformer_depth=transformer_depth, |
|
context_dim=context_dim, |
|
n_embed=n_embed, |
|
legacy=legacy, |
|
mixing_logit_init=mixing_logit_init, |
|
roll_out=roll_out, |
|
hint_channels=hint_channels, |
|
|
|
) |
|
|
|
|
|
def create_classifier_and_diffusion( |
|
image_size, |
|
classifier_use_fp16, |
|
classifier_width, |
|
classifier_depth, |
|
classifier_attention_resolutions, |
|
classifier_use_scale_shift_norm, |
|
classifier_resblock_updown, |
|
classifier_pool, |
|
learn_sigma, |
|
diffusion_steps, |
|
noise_schedule, |
|
timestep_respacing, |
|
use_kl, |
|
predict_xstart, |
|
rescale_timesteps, |
|
rescale_learned_sigmas, |
|
): |
|
classifier = create_classifier( |
|
image_size, |
|
classifier_use_fp16, |
|
classifier_width, |
|
classifier_depth, |
|
classifier_attention_resolutions, |
|
classifier_use_scale_shift_norm, |
|
classifier_resblock_updown, |
|
classifier_pool, |
|
) |
|
diffusion = create_gaussian_diffusion( |
|
steps=diffusion_steps, |
|
learn_sigma=learn_sigma, |
|
noise_schedule=noise_schedule, |
|
use_kl=use_kl, |
|
predict_xstart=predict_xstart, |
|
rescale_timesteps=rescale_timesteps, |
|
rescale_learned_sigmas=rescale_learned_sigmas, |
|
timestep_respacing=timestep_respacing, |
|
) |
|
return classifier, diffusion |
|
|
|
|
|
def create_classifier( |
|
image_size, |
|
classifier_use_fp16, |
|
classifier_width, |
|
classifier_depth, |
|
classifier_attention_resolutions, |
|
classifier_use_scale_shift_norm, |
|
classifier_resblock_updown, |
|
classifier_pool, |
|
): |
|
if image_size == 512: |
|
channel_mult = (0.5, 1, 1, 2, 2, 4, 4) |
|
elif image_size == 256: |
|
channel_mult = (1, 1, 2, 2, 4, 4) |
|
elif image_size == 128: |
|
channel_mult = (1, 1, 2, 3, 4) |
|
elif image_size == 64: |
|
channel_mult = (1, 2, 3, 4) |
|
else: |
|
raise ValueError(f"unsupported image size: {image_size}") |
|
|
|
attention_ds = [] |
|
for res in classifier_attention_resolutions.split(","): |
|
attention_ds.append(image_size // int(res)) |
|
|
|
return EncoderUNetModel( |
|
image_size=image_size, |
|
in_channels=3, |
|
model_channels=classifier_width, |
|
out_channels=1000, |
|
num_res_blocks=classifier_depth, |
|
attention_resolutions=tuple(attention_ds), |
|
channel_mult=channel_mult, |
|
use_fp16=classifier_use_fp16, |
|
num_head_channels=64, |
|
use_scale_shift_norm=classifier_use_scale_shift_norm, |
|
resblock_updown=classifier_resblock_updown, |
|
pool=classifier_pool, |
|
) |
|
|
|
|
|
def sr_model_and_diffusion_defaults(): |
|
res = model_and_diffusion_defaults() |
|
res["large_size"] = 256 |
|
res["small_size"] = 64 |
|
arg_names = inspect.getfullargspec(sr_create_model_and_diffusion)[0] |
|
for k in res.copy().keys(): |
|
if k not in arg_names: |
|
del res[k] |
|
return res |
|
|
|
|
|
def sr_create_model_and_diffusion( |
|
large_size, |
|
small_size, |
|
class_cond, |
|
learn_sigma, |
|
num_channels, |
|
num_res_blocks, |
|
num_heads, |
|
num_head_channels, |
|
num_heads_upsample, |
|
attention_resolutions, |
|
dropout, |
|
diffusion_steps, |
|
noise_schedule, |
|
timestep_respacing, |
|
use_kl, |
|
predict_xstart, |
|
rescale_timesteps, |
|
rescale_learned_sigmas, |
|
use_checkpoint, |
|
use_scale_shift_norm, |
|
resblock_updown, |
|
use_fp16, |
|
): |
|
model = sr_create_model( |
|
large_size, |
|
small_size, |
|
num_channels, |
|
num_res_blocks, |
|
learn_sigma=learn_sigma, |
|
class_cond=class_cond, |
|
use_checkpoint=use_checkpoint, |
|
attention_resolutions=attention_resolutions, |
|
num_heads=num_heads, |
|
num_head_channels=num_head_channels, |
|
num_heads_upsample=num_heads_upsample, |
|
use_scale_shift_norm=use_scale_shift_norm, |
|
dropout=dropout, |
|
resblock_updown=resblock_updown, |
|
use_fp16=use_fp16, |
|
) |
|
diffusion = create_gaussian_diffusion( |
|
steps=diffusion_steps, |
|
learn_sigma=learn_sigma, |
|
noise_schedule=noise_schedule, |
|
use_kl=use_kl, |
|
predict_xstart=predict_xstart, |
|
rescale_timesteps=rescale_timesteps, |
|
rescale_learned_sigmas=rescale_learned_sigmas, |
|
timestep_respacing=timestep_respacing, |
|
) |
|
return model, diffusion |
|
|
|
|
|
def sr_create_model( |
|
large_size, |
|
small_size, |
|
num_channels, |
|
num_res_blocks, |
|
learn_sigma, |
|
class_cond, |
|
use_checkpoint, |
|
attention_resolutions, |
|
num_heads, |
|
num_head_channels, |
|
num_heads_upsample, |
|
use_scale_shift_norm, |
|
dropout, |
|
resblock_updown, |
|
use_fp16, |
|
): |
|
_ = small_size |
|
|
|
if large_size == 512: |
|
channel_mult = (1, 1, 2, 2, 4, 4) |
|
elif large_size == 256: |
|
channel_mult = (1, 1, 2, 2, 4, 4) |
|
elif large_size == 64: |
|
channel_mult = (1, 2, 3, 4) |
|
else: |
|
raise ValueError(f"unsupported large size: {large_size}") |
|
|
|
attention_ds = [] |
|
for res in attention_resolutions.split(","): |
|
attention_ds.append(large_size // int(res)) |
|
|
|
return SuperResModel( |
|
image_size=large_size, |
|
in_channels=3, |
|
model_channels=num_channels, |
|
out_channels=(3 if not learn_sigma else 6), |
|
num_res_blocks=num_res_blocks, |
|
attention_resolutions=tuple(attention_ds), |
|
dropout=dropout, |
|
channel_mult=channel_mult, |
|
num_classes=(NUM_CLASSES if class_cond else None), |
|
use_checkpoint=use_checkpoint, |
|
num_heads=num_heads, |
|
num_head_channels=num_head_channels, |
|
num_heads_upsample=num_heads_upsample, |
|
use_scale_shift_norm=use_scale_shift_norm, |
|
resblock_updown=resblock_updown, |
|
use_fp16=use_fp16, |
|
) |
|
|
|
|
|
def create_gaussian_diffusion( |
|
*, |
|
diffusion_steps=1000, |
|
learn_sigma=False, |
|
sigma_small=False, |
|
noise_schedule="linear", |
|
use_kl=False, |
|
predict_xstart=False, |
|
predict_v=False, |
|
rescale_timesteps=False, |
|
rescale_learned_sigmas=False, |
|
timestep_respacing="", |
|
standarization_xt=False, |
|
): |
|
betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps) |
|
if use_kl: |
|
loss_type = gd.LossType.RESCALED_KL |
|
elif rescale_learned_sigmas: |
|
loss_type = gd.LossType.RESCALED_MSE |
|
else: |
|
loss_type = gd.LossType.MSE |
|
if not timestep_respacing: |
|
timestep_respacing = [diffusion_steps] |
|
|
|
if predict_xstart: |
|
model_mean_type = gd.ModelMeanType.START_X |
|
elif predict_v: |
|
model_mean_type = gd.ModelMeanType.V |
|
else: |
|
model_mean_type = gd.ModelMeanType.EPSILON |
|
|
|
|
|
|
|
|
|
|
|
|
|
return SpacedDiffusion( |
|
use_timesteps=space_timesteps(diffusion_steps, timestep_respacing), |
|
betas=betas, |
|
model_mean_type=model_mean_type, |
|
|
|
|
|
|
|
|
|
model_var_type=(( |
|
gd.ModelVarType.FIXED_LARGE |
|
if not sigma_small else gd.ModelVarType.FIXED_SMALL) |
|
if not learn_sigma else gd.ModelVarType.LEARNED_RANGE), |
|
loss_type=loss_type, |
|
rescale_timesteps=rescale_timesteps, |
|
standarization_xt=standarization_xt, |
|
) |
|
|
|
|
|
def add_dict_to_argparser(parser, default_dict): |
|
for k, v in default_dict.items(): |
|
v_type = type(v) |
|
if v is None: |
|
v_type = str |
|
elif isinstance(v, bool): |
|
v_type = str2bool |
|
parser.add_argument(f"--{k}", default=v, type=v_type) |
|
|
|
|
|
def args_to_dict(args, keys): |
|
return {k: getattr(args, k) for k in keys} |
|
|
|
|
|
def str2bool(v): |
|
""" |
|
https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse |
|
""" |
|
if isinstance(v, bool): |
|
return v |
|
if v.lower() in ("yes", "true", "t", "y", "1"): |
|
return True |
|
elif v.lower() in ("no", "false", "f", "n", "0"): |
|
return False |
|
else: |
|
raise argparse.ArgumentTypeError("boolean value expected") |
|
|