Update unet/conditional_unet_model.py

19527dd verified about 2 months ago

41 kB

	from typing import List, Optional, Tuple, Union

	import torch
	from dataclasses import dataclass
	from typing import Optional, Tuple, Union

	import torch
	import torch.nn as nn

	from diffusers.configuration_utils import ConfigMixin, register_to_config
	from diffusers.utils import BaseOutput
	from diffusers.models.embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
	from diffusers.models.modeling_utils import ModelMixin
	from diffusers.models.unets.unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block

	@dataclass
	class UNet2DOutput(BaseOutput):
	"""
	The output of [`UNet2DModel`].

	Args:
	sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
	The hidden states output from the last layer of the model.
	"""

	sample: torch.FloatTensor


	class UNet2DModel(ModelMixin, ConfigMixin):
	r"""
	A 2D UNet model that takes a noisy sample and a timestep and returns a sample shaped output.

	This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
	for all models (such as downloading or saving).

	Parameters:
	sample_size (`int` or `Tuple[int, int]`, optional, defaults to `None`):
	Height and width of input/output sample. Dimensions must be a multiple of `2 ** (len(block_out_channels) -
	1)`.
	in_channels (`int`, optional, defaults to 3): Number of channels in the input sample.
	out_channels (`int`, optional, defaults to 3): Number of channels in the output.
	center_input_sample (`bool`, optional, defaults to `False`): Whether to center the input sample.
	time_embedding_type (`str`, optional, defaults to `"positional"`): Type of time embedding to use.
	freq_shift (`int`, optional, defaults to 0): Frequency shift for Fourier time embedding.
	flip_sin_to_cos (`bool`, optional, defaults to `True`):
	Whether to flip sin to cos for Fourier time embedding.
	down_block_types (`Tuple[str]`, optional, defaults to `("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D")`):
	Tuple of downsample block types.
	mid_block_type (`str`, optional, defaults to `"UNetMidBlock2D"`):
	Block type for middle of UNet, it can be either `UNetMidBlock2D` or `UnCLIPUNetMidBlock2D`.
	up_block_types (`Tuple[str]`, optional, defaults to `("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D")`):
	Tuple of upsample block types.
	block_out_channels (`Tuple[int]`, optional, defaults to `(224, 448, 672, 896)`):
	Tuple of block output channels.
	layers_per_block (`int`, optional, defaults to `2`): The number of layers per block.
	mid_block_scale_factor (`float`, optional, defaults to `1`): The scale factor for the mid block.
	downsample_padding (`int`, optional, defaults to `1`): The padding for the downsample convolution.
	downsample_type (`str`, optional, defaults to `conv`):
	The downsample type for downsampling layers. Choose between "conv" and "resnet"
	upsample_type (`str`, optional, defaults to `conv`):
	The upsample type for upsampling layers. Choose between "conv" and "resnet"
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	act_fn (`str`, optional, defaults to `"silu"`): The activation function to use.
	attention_head_dim (`int`, optional, defaults to `8`): The attention head dimension.
	norm_num_groups (`int`, optional, defaults to `32`): The number of groups for normalization.
	attn_norm_num_groups (`int`, optional, defaults to `None`):
	If set to an integer, a group norm layer will be created in the mid block's [`Attention`] layer with the
	given number of groups. If left as `None`, the group norm layer will only be created if
	`resnet_time_scale_shift` is set to `default`, and if created will have `norm_num_groups` groups.
	norm_eps (`float`, optional, defaults to `1e-5`): The epsilon for normalization.
	resnet_time_scale_shift (`str`, optional, defaults to `"default"`): Time scale shift config
	for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
	class_embed_type (`str`, optional, defaults to `None`):
	The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
	`"timestep"`, or `"identity"`.
	num_class_embeds (`int`, optional, defaults to `None`):
	Input dimension of the learnable embedding matrix to be projected to `time_embed_dim` when performing class
	conditioning with `class_embed_type` equal to `None`.
	"""

	@register_to_config
	def __init__(
	self,
	sample_size: Optional[Union[int, Tuple[int, int]]] = None,
	in_channels: int = 3,
	out_channels: int = 3,
	center_input_sample: bool = False,
	time_embedding_type: str = "positional",
	freq_shift: int = 0,
	flip_sin_to_cos: bool = True,
	down_block_types: Tuple[str, ...] = ("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
	up_block_types: Tuple[str, ...] = ("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
	block_out_channels: Tuple[int, ...] = (224, 448, 672, 896),
	layers_per_block: int = 2,
	mid_block_scale_factor: float = 1,
	downsample_padding: int = 1,
	downsample_type: str = "conv",
	upsample_type: str = "conv",
	dropout: float = 0.0,
	act_fn: str = "silu",
	attention_head_dim: Optional[int] = 8,
	norm_num_groups: int = 32,
	attn_norm_num_groups: Optional[int] = None,
	norm_eps: float = 1e-5,
	resnet_time_scale_shift: str = "default",
	add_attention: bool = True,
	class_embed_type: Optional[str] = None,
	num_class_embeds: Optional[int] = None,
	num_train_timesteps: Optional[int] = None,
	set_W_to_weight: Optional[bool] = True
	):
	super().__init__()

	self.sample_size = sample_size
	time_embed_dim = block_out_channels[0] * 4

	# Check inputs
	if len(down_block_types) != len(up_block_types):
	raise ValueError(
	f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
	)

	if len(block_out_channels) != len(down_block_types):
	raise ValueError(
	f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
	)

	# input
	self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))

	# time
	if time_embedding_type == "fourier":
	self.time_proj = GaussianFourierProjection(embedding_size=block_out_channels[0], scale=16, set_W_to_weight=set_W_to_weight)
	timestep_input_dim = 2 * block_out_channels[0]
	elif time_embedding_type == "positional":
	self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
	timestep_input_dim = block_out_channels[0]
	elif time_embedding_type == "learned":
	self.time_proj = nn.Embedding(num_train_timesteps, block_out_channels[0])
	timestep_input_dim = block_out_channels[0]

	self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)

	# class embedding
	if class_embed_type is None and num_class_embeds is not None:
	self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
	elif class_embed_type == "timestep":
	self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
	elif class_embed_type == "identity":
	self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
	else:
	self.class_embedding = None

	self.down_blocks = nn.ModuleList([])
	self.mid_block = None
	self.up_blocks = nn.ModuleList([])

	# down
	output_channel = block_out_channels[0]
	for i, down_block_type in enumerate(down_block_types):
	input_channel = output_channel
	output_channel = block_out_channels[i]
	is_final_block = i == len(block_out_channels) - 1

	down_block = get_down_block(
	down_block_type,
	num_layers=layers_per_block,
	in_channels=input_channel,
	out_channels=output_channel,
	temb_channels=time_embed_dim,
	add_downsample=not is_final_block,
	resnet_eps=norm_eps,
	resnet_act_fn=act_fn,
	resnet_groups=norm_num_groups,
	attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel,
	downsample_padding=downsample_padding,
	resnet_time_scale_shift=resnet_time_scale_shift,
	downsample_type=downsample_type,
	dropout=dropout,
	)
	self.down_blocks.append(down_block)

	# mid
	self.mid_block = UNetMidBlock2D(
	in_channels=block_out_channels[-1],
	temb_channels=time_embed_dim,
	dropout=dropout,
	resnet_eps=norm_eps,
	resnet_act_fn=act_fn,
	output_scale_factor=mid_block_scale_factor,
	resnet_time_scale_shift=resnet_time_scale_shift,
	attention_head_dim=attention_head_dim if attention_head_dim is not None else block_out_channels[-1],
	resnet_groups=norm_num_groups,
	attn_groups=attn_norm_num_groups,
	add_attention=add_attention,
	)

	# up
	reversed_block_out_channels = list(reversed(block_out_channels))
	output_channel = reversed_block_out_channels[0]
	for i, up_block_type in enumerate(up_block_types):
	prev_output_channel = output_channel
	output_channel = reversed_block_out_channels[i]
	input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]

	is_final_block = i == len(block_out_channels) - 1

	up_block = get_up_block(
	up_block_type,
	num_layers=layers_per_block + 1,
	in_channels=input_channel,
	out_channels=output_channel,
	prev_output_channel=prev_output_channel,
	temb_channels=time_embed_dim,
	add_upsample=not is_final_block,
	resnet_eps=norm_eps,
	resnet_act_fn=act_fn,
	resnet_groups=norm_num_groups,
	attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel,
	resnet_time_scale_shift=resnet_time_scale_shift,
	upsample_type=upsample_type,
	dropout=dropout,
	)
	self.up_blocks.append(up_block)
	prev_output_channel = output_channel

	# out
	num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
	self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=num_groups_out, eps=norm_eps)
	self.conv_act = nn.SiLU()
	self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, padding=1)

	def forward(
	self,
	sample: torch.FloatTensor,
	timestep: Union[torch.Tensor, float, int],
	class_labels: Optional[torch.Tensor] = None,
	return_dict: bool = True,
	) -> Union[UNet2DOutput, Tuple]:
	r"""
	The [`UNet2DModel`] forward method.

	Args:
	sample (`torch.FloatTensor`):
	The noisy input tensor with the following shape `(batch, channel, height, width)`.
	timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
	class_labels (`torch.FloatTensor`, optional, defaults to `None`):
	Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`~models.unet_2d.UNet2DOutput`] instead of a plain tuple.

	Returns:
	[`~models.unet_2d.UNet2DOutput`] or `tuple`:
	If `return_dict` is True, an [`~models.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is
	returned where the first element is the sample tensor.
	"""
	# 0. center input if necessary
	if self.config.center_input_sample:
	sample = 2 * sample - 1.0

	# 1. time
	timesteps = timestep
	if not torch.is_tensor(timesteps):
	timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
	elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
	timesteps = timesteps[None].to(sample.device)

	# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
	timesteps = timesteps * torch.ones(sample.shape[0], dtype=timesteps.dtype, device=timesteps.device)

	t_emb = self.time_proj(timesteps)

	# timesteps does not contain any weights and will always return f32 tensors
	# but time_embedding might actually be running in fp16. so we need to cast here.
	# there might be better ways to encapsulate this.
	t_emb = t_emb.to(dtype=self.dtype)
	emb = self.time_embedding(t_emb)

	if self.class_embedding is not None:
	if class_labels is None:
	raise ValueError("class_labels should be provided when doing class conditioning")

	if self.config.class_embed_type == "timestep":
	class_labels = self.time_proj(class_labels)

	class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
	emb = emb + class_emb
	elif self.class_embedding is None and class_labels is not None:
	raise ValueError("class_embedding needs to be initialized in order to use class conditioning")

	# 2. pre-process
	skip_sample = sample
	sample = self.conv_in(sample)

	# 3. down
	down_block_res_samples = (sample,)
	for downsample_block in self.down_blocks:
	if hasattr(downsample_block, "skip_conv"):
	sample, res_samples, skip_sample = downsample_block(
	hidden_states=sample, temb=emb, skip_sample=skip_sample
	)
	else:
	sample, res_samples = downsample_block(hidden_states=sample, temb=emb)

	down_block_res_samples += res_samples

	# 4. mid
	sample = self.mid_block(sample, emb)

	# 5. up
	skip_sample = None
	for upsample_block in self.up_blocks:
	res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
	down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]

	if hasattr(upsample_block, "skip_conv"):
	sample, skip_sample = upsample_block(sample, res_samples, emb, skip_sample)
	else:
	sample = upsample_block(sample, res_samples, emb)

	# 6. post-process
	sample = self.conv_norm_out(sample)
	sample = self.conv_act(sample)
	sample = self.conv_out(sample)

	if skip_sample is not None:
	sample += skip_sample

	if self.config.time_embedding_type == "fourier":
	timesteps = timesteps.reshape((sample.shape[0], ([1] len(sample.shape[1:]))))
	sample = sample / timesteps

	if not return_dict:
	return (sample,)

	return UNet2DOutput(sample=sample)

	NUM_CLASSES_FLOOR_HUE = 10
	NUM_CLASSES_OBJECT_HUE = 10
	NUM_CLASSES_ORIENTATION = 15
	NUM_CLASSES_SCALE = 8
	NUM_CLASSES_SHAPE = 4
	NUM_CLASSES_WALL_HUE = 10
	class ClassConditionedUnetForShapes3D(ModelMixin, ConfigMixin):
	@register_to_config
	def __init__(self,
	num_classes_floor_hue=NUM_CLASSES_FLOOR_HUE + 1,
	num_classes_object_hue=NUM_CLASSES_OBJECT_HUE + 1,
	num_classes_orientation=NUM_CLASSES_ORIENTATION + 1,
	num_classes_scale=NUM_CLASSES_SCALE + 1,
	num_classes_shape=NUM_CLASSES_SHAPE + 1,
	num_classes_wall_hue=NUM_CLASSES_WALL_HUE + 1,
	sample_size: Optional[Union[int, Tuple[int, int]]] = None,
	in_channels: int = 3,
	out_channels: int = 3,
	center_input_sample: bool = False,
	time_embedding_type: str = "positional",
	freq_shift: int = 0,
	flip_sin_to_cos: bool = True,
	down_block_types: Tuple[str, ...] = (
	"DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
	up_block_types: Tuple[str, ...] = ("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
	block_out_channels: Tuple[int, ...] = (224, 448, 672, 896),
	layers_per_block: int = 2,
	mid_block_scale_factor: float = 1,
	downsample_padding: int = 1,
	downsample_type: str = "conv",
	upsample_type: str = "conv",
	dropout: float = 0.0,
	act_fn: str = "silu",
	attention_head_dim: Optional[int] = 8,
	norm_num_groups: int = 32,
	attn_norm_num_groups: Optional[int] = None,
	norm_eps: float = 1e-5,
	resnet_time_scale_shift: str = "default",
	add_attention: bool = True,
	class_embed_type: Optional[str] = None,
	num_class_embeds: Optional[int] = None,
	num_train_timesteps: Optional[int] = None,
	set_W_to_weight: Optional[bool] = True
	):
	super().__init__()
	self.class_floor_hue = nn.Embedding(num_classes_floor_hue, num_classes_floor_hue)
	self.class_object_hue = nn.Embedding(num_classes_object_hue, num_classes_object_hue)
	self.class_orientation = nn.Embedding(num_classes_orientation, num_classes_orientation)
	self.class_scale = nn.Embedding(num_classes_scale, num_classes_scale)
	self.class_shape = nn.Embedding(num_classes_shape, num_classes_shape)
	self.class_wall_hue = nn.Embedding(num_classes_wall_hue, num_classes_wall_hue)
	self.model = UNet2DModel(
	sample_size=sample_size,
	in_channels=in_channels,
	out_channels=out_channels,
	center_input_sample=center_input_sample,
	time_embedding_type=time_embedding_type,
	freq_shift=freq_shift,
	flip_sin_to_cos=flip_sin_to_cos,
	down_block_types=down_block_types,
	up_block_types=up_block_types,
	block_out_channels=block_out_channels,
	layers_per_block=layers_per_block,
	mid_block_scale_factor=mid_block_scale_factor,
	downsample_padding=downsample_padding,
	downsample_type=downsample_type,
	upsample_type=upsample_type,
	dropout=dropout,
	act_fn=act_fn,
	attention_head_dim=attention_head_dim,
	norm_num_groups=norm_num_groups,
	attn_norm_num_groups=attn_norm_num_groups,
	norm_eps=norm_eps,
	resnet_time_scale_shift=resnet_time_scale_shift,
	add_attention=add_attention,
	class_embed_type=class_embed_type,
	num_class_embeds=num_class_embeds,
	num_train_timesteps=num_train_timesteps,
	set_W_to_weight=set_W_to_weight
	)

	def forward(self, x, t, class_labels):
	bs, ch, w, h = x.shape

	class_cond_floor_hue = self.class_floor_hue(class_labels[:, 0])
	class_cond_floor_hue = class_cond_floor_hue.view(bs, class_cond_floor_hue.shape[1], 1, 1).expand(bs, class_cond_floor_hue.shape[1], w, h)
	class_cond_object_hue = self.class_object_hue(class_labels[:, 1])
	class_cond_object_hue = class_cond_object_hue.view(bs, class_cond_object_hue.shape[1], 1, 1).expand(bs, class_cond_object_hue.shape[1], w, h)
	class_cond_orientation = self.class_orientation(class_labels[:, 2])
	class_cond_orientation = class_cond_orientation.view(bs, class_cond_orientation.shape[1], 1, 1).expand(bs, class_cond_orientation.shape[1], w, h)
	class_cond_scale = self.class_scale(class_labels[:, 3])
	class_cond_scale = class_cond_scale.view(bs, class_cond_scale.shape[1], 1, 1).expand(bs, class_cond_scale.shape[1], w, h)
	class_cond_shape = self.class_shape(class_labels[:, 4])
	class_cond_shape = class_cond_shape.view(bs, class_cond_shape.shape[1], 1, 1).expand(bs, class_cond_shape.shape[1], w, h)
	class_cond_wall_hue = self.class_wall_hue(class_labels[:, 5])
	class_cond_wall_hue = class_cond_wall_hue.view(bs, class_cond_wall_hue.shape[1], 1, 1).expand(bs, class_cond_wall_hue.shape[1], w, h)
	net_input = torch.cat([x, class_cond_floor_hue, class_cond_object_hue, class_cond_orientation, class_cond_scale, class_cond_shape, class_cond_wall_hue], dim=1)
	return self.model(net_input, t)


	class MultiLabelConditionalUNet2DModelForShapes3D(ModelMixin, ConfigMixin):
	r"""
	A 2D UNet model that takes a noisy sample and a timestep and returns a sample shaped output.

	This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
	for all models (such as downloading or saving).

	Parameters:
	sample_size (`int` or `Tuple[int, int]`, optional, defaults to `None`):
	Height and width of input/output sample. Dimensions must be a multiple of `2 ** (len(block_out_channels) -
	1)`.
	in_channels (`int`, optional, defaults to 3): Number of channels in the input sample.
	out_channels (`int`, optional, defaults to 3): Number of channels in the output.
	center_input_sample (`bool`, optional, defaults to `False`): Whether to center the input sample.
	time_embedding_type (`str`, optional, defaults to `"positional"`): Type of time embedding to use.
	freq_shift (`int`, optional, defaults to 0): Frequency shift for Fourier time embedding.
	flip_sin_to_cos (`bool`, optional, defaults to `True`):
	Whether to flip sin to cos for Fourier time embedding.
	down_block_types (`Tuple[str]`, optional, defaults to `("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D")`):
	Tuple of downsample block types.
	mid_block_type (`str`, optional, defaults to `"UNetMidBlock2D"`):
	Block type for middle of UNet, it can be either `UNetMidBlock2D` or `UnCLIPUNetMidBlock2D`.
	up_block_types (`Tuple[str]`, optional, defaults to `("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D")`):
	Tuple of upsample block types.
	block_out_channels (`Tuple[int]`, optional, defaults to `(224, 448, 672, 896)`):
	Tuple of block output channels.
	layers_per_block (`int`, optional, defaults to `2`): The number of layers per block.
	mid_block_scale_factor (`float`, optional, defaults to `1`): The scale factor for the mid block.
	downsample_padding (`int`, optional, defaults to `1`): The padding for the downsample convolution.
	downsample_type (`str`, optional, defaults to `conv`):
	The downsample type for downsampling layers. Choose between "conv" and "resnet"
	upsample_type (`str`, optional, defaults to `conv`):
	The upsample type for upsampling layers. Choose between "conv" and "resnet"
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	act_fn (`str`, optional, defaults to `"silu"`): The activation function to use.
	attention_head_dim (`int`, optional, defaults to `8`): The attention head dimension.
	norm_num_groups (`int`, optional, defaults to `32`): The number of groups for normalization.
	attn_norm_num_groups (`int`, optional, defaults to `None`):
	If set to an integer, a group norm layer will be created in the mid block's [`Attention`] layer with the
	given number of groups. If left as `None`, the group norm layer will only be created if
	`resnet_time_scale_shift` is set to `default`, and if created will have `norm_num_groups` groups.
	norm_eps (`float`, optional, defaults to `1e-5`): The epsilon for normalization.
	resnet_time_scale_shift (`str`, optional, defaults to `"default"`): Time scale shift config
	for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
	class_embed_type (`str`, optional, defaults to `None`):
	The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
	`"timestep"`, or `"identity"`.
	num_class_embeds (`int`, optional, defaults to `None`):
	Input dimension of the learnable embedding matrix to be projected to `time_embed_dim` when performing class
	conditioning with `class_embed_type` equal to `None`.
	"""

	@register_to_config
	def __init__(
	self,
	sample_size: Optional[Union[int, Tuple[int, int]]] = None,
	in_channels: int = 3,
	out_channels: int = 3,
	center_input_sample: bool = False,
	time_embedding_type: str = "positional",
	freq_shift: int = 0,
	flip_sin_to_cos: bool = True,
	down_block_types: Tuple[str, ...] = ("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
	up_block_types: Tuple[str, ...] = ("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
	block_out_channels: Tuple[int, ...] = (224, 448, 672, 896),
	layers_per_block: int = 2,
	mid_block_scale_factor: float = 1,
	downsample_padding: int = 1,
	downsample_type: str = "conv",
	upsample_type: str = "conv",
	dropout: float = 0.0,
	act_fn: str = "silu",
	attention_head_dim: Optional[int] = 8,
	norm_num_groups: int = 32,
	attn_norm_num_groups: Optional[int] = None,
	norm_eps: float = 1e-5,
	resnet_time_scale_shift: str = "default",
	add_attention: bool = True,
	class_embed_type: Optional[str] = None,
	num_class_embeds_floor_hue=NUM_CLASSES_FLOOR_HUE + 1,
	num_class_embeds_object_hue=NUM_CLASSES_OBJECT_HUE + 1,
	num_class_embeds_orientation=NUM_CLASSES_ORIENTATION + 1,
	num_class_embeds_scale=NUM_CLASSES_SCALE + 1,
	num_class_embeds_shape=NUM_CLASSES_SHAPE + 1,
	num_class_embeds_wall_hue=NUM_CLASSES_WALL_HUE + 1,
	num_train_timesteps: Optional[int] = None,
	set_W_to_weight: Optional[bool] = True
	):
	super().__init__()

	self.sample_size = sample_size
	time_embed_dim = block_out_channels[0] * 4

	# Check inputs
	if len(down_block_types) != len(up_block_types):
	raise ValueError(
	f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
	)

	if len(block_out_channels) != len(down_block_types):
	raise ValueError(
	f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
	)

	# input
	self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))

	# time
	if time_embedding_type == "fourier":
	self.time_proj = GaussianFourierProjection(embedding_size=block_out_channels[0], scale=16, set_W_to_weight=set_W_to_weight)
	timestep_input_dim = 2 * block_out_channels[0]
	elif time_embedding_type == "positional":
	self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
	timestep_input_dim = block_out_channels[0]
	elif time_embedding_type == "learned":
	self.time_proj = nn.Embedding(num_train_timesteps, block_out_channels[0])
	timestep_input_dim = block_out_channels[0]

	self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)

	# class embedding
	if class_embed_type is None and num_class_embeds_floor_hue is not None:
	self.class_embedding_floor_hue = nn.Embedding(num_class_embeds_floor_hue, time_embed_dim)
	self.class_embedding_object_hue = nn.Embedding(num_class_embeds_object_hue, time_embed_dim)
	self.class_embedding_orientation = nn.Embedding(num_class_embeds_orientation, time_embed_dim)
	self.class_embedding_scale = nn.Embedding(num_class_embeds_scale, time_embed_dim)
	self.class_embedding_shape = nn.Embedding(num_class_embeds_shape, time_embed_dim)
	self.class_embedding_wall_hue = nn.Embedding(num_class_embeds_wall_hue, time_embed_dim)
	elif class_embed_type == "timestep":
	self.class_embedding_floor_hue = TimestepEmbedding(timestep_input_dim, time_embed_dim)
	self.class_embedding_object_hue = TimestepEmbedding(timestep_input_dim, time_embed_dim)
	self.class_embedding_orientation = TimestepEmbedding(timestep_input_dim, time_embed_dim)
	self.class_embedding_scale = TimestepEmbedding(timestep_input_dim, time_embed_dim)
	self.class_embedding_shape = TimestepEmbedding(timestep_input_dim, time_embed_dim)
	self.class_embedding_wall_hue = TimestepEmbedding(timestep_input_dim, time_embed_dim)
	elif class_embed_type == "identity":
	self.class_embedding_floor_hue = nn.Identity(time_embed_dim, time_embed_dim)
	self.class_embedding_object_hue = nn.Identity(time_embed_dim, time_embed_dim)
	self.class_embedding_orientation = nn.Identity(time_embed_dim, time_embed_dim)
	self.class_embedding_scale = nn.Identity(time_embed_dim, time_embed_dim)
	self.class_embedding_shape = nn.Identity(time_embed_dim, time_embed_dim)
	self.class_embedding_wall_hue = nn.Identity(time_embed_dim, time_embed_dim)
	else:
	self.class_embedding_floor_hue = None

	self.down_blocks = nn.ModuleList([])
	self.mid_block = None
	self.up_blocks = nn.ModuleList([])

	# down
	output_channel = block_out_channels[0]
	for i, down_block_type in enumerate(down_block_types):
	input_channel = output_channel
	output_channel = block_out_channels[i]
	is_final_block = i == len(block_out_channels) - 1

	down_block = get_down_block(
	down_block_type,
	num_layers=layers_per_block,
	in_channels=input_channel,
	out_channels=output_channel,
	temb_channels=time_embed_dim,
	add_downsample=not is_final_block,
	resnet_eps=norm_eps,
	resnet_act_fn=act_fn,
	resnet_groups=norm_num_groups,
	attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel,
	downsample_padding=downsample_padding,
	resnet_time_scale_shift=resnet_time_scale_shift,
	downsample_type=downsample_type,
	dropout=dropout,
	)
	self.down_blocks.append(down_block)

	# mid
	self.mid_block = UNetMidBlock2D(
	in_channels=block_out_channels[-1],
	temb_channels=time_embed_dim,
	dropout=dropout,
	resnet_eps=norm_eps,
	resnet_act_fn=act_fn,
	output_scale_factor=mid_block_scale_factor,
	resnet_time_scale_shift=resnet_time_scale_shift,
	attention_head_dim=attention_head_dim if attention_head_dim is not None else block_out_channels[-1],
	resnet_groups=norm_num_groups,
	attn_groups=attn_norm_num_groups,
	add_attention=add_attention,
	)

	# up
	reversed_block_out_channels = list(reversed(block_out_channels))
	output_channel = reversed_block_out_channels[0]
	for i, up_block_type in enumerate(up_block_types):
	prev_output_channel = output_channel
	output_channel = reversed_block_out_channels[i]
	input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]

	is_final_block = i == len(block_out_channels) - 1

	up_block = get_up_block(
	up_block_type,
	num_layers=layers_per_block + 1,
	in_channels=input_channel,
	out_channels=output_channel,
	prev_output_channel=prev_output_channel,
	temb_channels=time_embed_dim,
	add_upsample=not is_final_block,
	resnet_eps=norm_eps,
	resnet_act_fn=act_fn,
	resnet_groups=norm_num_groups,
	attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel,
	resnet_time_scale_shift=resnet_time_scale_shift,
	upsample_type=upsample_type,
	dropout=dropout,
	)
	self.up_blocks.append(up_block)
	prev_output_channel = output_channel

	# out
	num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
	self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=num_groups_out, eps=norm_eps)
	self.conv_act = nn.SiLU()
	self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, padding=1)

	def forward(
	self,
	sample: torch.FloatTensor,
	timestep: Union[torch.Tensor, float, int],
	class_labels: Optional[torch.Tensor] = None,
	return_dict: bool = True,
	) -> Union[UNet2DOutput, Tuple]:
	r"""
	The [`UNet2DModel`] forward method.

	Args:
	sample (`torch.FloatTensor`):
	The noisy input tensor with the following shape `(batch, channel, height, width)`.
	timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
	class_labels (`torch.FloatTensor`, optional, defaults to `None`):
	Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`~models.unet_2d.UNet2DOutput`] instead of a plain tuple.

	Returns:
	[`~models.unet_2d.UNet2DOutput`] or `tuple`:
	If `return_dict` is True, an [`~models.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is
	returned where the first element is the sample tensor.
	"""
	# 0. center input if necessary
	if self.config.center_input_sample:
	sample = 2 * sample - 1.0

	# 1. time
	timesteps = timestep
	if not torch.is_tensor(timesteps):
	timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
	elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
	timesteps = timesteps[None].to(sample.device)

	# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
	timesteps = timesteps * torch.ones(sample.shape[0], dtype=timesteps.dtype, device=timesteps.device)

	t_emb = self.time_proj(timesteps)

	# timesteps does not contain any weights and will always return f32 tensors
	# but time_embedding might actually be running in fp16. so we need to cast here.
	# there might be better ways to encapsulate this.
	t_emb = t_emb.to(dtype=self.dtype)
	emb = self.time_embedding(t_emb)

	if self.class_embedding_floor_hue is not None:
	if class_labels is None:
	raise ValueError("class_labels should be provided when doing class conditioning")
	class_labels_floor_hue = class_labels[:, 0]
	class_labels_object_hue = class_labels[:, 1]
	class_labels_orientation = class_labels[:, 2]
	class_labels_scale = class_labels[:, 3]
	class_labels_shape = class_labels[:, 4]
	class_labels_wall_hue = class_labels[:, 5]
	if self.config.class_embed_type == "timestep":
	class_labels_floor_hue = self.time_proj(class_labels_floor_hue)
	class_labels_object_hue = self.time_proj(class_labels_object_hue)
	class_labels_orientation = self.time_proj(class_labels_orientation)
	class_labels_scale = self.time_proj(class_labels_scale)
	class_labels_shape = self.time_proj(class_labels_shape)
	class_labels_wall_hue = self.time_proj(class_labels_wall_hue)

	def add_embedding_if_non_zero(class_labels, class_embedding):
	# Create an output tensor initialized to zero of the required shape
	output = torch.zeros((class_labels.size(0), emb.size(1)), device=emb.device)

	# Check for non-zero indices
	non_zero_indices = class_labels.nonzero(as_tuple=True)

	if non_zero_indices[0].numel() > 0:
	# Compute embeddings for non-zero indices only
	embeddings = class_embedding(class_labels[non_zero_indices])
	# Place computed embeddings back into the correct positions
	output[non_zero_indices] = embeddings

	return output

	if self.class_embedding_floor_hue:
	emb += self.class_embedding_floor_hue(class_labels_floor_hue)
	if self.class_embedding_object_hue:
	emb += self.class_embedding_object_hue(class_labels_object_hue)
	if self.class_embedding_orientation:
	emb += self.class_embedding_orientation(class_labels_orientation)
	if self.class_embedding_scale:
	emb += self.class_embedding_scale(class_labels_scale)
	if self.class_embedding_shape:
	emb += self.class_embedding_shape(class_labels_shape)
	if self.class_embedding_wall_hue:
	emb += self.class_embedding_wall_hue(class_labels_wall_hue)
	elif self.class_embedding_floor_hue is None and class_labels is not None:
	raise ValueError("class_embedding needs to be initialized in order to use class conditioning")

	# 2. pre-process
	skip_sample = sample
	sample = self.conv_in(sample)

	# 3. down
	down_block_res_samples = (sample,)
	for downsample_block in self.down_blocks:
	if hasattr(downsample_block, "skip_conv"):
	sample, res_samples, skip_sample = downsample_block(
	hidden_states=sample, temb=emb, skip_sample=skip_sample
	)
	else:
	sample, res_samples = downsample_block(hidden_states=sample, temb=emb)

	down_block_res_samples += res_samples

	# 4. mid
	sample = self.mid_block(sample, emb)

	# 5. up
	skip_sample = None
	for upsample_block in self.up_blocks:
	res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
	down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]

	if hasattr(upsample_block, "skip_conv"):
	sample, skip_sample = upsample_block(sample, res_samples, emb, skip_sample)
	else:
	sample = upsample_block(sample, res_samples, emb)

	# 6. post-process
	sample = self.conv_norm_out(sample)
	sample = self.conv_act(sample)
	sample = self.conv_out(sample)

	if skip_sample is not None:
	sample += skip_sample

	if self.config.time_embedding_type == "fourier":
	timesteps = timesteps.reshape((sample.shape[0], ([1] len(sample.shape[1:]))))
	sample = sample / timesteps

	if not return_dict:
	return (sample,)

	return UNet2DOutput(sample=sample)