{ "_class_name": "CausalVAEModel", "_diffusers_version": "0.27.2", "attn_resolutions": [], "decoder_attention": "AttnBlock3DFix", "decoder_conv_in": "CausalConv3d", "decoder_conv_out": "CausalConv3d", "decoder_mid_resnet": "ResnetBlock3D", "decoder_resnet_blocks": [ "ResnetBlock3D", "ResnetBlock3D", "ResnetBlock3D", "ResnetBlock3D" ], "decoder_spatial_upsample": [ "", "SpatialUpsample2x", "Spatial2xTime2x3DUpsample", "Spatial2xTime2x3DUpsample" ], "decoder_spatial_upsample_unup": [ "", "", "", "" ], "decoder_temporal_upsample": [ "", "", "", "" ], "double_z": true, "dropout": 0.0, "embed_dim": 4, "encoder_attention": "AttnBlock3DFix", "encoder_conv_in": "Conv2d", "encoder_conv_out": "CausalConv3d", "encoder_mid_resnet": "ResnetBlock3D", "encoder_resnet_blocks": [ "ResnetBlock2D", "ResnetBlock2D", "ResnetBlock3D", "ResnetBlock3D" ], "encoder_spatial_downsample": [ "Downsample", "Spatial2xTime2x3DDownsample", "Spatial2xTime2x3DDownsample", "" ], "encoder_spatial_downsample_undown": [ "", "", "", "" ], "encoder_temporal_downsample": [ "", "", "", "" ], "hidden_size": 128, "hidden_size_mult": [ 1, 2, 4, 4 ], "in_channels": 3, "loss_params": { "disc_start": 1, "disc_weight": 0.5, "kl_weight": 1e-06, "logvar_init": 0.0 }, "loss_type": "opensora.models.ae.videobase.losses.LPIPSWithDiscriminator", "lr": 1e-05, "num_res_blocks": 2, "out_channels": 3, "q_conv": "CausalConv3d", "resolution": 256, "z_channels": 4 }