Spaces:

sayakpaul
/

convert-kerascv-sd-diffusers

Build error

App Files Files Community

sayakpaul HF staff commited on Jan 27, 2023

Commit

3304f7d

•

1 Parent(s): f16a80a

apply styling.

Browse files

Files changed (9) hide show

app.py +27 -15
conversion_utils/__init__.py +1 -1
conversion_utils/text_encoder.py +10 -8
conversion_utils/unet.py +326 -120
conversion_utils/utils.py +10 -6
convert.py +20 -18
hub_utils/__init__.py +2 -2
hub_utils/readme.py +2 -2
hub_utils/repo.py +10 -3

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
-import gradio as gr
-from convert import run_conversion
-from hub_utils import save_model_card, push_to_hub
 PRETRAINED_CKPT = "CompVis/stable-diffusion-v1-4"
 DESCRIPTION = """
@@ -20,25 +20,37 @@ This Space lets you convert KerasCV Stable Diffusion weights to a format compati
 Check [here](https://github.com/huggingface/diffusers/blob/31be42209ddfdb69d9640a777b32e9b5c6259bf0/examples/dreambooth/train_dreambooth_lora.py#L975) for an example on how you can change the scheduler of an already initialized pipeline.
 """
 def run(hf_token, text_encoder_weights, unet_weights, repo_prefix):
     if text_encoder_weights == "":
-        text_encoder_weights = None
     if unet_weights == "":
-        unet_weights = None
     pipeline = run_conversion(text_encoder_weights, unet_weights)
     output_path = "kerascv_sd_diffusers_pipeline"
     pipeline.save_pretrained(output_path)
-    save_model_card(base_model=PRETRAINED_CKPT, repo_folder=output_path, weight_paths=[text_encoder_weights, unet_weights], repo_prefix=repo_prefix)
     push_str = push_to_hub(hf_token, output_path, repo_prefix)
     return push_str
-demo = gr.Interface(
-        title="KerasCV Stable Diffusion to Diffusers Stable Diffusion Pipelines 🧨🤗",
-        description=DESCRIPTION,
-        allow_flagging="never",
-        inputs=[gr.Text(max_lines=1, label="your_hf_token"), gr.Text(max_lines=1, label="text_encoder_weights"), gr.Text(max_lines=1, label="unet_weights"), gr.Text(max_lines=1, label="output_repo_prefix")],
-        outputs=[gr.Markdown(label="output")],
-        fn=run,
-    )
-demo.launch()

+import gradio as gr
+from convert import run_conversion
+from hub_utils import push_to_hub, save_model_card
 PRETRAINED_CKPT = "CompVis/stable-diffusion-v1-4"
 DESCRIPTION = """
 Check [here](https://github.com/huggingface/diffusers/blob/31be42209ddfdb69d9640a777b32e9b5c6259bf0/examples/dreambooth/train_dreambooth_lora.py#L975) for an example on how you can change the scheduler of an already initialized pipeline.
 """
 def run(hf_token, text_encoder_weights, unet_weights, repo_prefix):
     if text_encoder_weights == "":
+        text_encoder_weights = None
     if unet_weights == "":
+        unet_weights = None
     pipeline = run_conversion(text_encoder_weights, unet_weights)
     output_path = "kerascv_sd_diffusers_pipeline"
     pipeline.save_pretrained(output_path)
+    save_model_card(
+        base_model=PRETRAINED_CKPT,
+        repo_folder=output_path,
+        weight_paths=[text_encoder_weights, unet_weights],
+        repo_prefix=repo_prefix,
+    )
     push_str = push_to_hub(hf_token, output_path, repo_prefix)
     return push_str
+demo = gr.Interface(
+    title="KerasCV Stable Diffusion to Diffusers Stable Diffusion Pipelines 🧨🤗",
+    description=DESCRIPTION,
+    allow_flagging="never",
+    inputs=[
+        gr.Text(max_lines=1, label="your_hf_token"),
+        gr.Text(max_lines=1, label="text_encoder_weights"),
+        gr.Text(max_lines=1, label="unet_weights"),
+        gr.Text(max_lines=1, label="output_repo_prefix"),
+    ],
+    outputs=[gr.Markdown(label="output")],
+    fn=run,
+)
+demo.launch()

conversion_utils/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 from .text_encoder import populate_text_encoder
 from .unet import populate_unet
-from .utils import run_assertion

 from .text_encoder import populate_text_encoder
 from .unet import populate_unet
+from .utils import run_assertion

conversion_utils/text_encoder.py CHANGED Viewed

@@ -1,16 +1,23 @@
-from keras_cv.models import stable_diffusion
 import tensorflow as tf
 import torch
-from typing import Dict
 MAX_SEQ_LENGTH = 77
 def populate_text_encoder(tf_text_encoder: tf.keras.Model) -> Dict[str, torch.Tensor]:
     """Populates the state dict from the provided TensorFlow model
     (applicable only for the text encoder)."""
     text_state_dict = dict()
     num_encoder_layers = 0
     for layer in tf_text_encoder.layers:
         # Embeddings.
         if isinstance(layer, stable_diffusion.text_encoder.CLIPEmbedding):
@@ -102,9 +109,4 @@ def populate_text_encoder(tf_text_encoder: tf.keras.Model) -> Dict[str, torch.Te
                 layer.get_weights()[1]
             )
-    # Position ids.
-    text_state_dict["text_model.embeddings.position_ids"] = torch.tensor(
-        list(range(MAX_SEQ_LENGTH))
-    ).unsqueeze(0)
-    return text_state_dict

+from typing import Dict
 import tensorflow as tf
 import torch
+from keras_cv.models import stable_diffusion
 MAX_SEQ_LENGTH = 77
 def populate_text_encoder(tf_text_encoder: tf.keras.Model) -> Dict[str, torch.Tensor]:
     """Populates the state dict from the provided TensorFlow model
     (applicable only for the text encoder)."""
     text_state_dict = dict()
     num_encoder_layers = 0
+    # Position ids.
+    text_state_dict["text_model.embeddings.position_ids"] = torch.tensor(
+        list(range(MAX_SEQ_LENGTH))
+    ).unsqueeze(0)
     for layer in tf_text_encoder.layers:
         # Embeddings.
         if isinstance(layer, stable_diffusion.text_encoder.CLIPEmbedding):
                 layer.get_weights()[1]
             )
+    return text_state_dict

conversion_utils/unet.py CHANGED Viewed

@@ -1,10 +1,14 @@
-import tensorflow as tf
-import torch
-from typing import Dict
 from itertools import product
 from keras_cv.models import stable_diffusion
-def port_transformer_block(transformer_block: tf.keras.Model, up_down: int, block_id: int, attention_id: int) -> Dict[str, torch.Tensor]:
     """Populates a Transformer block."""
     transformer_dict = dict()
     if block_id is not None:
@@ -15,36 +19,58 @@ def port_transformer_block(transformer_block: tf.keras.Model, up_down: int, bloc
     # Norms.
     for i in range(1, 4):
         if i == 1:
-            norm = transformer_block.norm1
         elif i == 2:
             norm = transformer_block.norm2
         elif i == 3:
             norm = transformer_block.norm3
-        transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.norm{i}.weight"] = torch.from_numpy(norm.get_weights()[0])
-        transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.norm{i}.bias"] = torch.from_numpy(norm.get_weights()[1])
     # Attentions.
     for i in range(1, 3):
         if i == 1:
             attn = transformer_block.attn1
         else:
             attn = transformer_block.attn2
-        transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.attn{i}.to_q.weight"] = torch.from_numpy(attn.to_q.get_weights()[0].transpose())
-        transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.attn{i}.to_k.weight"] = torch.from_numpy(attn.to_k.get_weights()[0].transpose())
-        transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.attn{i}.to_v.weight"] = torch.from_numpy(attn.to_v.get_weights()[0].transpose())
-        transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.attn{i}.to_out.0.weight"] = torch.from_numpy(attn.out_proj.get_weights()[0].transpose())
-        transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.attn{i}.to_out.0.bias"] = torch.from_numpy(attn.out_proj.get_weights()[1])
-    # Dense.
     for i in range(0, 3, 2):
         if i == 0:
             layer = transformer_block.geglu.dense
-            transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.ff.net.{i}.proj.weight"] = torch.from_numpy(layer.get_weights()[0].transpose())
-            transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.ff.net.{i}.proj.bias"] = torch.from_numpy(layer.get_weights()[1])
         else:
             layer = transformer_block.dense
-            transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.ff.net.{i}.weight"] = torch.from_numpy(layer.get_weights()[0].transpose())
-            transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.ff.net.{i}.bias"] = torch.from_numpy(layer.get_weights()[1])
     return transformer_dict
@@ -54,7 +80,7 @@ def populate_unet(tf_unet: tf.keras.Model) -> Dict[str, torch.Tensor]:
     (applicable only for the UNet)."""
     unet_state_dict = dict()
-    timstep_emb = 1
     padded_conv = 1
     up_block = 0
@@ -67,37 +93,66 @@ def populate_unet(tf_unet: tf.keras.Model) -> Dict[str, torch.Tensor]:
     for layer in tf_unet.layers:
         # Timstep embedding.
         if isinstance(layer, tf.keras.layers.Dense):
-            unet_state_dict[f"time_embedding.linear_{timstep_emb}.weight"] = torch.from_numpy(layer.get_weights()[0].transpose())
-            unet_state_dict[f"time_embedding.linear_{timstep_emb}.bias"] = torch.from_numpy(layer.get_weights()[1])
             timstep_emb += 1
         # Padded convs (downsamplers).
-        elif isinstance(layer, stable_diffusion.__internal__.layers.padded_conv2d.PaddedConv2D):
             if padded_conv == 1:
                 # Transposition axes taken from https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_tf_pytorch_utils.py#L104
-                unet_state_dict["conv_in.weight"] = torch.from_numpy(layer.get_weights()[0].transpose(3, 2, 0, 1))
-                unet_state_dict["conv_in.bias"] = torch.from_numpy(layer.get_weights()[1])
             elif padded_conv in [2, 3, 4]:
-                unet_state_dict[f"down_blocks.{padded_conv-2}.downsamplers.0.conv.weight"] = torch.from_numpy(layer.get_weights()[0].transpose(3, 2, 0, 1))
-                unet_state_dict[f"down_blocks.{padded_conv-2}.downsamplers.0.conv.bias"] = torch.from_numpy(layer.get_weights()[1])
             elif padded_conv == 5:
-                unet_state_dict["conv_out.weight"] = torch.from_numpy(layer.get_weights()[0].transpose(3, 2, 0, 1))
-                unet_state_dict["conv_out.bias"] = torch.from_numpy(layer.get_weights()[1])
             padded_conv += 1
         # Upsamplers.
         elif isinstance(layer, stable_diffusion.diffusion_model.Upsample):
             conv = layer.conv
-            unet_state_dict[f"up_blocks.{up_block}.upsamplers.0.conv.weight"] = torch.from_numpy(conv.get_weights()[0].transpose(3, 2, 0, 1))
-            unet_state_dict[f"up_blocks.{up_block}.upsamplers.0.conv.bias"] = torch.from_numpy(conv.get_weights()[1])
             up_block += 1
         # Output norms.
-        elif isinstance(layer, stable_diffusion.__internal__.layers.group_normalization.GroupNormalization):
-            unet_state_dict["conv_norm_out.weight"] = torch.from_numpy(layer.get_weights()[0])
-            unet_state_dict["conv_norm_out.bias"] = torch.from_numpy(layer.get_weights()[1])
         # All ResBlocks.
         elif isinstance(layer, stable_diffusion.diffusion_model.ResBlock):
             layer_name = layer.name
@@ -105,8 +160,8 @@ def populate_unet(tf_unet: tf.keras.Model) -> Dict[str, torch.Tensor]:
             # Down.
             if len(parts) == 2 or int(parts[-1]) < 8:
-                entry_flow = layer.entry_flow
-                embedding_flow = layer.embedding_flow
                 exit_flow = layer.exit_flow
                 down_block_id = 0 if len(parts) == 2 else int(parts[-1]) // 2
@@ -114,72 +169,138 @@ def populate_unet(tf_unet: tf.keras.Model) -> Dict[str, torch.Tensor]:
                 # Conv blocks.
                 first_conv_layer = entry_flow[-1]
-                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv1.weight"] = torch.from_numpy(first_conv_layer.get_weights()[0].transpose(3, 2, 0, 1))
-                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv1.bias"] = torch.from_numpy(first_conv_layer.get_weights()[1])
                 second_conv_layer = exit_flow[-1]
-                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv2.weight"] = torch.from_numpy(second_conv_layer.get_weights()[0].transpose(3, 2, 0, 1))
-                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv2.bias"] = torch.from_numpy(second_conv_layer.get_weights()[1])
-                # Residual blocks.
                 if hasattr(layer, "residual_projection"):
-                    if isinstance(layer.residual_projection, stable_diffusion.__internal__.layers.padded_conv2d.PaddedConv2D):
                         residual = layer.residual_projection
-                        unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv_shortcut.weight"] = torch.from_numpy(residual.get_weights()[0].transpose(3, 2, 0, 1))
-                        unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv_shortcut.bias"] = torch.from_numpy(residual.get_weights()[1])
                 # Timestep embedding.
                 embedding_proj = embedding_flow[-1]
-                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.time_emb_proj.weight"] = torch.from_numpy(embedding_proj.get_weights()[0].transpose())
-                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.time_emb_proj.bias"] = torch.from_numpy(embedding_proj.get_weights()[1])
                 # Norms.
                 first_group_norm = entry_flow[0]
-                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.norm1.weight"] = torch.from_numpy(first_group_norm.get_weights()[0])
-                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.norm1.bias"] = torch.from_numpy(first_group_norm.get_weights()[1])
                 second_group_norm = exit_flow[0]
-                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.norm2.weight"] = torch.from_numpy(second_group_norm.get_weights()[0])
-                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.norm2.bias"] = torch.from_numpy(second_group_norm.get_weights()[1])
             # Middle.
             elif int(parts[-1]) == 8 or int(parts[-1]) == 9:
-                entry_flow = layer.entry_flow
-                embedding_flow = layer.embedding_flow
                 exit_flow = layer.exit_flow
                 mid_resnet_id = int(parts[-1]) % 2
                 # Conv blocks.
                 first_conv_layer = entry_flow[-1]
-                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.conv1.weight"] = torch.from_numpy(first_conv_layer.get_weights()[0].transpose(3, 2, 0, 1))
-                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.conv1.bias"] = torch.from_numpy(first_conv_layer.get_weights()[1])
                 second_conv_layer = exit_flow[-1]
-                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.conv2.weight"] = torch.from_numpy(second_conv_layer.get_weights()[0].transpose(3, 2, 0, 1))
-                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.conv2.bias"] = torch.from_numpy(second_conv_layer.get_weights()[1])
-                # Residual blocks.
                 if hasattr(layer, "residual_projection"):
-                    if isinstance(layer.residual_projection, stable_diffusion.__internal__.layers.padded_conv2d.PaddedConv2D):
                         residual = layer.residual_projection
-                        unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.conv_shortcut.weight"] = torch.from_numpy(residual.get_weights()[0].transpose(3, 2, 0, 1))
-                        unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.conv_shortcut.bias"] = torch.from_numpy(residual.get_weights()[1])
                 # Timestep embedding.
                 embedding_proj = embedding_flow[-1]
-                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.time_emb_proj.weight"] = torch.from_numpy(embedding_proj.get_weights()[0].transpose())
-                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.time_emb_proj.bias"] = torch.from_numpy(embedding_proj.get_weights()[1])
                 # Norms.
                 first_group_norm = entry_flow[0]
-                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.norm1.weight"] = torch.from_numpy(first_group_norm.get_weights()[0])
-                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.norm1.bias"] = torch.from_numpy(first_group_norm.get_weights()[1])
                 second_group_norm = exit_flow[0]
-                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.norm2.weight"] = torch.from_numpy(second_group_norm.get_weights()[0])
-                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.norm2.bias"] = torch.from_numpy(second_group_norm.get_weights()[1])
-            # Up.
             elif int(parts[-1]) > 9 and up_res_block_flag < len(up_res_blocks):
-                entry_flow = layer.entry_flow
-                embedding_flow = layer.embedding_flow
                 exit_flow = layer.exit_flow
                 up_res_block = up_res_blocks[up_res_block_flag]
@@ -188,32 +309,65 @@ def populate_unet(tf_unet: tf.keras.Model) -> Dict[str, torch.Tensor]:
                 # Conv blocks.
                 first_conv_layer = entry_flow[-1]
-                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv1.weight"] = torch.from_numpy(first_conv_layer.get_weights()[0].transpose(3, 2, 0, 1))
-                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv1.bias"] = torch.from_numpy(first_conv_layer.get_weights()[1])
                 second_conv_layer = exit_flow[-1]
-                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv2.weight"] = torch.from_numpy(second_conv_layer.get_weights()[0].transpose(3, 2, 0, 1))
-                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv2.bias"] = torch.from_numpy(second_conv_layer.get_weights()[1])
-                # Residual blocks.
                 if hasattr(layer, "residual_projection"):
-                    if isinstance(layer.residual_projection, stable_diffusion.__internal__.layers.padded_conv2d.PaddedConv2D):
                         residual = layer.residual_projection
-                        unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv_shortcut.weight"] = torch.from_numpy(residual.get_weights()[0].transpose(3, 2, 0, 1))
-                        unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv_shortcut.bias"] = torch.from_numpy(residual.get_weights()[1])
                 # Timestep embedding.
                 embedding_proj = embedding_flow[-1]
-                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.time_emb_proj.weight"] = torch.from_numpy(embedding_proj.get_weights()[0].transpose())
-                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.time_emb_proj.bias"] = torch.from_numpy(embedding_proj.get_weights()[1])
                 # Norms.
                 first_group_norm = entry_flow[0]
-                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.norm1.weight"] = torch.from_numpy(first_group_norm.get_weights()[0])
-                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.norm1.bias"] = torch.from_numpy(first_group_norm.get_weights()[1])
                 second_group_norm = exit_flow[0]
-                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.norm2.weight"] = torch.from_numpy(second_group_norm.get_weights()[0])
-                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.norm2.bias"] = torch.from_numpy(second_group_norm.get_weights()[1])
                 up_res_block_flag += 1
         # All SpatialTransformer blocks.
@@ -225,67 +379,119 @@ def populate_unet(tf_unet: tf.keras.Model) -> Dict[str, torch.Tensor]:
             if len(parts) == 2 or int(parts[-1]) < 6:
                 down_block_id = 0 if len(parts) == 2 else int(parts[-1]) // 2
                 down_attention_id = 0 if len(parts) == 2 else int(parts[-1]) % 2
                 # Convs.
                 proj1 = layer.proj1
-                unet_state_dict[f"down_blocks.{down_block_id}.attentions.{down_attention_id}.proj_in.weight"] = torch.from_numpy(proj1.get_weights()[0].transpose(3, 2, 0, 1))
-                unet_state_dict[f"down_blocks.{down_block_id}.attentions.{down_attention_id}.proj_in.bias"] = torch.from_numpy(proj1.get_weights()[1])
                 proj2 = layer.proj2
-                unet_state_dict[f"down_blocks.{down_block_id}.attentions.{down_attention_id}.proj_out.weight"] = torch.from_numpy(proj2.get_weights()[0].transpose(3, 2, 0, 1))
-                unet_state_dict[f"down_blocks.{down_block_id}.attentions.{down_attention_id}.proj_out.bias"] = torch.from_numpy(proj2.get_weights()[1])
                 # Transformer blocks.
                 transformer_block = layer.transformer_block
-                unet_state_dict.update(port_transformer_block(transformer_block, "down", down_block_id, down_attention_id))
                 # Norms.
                 norm = layer.norm
-                unet_state_dict[f"down_blocks.{down_block_id}.attentions.{down_attention_id}.norm.weight"] = torch.from_numpy(norm.get_weights()[0])
-                unet_state_dict[f"down_blocks.{down_block_id}.attentions.{down_attention_id}.norm.bias"] = torch.from_numpy(norm.get_weights()[1])
             # Middle.
             elif int(parts[-1]) == 6:
                 mid_attention_id = int(parts[-1]) % 2
                 # Convs.
                 proj1 = layer.proj1
-                unet_state_dict[f"mid_block.attentions.{mid_attention_id}.proj_in.weight"] = torch.from_numpy(proj1.get_weights()[0].transpose(3, 2, 0, 1))
-                unet_state_dict[f"mid_block.attentions.{mid_attention_id}.proj_in.bias"] = torch.from_numpy(proj1.get_weights()[1])
                 proj2 = layer.proj2
-                unet_state_dict[f"mid_block.attentions.{mid_resnet_id}.proj_out.weight"] = torch.from_numpy(proj2.get_weights()[0].transpose(3, 2, 0, 1))
-                unet_state_dict[f"mid_block.attentions.{mid_attention_id}.proj_out.bias"] = torch.from_numpy(proj2.get_weights()[1])
                 # Transformer blocks.
                 transformer_block = layer.transformer_block
-                unet_state_dict.update(port_transformer_block(transformer_block, "mid", None, mid_attention_id))
                 # Norms.
                 norm = layer.norm
-                unet_state_dict[f"mid_block.attentions.{mid_attention_id}.norm.weight"] = torch.from_numpy(norm.get_weights()[0])
-                unet_state_dict[f"mid_block.attentions.{mid_attention_id}.norm.bias"] = torch.from_numpy(norm.get_weights()[1])
             # Up.
-            elif int(parts[-1]) > 6 and up_spatial_transformer_flag < len(up_spatial_transformer_blocks):
-                up_spatial_transformer_block = up_spatial_transformer_blocks[up_spatial_transformer_flag]
                 up_block_id = up_spatial_transformer_block[0]
                 up_attention_id = up_spatial_transformer_block[1]
                 # Convs.
                 proj1 = layer.proj1
-                unet_state_dict[f"up_blocks.{up_block_id}.attentions.{up_attention_id}.proj_in.weight"] = torch.from_numpy(proj1.get_weights()[0].transpose(3, 2, 0, 1))
-                unet_state_dict[f"up_blocks.{up_block_id}.attentions.{up_attention_id}.proj_in.bias"] = torch.from_numpy(proj1.get_weights()[1])
                 proj2 = layer.proj2
-                unet_state_dict[f"up_blocks.{up_block_id}.attentions.{up_attention_id}.proj_out.weight"] = torch.from_numpy(proj2.get_weights()[0].transpose(3, 2, 0, 1))
-                unet_state_dict[f"up_blocks.{up_block_id}.attentions.{up_attention_id}.proj_out.bias"] = torch.from_numpy(proj2.get_weights()[1])
                 # Transformer blocks.
                 transformer_block = layer.transformer_block
-                unet_state_dict.update(port_transformer_block(transformer_block, "up", up_block_id, up_attention_id))
                 # Norms.
                 norm = layer.norm
-                unet_state_dict[f"up_blocks.{up_block_id}.attentions.{up_attention_id}.norm.weight"] = torch.from_numpy(norm.get_weights()[0])
-                unet_state_dict[f"up_blocks.{up_block_id}.attentions.{up_attention_id}.norm.bias"] = torch.from_numpy(norm.get_weights()[1])
                 up_spatial_transformer_flag += 1
-    return unet_state_dict

 from itertools import product
+from typing import Dict
+import tensorflow as tf
+import torch
 from keras_cv.models import stable_diffusion
+def port_transformer_block(
+    transformer_block: tf.keras.Model, up_down: int, block_id: int, attention_id: int
+) -> Dict[str, torch.Tensor]:
     """Populates a Transformer block."""
     transformer_dict = dict()
     if block_id is not None:
     # Norms.
     for i in range(1, 4):
         if i == 1:
+            norm = transformer_block.norm1
         elif i == 2:
             norm = transformer_block.norm2
         elif i == 3:
             norm = transformer_block.norm3
+        transformer_dict[
+            f"{prefix}.attentions.{attention_id}.transformer_blocks.0.norm{i}.weight"
+        ] = torch.from_numpy(norm.get_weights()[0])
+        transformer_dict[
+            f"{prefix}.attentions.{attention_id}.transformer_blocks.0.norm{i}.bias"
+        ] = torch.from_numpy(norm.get_weights()[1])
     # Attentions.
     for i in range(1, 3):
         if i == 1:
             attn = transformer_block.attn1
         else:
             attn = transformer_block.attn2
+        transformer_dict[
+            f"{prefix}.attentions.{attention_id}.transformer_blocks.0.attn{i}.to_q.weight"
+        ] = torch.from_numpy(attn.to_q.get_weights()[0].transpose())
+        transformer_dict[
+            f"{prefix}.attentions.{attention_id}.transformer_blocks.0.attn{i}.to_k.weight"
+        ] = torch.from_numpy(attn.to_k.get_weights()[0].transpose())
+        transformer_dict[
+            f"{prefix}.attentions.{attention_id}.transformer_blocks.0.attn{i}.to_v.weight"
+        ] = torch.from_numpy(attn.to_v.get_weights()[0].transpose())
+        transformer_dict[
+            f"{prefix}.attentions.{attention_id}.transformer_blocks.0.attn{i}.to_out.0.weight"
+        ] = torch.from_numpy(attn.out_proj.get_weights()[0].transpose())
+        transformer_dict[
+            f"{prefix}.attentions.{attention_id}.transformer_blocks.0.attn{i}.to_out.0.bias"
+        ] = torch.from_numpy(attn.out_proj.get_weights()[1])
+    # Dense.
     for i in range(0, 3, 2):
         if i == 0:
             layer = transformer_block.geglu.dense
+            transformer_dict[
+                f"{prefix}.attentions.{attention_id}.transformer_blocks.0.ff.net.{i}.proj.weight"
+            ] = torch.from_numpy(layer.get_weights()[0].transpose())
+            transformer_dict[
+                f"{prefix}.attentions.{attention_id}.transformer_blocks.0.ff.net.{i}.proj.bias"
+            ] = torch.from_numpy(layer.get_weights()[1])
         else:
             layer = transformer_block.dense
+            transformer_dict[
+                f"{prefix}.attentions.{attention_id}.transformer_blocks.0.ff.net.{i}.weight"
+            ] = torch.from_numpy(layer.get_weights()[0].transpose())
+            transformer_dict[
+                f"{prefix}.attentions.{attention_id}.transformer_blocks.0.ff.net.{i}.bias"
+            ] = torch.from_numpy(layer.get_weights()[1])
     return transformer_dict
     (applicable only for the UNet)."""
     unet_state_dict = dict()
+    timstep_emb = 1
     padded_conv = 1
     up_block = 0
     for layer in tf_unet.layers:
         # Timstep embedding.
         if isinstance(layer, tf.keras.layers.Dense):
+            unet_state_dict[
+                f"time_embedding.linear_{timstep_emb}.weight"
+            ] = torch.from_numpy(layer.get_weights()[0].transpose())
+            unet_state_dict[
+                f"time_embedding.linear_{timstep_emb}.bias"
+            ] = torch.from_numpy(layer.get_weights()[1])
             timstep_emb += 1
         # Padded convs (downsamplers).
+        elif isinstance(
+            layer, stable_diffusion.__internal__.layers.padded_conv2d.PaddedConv2D
+        ):
             if padded_conv == 1:
                 # Transposition axes taken from https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_tf_pytorch_utils.py#L104
+                unet_state_dict["conv_in.weight"] = torch.from_numpy(
+                    layer.get_weights()[0].transpose(3, 2, 0, 1)
+                )
+                unet_state_dict["conv_in.bias"] = torch.from_numpy(
+                    layer.get_weights()[1]
+                )
             elif padded_conv in [2, 3, 4]:
+                unet_state_dict[
+                    f"down_blocks.{padded_conv-2}.downsamplers.0.conv.weight"
+                ] = torch.from_numpy(layer.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[
+                    f"down_blocks.{padded_conv-2}.downsamplers.0.conv.bias"
+                ] = torch.from_numpy(layer.get_weights()[1])
             elif padded_conv == 5:
+                unet_state_dict["conv_out.weight"] = torch.from_numpy(
+                    layer.get_weights()[0].transpose(3, 2, 0, 1)
+                )
+                unet_state_dict["conv_out.bias"] = torch.from_numpy(
+                    layer.get_weights()[1]
+                )
             padded_conv += 1
         # Upsamplers.
         elif isinstance(layer, stable_diffusion.diffusion_model.Upsample):
             conv = layer.conv
+            unet_state_dict[
+                f"up_blocks.{up_block}.upsamplers.0.conv.weight"
+            ] = torch.from_numpy(conv.get_weights()[0].transpose(3, 2, 0, 1))
+            unet_state_dict[
+                f"up_blocks.{up_block}.upsamplers.0.conv.bias"
+            ] = torch.from_numpy(conv.get_weights()[1])
             up_block += 1
         # Output norms.
+        elif isinstance(
+            layer,
+            stable_diffusion.__internal__.layers.group_normalization.GroupNormalization,
+        ):
+            unet_state_dict["conv_norm_out.weight"] = torch.from_numpy(
+                layer.get_weights()[0]
+            )
+            unet_state_dict["conv_norm_out.bias"] = torch.from_numpy(
+                layer.get_weights()[1]
+            )
         # All ResBlocks.
         elif isinstance(layer, stable_diffusion.diffusion_model.ResBlock):
             layer_name = layer.name
             # Down.
             if len(parts) == 2 or int(parts[-1]) < 8:
+                entry_flow = layer.entry_flow
+                embedding_flow = layer.embedding_flow
                 exit_flow = layer.exit_flow
                 down_block_id = 0 if len(parts) == 2 else int(parts[-1]) // 2
                 # Conv blocks.
                 first_conv_layer = entry_flow[-1]
+                unet_state_dict[
+                    f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv1.weight"
+                ] = torch.from_numpy(
+                    first_conv_layer.get_weights()[0].transpose(3, 2, 0, 1)
+                )
+                unet_state_dict[
+                    f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv1.bias"
+                ] = torch.from_numpy(first_conv_layer.get_weights()[1])
                 second_conv_layer = exit_flow[-1]
+                unet_state_dict[
+                    f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv2.weight"
+                ] = torch.from_numpy(
+                    second_conv_layer.get_weights()[0].transpose(3, 2, 0, 1)
+                )
+                unet_state_dict[
+                    f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv2.bias"
+                ] = torch.from_numpy(second_conv_layer.get_weights()[1])
+                # Residual blocks.
                 if hasattr(layer, "residual_projection"):
+                    if isinstance(
+                        layer.residual_projection,
+                        stable_diffusion.__internal__.layers.padded_conv2d.PaddedConv2D,
+                    ):
                         residual = layer.residual_projection
+                        unet_state_dict[
+                            f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv_shortcut.weight"
+                        ] = torch.from_numpy(
+                            residual.get_weights()[0].transpose(3, 2, 0, 1)
+                        )
+                        unet_state_dict[
+                            f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv_shortcut.bias"
+                        ] = torch.from_numpy(residual.get_weights()[1])
                 # Timestep embedding.
                 embedding_proj = embedding_flow[-1]
+                unet_state_dict[
+                    f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.time_emb_proj.weight"
+                ] = torch.from_numpy(embedding_proj.get_weights()[0].transpose())
+                unet_state_dict[
+                    f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.time_emb_proj.bias"
+                ] = torch.from_numpy(embedding_proj.get_weights()[1])
                 # Norms.
                 first_group_norm = entry_flow[0]
+                unet_state_dict[
+                    f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.norm1.weight"
+                ] = torch.from_numpy(first_group_norm.get_weights()[0])
+                unet_state_dict[
+                    f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.norm1.bias"
+                ] = torch.from_numpy(first_group_norm.get_weights()[1])
                 second_group_norm = exit_flow[0]
+                unet_state_dict[
+                    f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.norm2.weight"
+                ] = torch.from_numpy(second_group_norm.get_weights()[0])
+                unet_state_dict[
+                    f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.norm2.bias"
+                ] = torch.from_numpy(second_group_norm.get_weights()[1])
             # Middle.
             elif int(parts[-1]) == 8 or int(parts[-1]) == 9:
+                entry_flow = layer.entry_flow
+                embedding_flow = layer.embedding_flow
                 exit_flow = layer.exit_flow
                 mid_resnet_id = int(parts[-1]) % 2
                 # Conv blocks.
                 first_conv_layer = entry_flow[-1]
+                unet_state_dict[
+                    f"mid_block.resnets.{mid_resnet_id}.conv1.weight"
+                ] = torch.from_numpy(
+                    first_conv_layer.get_weights()[0].transpose(3, 2, 0, 1)
+                )
+                unet_state_dict[
+                    f"mid_block.resnets.{mid_resnet_id}.conv1.bias"
+                ] = torch.from_numpy(first_conv_layer.get_weights()[1])
                 second_conv_layer = exit_flow[-1]
+                unet_state_dict[
+                    f"mid_block.resnets.{mid_resnet_id}.conv2.weight"
+                ] = torch.from_numpy(
+                    second_conv_layer.get_weights()[0].transpose(3, 2, 0, 1)
+                )
+                unet_state_dict[
+                    f"mid_block.resnets.{mid_resnet_id}.conv2.bias"
+                ] = torch.from_numpy(second_conv_layer.get_weights()[1])
+                # Residual blocks.
                 if hasattr(layer, "residual_projection"):
+                    if isinstance(
+                        layer.residual_projection,
+                        stable_diffusion.__internal__.layers.padded_conv2d.PaddedConv2D,
+                    ):
                         residual = layer.residual_projection
+                        unet_state_dict[
+                            f"mid_block.resnets.{mid_resnet_id}.conv_shortcut.weight"
+                        ] = torch.from_numpy(
+                            residual.get_weights()[0].transpose(3, 2, 0, 1)
+                        )
+                        unet_state_dict[
+                            f"mid_block.resnets.{mid_resnet_id}.conv_shortcut.bias"
+                        ] = torch.from_numpy(residual.get_weights()[1])
                 # Timestep embedding.
                 embedding_proj = embedding_flow[-1]
+                unet_state_dict[
+                    f"mid_block.resnets.{mid_resnet_id}.time_emb_proj.weight"
+                ] = torch.from_numpy(embedding_proj.get_weights()[0].transpose())
+                unet_state_dict[
+                    f"mid_block.resnets.{mid_resnet_id}.time_emb_proj.bias"
+                ] = torch.from_numpy(embedding_proj.get_weights()[1])
                 # Norms.
                 first_group_norm = entry_flow[0]
+                unet_state_dict[
+                    f"mid_block.resnets.{mid_resnet_id}.norm1.weight"
+                ] = torch.from_numpy(first_group_norm.get_weights()[0])
+                unet_state_dict[
+                    f"mid_block.resnets.{mid_resnet_id}.norm1.bias"
+                ] = torch.from_numpy(first_group_norm.get_weights()[1])
                 second_group_norm = exit_flow[0]
+                unet_state_dict[
+                    f"mid_block.resnets.{mid_resnet_id}.norm2.weight"
+                ] = torch.from_numpy(second_group_norm.get_weights()[0])
+                unet_state_dict[
+                    f"mid_block.resnets.{mid_resnet_id}.norm2.bias"
+                ] = torch.from_numpy(second_group_norm.get_weights()[1])
+            # Up.
             elif int(parts[-1]) > 9 and up_res_block_flag < len(up_res_blocks):
+                entry_flow = layer.entry_flow
+                embedding_flow = layer.embedding_flow
                 exit_flow = layer.exit_flow
                 up_res_block = up_res_blocks[up_res_block_flag]
                 # Conv blocks.
                 first_conv_layer = entry_flow[-1]
+                unet_state_dict[
+                    f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv1.weight"
+                ] = torch.from_numpy(
+                    first_conv_layer.get_weights()[0].transpose(3, 2, 0, 1)
+                )
+                unet_state_dict[
+                    f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv1.bias"
+                ] = torch.from_numpy(first_conv_layer.get_weights()[1])
                 second_conv_layer = exit_flow[-1]
+                unet_state_dict[
+                    f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv2.weight"
+                ] = torch.from_numpy(
+                    second_conv_layer.get_weights()[0].transpose(3, 2, 0, 1)
+                )
+                unet_state_dict[
+                    f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv2.bias"
+                ] = torch.from_numpy(second_conv_layer.get_weights()[1])
+                # Residual blocks.
                 if hasattr(layer, "residual_projection"):
+                    if isinstance(
+                        layer.residual_projection,
+                        stable_diffusion.__internal__.layers.padded_conv2d.PaddedConv2D,
+                    ):
                         residual = layer.residual_projection
+                        unet_state_dict[
+                            f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv_shortcut.weight"
+                        ] = torch.from_numpy(
+                            residual.get_weights()[0].transpose(3, 2, 0, 1)
+                        )
+                        unet_state_dict[
+                            f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv_shortcut.bias"
+                        ] = torch.from_numpy(residual.get_weights()[1])
                 # Timestep embedding.
                 embedding_proj = embedding_flow[-1]
+                unet_state_dict[
+                    f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.time_emb_proj.weight"
+                ] = torch.from_numpy(embedding_proj.get_weights()[0].transpose())
+                unet_state_dict[
+                    f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.time_emb_proj.bias"
+                ] = torch.from_numpy(embedding_proj.get_weights()[1])
                 # Norms.
                 first_group_norm = entry_flow[0]
+                unet_state_dict[
+                    f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.norm1.weight"
+                ] = torch.from_numpy(first_group_norm.get_weights()[0])
+                unet_state_dict[
+                    f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.norm1.bias"
+                ] = torch.from_numpy(first_group_norm.get_weights()[1])
                 second_group_norm = exit_flow[0]
+                unet_state_dict[
+                    f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.norm2.weight"
+                ] = torch.from_numpy(second_group_norm.get_weights()[0])
+                unet_state_dict[
+                    f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.norm2.bias"
+                ] = torch.from_numpy(second_group_norm.get_weights()[1])
                 up_res_block_flag += 1
         # All SpatialTransformer blocks.
             if len(parts) == 2 or int(parts[-1]) < 6:
                 down_block_id = 0 if len(parts) == 2 else int(parts[-1]) // 2
                 down_attention_id = 0 if len(parts) == 2 else int(parts[-1]) % 2
                 # Convs.
                 proj1 = layer.proj1
+                unet_state_dict[
+                    f"down_blocks.{down_block_id}.attentions.{down_attention_id}.proj_in.weight"
+                ] = torch.from_numpy(proj1.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[
+                    f"down_blocks.{down_block_id}.attentions.{down_attention_id}.proj_in.bias"
+                ] = torch.from_numpy(proj1.get_weights()[1])
                 proj2 = layer.proj2
+                unet_state_dict[
+                    f"down_blocks.{down_block_id}.attentions.{down_attention_id}.proj_out.weight"
+                ] = torch.from_numpy(proj2.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[
+                    f"down_blocks.{down_block_id}.attentions.{down_attention_id}.proj_out.bias"
+                ] = torch.from_numpy(proj2.get_weights()[1])
                 # Transformer blocks.
                 transformer_block = layer.transformer_block
+                unet_state_dict.update(
+                    port_transformer_block(
+                        transformer_block, "down", down_block_id, down_attention_id
+                    )
+                )
                 # Norms.
                 norm = layer.norm
+                unet_state_dict[
+                    f"down_blocks.{down_block_id}.attentions.{down_attention_id}.norm.weight"
+                ] = torch.from_numpy(norm.get_weights()[0])
+                unet_state_dict[
+                    f"down_blocks.{down_block_id}.attentions.{down_attention_id}.norm.bias"
+                ] = torch.from_numpy(norm.get_weights()[1])
             # Middle.
             elif int(parts[-1]) == 6:
                 mid_attention_id = int(parts[-1]) % 2
                 # Convs.
                 proj1 = layer.proj1
+                unet_state_dict[
+                    f"mid_block.attentions.{mid_attention_id}.proj_in.weight"
+                ] = torch.from_numpy(proj1.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[
+                    f"mid_block.attentions.{mid_attention_id}.proj_in.bias"
+                ] = torch.from_numpy(proj1.get_weights()[1])
                 proj2 = layer.proj2
+                unet_state_dict[
+                    f"mid_block.attentions.{mid_resnet_id}.proj_out.weight"
+                ] = torch.from_numpy(proj2.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[
+                    f"mid_block.attentions.{mid_attention_id}.proj_out.bias"
+                ] = torch.from_numpy(proj2.get_weights()[1])
                 # Transformer blocks.
                 transformer_block = layer.transformer_block
+                unet_state_dict.update(
+                    port_transformer_block(
+                        transformer_block, "mid", None, mid_attention_id
+                    )
+                )
                 # Norms.
                 norm = layer.norm
+                unet_state_dict[
+                    f"mid_block.attentions.{mid_attention_id}.norm.weight"
+                ] = torch.from_numpy(norm.get_weights()[0])
+                unet_state_dict[
+                    f"mid_block.attentions.{mid_attention_id}.norm.bias"
+                ] = torch.from_numpy(norm.get_weights()[1])
             # Up.
+            elif int(parts[-1]) > 6 and up_spatial_transformer_flag < len(
+                up_spatial_transformer_blocks
+            ):
+                up_spatial_transformer_block = up_spatial_transformer_blocks[
+                    up_spatial_transformer_flag
+                ]
                 up_block_id = up_spatial_transformer_block[0]
                 up_attention_id = up_spatial_transformer_block[1]
                 # Convs.
                 proj1 = layer.proj1
+                unet_state_dict[
+                    f"up_blocks.{up_block_id}.attentions.{up_attention_id}.proj_in.weight"
+                ] = torch.from_numpy(proj1.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[
+                    f"up_blocks.{up_block_id}.attentions.{up_attention_id}.proj_in.bias"
+                ] = torch.from_numpy(proj1.get_weights()[1])
                 proj2 = layer.proj2
+                unet_state_dict[
+                    f"up_blocks.{up_block_id}.attentions.{up_attention_id}.proj_out.weight"
+                ] = torch.from_numpy(proj2.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[
+                    f"up_blocks.{up_block_id}.attentions.{up_attention_id}.proj_out.bias"
+                ] = torch.from_numpy(proj2.get_weights()[1])
                 # Transformer blocks.
                 transformer_block = layer.transformer_block
+                unet_state_dict.update(
+                    port_transformer_block(
+                        transformer_block, "up", up_block_id, up_attention_id
+                    )
+                )
                 # Norms.
                 norm = layer.norm
+                unet_state_dict[
+                    f"up_blocks.{up_block_id}.attentions.{up_attention_id}.norm.weight"
+                ] = torch.from_numpy(norm.get_weights()[0])
+                unet_state_dict[
+                    f"up_blocks.{up_block_id}.attentions.{up_attention_id}.norm.bias"
+                ] = torch.from_numpy(norm.get_weights()[1])
                 up_spatial_transformer_flag += 1
+    return unet_state_dict

conversion_utils/utils.py CHANGED Viewed

@@ -1,15 +1,19 @@
 import numpy as np
-import torch
-from typing import Dict
-def run_assertion(orig_pt_state_dict: Dict[str, torch.Tensor], pt_state_dict_from_tf: Dict[str, torch.Tensor]):
     for k in orig_pt_state_dict:
         try:
             np.testing.assert_allclose(
-                orig_pt_state_dict[k].numpy(),
-                pt_state_dict_from_tf[k].numpy()
             )
         except:
-            raise ValueError("There are problems in the parameter population process. Cannot proceed :(")

+from typing import Dict
 import numpy as np
+import torch
+def run_assertion(
+    orig_pt_state_dict: Dict[str, torch.Tensor],
+    pt_state_dict_from_tf: Dict[str, torch.Tensor],
+):
     for k in orig_pt_state_dict:
         try:
             np.testing.assert_allclose(
+                orig_pt_state_dict[k].numpy(), pt_state_dict_from_tf[k].numpy()
             )
         except:
+            raise ValueError(
+                "There are problems in the parameter population process. Cannot proceed :("
+            )

convert.py CHANGED Viewed

@@ -1,26 +1,25 @@
-from conversion_utils import populate_text_encoder, populate_unet, run_assertion
-from diffusers import (
-    AutoencoderKL,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from transformers import CLIPTextModel
 import keras_cv
 import tensorflow as tf
 PRETRAINED_CKPT = "CompVis/stable-diffusion-v1-4"
 REVISION = None
 NON_EMA_REVISION = None
 IMG_HEIGHT = IMG_WIDTH = 512
 def initialize_pt_models():
     """Initializes the separate models of Stable Diffusion from diffusers and downloads
     their pre-trained weights."""
     pt_text_encoder = CLIPTextModel.from_pretrained(
-    PRETRAINED_CKPT, subfolder="text_encoder", revision=REVISION
     )
     pt_vae = AutoencoderKL.from_pretrained(
         PRETRAINED_CKPT, subfolder="vae", revision=REVISION
@@ -34,14 +33,17 @@ def initialize_pt_models():
     return pt_text_encoder, pt_vae, pt_unet, pt_safety_checker
 def initialize_tf_models():
     """Initializes the separate models of Stable Diffusion from KerasCV and downloads
     their pre-trained weights."""
-    tf_sd_model = keras_cv.models.StableDiffusion(img_height=IMG_HEIGHT, img_width=IMG_WIDTH)
-    _ = tf_sd_model.text_to_image("Cartoon") # To download the weights.
-    tf_text_encoder = tf_sd_model.text_encoder
-    tf_vae = tf_sd_model.image_encoder
     tf_unet = tf_sd_model.diffusion_model
     return tf_sd_model, tf_text_encoder, tf_vae, tf_unet
@@ -50,7 +52,7 @@ def run_conversion(text_encoder_weights: str = None, unet_weights: str = None):
     pt_text_encoder, pt_vae, pt_unet, pt_safety_checker = initialize_pt_models()
     tf_sd_model, tf_text_encoder, tf_vae, tf_unet = initialize_tf_models()
     print("Pre-trained model weights downloaded.")
     if text_encoder_weights is not None:
         print("Loading fine-tuned text encoder weights.")
         text_encoder_weights_path = tf.keras.utils.get_file(text_encoder_weights)
@@ -72,7 +74,9 @@ def run_conversion(text_encoder_weights: str = None, unet_weights: str = None):
         unet_state_dict_from_pt = pt_text_encoder.state_dict()
         run_assertion(unet_state_dict_from_pt, unet_state_dict_from_tf)
-    print("Assertions successful, populating the converted parameters into the diffusers models...")
     pt_text_encoder.load_state_dict(text_encoder_state_dict_from_tf)
     pt_unet.load_state_dict(unet_state_dict_from_tf)
@@ -86,5 +90,3 @@ def run_conversion(text_encoder_weights: str = None, unet_weights: str = None):
         revision=None,
     )
     return pipeline

 import keras_cv
 import tensorflow as tf
+from diffusers import (AutoencoderKL, StableDiffusionPipeline,
+                       UNet2DConditionModel)
+from diffusers.pipelines.stable_diffusion.safety_checker import \
+    StableDiffusionSafetyChecker
+from transformers import CLIPTextModel
+from conversion_utils import (populate_text_encoder, populate_unet,
+                              run_assertion)
 PRETRAINED_CKPT = "CompVis/stable-diffusion-v1-4"
 REVISION = None
 NON_EMA_REVISION = None
 IMG_HEIGHT = IMG_WIDTH = 512
 def initialize_pt_models():
     """Initializes the separate models of Stable Diffusion from diffusers and downloads
     their pre-trained weights."""
     pt_text_encoder = CLIPTextModel.from_pretrained(
+        PRETRAINED_CKPT, subfolder="text_encoder", revision=REVISION
     )
     pt_vae = AutoencoderKL.from_pretrained(
         PRETRAINED_CKPT, subfolder="vae", revision=REVISION
     return pt_text_encoder, pt_vae, pt_unet, pt_safety_checker
 def initialize_tf_models():
     """Initializes the separate models of Stable Diffusion from KerasCV and downloads
     their pre-trained weights."""
+    tf_sd_model = keras_cv.models.StableDiffusion(
+        img_height=IMG_HEIGHT, img_width=IMG_WIDTH
+    )
+    _ = tf_sd_model.text_to_image("Cartoon")  # To download the weights.
+    tf_text_encoder = tf_sd_model.text_encoder
+    tf_vae = tf_sd_model.image_encoder
     tf_unet = tf_sd_model.diffusion_model
     return tf_sd_model, tf_text_encoder, tf_vae, tf_unet
     pt_text_encoder, pt_vae, pt_unet, pt_safety_checker = initialize_pt_models()
     tf_sd_model, tf_text_encoder, tf_vae, tf_unet = initialize_tf_models()
     print("Pre-trained model weights downloaded.")
     if text_encoder_weights is not None:
         print("Loading fine-tuned text encoder weights.")
         text_encoder_weights_path = tf.keras.utils.get_file(text_encoder_weights)
         unet_state_dict_from_pt = pt_text_encoder.state_dict()
         run_assertion(unet_state_dict_from_pt, unet_state_dict_from_tf)
+    print(
+        "Assertions successful, populating the converted parameters into the diffusers models..."
+    )
     pt_text_encoder.load_state_dict(text_encoder_state_dict_from_tf)
     pt_unet.load_state_dict(unet_state_dict_from_tf)
         revision=None,
     )
     return pipeline

hub_utils/__init__.py CHANGED Viewed

	@@ -1,2 +1,2 @@
1	- from .readme import save_model_card
2	- from .repo import push_to_hub


1	+ from .readme import save_model_card
2	+ from .repo import push_to_hub

hub_utils/readme.py CHANGED Viewed

@@ -23,7 +23,7 @@ The pipeline contained in this repository was created using [this Space](https:/
 """
     if weight_paths is not None:
-        model_card +=  "Following weight paths (KerasCV) were used: {weight_paths}"
     with open(os.path.join(repo_folder, "README.md"), "w") as f:
-        f.write(yaml + model_card)

 """
     if weight_paths is not None:
+        model_card += "Following weight paths (KerasCV) were used: {weight_paths}"
     with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)

hub_utils/repo.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from huggingface_hub import HfApi, create_repo
 def push_to_hub(hf_token: str, push_dir: str, repo_prefix: None) -> str:
     try:
         if hf_token == "":
@@ -7,9 +8,15 @@ def push_to_hub(hf_token: str, push_dir: str, repo_prefix: None) -> str:
         else:
             hf_api = HfApi(token=hf_token)
             user = hf_api.whoami()["name"]
-            repo_id = f"{user}/{push_dir}" if repo_prefix == "" else f"{user}/{repo_prefix}-{push_dir}"
             _ = create_repo(repo_id=repo_id, token=hf_token)
-            url = hf_api.upload_folder(folder_path=push_dir, repo_id=repo_id, exist_ok=True)
             return f"Model successfully pushed: [{url}]({url})"
     except Exception as e:
-        return f"{e}"

 from huggingface_hub import HfApi, create_repo
 def push_to_hub(hf_token: str, push_dir: str, repo_prefix: None) -> str:
     try:
         if hf_token == "":
         else:
             hf_api = HfApi(token=hf_token)
             user = hf_api.whoami()["name"]
+            repo_id = (
+                f"{user}/{push_dir}"
+                if repo_prefix == ""
+                else f"{user}/{repo_prefix}-{push_dir}"
+            )
             _ = create_repo(repo_id=repo_id, token=hf_token)
+            url = hf_api.upload_folder(
+                folder_path=push_dir, repo_id=repo_id, exist_ok=True
+            )
             return f"Model successfully pushed: [{url}]({url})"
     except Exception as e:
+        return f"{e}"