yibolu commited on Mar 4

Commit

6eca12e

•

1 Parent(s): a38262d

update pipeline and demos

Browse files

Files changed (36) hide show

README.md +19 -11
controlnet_img2img_demo.py +6 -4
controlnet_txt2img_demo.py +11 -5
controlnet_txt2img_sdxl_demo.py +70 -0
img2img_demo.py +5 -2
lyrasd_model/__init__.py +5 -1
lyrasd_model/lora_util.py +238 -6
lyrasd_model/lyrasd_controlnet_img2img_pipeline.py +92 -110
lyrasd_model/lyrasd_controlnet_txt2img_pipeline.py +40 -82
lyrasd_model/lyrasd_img2img_pipeline.py +90 -95
lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm80.so +0 -3
lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm86.so +0 -3
lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm80.so +2 -2
lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so +2 -2
lyrasd_model/lyrasd_pipeline_base.py +214 -0
lyrasd_model/lyrasd_txt2img_inpaint_pipeline.py +826 -0
lyrasd_model/lyrasd_txt2img_pipeline.py +172 -85
lyrasd_model/lyrasd_vae_model.py +363 -0
lyrasd_model/lyrasdxl_controlnet_txt2img_pipeline.py +346 -0
lyrasd_model/lyrasdxl_pipeline_base.py +275 -0
lyrasd_model/lyrasdxl_txt2img_inpaint_pipeline.py +535 -0
lyrasd_model/lyrasdxl_txt2img_pipeline.py +267 -0
lyrasd_model/{lyrasd_lib/placeholder.txt → module/__init__.py} +0 -0
lyrasd_model/module/lyra_tool.py +5 -0
lyrasd_model/module/lyrasd_ip_adapter.py +289 -0
lyrasd_model/module/resampler.py +121 -0
lyrasd_model/module/tools.py +148 -0
models/README.md +14 -5
outputs/res_controlnet_img2img_0.png +2 -2
outputs/{res_controlnet_sdxl_txt2img.png → res_controlnet_sdxl_txt2img_0.png} +2 -2
outputs/res_controlnet_txt2img_0.png +2 -2
outputs/res_img2img_0.png +2 -2
outputs/res_txt2img_lora_0.png +2 -2
outputs/{res_sdxl_txt2img_lora_0.png → res_txt2img_xl_lora_0.png} +2 -2
txt2img_demo.py +13 -10
txt2img_sdxl_demo.py +55 -0

README.md CHANGED Viewed

@@ -79,12 +79,16 @@ from lyrasd_model import LyraSdTxt2ImgPipeline
 #   4. scheduler 配置
 # LyraSD 的 C++ 编译动态链接库，其中包含 C++ CUDA 计算的细节
-lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm80.so"
-model_path = "./models/lyrasd_rev_animated"
 lora_path = "./models/xiaorenshu.safetensors"
 # 构建 Txt2Img 的 Pipeline
-model = LyraSdTxt2ImgPipeline(model_path, lib_path)
 # load lora
 # lora model path, name，lora strength
@@ -94,7 +98,7 @@ model.load_lora_v2(lora_path, "xiaorenshu", 0.4)
 prompt = "a cat, cute, cartoon, concise, traditional, chinese painting, Tang and Song Dynasties, masterpiece, 4k, 8k, UHD, best quality"
 negative_prompt = "(((horrible))), (((scary))), (((naked))), (((large breasts))), high saturation, colorful, human:2, body:2, low quality, bad quality, lowres, out of frame, duplicate, watermark, signature, text, frames, cut, cropped, malformed limbs, extra limbs, (((missing arms))), (((missing legs)))"
 height, width = 512, 512
-steps = 30
 guidance_scale = 7
 generator = torch.Generator().manual_seed(123)
 num_images = 1
@@ -128,12 +132,16 @@ from lyrasd_model import LyraSdXLTxt2ImgPipeline
 #   4. scheduler 配置
 # LyraSD 的 C++ 编译动态链接库，其中包含 C++ CUDA 计算的细节
-lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm80.so"
-model_path = "./models/lyrasd_helloworldSDXL20Fp16"
 lora_path = "./models/dissolve_sdxl.safetensors"
 # 构建 Txt2Img 的 Pipeline
-model = LyraSdXLTxt2ImgPipeline(model_path, lib_path)
 # load lora
 # lora model path, name，lora strength
@@ -143,7 +151,7 @@ model.load_lora_v2(lora_path, "dissolve_sdxl", 0.4)
 prompt = "a cat, cute, cartoon, concise, traditional, chinese painting, Tang and Song Dynasties, masterpiece, 4k, 8k, UHD, best quality"
 negative_prompt = "(((horrible))), (((scary))), (((naked))), (((large breasts))), high saturation, colorful, human:2, body:2, low quality, bad quality, lowres, out of frame, duplicate, watermark, signature, text, frames, cut, cropped, malformed limbs, extra limbs, (((missing arms))), (((missing legs)))"
 height, width = 512, 512
-steps = 30
 guidance_scale = 7
 generator = torch.Generator().manual_seed(123)
 num_images = 1
@@ -181,7 +189,7 @@ model.unload_lora_v2("dissolve_sdxl", True)
 ![text2img_demo](./outputs/res_sdxl_txt2img_0.png)
 #### SDXL Text2Img with Lora
-![text2img_demo](./outputs/res_sdxl_txt2img_lora_0.png)
 <!-- ### Img2Img
@@ -201,7 +209,7 @@ model.unload_lora_v2("dissolve_sdxl", True)
 ![text2img_demo](./outputs/res_controlnet_txt2img_0.png)
 #### SDXL ControlNet Text2Img Output
-![text2img_demo](./outputs/res_controlnet_sdxl_txt2img.png)
 ## Docker Environment Recommendation
@@ -218,7 +226,7 @@ python txt2img_demo.py
 ## Citation
 ``` bibtex
-@Misc{lyraSD_2023,
   author =       {Kangjian Wu, Zhengtao Wang, Yibo Lu, Haoxiong Su, Sa Xiao, Bin Wu},
   title =        {lyraSD: Accelerating Stable Diffusion with best flexibility},
   howpublished = {\url{https://huggingface.co/TMElyralab/lyraSD}},

 #   4. scheduler 配置
 # LyraSD 的 C++ 编译动态链接库，其中包含 C++ CUDA 计算的细节
+lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm80.so"
+model_path = "./models/rev-animated"
 lora_path = "./models/xiaorenshu.safetensors"
+torch.classes.load_library(lib_path)
 # 构建 Txt2Img 的 Pipeline
+model = LyraSdTxt2ImgPipeline()
+model.reload_pipe(model_path)
 # load lora
 # lora model path, name，lora strength
 prompt = "a cat, cute, cartoon, concise, traditional, chinese painting, Tang and Song Dynasties, masterpiece, 4k, 8k, UHD, best quality"
 negative_prompt = "(((horrible))), (((scary))), (((naked))), (((large breasts))), high saturation, colorful, human:2, body:2, low quality, bad quality, lowres, out of frame, duplicate, watermark, signature, text, frames, cut, cropped, malformed limbs, extra limbs, (((missing arms))), (((missing legs)))"
 height, width = 512, 512
+steps = 20
 guidance_scale = 7
 generator = torch.Generator().manual_seed(123)
 num_images = 1
 #   4. scheduler 配置
 # LyraSD 的 C++ 编译动态链接库，其中包含 C++ CUDA 计算的细节
+lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm80.so"
+model_path = "./models/helloworldSDXL20Fp16"
 lora_path = "./models/dissolve_sdxl.safetensors"
+torch.classes.load_library(lib_path)
 # 构建 Txt2Img 的 Pipeline
+model = LyraSdXLTxt2ImgPipeline()
+model.reload_pipe(model_path)
 # load lora
 # lora model path, name，lora strength
 prompt = "a cat, cute, cartoon, concise, traditional, chinese painting, Tang and Song Dynasties, masterpiece, 4k, 8k, UHD, best quality"
 negative_prompt = "(((horrible))), (((scary))), (((naked))), (((large breasts))), high saturation, colorful, human:2, body:2, low quality, bad quality, lowres, out of frame, duplicate, watermark, signature, text, frames, cut, cropped, malformed limbs, extra limbs, (((missing arms))), (((missing legs)))"
 height, width = 512, 512
+steps = 20
 guidance_scale = 7
 generator = torch.Generator().manual_seed(123)
 num_images = 1
 ![text2img_demo](./outputs/res_sdxl_txt2img_0.png)
 #### SDXL Text2Img with Lora
+![text2img_demo](./outputs/res_txt2img_xl_lora_0.png)
 <!-- ### Img2Img
 ![text2img_demo](./outputs/res_controlnet_txt2img_0.png)
 #### SDXL ControlNet Text2Img Output
+![text2img_demo](./outputs/res_controlnet_sdxl_txt2img_0.png)
 ## Docker Environment Recommendation
 ## Citation
 ``` bibtex
+@Misc{lyraSD_2024,
   author =       {Kangjian Wu, Zhengtao Wang, Yibo Lu, Haoxiong Su, Sa Xiao, Bin Wu},
   title =        {lyraSD: Accelerating Stable Diffusion with best flexibility},
   howpublished = {\url{https://huggingface.co/TMElyralab/lyraSD}},

controlnet_img2img_demo.py CHANGED Viewed

@@ -14,14 +14,16 @@ from lyrasd_model import LyraSdControlnetImg2ImgPipeline
 # LyraSD 的 C++ 编译动态链接库，其中包含 C++ CUDA 计算的细节
 lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so"
-model_path = "./models/lyrasd_rev_animated"
-canny_controlnet_path = "./models/lyrasd_canny"
 # 构建 Img2Img 的 Pipeline
-model = LyraSdControlnetImg2ImgPipeline(model_path, lib_path)
 # load Controlnet 模型，最多load 3个
-model.load_controlnet_model("canny", canny_controlnet_path, "fp32")
 control_img = Image.open("control_bird_canny.png")

 # LyraSD 的 C++ 编译动态链接库，其中包含 C++ CUDA 计算的细节
 lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so"
+model_path = "./models/rev-animated"
+canny_controlnet_path = "./models/canny"
+torch.classes.load_library(lib_path)
 # 构建 Img2Img 的 Pipeline
+model = LyraSdControlnetImg2ImgPipeline()
+model.reload_pipe(model_path)
 # load Controlnet 模型，最多load 3个
+model.load_controlnet_model_v2("canny", canny_controlnet_path)
 control_img = Image.open("control_bird_canny.png")

controlnet_txt2img_demo.py CHANGED Viewed

@@ -12,16 +12,22 @@ from lyrasd_model import LyraSdControlnetTxt2ImgPipeline
 #   5. scheduler 配置
 # LyraSD 的 C++ 编译动态链接库，其中包含 C++ CUDA 计算的细节
-lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so"
-model_path = "./models/lyrasd_rev_animated"
-canny_controlnet_path = "./models/lyrasd_canny"
 # 构建 Txt2Img 的 Pipeline
-pipe = LyraSdControlnetTxt2ImgPipeline(model_path, lib_path)
 # load Controlnet 模型，最多load 3个
 start = time.perf_counter()
-pipe.load_controlnet_model("canny", canny_controlnet_path, "fp32")
 print(f"controlnet load cost: {time.perf_counter() - start}")
 # 可以通过 get_loaded_controlnet 方法获取目前已经load 好的Controlnet list
 print(pipe.get_loaded_controlnet())

 #   5. scheduler 配置
 # LyraSD 的 C++ 编译动态链接库，其中包含 C++ CUDA 计算的细节
+lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm80.so"
+model_path = "./models/rev-animated"
+canny_controlnet_path = "./models/canny"
+torch.classes.load_library(lib_path)
 # 构建 Txt2Img 的 Pipeline
+pipe = LyraSdControlnetTxt2ImgPipeline()
+pipe.reload_pipe(model_path)
 # load Controlnet 模型，最多load 3个
 start = time.perf_counter()
+pipe.load_controlnet_model_v2("canny", canny_controlnet_path)
 print(f"controlnet load cost: {time.perf_counter() - start}")
 # 可以通过 get_loaded_controlnet 方法获取目前已经load 好的Controlnet list
 print(pipe.get_loaded_controlnet())

controlnet_txt2img_sdxl_demo.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+import time
+from PIL import Image
+import numpy as np
+from lyrasd_model import LyraSdXLControlnetTxt2ImgPipeline
+import GPUtil
+# 存放模型文件的路径，应该包含一下结构：
+#   1. clip 模型
+#   2. 转换好的优化后的 unet 模型
+#   3. 转换好的优化后的 controlnet 模型
+#   4. vae 模型
+#   5. scheduler 配置
+lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm80.so"
+model_path = "./models/helloworldSDXL20Fp16"
+torch.classes.load_library(lib_path)
+# 构建 Txt2Img 的 Pipeline
+pipe = LyraSdXLControlnetTxt2ImgPipeline()
+start = time.perf_counter()
+pipe.reload_pipe(model_path)
+print(f"pipeline load cost: {time.perf_counter() - start}")
+# load Controlnet 模型，最多load 3个
+start = time.perf_counter()
+pipe.load_controlnet_model_v2("canny", "./models/controlnet-canny-sdxl-1.0")
+print(f"controlnet load cost: {time.perf_counter() - start}")
+# 可以通过 get_loaded_controlnet 方法获取目前已经load 好的Controlnet list
+print(pipe.get_loaded_controlnet())
+# 可以通过unload_controlnet_model 方法unload Controlnet
+# pipe.unload_controlnet_model("canny")
+control_img = Image.open("control_bird_canny.png")
+# 准备应用的输入和超参数
+prompt = "a bird"
+negative_prompt = ""
+height, width = 1024, 1024
+steps = 20
+guidance_scale = 7.5
+generator = torch.Generator().manual_seed(123)
+num_images = 1
+guess_mode = False
+# 可以一次性load 3 个 Controlnets，达到multi Controlnet的效果，这里的参数的长度需要对其
+# Controlnet 所输入的img list 长度应该和 controlnet scale 与 Controlnet name 一致，而内部的list长度需要和batch size一致
+# 对应的index 可以对其
+controlnet_images = [[control_img]]
+controlnet_scale = [0.5]
+controlnet_names = ['canny']
+# 推理生成，返回结果都是生成好的 PIL.Image
+for batch in [1]:
+    print(f"cur batch: {batch}")
+    for _ in range(3):
+        start = time.perf_counter()
+        images = pipe(prompt=prompt, height=height, width=width, num_inference_steps=steps,
+                      guidance_scale=guidance_scale, negative_prompt=negative_prompt, num_images_per_prompt=batch,
+                      generator=generator, controlnet_images=controlnet_images,
+                      controlnet_scale=controlnet_scale, controlnet_names=controlnet_names,
+                      guess_mode=guess_mode
+                      )
+        print("cur cost: ", time.perf_counter() - start)
+        GPUtil.showUtilization(all=True)
+# 存储生成的图片
+for i, image in enumerate(images):
+    image.save(f"./outputs/res_controlnet_sdxl_txt2img_{i}.png")

img2img_demo.py CHANGED Viewed

@@ -14,10 +14,13 @@ from lyrasd_model import LyraSDImg2ImgPipeline
 # LyraSD 的 C++ 编译动态链接库，其中包含 C++ CUDA 计算的细节
 lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so"
-model_path = "./models/lyrasd_rev_animated"
 # 构建 Img2Img 的 Pipeline
-model = LyraSDImg2ImgPipeline(model_path, lib_path)
 # 准备应用的输入和超参数
 prompt = "a cat, cartoon style"

 # LyraSD 的 C++ 编译动态链接库，其中包含 C++ CUDA 计算的细节
 lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so"
+model_path = "./models/rev-animated"
+torch.classes.load_library(lib_path)
 # 构建 Img2Img 的 Pipeline
+model = LyraSDImg2ImgPipeline()
+model.reload_pipe(model_path)
 # 准备应用的输入和超参数
 prompt = "a cat, cartoon style"

lyrasd_model/__init__.py CHANGED Viewed

@@ -1,5 +1,9 @@
 from . import lyrasd_img2img_pipeline, lyrasd_txt2img_pipeline, lyrasd_controlnet_txt2img_pipeline, lyrasd_controlnet_img2img_pipeline
 from .lyrasd_txt2img_pipeline import LyraSdTxt2ImgPipeline
 from .lyrasd_img2img_pipeline import LyraSDImg2ImgPipeline
 from .lyrasd_controlnet_txt2img_pipeline import LyraSdControlnetTxt2ImgPipeline
-from .lyrasd_controlnet_img2img_pipeline import LyraSdControlnetImg2ImgPipeline

 from . import lyrasd_img2img_pipeline, lyrasd_txt2img_pipeline, lyrasd_controlnet_txt2img_pipeline, lyrasd_controlnet_img2img_pipeline
 from .lyrasd_txt2img_pipeline import LyraSdTxt2ImgPipeline
 from .lyrasd_img2img_pipeline import LyraSDImg2ImgPipeline
+from .lyrasd_txt2img_inpaint_pipeline import LyraSdTxt2ImgInpaintPipeline
 from .lyrasd_controlnet_txt2img_pipeline import LyraSdControlnetTxt2ImgPipeline
+from .lyrasd_controlnet_img2img_pipeline import LyraSdControlnetImg2ImgPipeline
+from .lyrasdxl_txt2img_pipeline import LyraSdXLTxt2ImgPipeline
+from .lyrasdxl_controlnet_txt2img_pipeline import LyraSdXLControlnetTxt2ImgPipeline
+from .lyrasdxl_txt2img_inpaint_pipeline import LyraSdXLTxt2ImgInpaintPipeline

lyrasd_model/lora_util.py CHANGED Viewed

@@ -1,7 +1,18 @@
 import os
 import torch
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 import numpy as np
 def add_text_lora_layer(clip_model, lora_model_path="Misaka.safetensors", alpha=1.0, lora_file_format="fp32", device="cuda:0"):
     if lora_file_format == "fp32":
@@ -14,9 +25,10 @@ def add_text_lora_layer(clip_model, lora_model_path="Misaka.safetensors", alpha=
     unload_dict = []
     # directly update weight in diffusers model
     for file in all_files:
         if 'text' in file.name:
-            layer_infos = file.name.split('.')[0].split('text_model_')[-1].split('_')
             curr_layer = clip_model.text_model
         else:
             continue
@@ -39,9 +51,71 @@ def add_text_lora_layer(clip_model, lora_model_path="Misaka.safetensors", alpha=
                     temp_name += '_'+layer_infos.pop(0)
                 else:
                     temp_name = layer_infos.pop(0)
-        data = torch.from_numpy(np.fromfile(file.path, dtype=model_dtype)).to(clip_model.dtype).to(clip_model.device).reshape(curr_layer.weight.data.shape)
         if len(curr_layer.weight.data) == 4:
-            adding_weight = alpha * data.permute(0,3,1,2)
         else:
             adding_weight = alpha * data
         curr_layer.weight.data += adding_weight
@@ -51,4 +125,162 @@ def add_text_lora_layer(clip_model, lora_model_path="Misaka.safetensors", alpha=
             "added_weight": adding_weight
         }
         unload_dict.append(curr_layer_unload_data)
-    return unload_dict

 import os
+import re
+import time
 import torch
 import numpy as np
+from safetensors.torch import load_file
+from diffusers.loaders import LoraLoaderMixin
+from diffusers.loaders.lora_conversion_utils import _maybe_map_sgm_blocks_to_diffusers, _convert_kohya_lora_to_diffusers
+from types import SimpleNamespace
+import logging.handlers
+LORA_PREFIX_UNET = "lora_unet"
+LORA_PREFIX_TEXT_ENCODER = "lora_te"
+LORA_UNET_LAYERS = ['lora_unet_down_blocks_0_attentions_0', 'lora_unet_down_blocks_0_attentions_1', 'lora_unet_down_blocks_1_attentions_0', 'lora_unet_down_blocks_1_attentions_1', 'lora_unet_down_blocks_2_attentions_0', 'lora_unet_down_blocks_2_attentions_1', 'lora_unet_mid_block_attentions_0', 'lora_unet_up_blocks_1_attentions_0',
+                    'lora_unet_up_blocks_1_attentions_1', 'lora_unet_up_blocks_1_attentions_2', 'lora_unet_up_blocks_2_attentions_0', 'lora_unet_up_blocks_2_attentions_1', 'lora_unet_up_blocks_2_attentions_2', 'lora_unet_up_blocks_3_attentions_0', 'lora_unet_up_blocks_3_attentions_1', 'lora_unet_up_blocks_3_attentions_2']
 def add_text_lora_layer(clip_model, lora_model_path="Misaka.safetensors", alpha=1.0, lora_file_format="fp32", device="cuda:0"):
     if lora_file_format == "fp32":
     unload_dict = []
     # directly update weight in diffusers model
     for file in all_files:
         if 'text' in file.name:
+            layer_infos = file.name.split('.')[0].split(
+                'text_model_')[-1].split('_')
             curr_layer = clip_model.text_model
         else:
             continue
                     temp_name += '_'+layer_infos.pop(0)
                 else:
                     temp_name = layer_infos.pop(0)
+        data = torch.from_numpy(np.fromfile(file.path, dtype=model_dtype)).to(
+            clip_model.dtype).to(clip_model.device).reshape(curr_layer.weight.data.shape)
+        if len(curr_layer.weight.data) == 4:
+            adding_weight = alpha * data.permute(0, 3, 1, 2)
+        else:
+            adding_weight = alpha * data
+        curr_layer.weight.data += adding_weight
+        curr_layer_unload_data = {
+            "layer": curr_layer,
+            "added_weight": adding_weight
+        }
+        unload_dict.append(curr_layer_unload_data)
+    return unload_dict
+def add_xltext_lora_layer(clip_model, clip_model_2, lora_model_path, alpha=1.0, lora_file_format="fp32", device="cuda:0"):
+    if lora_file_format == "fp32":
+        model_dtype = np.float32
+    elif lora_file_format == "fp16":
+        model_dtype = np.float16
+    else:
+        raise Exception(f"unsupported model dtype: {lora_file_format}")
+    all_files = os.scandir(lora_model_path)
+    unload_dict = []
+    # directly update weight in diffusers model
+    for file in all_files:
+        if 'text' in file.name:
+            layer_infos = file.name.split('.')[0].split(
+                'text_model_')[-1].split('_')
+            if "text_encoder_2" in file.name:
+                curr_layer = clip_model_2.text_model
+            elif "text_encoder" in file.name:
+                curr_layer = clip_model.text_model
+            else:
+                raise ValueError(
+                    "Cannot identify clip model, need text_encoder or text_encoder_2 in filename, found: ", file.name)
+        else:
+            continue
+        # find the target layer
+        # find the target layer
+        temp_name = layer_infos.pop(0)
+        while len(layer_infos) > -1:
+            try:
+                curr_layer = curr_layer.__getattr__(temp_name)
+                if len(layer_infos) > 0:
+                    temp_name = layer_infos.pop(0)
+                    # if temp_name == "self":
+                    #     temp_name += "_" + layer_infos.pop(0)
+                    # elif temp_name != "mlp" and len(layer_infos) == 1:
+                    #     temp_name += "_" + layer_infos.pop(0)
+                elif len(layer_infos) == 0:
+                    break
+            except Exception:
+                if len(temp_name) > 0:
+                    temp_name += '_'+layer_infos.pop(0)
+                else:
+                    temp_name = layer_infos.pop(0)
+        data = torch.from_numpy(np.fromfile(file.path, dtype=model_dtype)).to(
+            clip_model.dtype).to(clip_model.device).reshape(curr_layer.weight.data.shape)
         if len(curr_layer.weight.data) == 4:
+            adding_weight = alpha * data.permute(0, 3, 1, 2)
         else:
             adding_weight = alpha * data
         curr_layer.weight.data += adding_weight
             "added_weight": adding_weight
         }
         unload_dict.append(curr_layer_unload_data)
+    return unload_dict
+def lora_trans(state_dict):
+    loraload = LoraLoaderMixin()
+    unet_config = SimpleNamespace(**{'layers_per_block': 2})
+    state_dicts = _maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config)
+    state_dicts_trans, state_dicts_alpha = _convert_kohya_lora_to_diffusers(
+        state_dicts)
+    keys = list(state_dicts_trans.keys())
+    for k in keys:
+        key = k.replace('processor.', '')
+        for x in ['.lora_linear_layer.', '_lora.', '.lora.']:
+            key = key.replace(x, '.lora_')
+        if key.find('text_encoder') >= 0:
+            for x in ['q', 'k', 'v', 'out']:
+                key = key.replace(f'.to_{x}.', f'.{x}_proj.')
+        key = key.replace('to_out.', 'to_out.0.')
+        if key != k:
+            state_dicts_trans[key] = state_dicts_trans.pop(k)
+    alpha = torch.Tensor(list(set(list(state_dicts_alpha.values()))))
+    state_dicts_trans.update({'lora.alpha': alpha})
+    return state_dicts_trans
+def load_state_dict(filename, need_trans=True):
+    state_dict = load_file(os.path.abspath(filename), device="cpu")
+    if need_trans:
+        state_dict = lora_trans(state_dict)
+    return state_dict
+def move_state_dict_to_cuda(state_dict):
+    ret_state_dict = {}
+    for item in state_dict:
+        ret_state_dict[item] = state_dict[item].cuda()
+    return ret_state_dict
+def add_lora_to_opt_model(state_dict, unet, clip_model, clip_model_2, alpha=1.0, need_trans=False):
+    # directly update weight in diffusers model
+    state_dict = move_state_dict_to_cuda(state_dict)
+    alpha_ks = list(filter(lambda x: x.find('.alpha') >= 0, state_dict))
+    lora_alpha = state_dict[alpha_ks[0]].item() if len(alpha_ks) > 0 else -1
+    visited = set()
+    for key in state_dict:
+        # print(key)
+        # it is suggested to print out the key, it usually will be something like below
+        # "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"
+        # as we have set the alpha beforehand, so just skip
+        if '.alpha' in key or key in visited:
+            continue
+        if "text" in key:
+            curr_layer = clip_model_2 if key.find(
+                'text_encoder_2') >= 0 else clip_model
+            # if is_sdxl:
+            layer_infos = key.split('.')[1:]
+            for x in layer_infos:
+                try:
+                    curr_layer = curr_layer.__getattr__(x)
+                except Exception:
+                    break
+            # update weight
+            pair_keys = [key.replace("lora_down", "lora_up"),
+                         key.replace("lora_up", "lora_down")]
+            weight_up, weight_down = state_dict[pair_keys[0]
+                                                ], state_dict[pair_keys[1]]
+            weight_scale = lora_alpha/weight_up.shape[1] if lora_alpha != -1 else 1.0
+            if len(weight_up.shape) == 4:
+                weight_up = weight_up.squeeze([2, 3])
+                weight_down = weight_down.squeeze([2, 3])
+                if len(weight_down.shape) == 4:
+                    adding_weight = torch.einsum(
+                        'a b, b c h w -> a c h w', weight_up, weight_down)
+                else:
+                    adding_weight = torch.mm(
+                        weight_up, weight_down).unsqueeze(2).unsqueeze(3)
+            else:
+                adding_weight = torch.mm(weight_up, weight_down)
+            adding_weight = alpha * weight_scale * adding_weight
+            curr_layer.weight.data += adding_weight.to(torch.float16)
+            # update visited list
+            for item in pair_keys:
+                visited.add(item)
+        elif "unet" in key:
+            layer_infos = key
+            layer_infos = layer_infos.replace(".lora_up.weight", "")
+            layer_infos = layer_infos.replace(".lora_down.weight", "")
+            layer_infos = layer_infos[5:]
+            layer_names = layer_infos.split(".")
+            layers = []
+            i = 0
+            while i < len(layer_names):
+                if len(layers) >= 4:
+                    layers[-1] += "_" + layer_names[i]
+                elif i + 1 < len(layer_names) and layer_names[i+1].isdigit():
+                    layers.append(layer_names[i] + "_" + layer_names[i+1])
+                    i += 1
+                elif len(layers) > 0 and "samplers" in layers[-1]:
+                    layers[-1] += "_" + layer_names[i]
+                else:
+                    layers.append(layer_names[i])
+                i += 1
+            layer_infos = ".".join(layers)
+            pair_keys = [key.replace("lora_down", "lora_up"),
+                         key.replace("lora_up", "lora_down")]
+            # update weight
+            if len(state_dict[pair_keys[0]].shape) == 4:
+                weight_up = state_dict[pair_keys[0]].squeeze(
+                    3).squeeze(2).to(torch.float32)
+                weight_down = state_dict[pair_keys[1]].to(torch.float32)
+                weight_scale = lora_alpha/weight_up.shape[1] if lora_alpha != -1 else 1.0
+                weight_up, weight_down = state_dict[pair_keys[0]
+                                                    ], state_dict[pair_keys[1]]
+                weight_up = weight_up.squeeze([2, 3]).to(torch.float32)
+                weight_down = weight_down.squeeze([2, 3]).to(torch.float32)
+                if len(weight_down.shape) == 4:
+                    curr_layer_weight = weight_scale * \
+                        torch.einsum('a b, b c h w -> a c h w',
+                                     weight_up, weight_down)
+                else:
+                    curr_layer_weight = weight_scale * \
+                        torch.mm(weight_up, weight_down).unsqueeze(
+                            2).unsqueeze(3)
+                curr_layer_weight = curr_layer_weight.permute(0, 2, 3, 1)
+            else:
+                weight_up = state_dict[pair_keys[0]].to(torch.float32)
+                weight_down = state_dict[pair_keys[1]].to(torch.float32)
+                weight_scale = lora_alpha/weight_up.shape[1] if lora_alpha != -1 else 1.0
+                curr_layer_weight = weight_scale * \
+                    torch.mm(weight_up, weight_down)
+            #
+            curr_layer_weight = curr_layer_weight.to(torch.float16)
+            unet.load_lora_by_name(layers, curr_layer_weight, alpha)
+            for item in pair_keys:
+                visited.add(item)

lyrasd_model/lyrasd_controlnet_img2img_pipeline.py CHANGED Viewed

@@ -1,21 +1,18 @@
 import torch
 from typing import Any, Callable, Dict, List, Optional, Union
-from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.loaders import TextualInversionLoaderMixin
-from diffusers.models import AutoencoderKL
-from diffusers.utils import randn_tensor, logging
-from diffusers.schedulers import EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, DPMSolverMultistepScheduler
 from diffusers.utils import PIL_INTERPOLATION
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 import os
 import numpy as np
 import warnings
-from .lora_util import add_text_lora_layer
-import gc
 from PIL import Image
 import PIL
 import inspect
 import time
@@ -31,7 +28,8 @@ def numpy_to_pil(images):
     images = (images * 255).round().astype("uint8")
     if images.shape[-1] == 1:
         # special case for grayscale (single channel) images
-        pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
     else:
         pil_images = [Image.fromarray(image) for image in images]
@@ -53,7 +51,8 @@ def preprocess(image):
         w, h = image[0].size
         w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
         image = image.transpose(0, 3, 1, 2)
@@ -63,69 +62,11 @@ def preprocess(image):
         image = torch.cat(image, dim=0)
     return image
-class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
-    def __init__(self, model_path, lib_so_path, model_dtype='fp32', device=torch.device("cuda"), dtype=torch.float16) -> None:
-        self.device = device
-        self.dtype = dtype
-        torch.classes.load_library(lib_so_path)
-        self.vae = AutoencoderKL.from_pretrained(model_path, subfolder="vae").to(dtype).to(device)
-        self.tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer")
-        self.text_encoder = CLIPTextModel.from_pretrained(model_path, subfolder="text_encoder").to(dtype).to(device)
-        self.unet_in_channels = 4
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.vae.enable_tiling()
-        self.unet = torch.classes.lyrasd.Unet2dConditionalModelOp(
-            3,     # max num of controlnets
-            "fp16" # inference dtype (can only use fp16 for now)
-        )
-        unet_path = os.path.join(model_path, "unet_bins/")
-        self.reload_unet_model(unet_path, model_dtype)
-        self.scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler")
-    def load_controlnet_model(self, model_name, controlnet_path, model_dtype="fp32"):
-        if len(controlnet_path) > 0 and controlnet_path[-1] != "/":
-            controlnet_path = controlnet_path + "/"
-        self.unet.load_controlnet_model(model_name, controlnet_path, model_dtype)
-    def unload_controlnet_model(self, model_name):
-        self.unet.unload_controlnet_model(model_name, True)
-    def get_loaded_controlnet(self):
-        return self.unet.get_loaded_controlnet()
-    def reload_unet_model(self, unet_path, unet_file_format='fp32'):
-        if len(unet_path) > 0 and unet_path[-1] != "/":
-            unet_path = unet_path + "/"
-        return self.unet.reload_unet_model(unet_path, unet_file_format)
-    def load_lora(self, lora_model_path, lora_name, lora_strength, lora_file_format='fp32'):
-        if len(lora_model_path) > 0 and lora_model_path[-1] != "/":
-            lora_model_path = lora_model_path + "/"
-        lora = add_text_lora_layer(self.text_encoder, lora_model_path, lora_strength, lora_file_format)
-        self.loaded_lora[lora_name] = lora
-        self.unet.load_lora(lora_model_path, lora_name, lora_strength, lora_file_format)
-    def unload_lora(self, lora_name, clean_cache=False):
-        for layer_data in self.loaded_lora[lora_name]:
-            layer = layer_data['layer']
-            added_weight = layer_data['added_weight']
-            layer.weight.data -= added_weight
-        self.unet.unload_lora(lora_name, clean_cache)
-        del self.loaded_lora[lora_name]
-        gc.collect()
-        torch.cuda.empty_cache()
-    def clean_lora_cache(self):
-        self.unet.clean_lora_cache()
-    def get_loaded_lora(self):
-        return self.unet.get_loaded_lora()
     def _encode_prompt(
         self,
@@ -181,13 +122,14 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
                 return_tensors="pt",
             )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
             if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
                 text_input_ids, untruncated_ids
             ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
                 )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
@@ -205,12 +147,14 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
             )
             prompt_embeds = prompt_embeds[0]
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -235,7 +179,8 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -261,10 +206,13 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
@@ -272,7 +220,6 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
         return prompt_embeds
     def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
@@ -282,6 +229,17 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         return image
     def check_inputs(
         self,
         prompt,
@@ -291,8 +249,9 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
         prompt_embeds=None,
         negative_prompt_embeds=None,
     ):
-        if height % 64 != 0 or width % 64 != 0: # 初版暂时只支持 64 的倍数的 height 和 width
-            raise ValueError(f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
@@ -304,7 +263,8 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
         elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -342,13 +302,14 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
             elif isinstance(generator, list):
                 init_latents = [
-                    self.vae.encode(image[i: i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
                 ]
                 init_latents = torch.cat(init_latents, dim=0)
             else:
-                init_latents = self.vae.encode(image).latent_dist.sample(generator)
-            init_latents = self.vae.config.scaling_factor * init_latents
         if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
             # expand init_latents for batch_size
@@ -358,9 +319,9 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
                 " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
                 " your script to pass as many initial images as text prompts to suppress this warning."
             )
-            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
             additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
         elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
             raise ValueError(
                 f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
@@ -369,7 +330,8 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
             init_latents = torch.cat([init_latents], dim=0)
         shape = init_latents.shape
-        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         # get latents
         init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
@@ -398,7 +360,8 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
                 for image_ in image:
                     image_ = image_.convert("RGB")
-                    image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
                     image_ = np.array(image_)
                     image_ = image_[None, :]
                     images.append(image_)
@@ -434,27 +397,29 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
     def get_timesteps(self, num_inference_steps, strength, device):
         # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
         t_start = max(num_inference_steps - init_timestep, 0)
         timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
         return timesteps, num_inference_steps - t_start
     @torch.no_grad()
     def __call__(
         self,
@@ -477,9 +442,10 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
         controlnet_images: Optional[List[PIL.Image.Image]] = None,
         controlnet_scale: Optional[List[float]] = None,
         controlnet_names: Optional[List[str]] = None,
-        guess_mode = False,
         eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
@@ -549,7 +515,6 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
         # 3. Encode input prompt
         start = time.perf_counter()
         prompt_embeds = self._encode_prompt(
@@ -583,17 +548,21 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
         scales = [1.0, ] * 13
         if guess_mode:
             scales = torch.logspace(-1, 0, 13).tolist()
         for scale in controlnet_scale:
             scales_ = [d * scale for d in scales]
             control_scales.append(scales_)
-        image = preprocess(image)
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
-        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
         # 6. Prepare latent variables
         latents = self.prepare_latents(
@@ -604,33 +573,46 @@ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         start_unet = time.perf_counter()
         for i, t in enumerate(timesteps):
             # expand the latents if we are doing classifier free guidance
-            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-            latent_model_input = latent_model_input.permute(0, 2, 3, 1).contiguous()
             # 后边三个 None 是给到controlnet 的参数，暂时给到 None 当 placeholder
-            noise_pred = self.unet.forward(latent_model_input, prompt_embeds, t, controlnet_names, control_images, control_scales, guess_mode)
             noise_pred = noise_pred.permute(0, 3, 1, 2)
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
         torch.cuda.synchronize()
         start = time.perf_counter()
-        image = self.decode_latents(latents)
         torch.cuda.synchronize()
         image = numpy_to_pil(image)
         return image

 import torch
 from typing import Any, Callable, Dict, List, Optional, Union
 from diffusers.loaders import TextualInversionLoaderMixin
+from diffusers.utils.torch_utils import logging, randn_tensor
 from diffusers.utils import PIL_INTERPOLATION
 import os
 import numpy as np
 import warnings
 from PIL import Image
 import PIL
+from .lyrasd_pipeline_base import LyraSDXLPipelineBase
 import inspect
 import time
     images = (images * 255).round().astype("uint8")
     if images.shape[-1] == 1:
         # special case for grayscale (single channel) images
+        pil_images = [Image.fromarray(image.squeeze(), mode="L")
+                      for image in images]
     else:
         pil_images = [Image.fromarray(image) for image in images]
         w, h = image[0].size
         w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[
+            None, :] for i in image]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
         image = image.transpose(0, 3, 1, 2)
         image = torch.cat(image, dim=0)
     return image
+class LyraSdControlnetImg2ImgPipeline(LyraSDXLPipelineBase):
+    def __init__(self, device=torch.device("cuda"), dtype=torch.float16, vae_scale_factor=8, vae_scaling_factor=0.18215) -> None:
+        super().__init__(device, dtype, vae_scale_factor=vae_scale_factor,
+                         vae_scaling_factor=vae_scaling_factor)
     def _encode_prompt(
         self,
                 return_tensors="pt",
             )
             text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(
+                prompt, padding="longest", return_tensors="pt").input_ids
             if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
                 text_input_ids, untruncated_ids
             ):
                 removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1: -1]
                 )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
             )
             prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds.to(
+            dtype=self.text_encoder.dtype, device=device)
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1)
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(
+                    uncond_tokens, self.tokenizer)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=self.text_encoder.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len, -1)
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
         return prompt_embeds
     def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         return image
+    def lyra_decode_latents(self, latents):
+        print("lyra_decode_latents")
+        latents = 1 / self.vae_scaling_factor * latents
+        image = self.vae.decode(latents)
+        image = image.permute(0, 2, 3, 1)
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().float().numpy()
+        return image
     def check_inputs(
         self,
         prompt,
         prompt_embeds=None,
         negative_prompt_embeds=None,
     ):
+        if height % 64 != 0 or width % 64 != 0:  # 初版暂时只支持 64 的倍数的 height 和 width
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
         elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
             elif isinstance(generator, list):
                 init_latents = [
+                    self.vae.encode(image[i: i + 1]).sample(generator[i]) for i in range(batch_size)
                 ]
                 init_latents = torch.cat(init_latents, dim=0)
             else:
+                init_latents = self.vae.encode(
+                    image).sample(generator)
+            init_latents = self.vae.scaling_factor * init_latents
         if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
             # expand init_latents for batch_size
                 " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
                 " your script to pass as many initial images as text prompts to suppress this warning."
             )
             additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat(
+                [init_latents] * additional_image_per_prompt, dim=0)
         elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
             raise ValueError(
                 f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
             init_latents = torch.cat([init_latents], dim=0)
         shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator,
+                             device=device, dtype=dtype)
         # get latents
         init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
                 for image_ in image:
                     image_ = image_.convert("RGB")
+                    image_ = image_.resize(
+                        (width, height), resample=PIL_INTERPOLATION["lanczos"])
                     image_ = np.array(image_)
                     image_ = image_[None, :]
                     images.append(image_)
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(
+            self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
     def get_timesteps(self, num_inference_steps, strength, device):
         # get the original timestep using init_timestep
+        init_timestep = min(
+            int(num_inference_steps * strength), num_inference_steps)
         t_start = max(num_inference_steps - init_timestep, 0)
         timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
         return timesteps, num_inference_steps - t_start
     @torch.no_grad()
     def __call__(
         self,
         controlnet_images: Optional[List[PIL.Image.Image]] = None,
         controlnet_scale: Optional[List[float]] = None,
         controlnet_names: Optional[List[str]] = None,
+        guess_mode=False,
         eta: float = 0.0,
+        generator: Optional[Union[torch.Generator,
+                                  List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
         # 3. Encode input prompt
         start = time.perf_counter()
         prompt_embeds = self._encode_prompt(
         scales = [1.0, ] * 13
         if guess_mode:
             scales = torch.logspace(-1, 0, 13).tolist()
         for scale in controlnet_scale:
             scales_ = [d * scale for d in scales]
             control_scales.append(scales_)
+        print(f"clip cost: {(time.perf_counter() - start)* 1000}")
+        image = self.image_processor.preprocess(image)
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(
+            batch_size * num_images_per_prompt)
         # 6. Prepare latent variables
         latents = self.prepare_latents(
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - \
+            num_inference_steps * self.scheduler.order
         start_unet = time.perf_counter()
         for i, t in enumerate(timesteps):
             # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat(
+                [latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(
+                latent_model_input, t)
+            latent_model_input = latent_model_input.permute(
+                0, 2, 3, 1).contiguous()
             # 后边三个 None 是给到controlnet 的参数，暂时给到 None 当 placeholder
+            noise_pred = self.unet.forward(
+                latent_model_input, prompt_embeds, t, controlnet_names, control_images, control_scales, guess_mode)
             noise_pred = noise_pred.permute(0, 3, 1, 2)
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * \
+                    (noise_pred_text - noise_pred_uncond)
             # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred, t, latents, **extra_step_kwargs).prev_sample
         torch.cuda.synchronize()
+        print(
+            f"unet x {num_inference_steps} cost: {(time.perf_counter() - start_unet) * 1000}")
         start = time.perf_counter()
+        # image = self.decode_latents(latents)
+        image = self.lyra_decode_latents(latents)
         torch.cuda.synchronize()
+        print(f"vae cost: {(time.perf_counter() - start)* 1000}")
+        print()
         image = numpy_to_pil(image)
         return image

lyrasd_model/lyrasd_controlnet_txt2img_pipeline.py CHANGED Viewed

@@ -1,12 +1,8 @@
 import torch
 from typing import Any, Callable, Dict, List, Optional, Union
-from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.loaders import TextualInversionLoaderMixin
-from diffusers.models import AutoencoderKL
-from diffusers.utils import randn_tensor, logging
-from diffusers.schedulers import EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, DPMSolverMultistepScheduler
 from diffusers.utils import PIL_INTERPOLATION
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 import os
 import numpy as np
 from .lora_util import add_text_lora_layer
@@ -17,6 +13,7 @@ import PIL
 import inspect
 import time
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -36,68 +33,11 @@ def numpy_to_pil(images):
     return pil_images
-class LyraSdControlnetTxt2ImgPipeline(TextualInversionLoaderMixin):
-    def __init__(self, model_path, lib_so_path, model_dtype='fp32', device=torch.device("cuda"), dtype=torch.float16) -> None:
-        self.device = device
-        self.dtype = dtype
-        torch.classes.load_library(lib_so_path)
-        self.vae = AutoencoderKL.from_pretrained(model_path, subfolder="vae").to(dtype).to(device)
-        self.tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer")
-        self.text_encoder = CLIPTextModel.from_pretrained(model_path, subfolder="text_encoder").to(dtype).to(device)
-        self.unet_in_channels = 4
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.vae.enable_tiling()
-        self.unet = torch.classes.lyrasd.Unet2dConditionalModelOp(
-            3,     # max num of controlnets
-            "fp16" # inference dtype (can only use fp16 for now)
-        )
-        unet_path = os.path.join(model_path, "unet_bins/")
-        self.reload_unet_model(unet_path, model_dtype)
-        self.scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler")
-    def load_controlnet_model(self, model_name, controlnet_path, model_dtype="fp32"):
-        if len(controlnet_path) > 0 and controlnet_path[-1] != "/":
-            controlnet_path = controlnet_path + "/"
-        self.unet.load_controlnet_model(model_name, controlnet_path, model_dtype)
-    def unload_controlnet_model(self, model_name):
-        self.unet.unload_controlnet_model(model_name, True)
-    def get_loaded_controlnet(self):
-        return self.unet.get_loaded_controlnet()
-    def reload_unet_model(self, unet_path, unet_file_format='fp32'):
-        if len(unet_path) > 0 and unet_path[-1] != "/":
-            unet_path = unet_path + "/"
-        return self.unet.reload_unet_model(unet_path, unet_file_format)
-    def load_lora(self, lora_model_path, lora_name, lora_strength, lora_file_format='fp32'):
-        if len(lora_model_path) > 0 and lora_model_path[-1] != "/":
-            lora_model_path = lora_model_path + "/"
-        lora = add_text_lora_layer(self.text_encoder, lora_model_path, lora_strength, lora_file_format)
-        self.loaded_lora[lora_name] = lora
-        self.unet.load_lora(lora_model_path, lora_name, lora_strength, lora_file_format)
-    def unload_lora(self, lora_name, clean_cache=False):
-        for layer_data in self.loaded_lora[lora_name]:
-            layer = layer_data['layer']
-            added_weight = layer_data['added_weight']
-            layer.weight.data -= added_weight
-        self.unet.unload_lora(lora_name, clean_cache)
-        del self.loaded_lora[lora_name]
-        gc.collect()
-        torch.cuda.empty_cache()
-    def clean_lora_cache(self):
-        self.unet.clean_lora_cache()
-    def get_loaded_lora(self):
-        return self.unet.get_loaded_lora()
     def _encode_prompt(
         self,
         prompt,
@@ -253,6 +193,23 @@ class LyraSdControlnetTxt2ImgPipeline(TextualInversionLoaderMixin):
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         return image
     def check_inputs(
         self,
         prompt,
@@ -342,21 +299,8 @@ class LyraSdControlnetTxt2ImgPipeline(TextualInversionLoaderMixin):
             elif isinstance(image[0], torch.Tensor):
                 image = torch.cat(image, dim=0)
-        image_batch_size = image.shape[0]
-        if image_batch_size == 1:
-            repeat_by = batch_size
-        else:
-            # image batch size is the same as prompt batch size
-            repeat_by = num_images_per_prompt
-        image = image.repeat_interleave(repeat_by, dim=0)
         image = image.to(device=device, dtype=dtype)
-        if do_classifier_free_guidance and not guess_mode:
-            image = torch.cat([image] * 2)
         return image
     def prepare_extra_step_kwargs(self, generator, eta):
@@ -376,6 +320,18 @@ class LyraSdControlnetTxt2ImgPipeline(TextualInversionLoaderMixin):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
     @torch.no_grad()
     def __call__(
         self,
@@ -527,7 +483,7 @@ class LyraSdControlnetTxt2ImgPipeline(TextualInversionLoaderMixin):
             latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
             latent_model_input = latent_model_input.permute(0, 2, 3, 1).contiguous()
-            # 后边三个 None 是给到controlnet 的参数，暂时给到 None 当 placeholder
             noise_pred = self.unet.forward(latent_model_input, prompt_embeds, t, controlnet_names, control_images, control_scales, guess_mode)
             noise_pred = noise_pred.permute(0, 3, 1, 2)
@@ -540,7 +496,9 @@ class LyraSdControlnetTxt2ImgPipeline(TextualInversionLoaderMixin):
             # compute the previous noisy sample x_t -> x_t-1
             latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-        image = self.decode_latents(latents)
         image = numpy_to_pil(image)
         return image

 import torch
 from typing import Any, Callable, Dict, List, Optional, Union
 from diffusers.loaders import TextualInversionLoaderMixin
+from diffusers.utils.torch_utils import logging, randn_tensor
 from diffusers.utils import PIL_INTERPOLATION
 import os
 import numpy as np
 from .lora_util import add_text_lora_layer
 import inspect
 import time
+from .lyrasd_pipeline_base import LyraSDXLPipelineBase
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
     return pil_images
+class LyraSdControlnetTxt2ImgPipeline(LyraSDXLPipelineBase):
+    def __init__(self, device=torch.device("cuda"), dtype=torch.float16, vae_scale_factor=8, vae_scaling_factor=0.18215) -> None:
+        super().__init__(device, dtype, vae_scale_factor=vae_scale_factor,
+                         vae_scaling_factor=vae_scaling_factor)
     def _encode_prompt(
         self,
         prompt,
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         return image
+    def lyra_decode_latents(self, latents):
+        print("lyra_decode_latents")
+        # np.save("", latents.)
+        # np.save(f"/workspace/vae_model/latent.npy", latents.detach().cpu().numpy())
+        latents = 1 / self.vae_scaling_factor * latents
+        latents = latents.permute(0, 2, 3, 1).contiguous()
+        image = self.vae.vae_decode(latents)
+        # print(image)
+        # GPUtil.showUtilization(all=True)
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().float().numpy()
+        return image
     def check_inputs(
         self,
         prompt,
             elif isinstance(image[0], torch.Tensor):
                 image = torch.cat(image, dim=0)
         image = image.to(device=device, dtype=dtype)
         return image
     def prepare_extra_step_kwargs(self, generator, eta):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
+    def lyra_decode_latents(self, latents):
+        print("lyra_decode_latents")
+        latents = 1 / self.vae_scaling_factor * latents
+        image = self.vae.decode(latents)
+        image = image.permute(0, 2, 3, 1)
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().float().numpy()
+        return image
     @torch.no_grad()
     def __call__(
         self,
             latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
             latent_model_input = latent_model_input.permute(0, 2, 3, 1).contiguous()
+            control_images[0]
             noise_pred = self.unet.forward(latent_model_input, prompt_embeds, t, controlnet_names, control_images, control_scales, guess_mode)
             noise_pred = noise_pred.permute(0, 3, 1, 2)
             # compute the previous noisy sample x_t -> x_t-1
             latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+        # image = self.decode_latents(latents)
+        image = self.lyra_decode_latents(latents)
         image = numpy_to_pil(image)
         return image

lyrasd_model/lyrasd_img2img_pipeline.py CHANGED Viewed

@@ -8,13 +8,12 @@ import numpy as np
 import PIL
 import torch
 from diffusers.loaders import TextualInversionLoaderMixin
-from diffusers.models import AutoencoderKL
-from diffusers.schedulers import EulerAncestralDiscreteScheduler
-from diffusers.utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor
 from PIL import Image
-from transformers import CLIPTextModel, CLIPTokenizer
-from .lora_util import add_text_lora_layer
-import gc
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -28,7 +27,8 @@ def numpy_to_pil(images):
     images = (images * 255).round().astype("uint8")
     if images.shape[-1] == 1:
         # special case for grayscale (single channel) images
-        pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
     else:
         pil_images = [Image.fromarray(image) for image in images]
@@ -50,7 +50,8 @@ def preprocess(image):
         w, h = image[0].size
         w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
         image = image.transpose(0, 3, 1, 2)
@@ -61,60 +62,13 @@ def preprocess(image):
     return image
-class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
-    def __init__(self, model_path, lib_so_path, model_dtype='fp32', device=torch.device("cuda"), dtype=torch.float16) -> None:
-        self.device = device
-        self.dtype = dtype
-        torch.classes.load_library(lib_so_path)
-        self.vae = AutoencoderKL.from_pretrained(model_path, subfolder="vae").to(dtype).to(device)
-        self.tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer")
-        self.text_encoder = CLIPTextModel.from_pretrained(model_path, subfolder="text_encoder").to(dtype).to(device)
-        unet_path = os.path.join(model_path, "unet_bins/")
-        self.unet_in_channels = 4
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.vae.enable_tiling()
-        self.unet = torch.classes.lyrasd.Unet2dConditionalModelOp(
-            3,     # max num of controlnets
-            "fp16" # inference dtype (can only use fp16 for now)
-        )
-        self.reload_unet_model(unet_path, model_dtype)
-        self.scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler")
-    def reload_unet_model(self, unet_path, unet_file_format='fp32'):
-        if len(unet_path) > 0 and unet_path[-1] != "/":
-            unet_path = unet_path + "/"
-        return self.unet.reload_unet_model(unet_path, unet_file_format)
-    def load_lora(self, lora_model_path, lora_name, lora_strength, lora_file_format='fp32'):
-        if len(lora_model_path) > 0 and lora_model_path[-1] != "/":
-            lora_model_path = lora_model_path + "/"
-        lora = add_text_lora_layer(self.text_encoder, lora_model_path, lora_strength, lora_file_format)
-        self.loaded_lora[lora_name] = lora
-        self.unet.load_lora(lora_model_path, lora_name, lora_strength, lora_file_format)
-    def unload_lora(self, lora_name, clean_cache=False):
-        for layer_data in self.loaded_lora[lora_name]:
-            layer = layer_data['layer']
-            added_weight = layer_data['added_weight']
-            layer.weight.data -= added_weight
-        self.unet.unload_lora(lora_name, clean_cache)
-        del self.loaded_lora[lora_name]
-        gc.collect()
-        torch.cuda.empty_cache()
-    def clean_lora_cache(self):
-        self.unet.clean_lora_cache()
-    def get_loaded_lora(self):
-        return self.unet.get_loaded_lora()
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
         prompt,
@@ -170,7 +124,8 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
                 return_tensors="pt",
             )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
             if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
                 text_input_ids, untruncated_ids
@@ -201,12 +156,14 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
         else:
             prompt_embeds_dtype = prompt_embeds.dtype
-        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -231,7 +188,8 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -257,10 +215,13 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
@@ -286,13 +247,15 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
@@ -301,10 +264,12 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
         self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
     ):
         if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
         if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
         ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
@@ -321,7 +286,8 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
         elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -339,7 +305,8 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
     def get_timesteps(self, num_inference_steps, strength, device):
         # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
         t_start = max(num_inference_steps - init_timestep, 0)
         timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
@@ -354,6 +321,8 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
         image = image.to(device=device, dtype=dtype)
         batch_size = batch_size * num_images_per_prompt
         if image.shape[1] == 4:
@@ -368,13 +337,13 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
             elif isinstance(generator, list):
                 init_latents = [
-                    self.vae.encode(image[i: i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
                 ]
                 init_latents = torch.cat(init_latents, dim=0)
             else:
-                init_latents = self.vae.encode(image).latent_dist.sample(generator)
-            init_latents = self.vae.config.scaling_factor * init_latents
         if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
             # expand init_latents for batch_size
@@ -384,9 +353,11 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
                 " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
                 " your script to pass as many initial images as text prompts to suppress this warning."
             )
-            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
             additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
         elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
             raise ValueError(
                 f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
@@ -395,7 +366,8 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
             init_latents = torch.cat([init_latents], dim=0)
         shape = init_latents.shape
-        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         # get latents
         init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
@@ -403,6 +375,17 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
         return latents
     @torch.no_grad()
     def __call__(
         self,
@@ -421,10 +404,12 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: Optional[int] = 1,
         eta: Optional[float] = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: int = 1,
     ):
         r"""
@@ -482,7 +467,8 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
                 "not-safe-for-work" (nsfw) content.
         """
         # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -510,12 +496,14 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
         )
         # 4. Preprocess image
-        image = preprocess(image)
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
-        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
         # 6. Prepare latent variables
         latents = self.prepare_latents(
@@ -526,29 +514,36 @@ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         for i, t in enumerate(timesteps):
             # expand the latents if we are doing classifier free guidance
-            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-            latent_model_input = latent_model_input.permute(0, 2, 3, 1).contiguous()
             # predict the noise residual
-            # 后边4个 None 是给到controlnet 的参数，暂时给到 None 当 placeholder
-            noise_pred = self.unet.forward(latent_model_input, prompt_embeds, t, None, None, None, None)
             noise_pred = noise_pred.permute(0, 3, 1, 2)
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-        image = self.decode_latents(latents)
         image = numpy_to_pil(image)
         return image

 import PIL
 import torch
 from diffusers.loaders import TextualInversionLoaderMixin
+from diffusers.utils import PIL_INTERPOLATION, deprecate
+from diffusers.utils.torch_utils import logging, randn_tensor
 from PIL import Image
+from .lyrasd_pipeline_base import LyraSDXLPipelineBase
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
     images = (images * 255).round().astype("uint8")
     if images.shape[-1] == 1:
         # special case for grayscale (single channel) images
+        pil_images = [Image.fromarray(image.squeeze(), mode="L")
+                      for image in images]
     else:
         pil_images = [Image.fromarray(image) for image in images]
         w, h = image[0].size
         w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[
+            None, :] for i in image]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
         image = image.transpose(0, 3, 1, 2)
     return image
+class LyraSDImg2ImgPipeline(LyraSDXLPipelineBase):
+    def __init__(self, device=torch.device("cuda"), dtype=torch.float16, vae_scale_factor=8, vae_scaling_factor=0.18215) -> None:
+        super().__init__(device, dtype, vae_scale_factor=vae_scale_factor,
+                         vae_scaling_factor=vae_scaling_factor)
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
         prompt,
                 return_tensors="pt",
             )
             text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(
+                prompt, padding="longest", return_tensors="pt").input_ids
             if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
                 text_input_ids, untruncated_ids
         else:
             prompt_embeds_dtype = prompt_embeds.dtype
+        prompt_embeds = prompt_embeds.to(
+            dtype=prompt_embeds_dtype, device=device)
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1)
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(
+                    uncond_tokens, self.tokenizer)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=prompt_embeds_dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len, -1)
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(
+            self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
         self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
     ):
         if strength < 0 or strength > 1:
+            raise ValueError(
+                f"The value of strength should in [0.0, 1.0] but is {strength}")
         if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(
+                callback_steps, int) or callback_steps <= 0)
         ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
         elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
     def get_timesteps(self, num_inference_steps, strength, device):
         # get the original timestep using init_timestep
+        init_timestep = min(
+            int(num_inference_steps * strength), num_inference_steps)
         t_start = max(num_inference_steps - init_timestep, 0)
         timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
         image = image.to(device=device, dtype=dtype)
+        print(image.shape)
         batch_size = batch_size * num_images_per_prompt
         if image.shape[1] == 4:
             elif isinstance(generator, list):
                 init_latents = [
+                    self.vae.encode(image[i: i + 1]).sample(generator[i]) for i in range(batch_size)
                 ]
                 init_latents = torch.cat(init_latents, dim=0)
             else:
+                init_latents = self.vae.encode(image).sample(generator)
+            init_latents = self.vae.scaling_factor * init_latents
         if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
             # expand init_latents for batch_size
                 " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
                 " your script to pass as many initial images as text prompts to suppress this warning."
             )
+            deprecate("len(prompt) != len(image)", "1.0.0",
+                      deprecation_message, standard_warn=False)
             additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat(
+                [init_latents] * additional_image_per_prompt, dim=0)
         elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
             raise ValueError(
                 f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
             init_latents = torch.cat([init_latents], dim=0)
         shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator,
+                             device=device, dtype=dtype)
         # get latents
         init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
         return latents
+    def lyra_decode_latents(self, latents):
+        print("lyra_decode_latents")
+        latents = 1 / self.vae_scaling_factor * latents
+        image = self.vae.decode(latents)
+        image = image.permute(0, 2, 3, 1)
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().float().numpy()
+        return image
     @torch.no_grad()
     def __call__(
         self,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: Optional[int] = 1,
         eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator,
+                                  List[torch.Generator]]] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        callback: Optional[Callable[[
+            int, int, torch.FloatTensor], None]] = None,
         callback_steps: int = 1,
     ):
         r"""
                 "not-safe-for-work" (nsfw) content.
         """
         # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, strength, callback_steps,
+                          negative_prompt, prompt_embeds, negative_prompt_embeds)
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
         )
         # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(
+            batch_size * num_images_per_prompt)
         # 6. Prepare latent variables
         latents = self.prepare_latents(
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - \
+            num_inference_steps * self.scheduler.order
         for i, t in enumerate(timesteps):
             # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat(
+                [latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(
+                latent_model_input, t)
+            latent_model_input = latent_model_input.permute(
+                0, 2, 3, 1).contiguous()
             # predict the noise residual
+            # 后边 None 是给到controlnet 的参数，暂时给到 None 当 placeholder
+            noise_pred = self.unet.forward(
+                latent_model_input, prompt_embeds, t)
             noise_pred = noise_pred.permute(0, 3, 1, 2)
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * \
+                    (noise_pred_text - noise_pred_uncond)
             # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred, t, latents, **extra_step_kwargs).prev_sample
+        # image = self.decode_latents(latents)
+        image = self.lyra_decode_latents(latents)
         image = numpy_to_pil(image)
         return image

lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm80.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0689ed5d3b55f5033a8869d5f23ce900793aa0ab7fdc4a3e3c0a0f3a243c83da
-size 65441456

lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm86.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b8e27e715fa3a17ce25bf23b772e0dd355d0780c1bd93cfeeb12ef45b0ba2444
-size 65389176

lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm80.so CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c2eaa9067ad8eb1d20872afa71ed9497f62d930819704d15e5e8bf559623eca7
-size 65498752

 version https://git-lfs.github.com/spec/v1
+oid sha256:8600f5414d283ebf64cb3974ef520858747cbb1a6d59dd46a3dcd9427758613b
+size 97823240

lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7d0c909ff2498934c6d1ed8f46af6cdc7812872177c0a4e7ca0ee99bf88fcb65
-size 65519232

 version https://git-lfs.github.com/spec/v1
+oid sha256:8e5aefbb32667eeacb7fa60283656b4bb2ebb7dcd54276f9d101c856ed64e340
+size 97823240

lyrasd_model/lyrasd_pipeline_base.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import inspect
+import os
+import time
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import gc
+import torch
+import numpy as np
+from glob import glob
+from diffusers.loaders import TextualInversionLoaderMixin
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL
+from diffusers.schedulers import (DPMSolverMultistepScheduler,
+                                  EulerAncestralDiscreteScheduler,
+                                  EulerDiscreteScheduler,
+                                  KarrasDiffusionSchedulers)
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPTextModelWithProjection
+from .lyrasd_vae_model import LyraSdVaeModel
+from .module.lyrasd_ip_adapter import LyraIPAdapter
+from .lora_util import add_text_lora_layer, add_xltext_lora_layer, add_lora_to_opt_model, load_state_dict
+from safetensors.torch import load_file
+class LyraSDXLPipelineBase(TextualInversionLoaderMixin):
+    def __init__(self, device=torch.device("cuda"), dtype=torch.float16, num_channels_unet=4, num_channels_latents=4, vae_scale_factor=8, vae_scaling_factor=0.18215) -> None:
+        self.device = device
+        self.dtype = dtype
+        self.num_channels_unet = num_channels_unet
+        self.num_channels_latents = num_channels_latents
+        self.vae_scale_factor = vae_scale_factor
+        self.vae_scaling_factor = vae_scaling_factor
+        self.unet_cache = {}
+        self.unet_in_channels = 4
+        self.controlnet_cache = {}
+        self.loaded_lora = {}
+        self.loaded_lora_strength = {}
+        self.scheduler = None
+        self.init_pipe()
+    def init_pipe(self):
+        self.vae = LyraSdVaeModel(
+            scale_factor=self.vae_scale_factor, scaling_factor=self.vae_scaling_factor)
+        self.unet = torch.classes.lyrasd.Unet2dConditionalModelOp(
+            3,
+            "fp16",
+            self.num_channels_unet,
+            self.num_channels_latents
+        )
+        self.image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+        self.feature_extractor = CLIPImageProcessor()
+    def reload_pipe(self, model_path):
+        self.tokenizer = CLIPTokenizer.from_pretrained(
+            model_path, subfolder="tokenizer")
+        self.text_encoder = CLIPTextModel.from_pretrained(
+            model_path, subfolder="text_encoder").to(self.dtype).to(self.device)
+        self.reload_unet_model_v2(model_path)
+        self.reload_vae_model_v2(model_path)
+        if not self.scheduler:
+            self.scheduler = EulerAncestralDiscreteScheduler.from_pretrained(
+                model_path, subfolder="scheduler")
+    @property
+    def _execution_device(self):
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def reload_unet_model(self, unet_path, unet_file_format='fp32'):
+        if len(unet_path) > 0 and unet_path[-1] != "/":
+            unet_path = unet_path + "/"
+        self.unet.reload_unet_model(unet_path, unet_file_format)
+        self.load_embedding_weight(
+            self.add_embedding, f"{unet_path}add_embedding*", unet_file_format=unet_file_format)
+    def reload_vae_model(self, vae_path, vae_file_format='fp32'):
+        if len(vae_path) > 0 and vae_path[-1] != "/":
+            vae_path = vae_path + "/"
+        return self.vae.reload_vae_model(vae_path, vae_file_format)
+    def load_lora(self, lora_model_path, lora_name, lora_strength, lora_file_format='fp32'):
+        if len(lora_model_path) > 0 and lora_model_path[-1] != "/":
+            lora_model_path = lora_model_path + "/"
+        lora = add_xltext_lora_layer(
+            self.text_encoder, self.text_encoder_2, lora_model_path, lora_strength, lora_file_format)
+        self.loaded_lora[lora_name] = lora
+        self.unet.load_lora(lora_model_path, lora_name,
+                            lora_strength, lora_file_format)
+    def unload_lora(self, lora_name, clean_cache=False):
+        for layer_data in self.loaded_lora[lora_name]:
+            layer = layer_data['layer']
+            added_weight = layer_data['added_weight']
+            layer.weight.data -= added_weight
+        self.unet.unload_lora(lora_name, clean_cache)
+        del self.loaded_lora[lora_name]
+        gc.collect()
+        torch.cuda.empty_cache()
+    def load_lora_v2(self, lora_model_path, lora_name, lora_strength):
+        if lora_name in self.loaded_lora:
+            state_dict = self.loaded_lora[lora_name]
+        else:
+            state_dict = load_state_dict(lora_model_path)
+            self.loaded_lora[lora_name] = state_dict
+        self.loaded_lora_strength[lora_name] = lora_strength
+        add_lora_to_opt_model(state_dict, self.unet, self.text_encoder,
+                              None, lora_strength)
+    def unload_lora_v2(self, lora_name, clean_cache=False):
+        state_dict = self.loaded_lora[lora_name]
+        lora_strength = self.loaded_lora_strength[lora_name]
+        add_lora_to_opt_model(state_dict, self.unet, self.text_encoder,
+                              None,  -1.0 * lora_strength)
+        del self.loaded_lora_strength[lora_name]
+        if clean_cache:
+            del self.loaded_lora[lora_name]
+            gc.collect()
+            torch.cuda.empty_cache()
+    def clean_lora_cache(self):
+        self.unet.clean_lora_cache()
+    def get_loaded_lora(self):
+        return self.unet.get_loaded_lora()
+    def load_ip_adapter(self, dir_ip_adapter, ip_plus, image_encoder_path, num_ip_tokens, ip_projection_dim,  dir_face_in=None, num_fp_tokens=1, fp_projection_dim=None, sdxl=True):
+        self.ip_adapter_helper = LyraIPAdapter(self, sdxl, "cuda", dir_ip_adapter, ip_plus, image_encoder_path,
+                                               num_ip_tokens, ip_projection_dim, dir_face_in, num_fp_tokens, fp_projection_dim)
+    def reload_unet_model_v2(self, model_path):
+        checkpoint_file = os.path.join(
+            model_path, "unet/diffusion_pytorch_model.bin")
+        if not os.path.exists(checkpoint_file):
+            checkpoint_file = os.path.join(
+                model_path, "unet/diffusion_pytorch_model.safetensors")
+        if checkpoint_file in self.unet_cache:
+            state_dict = self.unet_cache[checkpoint_file]
+        else:
+            if "safetensors" in checkpoint_file:
+                state_dict = load_file(checkpoint_file)
+            else:
+                state_dict = torch.load(checkpoint_file, map_location="cpu")
+            for key in state_dict:
+                if len(state_dict[key].shape) == 4:
+                    # converted_unet_checkpoint[key] = converted_unet_checkpoint[key].to(torch.float16).to("cuda").permute(0,2,3,1).contiguous().cpu()
+                    state_dict[key] = state_dict[key].to(
+                        torch.float16).permute(0, 2, 3, 1).contiguous()
+                state_dict[key] = state_dict[key].to(torch.float16)
+            self.unet_cache[checkpoint_file] = state_dict
+        self.unet.reload_unet_model_from_cache(state_dict, "cpu")
+    def reload_vae_model_v2(self, model_path):
+        self.vae.reload_vae_model_v2(model_path)
+    def load_controlnet_model(self, model_name, controlnet_path, model_dtype="fp32"):
+        if len(controlnet_path) > 0 and controlnet_path[-1] != "/":
+            controlnet_path = controlnet_path + "/"
+        self.unet.load_controlnet_model(model_name, controlnet_path, model_dtype)
+    def unload_controlnet_model(self, model_name):
+        self.unet.unload_controlnet_model(model_name, True)
+    def get_loaded_controlnet(self):
+        return self.unet.get_loaded_controlnet()
+    def load_controlnet_model_v2(self, model_name, controlnet_path):
+        checkpoint_file = os.path.join(controlnet_path, "diffusion_pytorch_model.bin")
+        if not os.path.exists(checkpoint_file):
+            checkpoint_file = os.path.join(controlnet_path, "diffusion_pytorch_model.safetensors")
+        if checkpoint_file in self.controlnet_cache:
+            state_dict = self.controlnet_cache[checkpoint_file]
+        else:
+            if "safetensors" in checkpoint_file:
+                state_dict = load_file(checkpoint_file)
+            else:
+                state_dict = torch.load(checkpoint_file, map_location="cpu")
+            for key in state_dict:
+                if len(state_dict[key].shape) == 4:
+                    # converted_unet_checkpoint[key] = converted_unet_checkpoint[key].to(torch.float16).to("cuda").permute(0,2,3,1).contiguous().cpu()
+                    state_dict[key] = state_dict[key].to(torch.float16).permute(0,2,3,1).contiguous()
+                state_dict[key] = state_dict[key].to(torch.float16)
+            self.controlnet_cache[checkpoint_file] = state_dict
+        self.unet.load_controlnet_model_from_state_dict(model_name, state_dict, "cpu")

lyrasd_model/lyrasd_txt2img_inpaint_pipeline.py ADDED Viewed

	@@ -0,0 +1,826 @@

+import inspect
+import os
+import sys
+import time
+from typing import Any, Callable, Dict, List, Optional, Union
+import GPUtil
+import torch
+from diffusers.loaders import TextualInversionLoaderMixin
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.utils.torch_utils import logging, randn_tensor
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+import gc
+import numpy as np
+from .lyrasd_vae_model import LyraSdVaeModel
+from diffusers.models.embeddings import ImageProjection
+from transformers import (
+    CLIPImageProcessor,
+    CLIPVisionModelWithProjection,
+)
+from .lyrasd_pipeline_base import LyraSDXLPipelineBase
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(
+        dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + \
+        (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+def numpy_to_pil(images):
+    """
+    Convert a numpy image or a batch of images to a PIL image.
+    """
+    if images.ndim == 3:
+        images = images[None, ...]
+    images = (images * 255).round().astype("uint8")
+    if images.shape[-1] == 1:
+        # special case for grayscale (single channel) images
+        pil_images = [Image.fromarray(image.squeeze(), mode="L")
+                      for image in images]
+    else:
+        pil_images = [Image.fromarray(image) for image in images]
+    return pil_images
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        print("set(inspect.signature(scheduler.set_timesteps).parameters.keys())", set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()))
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError(
+            "Could not access latents of provided encoder_output")
+class LyraSdTxt2ImgInpaintPipeline(LyraSDXLPipelineBase):
+    def __init__(self, device=torch.device("cuda"), dtype=torch.float16, vae_scale_factor=8, vae_scaling_factor=0.18215, num_channels_unet=9, num_channels_latents=4) -> None:
+        super().__init__(device, dtype, num_channels_unet=num_channels_unet, num_channels_latents=num_channels_latents,
+                         vae_scale_factor=vae_scale_factor, vae_scaling_factor=vae_scaling_factor)
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(
+                prompt, padding="longest", return_tensors="pt").input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1: -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds.to(
+            dtype=self.text_encoder.dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(
+                    uncond_tokens, self.tokenizer)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=self.text_encoder.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len, -1)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        return prompt_embeds
+    def load_ip_adapter(self,
+                        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+                        subfolder: str,
+                        weight_name: str,
+                        **kwargs
+                        ):
+        # if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
+        self.feature_extractor = CLIPImageProcessor()
+        # if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
+        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            pretrained_model_name_or_path_or_dict,
+            subfolder=os.path.join(subfolder, "image_encoder"),
+        ).to(self.device, dtype=self.dtype)
+        # else:
+        #     print("kio: already has image_encoder", hasattr(self, "image_encoder"), getattr(self, "feature_extractor", None) is None)
+        # kiotodo: init ImageProjection
+        model_path = os.path.join(
+            pretrained_model_name_or_path_or_dict, subfolder, weight_name)
+        state_dict = torch.load(model_path, map_location="cpu")
+        clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
+        cross_attention_dim = state_dict["image_proj"]["proj.weight"].shape[0] // 4
+        self.encoder_hid_proj = ImageProjection(
+            cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim, num_image_text_embeds=4
+        )
+        image_proj_state_dict = {}
+        image_proj_state_dict.update(
+            {
+                "image_embeds.weight": state_dict["image_proj"]["proj.weight"],
+                "image_embeds.bias": state_dict["image_proj"]["proj.bias"],
+                "norm.weight": state_dict["image_proj"]["norm.weight"],
+                "norm.bias": state_dict["image_proj"]["norm.bias"],
+            }
+        )
+        self.encoder_hid_proj.load_state_dict(image_proj_state_dict)
+        self.encoder_hid_proj.to(dtype=self.dtype, device=self.device)
+        dir_ipadapter = os.path.join(
+            pretrained_model_name_or_path_or_dict, subfolder, '.'.join(weight_name.split(".")[:-1]))
+        self.unet.load_ip_adapter(dir_ipadapter, "", 1, "fp16")
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(
+                image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(
+            num_images_per_prompt, dim=0)
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.scaling_factor * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+    def lyra_decode_latents(self, latents):
+        # print("lyra_decode_latents")
+        # np.save("", latents.)
+        # np.save(f"/workspace/vae_model/latent.npy", latents.detach().cpu().numpy())
+        latents = 1 / self.vae.scaling_factor * latents
+        # latents = latents.permute(0, 2, 3, 1).contiguous()
+        image = self.vae.decode(latents)
+        image = image.permute(0, 2, 3, 1)
+        # print(image)
+        # GPUtil.showUtilization(all=True)
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().float().numpy()
+        return image
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(
+            int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+        return timesteps, num_inference_steps - t_start
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 64 != 0 or width % 64 != 0:  # 初版暂时只支持 64 的倍数的 height 和 width
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(AutoencoderKLOutput(
+                    latent_dist=self.vae.encode(image[i: i + 1])), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(AutoencoderKLOutput(
+                latent_dist=self.vae.encode(image)), generator=generator)
+        image_latents = self.vae_scaling_factor * image_latents
+        return image_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None,
+                        image=None, timestep=None, is_strength_max=True, return_noise=False, return_image_latents=False):
+        shape = (batch_size, num_channels_latents, height //
+                 self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if (image is None or timestep is None) and not is_strength_max:
+            raise ValueError(
+                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                "However, either the image or the noise timestep has not been provided."
+            )
+        if return_image_latents or (latents is None and not is_strength_max):
+            image = image.to(device=device, dtype=dtype)
+            if image.shape[1] == 4:
+                image_latents = image
+            else:
+                image_latents = self._encode_vae_image(
+                    image=image, generator=generator)
+            image_latents = image_latents.repeat(
+                batch_size // image_latents.shape[0], 1, 1, 1)
+        if latents is None:
+            noise = randn_tensor(shape, generator=generator,
+                                 device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.add_noise(
+                image_latents, noise, timestep)
+            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+        else:
+            noise = latents.to(device)
+            latents = noise * self.scheduler.init_noise_sigma
+        outputs = (latents,)
+        if return_noise:
+            outputs += (noise,)
+        if return_image_latents:
+            outputs += (image_latents,)
+        return outputs
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor,
+                        width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+        masked_image = masked_image.to(device=device, dtype=dtype)
+        if masked_image.shape[1] == 4:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = self._encode_vae_image(
+                masked_image, generator=generator)
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(
+                batch_size // masked_image_latents.shape[0], 1, 1, 1)
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] *
+                      2) if do_classifier_free_guidance else masked_image_latents
+        )
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(
+            device=device, dtype=dtype)
+        return mask, masked_image_latents
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(
+            self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        masked_image_latents: torch.FloatTensor = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator,
+                                  List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        param_scale_dict: Optional[dict] = {}
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet_config_sample_size * self.vae_scale_factor
+        width = width or self.unet_config_sample_size * self.vae_scale_factor
+        # self.unet_config.sample_size = 64
+        # height = 512
+        # width = 512
+        # 1. Check inputs. Raise error if not correct
+        # self.check_inputs(
+        #     prompt, height, width, negative_prompt, prompt_embeds, negative_prompt_embeds
+        # )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self.device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # 3.5 Encode ipadapter_image
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt)
+            if do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+            image_embeds = self.encoder_hid_proj(image_embeds).to(self.dtype)
+        # 4. Prepare timesteps
+        # self.scheduler.set_timesteps(num_inference_steps, device=device)
+        # timesteps = self.scheduler.timesteps
+        # 4.5 Prepare mask and image
+        timesteps = None
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps=num_inference_steps, strength=strength, device=device
+        )
+        # check that number of inference steps is not < 1 - as this doesn't make sense
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(
+            batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+        # 5. Preprocess mask and image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width)
+        init_image = init_image.to(dtype=torch.float32)
+        # 5. Prepare latent variables
+        return_image_latents = self.num_channels_unet == 4
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            self.num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_image_latents=return_image_latents
+        )
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+        # 5.5 Prepare mask latent variables
+        mask_condition = self.mask_processor.preprocess(
+            mask_image, height=height, width=width)
+        if masked_image_latents is None:
+            masked_image = init_image * (mask_condition < 0.5)
+        else:
+            masked_image = masked_image_latents
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask_condition,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
+        # Check that sizes of mask, masked image and latents match
+        if self.num_channels_unet == 9:
+            # default case for runwayml/stable-diffusion-inpainting
+            num_channels_mask = mask.shape[1]
+            num_channels_masked_image = masked_image_latents.shape[1]
+            if self.num_channels_latents + num_channels_mask + num_channels_masked_image != self.num_channels_unet:
+                raise ValueError(
+                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                    f" {self.num_channels_latents} but received `num_channels_latents`: {self.num_channels_latents} +"
+                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                    f" = {self.num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                    " `pipeline.unet` or your `mask_image` or `image` input."
+                )
+        elif self.num_channels_unet != 4:
+            raise ValueError(
+                f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+            )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - \
+            num_inference_steps * self.scheduler.order
+        for i, t in enumerate(timesteps):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat(
+                [latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(
+                latent_model_input, t)
+            if self.num_channels_unet == 9:
+                latent_model_input = torch.cat(
+                    [latent_model_input, mask, masked_image_latents], dim=1)
+            latent_model_input = latent_model_input.permute(
+                0, 2, 3, 1).contiguous()
+            # latent_model_input = latent_model_input[:,:4,:,:].
+            # 后边三个 None 是给到controlnet 的参数，暂时给到 None 当 placeholder
+            # todo: forward ip image_embeds
+            # break
+            if ip_adapter_image is not None:
+                noise_pred = self.unet.forward(
+                    latent_model_input, prompt_embeds, t, None, None, None, None, {"ip_hidden_states": image_embeds}, param_scale_dict)
+            else:
+                noise_pred = self.unet.forward(
+                    latent_model_input, prompt_embeds, t)
+            noise_pred = noise_pred.permute(0, 3, 1, 2).contiguous()
+            # saver.save_v(f"latent_model_input_{i}", latent_model_input)
+            # saver.save_v(f"noise_pred_{i}", noise_pred)
+            # saver.save_v(f"prompt_embeds_{i}", prompt_embeds)
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * \
+                    (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+            if self.num_channels_unet == 4:
+                init_latents_proper = image_latents
+                if self.do_classifier_free_guidance:
+                    init_mask, _ = mask.chunk(2)
+                else:
+                    init_mask = mask
+                if i < len(timesteps) - 1:
+                    noise_timestep = timesteps[i + 1]
+                    init_latents_proper = self.scheduler.add_noise(
+                        init_latents_proper, noise, torch.tensor(
+                            [noise_timestep])
+                    )
+                latents = (1 - init_mask) * init_latents_proper + \
+                    init_mask * latents
+            # if do_classifier_free_guidance and guidance_rescale > 0.0:
+            #     # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+            #     noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+            # # compute the previous noisy sample x_t -> x_t-1
+            # latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+        # image = self.decode_latents(latents)
+        image = self.lyra_decode_latents(latents)
+        image = numpy_to_pil(image)
+        return image

lyrasd_model/lyrasd_txt2img_pipeline.py CHANGED Viewed

@@ -2,7 +2,7 @@ import inspect
 import os
 import time
 from typing import Any, Callable, Dict, List, Optional, Union
 import torch
 from diffusers.loaders import TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL
@@ -10,17 +10,43 @@ from diffusers.schedulers import (DPMSolverMultistepScheduler,
                                   EulerAncestralDiscreteScheduler,
                                   EulerDiscreteScheduler,
                                   KarrasDiffusionSchedulers)
-from diffusers.utils import logging, randn_tensor
 from PIL import Image
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 import gc
 import numpy as np
-from .lora_util import add_text_lora_layer
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 def numpy_to_pil(images):
     """
     Convert a numpy image or a batch of images to a PIL image.
@@ -30,68 +56,18 @@ def numpy_to_pil(images):
     images = (images * 255).round().astype("uint8")
     if images.shape[-1] == 1:
         # special case for grayscale (single channel) images
-        pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
     else:
         pil_images = [Image.fromarray(image) for image in images]
     return pil_images
-class LyraSdTxt2ImgPipeline(TextualInversionLoaderMixin):
-    def __init__(self, model_path, lib_so_path, model_dtype="fp32", device=torch.device("cuda"), dtype=torch.float16) -> None:
-        self.device = device
-        self.dtype = dtype
-        torch.classes.load_library(lib_so_path)
-        self.vae = AutoencoderKL.from_pretrained(model_path, subfolder="vae").to(dtype).to(device)
-        self.tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer")
-        self.text_encoder = CLIPTextModel.from_pretrained(model_path, subfolder="text_encoder").to(dtype).to(device)
-        unet_path = os.path.join(model_path, "unet_bins/")
-        self.unet_in_channels = 4
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.vae.enable_tiling()
-        self.unet = torch.classes.lyrasd.Unet2dConditionalModelOp(
-            3,     # max num of controlnets
-            "fp16" # inference dtype (can only use fp16 for now)
-        )
-        unet_path = os.path.join(model_path, "unet_bins/")
-        self.reload_unet_model(unet_path, model_dtype)
-        self.scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler")
-        self.loaded_lora = {}
-    def reload_unet_model(self, unet_path, unet_file_format='fp32'):
-        if len(unet_path) > 0 and unet_path[-1] != "/":
-            unet_path = unet_path + "/"
-        return self.unet.reload_unet_model(unet_path, unet_file_format)
-    def load_lora(self, lora_model_path, lora_name, lora_strength, lora_file_format='fp32'):
-        if len(lora_model_path) > 0 and lora_model_path[-1] != "/":
-            lora_model_path = lora_model_path + "/"
-        lora = add_text_lora_layer(self.text_encoder, lora_model_path, lora_strength, lora_file_format)
-        self.loaded_lora[lora_name] = lora
-        self.unet.load_lora(lora_model_path, lora_name, lora_strength, lora_file_format)
-    def unload_lora(self, lora_name, clean_cache=False):
-        for layer_data in self.loaded_lora[lora_name]:
-            layer = layer_data['layer']
-            added_weight = layer_data['added_weight']
-            layer.weight.data -= added_weight
-        self.unet.unload_lora(lora_name, clean_cache)
-        del self.loaded_lora[lora_name]
-        gc.collect()
-        torch.cuda.empty_cache()
-    def clean_lora_cache(self):
-        self.unet.clean_lora_cache()
-    def get_loaded_lora(self):
-        return self.unet.get_loaded_lora()
     def _encode_prompt(
         self,
@@ -147,7 +123,8 @@ class LyraSdTxt2ImgPipeline(TextualInversionLoaderMixin):
                 return_tensors="pt",
             )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
             if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
                 text_input_ids, untruncated_ids
@@ -171,12 +148,14 @@ class LyraSdTxt2ImgPipeline(TextualInversionLoaderMixin):
             )
             prompt_embeds = prompt_embeds[0]
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -201,7 +180,8 @@ class LyraSdTxt2ImgPipeline(TextualInversionLoaderMixin):
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -227,10 +207,13 @@ class LyraSdTxt2ImgPipeline(TextualInversionLoaderMixin):
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
@@ -239,14 +222,83 @@ class LyraSdTxt2ImgPipeline(TextualInversionLoaderMixin):
         return prompt_embeds
     def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
         image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clamp(0, 1)
         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         return image
     def check_inputs(
         self,
         prompt,
@@ -257,7 +309,8 @@ class LyraSdTxt2ImgPipeline(TextualInversionLoaderMixin):
         negative_prompt_embeds=None,
     ):
         if height % 64 != 0 or width % 64 != 0:  # 初版暂时只支持 64 的倍数的 height 和 width
-            raise ValueError(f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
@@ -269,7 +322,8 @@ class LyraSdTxt2ImgPipeline(TextualInversionLoaderMixin):
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
         elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -286,7 +340,8 @@ class LyraSdTxt2ImgPipeline(TextualInversionLoaderMixin):
                 )
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -294,7 +349,8 @@ class LyraSdTxt2ImgPipeline(TextualInversionLoaderMixin):
             )
         if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         else:
             latents = latents.to(device)
@@ -308,13 +364,15 @@ class LyraSdTxt2ImgPipeline(TextualInversionLoaderMixin):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
@@ -330,10 +388,13 @@ class LyraSdTxt2ImgPipeline(TextualInversionLoaderMixin):
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: Optional[int] = 1,
         eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -410,6 +471,14 @@ class LyraSdTxt2ImgPipeline(TextualInversionLoaderMixin):
             negative_prompt_embeds=negative_prompt_embeds,
         )
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
@@ -431,28 +500,46 @@ class LyraSdTxt2ImgPipeline(TextualInversionLoaderMixin):
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         for i, t in enumerate(timesteps):
             # expand the latents if we are doing classifier free guidance
-            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-            latent_model_input = latent_model_input.permute(0, 2, 3, 1).contiguous()
-            # 后边4个 None 是给到controlnet 的参数，暂时给到 None 当 placeholder
-            noise_pred = self.unet.forward(latent_model_input, prompt_embeds, t, None, None, None, None)
             noise_pred = noise_pred.permute(0, 3, 1, 2)
-            # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-        image = self.decode_latents(latents)
         image = numpy_to_pil(image)
         return image

 import os
 import time
 from typing import Any, Callable, Dict, List, Optional, Union
+import GPUtil
 import torch
 from diffusers.loaders import TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL
                                   EulerAncestralDiscreteScheduler,
                                   EulerDiscreteScheduler,
                                   KarrasDiffusionSchedulers)
+from diffusers.utils.torch_utils import logging, randn_tensor
 from PIL import Image
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 import gc
 import numpy as np
+from .lyrasd_vae_model import LyraSdVaeModel
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models.embeddings import ImageProjection
+from transformers import (
+    CLIPImageProcessor,
+    CLIPVisionModelWithProjection,
+)
+from .lora_util import add_text_lora_layer, add_xltext_lora_layer, add_lora_to_opt_model, load_state_dict
+from safetensors.torch import load_file
+from .lyrasd_pipeline_base import LyraSDXLPipelineBase
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(
+        dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + \
+        (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
 def numpy_to_pil(images):
     """
     Convert a numpy image or a batch of images to a PIL image.
     images = (images * 255).round().astype("uint8")
     if images.shape[-1] == 1:
         # special case for grayscale (single channel) images
+        pil_images = [Image.fromarray(image.squeeze(), mode="L")
+                      for image in images]
     else:
         pil_images = [Image.fromarray(image) for image in images]
     return pil_images
+class LyraSdTxt2ImgPipeline(LyraSDXLPipelineBase):
+    def __init__(self, device=torch.device("cuda"), dtype=torch.float16, vae_scale_factor=8, vae_scaling_factor=0.18215) -> None:
+        super().__init__(device, dtype, vae_scale_factor=vae_scale_factor, vae_scaling_factor=vae_scaling_factor)
     def _encode_prompt(
         self,
                 return_tensors="pt",
             )
             text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(
+                prompt, padding="longest", return_tensors="pt").input_ids
             if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
                 text_input_ids, untruncated_ids
             )
             prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds.to(
+            dtype=self.text_encoder.dtype, device=device)
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1)
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(
+                    uncond_tokens, self.tokenizer)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=self.text_encoder.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len, -1)
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
         return prompt_embeds
+    def load_ip_adapter(self,
+                        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+                        subfolder: str,
+                        weight_name: str,
+                        **kwargs
+                        ):
+        # if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
+        self.feature_extractor = CLIPImageProcessor()
+        # if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
+        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            pretrained_model_name_or_path_or_dict,
+            subfolder=os.path.join(subfolder, "image_encoder"),
+        ).to(self.device, dtype=self.dtype)
+        # else:
+        #     print("kio: already has image_encoder", hasattr(self, "image_encoder"), getattr(self, "feature_extractor", None) is None)
+        # kiotodo: init ImageProjection
+        model_path = os.path.join(
+            pretrained_model_name_or_path_or_dict, subfolder, weight_name)
+        state_dict = torch.load(model_path, map_location="cpu")
+        clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
+        cross_attention_dim = state_dict["image_proj"]["proj.weight"].shape[0] // 4
+        self.encoder_hid_proj = ImageProjection(
+            cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim, num_image_text_embeds=4
+        )
+        image_proj_state_dict = {}
+        image_proj_state_dict.update(
+            {
+                "image_embeds.weight": state_dict["image_proj"]["proj.weight"],
+                "image_embeds.bias": state_dict["image_proj"]["proj.bias"],
+                "norm.weight": state_dict["image_proj"]["norm.weight"],
+                "norm.bias": state_dict["image_proj"]["norm.bias"],
+            }
+        )
+        self.encoder_hid_proj.load_state_dict(image_proj_state_dict)
+        self.encoder_hid_proj.to(dtype=self.dtype, device=self.device)
+        dir_ipadapter = os.path.join(
+            pretrained_model_name_or_path_or_dict, subfolder, '.'.join(weight_name.split(".")[:-1]))
+        self.unet.load_ip_adapter(dir_ipadapter, "", 1, "fp16")
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(
+                image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(
+            num_images_per_prompt, dim=0)
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
     def decode_latents(self, latents):
+        latents = 1 / self.vae.scaling_factor * latents
         image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clamp(0, 1)
         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         return image
+    def lyra_decode_latents(self, latents):
+        latents = 1 / self.vae.scaling_factor * latents
+        image = self.vae.decode(latents)
+        image = image.permute(0, 2, 3, 1)
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().float().numpy()
+        return image
     def check_inputs(
         self,
         prompt,
         negative_prompt_embeds=None,
     ):
         if height % 64 != 0 or width % 64 != 0:  # 初版暂时只支持 64 的倍数的 height 和 width
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
         elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
                 )
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height //
+                 self.vae.scale_factor, width // self.vae.scale_factor)
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
             )
         if latents is None:
+            latents = randn_tensor(
+                shape, generator=generator, device=device, dtype=dtype)
         else:
             latents = latents.to(device)
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(
+            self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: Optional[int] = 1,
         eta: float = 0.0,
+        generator: Optional[Union[torch.Generator,
+                                  List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        param_scale_dict: Optional[dict] = {}
     ):
         r"""
         Function invoked when calling the pipeline for generation.
             negative_prompt_embeds=negative_prompt_embeds,
         )
+        # 3.5 Encode ipadapter_image
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt)
+            if do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+            image_embeds = self.encoder_hid_proj(image_embeds).to(self.dtype)
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - \
+            num_inference_steps * self.scheduler.order
         for i, t in enumerate(timesteps):
             # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat(
+                [latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(
+                latent_model_input, t)
+            latent_model_input = latent_model_input.permute(
+                0, 2, 3, 1).contiguous()
+            # 后边三个 None 是给到controlnet 的参数，暂时给到 None 当 placeholder
+            # todo: forward ip image_embeds
+            # break
+            if ip_adapter_image is not None:
+                noise_pred = self.unet.forward(
+                    latent_model_input, prompt_embeds, t, None, None, None, None, {"ip_hidden_states": image_embeds}, param_scale_dict)
+            else:
+                noise_pred = self.unet.forward(
+                    latent_model_input, prompt_embeds, t)
             noise_pred = noise_pred.permute(0, 3, 1, 2)
+            np.save(f"/workspace/noise_pred_{i}.npy", noise_pred.detach().cpu().numpy())
+            # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * \
+                    (noise_pred_text - noise_pred_uncond)
+            # if do_classifier_free_guidance and guidance_rescale > 0.0:
+            #     # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+            #     noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
             # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+        # image = self.decode_latents(latents)
+        image = self.lyra_decode_latents(latents)
         image = numpy_to_pil(image)
         return image

lyrasd_model/lyrasd_vae_model.py ADDED Viewed

	@@ -0,0 +1,363 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
+import numpy as np
+from safetensors.torch import load_file
+import os
+class LyraSdVaeModel():
+    r"""
+    A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        scaling_factor (`float`, *optional*, defaults to 0.18215):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        force_upcast (`bool`, *optional*, default to `True`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without loosing too much precision in which case
+            `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
+    """
+    _supports_gradient_checkpointing = True
+    def __init__(
+        self,
+        dtype: str = "fp16",
+        scaling_factor: float = 0.18215,
+        scale_factor: int = 8,
+        is_upcast: bool = False
+    ):
+        super().__init__()
+        self.is_upcast = is_upcast
+        self.scaling_factor = scaling_factor
+        self.scale_factor = scale_factor
+        self.model = torch.classes.lyrasd.VaeModelOp(
+            dtype,
+            is_upcast
+        )
+        self.vae_cache = {}
+        self.use_slicing = False
+        self.use_tiling = False
+        self.tile_latent_min_size = 512
+        self.tile_sample_min_size = 64
+        self.tile_overlap_factor = 0.25
+    def reload_vae_model(self, vae_path, vae_file_format='fp32'):
+        if len(vae_path) > 0 and vae_path[-1] != "/":
+            vae_path = vae_path + "/"
+        return self.model.reload_vae_model(vae_path, vae_file_format)
+    def reload_vae_model_v2(self, model_path):
+        checkpoint_file = os.path.join(model_path, "vae/diffusion_pytorch_model.bin")
+        if not os.path.exists(checkpoint_file):
+            checkpoint_file = os.path.join(model_path, "vae/diffusion_pytorch_model.safetensors")
+        if checkpoint_file in self.vae_cache:
+            state_dict = self.vae_cache[checkpoint_file]
+        else:
+            if "safetensors" in checkpoint_file:
+                state_dict = load_file(checkpoint_file)
+            else:
+                state_dict = torch.load(checkpoint_file, map_location="cpu")
+            # replace deprecated weights
+            for path in ["encoder.mid_block.attentions.0", "decoder.mid_block.attentions.0"]:
+                # group_norm path stays the same
+                # query -> to_q
+                if f"{path}.query.weight" in state_dict:
+                    state_dict[f"{path}.to_q.weight"] = state_dict.pop(f"{path}.query.weight")
+                if f"{path}.query.bias" in state_dict:
+                    state_dict[f"{path}.to_q.bias"] = state_dict.pop(f"{path}.query.bias")
+                # key -> to_k
+                if f"{path}.key.weight" in state_dict:
+                    state_dict[f"{path}.to_k.weight"] = state_dict.pop(f"{path}.key.weight")
+                if f"{path}.key.bias" in state_dict:
+                    state_dict[f"{path}.to_k.bias"] = state_dict.pop(f"{path}.key.bias")
+                # value -> to_v
+                if f"{path}.value.weight" in state_dict:
+                    state_dict[f"{path}.to_v.weight"] = state_dict.pop(f"{path}.value.weight")
+                if f"{path}.value.bias" in state_dict:
+                    state_dict[f"{path}.to_v.bias"] = state_dict.pop(f"{path}.value.bias")
+                # proj_attn -> to_out.0
+                if f"{path}.proj_attn.weight" in state_dict:
+                    state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight")
+                if f"{path}.proj_attn.bias" in state_dict:
+                    state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias")
+            for key in state_dict:
+                # print(key)
+                if len(state_dict[key].shape) == 4:
+                    state_dict[key] = state_dict[key].permute(0,2,3,1).contiguous()
+                else:
+                    state_dict[key] = state_dict[key]
+                if self.is_upcast and (key.startswith("decoder.up_blocks.2") or key.startswith("decoder.up_blocks.3") or key.startswith("decoder.conv_norm_out")):
+                    # print(key)
+                    state_dict[key] = state_dict[key].to(torch.float32)
+                else:
+                    state_dict[key] = state_dict[key].to(torch.float16)
+            self.vae_cache[checkpoint_file] = state_dict
+        return self.model.reload_vae_model_from_cache(state_dict, "cpu")
+    def enable_tiling(self, use_tiling: bool = True):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.use_tiling = use_tiling
+    def disable_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.enable_tiling(False)
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+    def lyra_decode(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        x = x.permute(0, 2, 3, 1).contiguous()
+        x = self.model.vae_decode(x)
+        return x.permute(0, 3, 1, 2)
+    def lyra_encode(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        x = x.permute(0, 2, 3, 1).contiguous()
+        x = self.model.vae_encode(x)
+        return x.permute(0, 3, 1, 2)
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> DiagonalGaussianDistribution:
+        """
+        Encode a batch of images into latents.
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+            return self.tiled_encode(x, return_dict=return_dict)
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self.lyra_encode(
+                x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+            posterior = DiagonalGaussianDistribution(h)
+        else:
+            moments = self.lyra_encode(x)
+            posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> torch.FloatTensor:
+        if self.use_tiling and (z.shape[2] > self.tile_latent_min_size or z.shape[3] > self.tile_latent_min_size):
+            return self.tiled_decode(z, return_dict=return_dict)
+        dec = self.lyra_decode(z)
+        return dec
+    def decode(
+        self, z: torch.FloatTensor, return_dict: bool = True, generator=None
+    ) -> torch.FloatTensor:
+        """
+        Decode a batch of images.
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(
+                z_slice) for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z)
+        return decoded
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, y, :] = a[:, :, -blend_extent + y, :] * \
+                (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
+        return b
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * \
+                (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
+        return b
+    def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> DiagonalGaussianDistribution:
+        r"""Encode a batch of images using a tiled encoder.
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
+                `tuple` is returned.
+        """
+        overlap_size = int(self.tile_sample_min_size *
+                           (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size *
+                           self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+        # Split the image into 512x512 tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[2], overlap_size):
+            row = []
+            for j in range(0, x.shape[3], overlap_size):
+                tile = x[:, :, i: i + self.tile_sample_min_size,
+                         j: j + self.tile_sample_min_size]
+                tile = self.lyra_encode(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+        moments = torch.cat(result_rows, dim=2)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> torch.FloatTensor:
+        r"""
+        Decode a batch of images using a tiled decoder.
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        overlap_size = int(self.tile_latent_min_size *
+                           (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_size *
+                           self.tile_overlap_factor)
+        row_limit = self.tile_sample_min_size - blend_extent
+        # Split z into overlapping 64x64 tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, z.shape[2], overlap_size):
+            row = []
+            for j in range(0, z.shape[3], overlap_size):
+                tile = z[:, :, i: i + self.tile_latent_min_size,
+                         j: j + self.tile_latent_min_size]
+                decoded = self.lyra_decode(tile)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+        dec = torch.cat(result_rows, dim=2)
+        if not return_dict:
+            return (dec,)
+        return dec

lyrasd_model/lyrasdxl_controlnet_txt2img_pipeline.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import inspect
+import os
+import time
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import gc
+import torch
+import numpy as np
+from glob import glob
+import PIL
+from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel
+from diffusers.loaders import TextualInversionLoaderMixin
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL
+from diffusers.schedulers import (DPMSolverMultistepScheduler,
+                                  EulerAncestralDiscreteScheduler,
+                                  EulerDiscreteScheduler,
+                                  KarrasDiffusionSchedulers)
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils import logging
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPTextModelWithProjection
+from diffusers.utils import PIL_INTERPOLATION
+from .lyrasd_vae_model import LyraSdVaeModel
+from .lora_util import add_text_lora_layer, add_xltext_lora_layer, add_lora_to_opt_model, load_state_dict
+from safetensors.torch import load_file
+from .lyrasdxl_pipeline_base import LyraSDXLPipelineBase
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(
+        dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + \
+        (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+class LyraSdXLControlnetTxt2ImgPipeline(LyraSDXLPipelineBase, StableDiffusionXLPipeline):
+    device = torch.device("cpu")
+    dtype = torch.float32
+    def __init__(self, device=torch.device("cuda"), dtype=torch.float16, vae_scale_factor=8, vae_scaling_factor=0.13025) -> None:
+        self.register_to_config(force_zeros_for_empty_prompt=True)
+        super().__init__(device, dtype, vae_scale_factor=vae_scale_factor, vae_scaling_factor=vae_scaling_factor)
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height, width)
+        image = image.permute(0, 2, 3, 1)
+        image = image.to(device=device, dtype=dtype)
+        # print(image.shape)
+        # print(image)
+        return image
+    @property
+    def _execution_device(self):
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def _get_aug_emb(self, add_embedding, time_ids, text_embeds, dtype):
+        time_embeds = self.add_time_proj(time_ids.flatten())
+        time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+        add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+        add_embeds = add_embeds.to(dtype)
+        aug_emb = add_embedding(add_embeds)
+        return aug_emb
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        controlnet_names: Optional[List[str]] = None,
+        controlnet_images: Optional[List[PIL.Image.Image]] = None,
+        controlnet_scale: Optional[List[float]] = None,
+        guess_mode=False,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator,
+                                  List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[
+            int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+    ):
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get(
+                "scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+        control_images = []
+        for image_ in controlnet_images:
+            image_ = self.prepare_image(
+                image=image_,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=prompt_embeds.dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance
+            )
+            control_images.append(image_)
+        control_scales = []
+        scales = [1.0, ] * 10
+        if guess_mode:
+            scales = torch.logspace(-1, 0, 10).tolist()
+        for scale in controlnet_scale:
+            scales_ = [d * scale for d in scales]
+            control_scales.append(scales_)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet_in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        add_time_ids = list(
+            original_size + crops_coords_top_left + target_size)
+        add_time_ids = torch.tensor([add_time_ids], dtype=prompt_embeds.dtype)
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat(
+                [negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat(
+                [negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(
+            batch_size * num_images_per_prompt, 1)
+        # 8. Denoising loop
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        # 7.1 Apply denoising_end
+        if denoising_end is not None and type(denoising_end) == float and denoising_end > 0 and denoising_end < 1:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(
+                list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+        aug_emb = self._get_aug_emb(
+            self.add_embedding, add_time_ids, add_text_embeds, prompt_embeds.dtype)
+        controlnet_aug_embs = []
+        for controlnet_name in controlnet_names:
+            controlnet_aug_embs.append(self._get_aug_emb(self.controlnet_add_embedding[controlnet_name],
+                                                         add_time_ids, add_text_embeds, prompt_embeds.dtype))
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat(
+                    [latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t)
+                latent_model_input = latent_model_input.permute(
+                    0, 2, 3, 1).contiguous()
+                noise_pred = self.unet.forward(
+                    latent_model_input, prompt_embeds, t, aug_emb,
+                    controlnet_names, control_images, controlnet_aug_embs, control_scales, guess_mode).permute(0, 3, 1, 2)
+                # print(noise_pred)
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * \
+                        (noise_pred_text - noise_pred_uncond)
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        # make sure the VAE is in float32 mode, as it overflows in float16
+        # if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
+        #     self.upcast_vae()
+        #     latents = latents.to(
+        #         next(iter(self.vae.post_quant_conv.parameters())).dtype)
+        # # latents = latents.to(torch.float32)
+        # if output_type == "latent":
+        #     return latents
+        # np.save(f"/workspace/latents.npy", latents.detach().cpu().numpy())
+        # image = self.vae.decode(
+        #     latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        image = self.vae.decode(1 / self.vae.scaling_factor * latents)
+        image = self.image_processor.postprocess(
+            image, output_type=output_type)
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        return image

lyrasd_model/lyrasdxl_pipeline_base.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import inspect
+import os
+import time
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import gc
+import torch
+import numpy as np
+from glob import glob
+from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel
+from diffusers.loaders import TextualInversionLoaderMixin
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL
+from diffusers.schedulers import (DPMSolverMultistepScheduler,
+                                  EulerAncestralDiscreteScheduler,
+                                  EulerDiscreteScheduler,
+                                  KarrasDiffusionSchedulers)
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils import logging
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPTextModelWithProjection
+from .lyrasd_vae_model import LyraSdVaeModel
+from .module.lyrasd_ip_adapter import LyraIPAdapter
+from .lora_util import add_text_lora_layer, add_xltext_lora_layer, add_lora_to_opt_model, load_state_dict
+from safetensors.torch import load_file
+class LyraSDXLPipelineBase(TextualInversionLoaderMixin):
+    def __init__(self, device=torch.device("cuda"), dtype=torch.float16, num_channels_unet=4, num_channels_latents=4, vae_scale_factor=8, vae_scaling_factor=0.13025) -> None:
+        self.device = device
+        self.dtype = dtype
+        self.num_channels_unet = num_channels_unet
+        self.num_channels_latents = num_channels_latents
+        self.vae_scale_factor = vae_scale_factor
+        self.vae_scaling_factor = vae_scaling_factor
+        self.unet_cache = {}
+        self.unet_in_channels = 4
+        self.controlnet_cache = {}
+        self.controlnet_add_embedding = {}
+        self.loaded_lora = {}
+        self.loaded_lora_strength = {}
+        self.scheduler = None
+        self.init_pipe()
+    def init_pipe(self):
+        self.vae = LyraSdVaeModel(
+            scale_factor=self.vae_scale_factor, scaling_factor=self.vae_scaling_factor, is_upcast=True)
+        self.unet = torch.classes.lyrasd.XLUnet2dConditionalModelOp(
+            "fp16",
+            self.num_channels_unet,
+            self.num_channels_latents)
+        self.default_sample_size = 128
+        self.addition_time_embed_dim = 256
+        flip_sin_to_cos, freq_shift = True, 0
+        self.projection_class_embeddings_input_dim, self.time_embed_dim = 2816, 1280
+        self.add_time_proj = Timesteps(
+            self.addition_time_embed_dim, flip_sin_to_cos, freq_shift).to(self.dtype).to(self.device)
+        self.add_embedding = TimestepEmbedding(
+            self.projection_class_embeddings_input_dim, self.time_embed_dim).to(self.dtype).to(self.device)
+        self.image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        self.feature_extractor = CLIPImageProcessor()
+    def reload_pipe(self, model_path):
+        self.tokenizer = CLIPTokenizer.from_pretrained(
+            model_path, subfolder="tokenizer")
+        self.text_encoder = CLIPTextModel.from_pretrained(
+            model_path, subfolder="text_encoder").to(self.dtype).to(self.device)
+        self.tokenizer_2 = CLIPTokenizer.from_pretrained(
+            model_path, subfolder="tokenizer_2")
+        self.text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(
+            model_path, subfolder="text_encoder_2").to(self.dtype).to(self.device)
+        self.reload_unet_model_v2(model_path)
+        self.reload_vae_model_v2(model_path)
+        if not self.scheduler:
+            self.scheduler = EulerAncestralDiscreteScheduler.from_pretrained(
+                model_path, subfolder="scheduler")
+    def load_embedding_weight(self, model, weight_path, unet_file_format="fp16"):
+        bin_list = glob(weight_path)
+        sate_dicts = model.state_dict()
+        dtype = np.float32 if unet_file_format == "fp32" else np.float16
+        for bin_file in bin_list:
+            weight = torch.from_numpy(np.fromfile(bin_file, dtype=dtype)).to(
+                self.dtype).to(self.device)
+            key = '.'.join(os.path.basename(bin_file).split('.')[1:-1])
+            weight = weight.reshape(sate_dicts[key].shape)
+            sate_dicts.update({key: weight})
+        model.load_state_dict(sate_dicts)
+    @property
+    def _execution_device(self):
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def reload_unet_model(self, unet_path, unet_file_format='fp32'):
+        if len(unet_path) > 0 and unet_path[-1] != "/":
+            unet_path = unet_path + "/"
+        self.unet.reload_unet_model(unet_path, unet_file_format)
+        self.load_embedding_weight(
+            self.add_embedding, f"{unet_path}add_embedding*", unet_file_format=unet_file_format)
+    def reload_vae_model(self, vae_path, vae_file_format='fp32'):
+        if len(vae_path) > 0 and vae_path[-1] != "/":
+            vae_path = vae_path + "/"
+        return self.vae.reload_vae_model(vae_path, vae_file_format)
+    def load_lora(self, lora_model_path, lora_name, lora_strength, lora_file_format='fp32'):
+        if len(lora_model_path) > 0 and lora_model_path[-1] != "/":
+            lora_model_path = lora_model_path + "/"
+        lora = add_xltext_lora_layer(
+            self.text_encoder, self.text_encoder_2, lora_model_path, lora_strength, lora_file_format)
+        self.loaded_lora[lora_name] = lora
+        self.unet.load_lora(lora_model_path, lora_name,
+                            lora_strength, lora_file_format)
+    def unload_lora(self, lora_name, clean_cache=False):
+        for layer_data in self.loaded_lora[lora_name]:
+            layer = layer_data['layer']
+            added_weight = layer_data['added_weight']
+            layer.weight.data -= added_weight
+        self.unet.unload_lora(lora_name, clean_cache)
+        del self.loaded_lora[lora_name]
+        gc.collect()
+        torch.cuda.empty_cache()
+    def load_lora_v2(self, lora_model_path, lora_name, lora_strength):
+        if lora_name in self.loaded_lora:
+            state_dict = self.loaded_lora[lora_name]
+        else:
+            state_dict = load_state_dict(lora_model_path)
+            self.loaded_lora[lora_name] = state_dict
+        self.loaded_lora_strength[lora_name] = lora_strength
+        add_lora_to_opt_model(state_dict, self.unet, self.text_encoder,
+                              self.text_encoder_2, lora_strength)
+    def unload_lora_v2(self, lora_name, clean_cache=False):
+        state_dict = self.loaded_lora[lora_name]
+        lora_strength = self.loaded_lora_strength[lora_name]
+        add_lora_to_opt_model(state_dict, self.unet, self.text_encoder,
+                              self.text_encoder_2,  -1.0 * lora_strength)
+        del self.loaded_lora_strength[lora_name]
+        if clean_cache:
+            del self.loaded_lora[lora_name]
+            gc.collect()
+            torch.cuda.empty_cache()
+    def clean_lora_cache(self):
+        self.unet.clean_lora_cache()
+    def get_loaded_lora(self):
+        return self.unet.get_loaded_lora()
+    def _get_aug_emb(self, time_ids, text_embeds, dtype):
+        time_embeds = self.add_time_proj(time_ids.flatten())
+        time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+        add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+        add_embeds = add_embeds.to(dtype)
+        aug_emb = self.add_embedding(add_embeds)
+        return aug_emb
+    def load_ip_adapter(self, dir_ip_adapter, ip_plus, image_encoder_path, num_ip_tokens, ip_projection_dim,  dir_face_in=None, num_fp_tokens=1, fp_projection_dim=None, sdxl=True):
+        self.ip_adapter_helper = LyraIPAdapter(self, sdxl, "cuda", dir_ip_adapter, ip_plus, image_encoder_path,
+                                               num_ip_tokens, ip_projection_dim, dir_face_in, num_fp_tokens, fp_projection_dim)
+    def reload_unet_model_v2(self, model_path):
+        checkpoint_file = os.path.join(
+            model_path, "unet/diffusion_pytorch_model.bin")
+        if not os.path.exists(checkpoint_file):
+            checkpoint_file = os.path.join(
+                model_path, "unet/diffusion_pytorch_model.safetensors")
+        if checkpoint_file in self.unet_cache:
+            state_dict = self.unet_cache[checkpoint_file]
+        else:
+            if "safetensors" in checkpoint_file:
+                state_dict = load_file(checkpoint_file)
+            else:
+                state_dict = torch.load(checkpoint_file, map_location="cpu")
+            for key in state_dict:
+                if len(state_dict[key].shape) == 4:
+                    # converted_unet_checkpoint[key] = converted_unet_checkpoint[key].to(torch.float16).to("cuda").permute(0,2,3,1).contiguous().cpu()
+                    state_dict[key] = state_dict[key].to(
+                        torch.float16).permute(0, 2, 3, 1).contiguous()
+                state_dict[key] = state_dict[key].to(torch.float16)
+            self.unet_cache[checkpoint_file] = state_dict
+        self.unet.reload_unet_model_from_cache(state_dict, "cpu")
+        self.load_embedding_weight_v2(self.add_embedding, state_dict)
+    def load_embedding_weight_v2(self, model, state_dict):
+        sub_state_dict = {}
+        for k in state_dict:
+            if k.startswith("add_embedding"):
+                v = state_dict[k]
+                sub_k = ".".join(k.split(".")[1:])
+                sub_state_dict[sub_k] = v
+        model.load_state_dict(sub_state_dict)
+    def reload_vae_model_v2(self, model_path):
+        self.vae.reload_vae_model_v2(model_path)
+    def load_controlnet_model_v2(self, model_name, controlnet_path):
+        checkpoint_file = os.path.join(
+            controlnet_path, "diffusion_pytorch_model.bin")
+        if not os.path.exists(checkpoint_file):
+            checkpoint_file = os.path.join(
+                controlnet_path, "diffusion_pytorch_model.safetensors")
+        if checkpoint_file in self.controlnet_cache:
+            state_dict = self.controlnet_cache[checkpoint_file]
+        else:
+            if "safetensors" in checkpoint_file:
+                state_dict = load_file(checkpoint_file)
+            else:
+                state_dict = torch.load(checkpoint_file, map_location="cpu")
+            for key in state_dict:
+                if len(state_dict[key].shape) == 4:
+                    # converted_unet_checkpoint[key] = converted_unet_checkpoint[key].to(torch.float16).to("cuda").permute(0,2,3,1).contiguous().cpu()
+                    state_dict[key] = state_dict[key].to(
+                        torch.float16).permute(0, 2, 3, 1).contiguous()
+                state_dict[key] = state_dict[key].to(torch.float16)
+            self.controlnet_cache[checkpoint_file] = state_dict
+        self.unet.load_controlnet_model_from_state_dict(
+            model_name, state_dict, "cpu")
+        add_embedding = TimestepEmbedding(
+            self.projection_class_embeddings_input_dim, self.time_embed_dim).to(self.dtype).to(self.device)
+        self.load_embedding_weight_v2(add_embedding, state_dict)
+        self.controlnet_add_embedding[model_name] = add_embedding
+    def unload_controlnet_model(self, model_name):
+        self.unet.unload_controlnet_model(model_name, True)
+        del self.controlnet_add_embedding[model_name]
+    def get_loaded_controlnet(self):
+        return self.unet.get_loaded_controlnet()

lyrasd_model/lyrasdxl_txt2img_inpaint_pipeline.py ADDED Viewed

	@@ -0,0 +1,535 @@

+import inspect
+import os
+import time
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import gc
+import torch
+import numpy as np
+from glob import glob
+from diffusers import StableDiffusionXLInpaintPipeline, UNet2DConditionModel
+from diffusers.loaders import TextualInversionLoaderMixin
+from diffusers.image_processor import VaeImageProcessor, PipelineImageInput
+from diffusers.models import AutoencoderKL
+from diffusers.schedulers import (DPMSolverMultistepScheduler,
+                                  EulerAncestralDiscreteScheduler,
+                                  EulerDiscreteScheduler,
+                                  KarrasDiffusionSchedulers)
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils import logging
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPTextModelWithProjection
+from .lyrasd_vae_model import LyraSdVaeModel
+from .module.lyrasd_ip_adapter import LyraIPAdapter
+from .lora_util import add_text_lora_layer, add_xltext_lora_layer, add_lora_to_opt_model, load_state_dict
+from safetensors.torch import load_file
+from .lyrasdxl_pipeline_base import LyraSDXLPipelineBase
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(
+        dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + \
+        (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if sample_mode == "sample":
+        return encoder_output.sample(generator)
+    elif sample_mode == "argmax":
+        return encoder_output.mode()
+    else:
+        return encoder_output
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class LyraSdXLTxt2ImgInpaintPipeline(LyraSDXLPipelineBase, StableDiffusionXLInpaintPipeline):
+    device = torch.device("cpu")
+    dtype = torch.float32
+    def __init__(self, device=torch.device("cuda"), dtype=torch.float16, vae_scale_factor=8, vae_scaling_factor=0.13025, num_channels_unet=9, num_channels_latents=4, requires_aesthetics_score: bool = False,
+                 force_zeros_for_empty_prompt: bool = True) -> None:
+        self.register_to_config(
+            force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.register_to_config(
+            requires_aesthetics_score=requires_aesthetics_score)
+        super().__init__(device, dtype, num_channels_unet=num_channels_unet, num_channels_latents=num_channels_latents, vae_scale_factor=vae_scale_factor, vae_scaling_factor=vae_scaling_factor)
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(
+                image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(
+            num_images_per_prompt, dim=0)
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        dtype = image.dtype
+        # if self.vae.config.force_upcast:
+        #     image = image.float()
+        #     self.vae.to(dtype=torch.float32)
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+        image_latents = image_latents.to(dtype)
+        image_latents = self.vae.scaling_factor * image_latents
+        return image_latents
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+        passed_add_embed_dim = (
+            self.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.add_embedding.linear_1.in_features
+        if (
+            expected_add_embed_dim > passed_add_embed_dim
+            and (expected_add_embed_dim - passed_add_embed_dim) == self.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
+            )
+        elif (
+            expected_add_embed_dim < passed_add_embed_dim
+            and (passed_add_embed_dim - expected_add_embed_dim) == self.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
+            )
+        elif expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+        return add_time_ids, add_neg_time_ids
+    def load_ip_adapter(self, dir_ip_adapter, ip_plus, image_encoder_path, num_ip_tokens, ip_projection_dim,  dir_face_in=None, num_fp_tokens=1, fp_projection_dim=None, sdxl=True):
+        self.ip_adapter_helper = LyraIPAdapter(self, sdxl, "cuda", dir_ip_adapter, ip_plus, image_encoder_path,
+                                               num_ip_tokens, ip_projection_dim, dir_face_in, num_fp_tokens, fp_projection_dim)
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        masked_image_latents: torch.FloatTensor = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.9999,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator,
+                                  List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        extra_tensor_dict: Optional[Dict[str, torch.FloatTensor]] = {},
+        param_scale_dict: Optional[Dict[str, int]] = {},
+        **kwargs
+    ):
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            strength,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get(
+                "scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip
+        )
+        def denoising_value_valid(dnv):
+            return isinstance(self.denoising_end, float) and 0 < dnv < 1
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps,
+            strength,
+            device,
+            denoising_start=self.denoising_start if denoising_value_valid else None,
+        )
+        latent_timestep = timesteps[:1].repeat(
+            batch_size * num_images_per_prompt)
+        is_strength_max = strength == 1.0
+        # 5. Prepare latent variables
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width)
+        init_image = init_image.to(dtype=torch.float32)
+        mask = self.mask_processor.preprocess(
+            mask_image, height=height, width=width)
+        if masked_image_latents is not None:
+            masked_image = masked_image_latents
+        elif init_image.shape[1] == 4:
+            # if images are in latent space, we can't mask it
+            masked_image = None
+        else:
+            masked_image = init_image * (mask < 0.5)
+        add_noise = True if self.denoising_start is None else False
+        return_image_latents = self.num_channels_unet == 4
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            self.num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            add_noise=add_noise,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+        if negative_original_size is None:
+            negative_original_size = original_size
+        if negative_target_size is None:
+            negative_target_size = target_size
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            negative_original_size,
+            negative_crops_coords_top_left,
+            negative_target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(
+            batch_size * num_images_per_prompt, 1)
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat(
+                [negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat(
+                [negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_neg_time_ids = add_neg_time_ids.repeat(
+                batch_size * num_images_per_prompt, 1)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+        # 8. Denoising loop
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        # 7.1 Apply denoising_end
+        if denoising_end is not None and type(denoising_end) == float and denoising_end > 0 and denoising_end < 1:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(
+                list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+        aug_emb = self._get_aug_emb(
+            add_time_ids, add_text_embeds, prompt_embeds.dtype)
+        extra_tensor_dict2 = {}
+        for name in extra_tensor_dict:
+            if name in ["fp_hidden_states", "ip_hidden_states"]:
+                v1, v2 = extra_tensor_dict[name][0], extra_tensor_dict[name][1]
+                extra_tensor_dict2[name] = torch.cat(
+                    [v1.repeat(num_images_per_prompt, 1, 1), v2.repeat(num_images_per_prompt, 1, 1)])
+            else:
+                extra_tensor_dict2[name] = extra_tensor_dict[name]
+        # np.save("/workspace/prompt_embeds.npy", prompt_embeds.detach().cpu().numpy())
+        # prompt_embeds = torch.from_numpy(np.load("/workspace/gt_prompt_embeds.npy")).cuda()
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat(
+                    [latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t)
+                if self.num_channels_unet == 9:
+                    latent_model_input = torch.cat(
+                        [latent_model_input, mask, masked_image_latents], dim=1)
+                latent_model_input = latent_model_input.permute(
+                    0, 2, 3, 1).contiguous()
+                noise_pred = self.unet.forward(latent_model_input, prompt_embeds, t, aug_emb, None, None,
+                                               None, None, None, extra_tensor_dict2, param_scale_dict).permute(0, 3, 1, 2).contiguous()
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * \
+                        (noise_pred_text - noise_pred_uncond)
+                if do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if self.num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor(
+                                [noise_timestep])
+                        )
+                    latents = (1 - init_mask) * \
+                        init_latents_proper + init_mask * latents
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if output_type == "latent":
+            return latents
+        image = self.vae.decode(1 / self.vae.scaling_factor * latents)
+        image = self.image_processor.postprocess(
+            image, output_type=output_type)
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        return image

lyrasd_model/lyrasdxl_txt2img_pipeline.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import inspect
+import os
+import time
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import gc
+import torch
+import numpy as np
+from glob import glob
+from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel
+from diffusers.loaders import TextualInversionLoaderMixin
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL
+from diffusers.schedulers import (DPMSolverMultistepScheduler,
+                                  EulerAncestralDiscreteScheduler,
+                                  EulerDiscreteScheduler,
+                                  KarrasDiffusionSchedulers)
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils import logging
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPTextModelWithProjection
+from .lyrasd_vae_model import LyraSdVaeModel
+from .module.lyrasd_ip_adapter import LyraIPAdapter
+from .lora_util import add_text_lora_layer, add_xltext_lora_layer, add_lora_to_opt_model, load_state_dict
+from safetensors.torch import load_file
+from .lyrasdxl_pipeline_base import LyraSDXLPipelineBase
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(
+        dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + \
+        (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+class LyraSdXLTxt2ImgPipeline(LyraSDXLPipelineBase, StableDiffusionXLPipeline):
+    device = torch.device("cpu")
+    dtype = torch.float32
+    def __init__(self, device=torch.device("cuda"), dtype=torch.float16, vae_scale_factor=8, vae_scaling_factor=0.13025) -> None:
+        self.register_to_config(force_zeros_for_empty_prompt=True)
+        super().__init__(device, dtype, vae_scale_factor=vae_scale_factor, vae_scaling_factor=vae_scaling_factor)
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator,
+                                  List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[
+            int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        extra_tensor_dict: Optional[Dict[str, torch.FloatTensor]] = {},
+        param_scale_dict: Optional[Dict[str, int]] = {},
+        clip_skip: Optional[int] = None
+    ):
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get(
+                "scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip
+        )
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet_in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        add_time_ids = list(
+            original_size + crops_coords_top_left + target_size)
+        add_time_ids = torch.tensor([add_time_ids], dtype=prompt_embeds.dtype)
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat(
+                [negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat(
+                [negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(
+            batch_size * num_images_per_prompt, 1)
+        # 8. Denoising loop
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        # 7.1 Apply denoising_end
+        if denoising_end is not None and type(denoising_end) == float and denoising_end > 0 and denoising_end < 1:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(
+                list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+        aug_emb = self._get_aug_emb(
+            add_time_ids, add_text_embeds, prompt_embeds.dtype)
+        extra_tensor_dict2 = {}
+        for name in extra_tensor_dict:
+            if name in ["fp_hidden_states", "ip_hidden_states"]:
+                v1, v2 = extra_tensor_dict[name][0], extra_tensor_dict[name][1]
+                extra_tensor_dict2[name] = torch.cat(
+                    [v1.repeat(num_images_per_prompt, 1, 1), v2.repeat(num_images_per_prompt, 1, 1)])
+            else:
+                extra_tensor_dict2[name] = extra_tensor_dict[name]
+        # np.save("/workspace/prompt_embeds.npy", prompt_embeds.detach().cpu().numpy())
+        # prompt_embeds = torch.from_numpy(np.load("/workspace/gt_prompt_embeds.npy")).cuda()
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat(
+                    [latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t)
+                latent_model_input = latent_model_input.permute(
+                    0, 2, 3, 1).contiguous()
+                noise_pred = self.unet.forward(latent_model_input, prompt_embeds, t, aug_emb, None, None,
+                                               None, None, None, extra_tensor_dict2, param_scale_dict).permute(0, 3, 1, 2).contiguous()
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * \
+                        (noise_pred_text - noise_pred_uncond)
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if output_type == "latent":
+            return latents
+        image = self.vae.decode(1 / self.vae.scaling_factor * latents)
+        image = self.image_processor.postprocess(
+            image, output_type=output_type)
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        return image

lyrasd_model/{lyrasd_lib/placeholder.txt → module/__init__.py} RENAMED Viewed

File without changes

lyrasd_model/module/lyra_tool.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import yaml
+def load_yaml(cfg_path):
+    with open(cfg_path, 'r', encoding='utf-8') as f:
+        return yaml.safe_load(f)

lyrasd_model/module/lyrasd_ip_adapter.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import os, sys
+from typing import List
+import torch
+from diffusers import StableDiffusionPipeline
+from diffusers.pipelines.controlnet import MultiControlNetModel
+from diffusers.models.embeddings import ImageProjection
+from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor
+from PIL import Image
+from typing import Any, Callable, Dict, List, Optional, Union
+from copy import deepcopy
+import time
+sys.path.append(os.path.dirname(__file__))
+from resampler import Resampler
+from diffusers import DiffusionPipeline
+import numpy as np
+# sys.path.append(os.environ['LYRASD_WORKDIR'] + "/tests/utils")
+from .tools import get_mem_use
+class ImageProjModel(torch.nn.Module):
+    """Projection Model"""
+    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.clip_extra_context_tokens = clip_extra_context_tokens
+        self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+    def forward(self, image_embeds):
+        embeds = image_embeds
+        clip_extra_context_tokens = self.proj(embeds).reshape(-1, self.clip_extra_context_tokens, self.cross_attention_dim)
+        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
+        return clip_extra_context_tokens
+class LyraIPAdapter:
+    def __init__(
+            self,
+            sd_pipe,
+            sdxl,
+            device,
+            ip_ckpt=None,
+            ip_plus=False,
+            image_encoder_path=None,
+            num_ip_tokens=4,
+            ip_projection_dim=None,
+            fp_ckpt=None,
+            num_fp_tokens=1,
+            fp_projection_dim=None,
+        ):
+        self.pipe = sd_pipe
+        self.device = device
+        self.fp_ckpt = fp_ckpt
+        self.ip_ckpt = ip_ckpt
+        self.num_fp_tokens = num_fp_tokens
+        self.num_ip_tokens = num_ip_tokens
+        self.fp_projection_dim = fp_projection_dim
+        self.ip_projection_dim = ip_projection_dim
+        self.sdxl = sdxl
+        self.ip_plus = ip_plus
+        self.cross_attention_dim = 2048
+        # self.pipe = sd_pipe.to(self.device)
+        # self.set_ip_adapter()
+        if image_encoder_path:
+            self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(image_encoder_path).to(self.device, dtype=torch.float16)
+            self.clip_image_processor = CLIPImageProcessor()
+            self.projection_dim = self.image_encoder.config.projection_dim
+        # image proj model
+        if self.ip_ckpt:
+            if self.ip_plus:
+                proj_heads = 20 if self.sdxl else 12
+                self.image_proj_model = self.init_proj_plus(proj_heads, self.num_ip_tokens)
+            else:
+                self.image_proj_model = self.init_proj(self.ip_projection_dim, self.num_ip_tokens)
+        # face proj model
+        if self.fp_ckpt:
+            self.face_proj_model = self.init_proj(self.fp_projection_dim, self.num_fp_tokens)
+        self.load_ip_adapter()
+    def init_proj_diffuser(self, state_dict):
+        # diffusers加载版本
+        clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
+        cross_attention_dim = state_dict["image_proj"]["proj.weight"].shape[0] // 4
+        image_proj_model = ImageProjection(
+            cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim, num_image_text_embeds=4
+        ).to(dtype=self.dtype, device=self.device)
+        return image_proj_model
+    # init_proj / init_proj_plus 是 facein里实现的
+    def init_proj(self, projection_dim, num_tokens):
+        image_proj_model = ImageProjModel(
+            cross_attention_dim=self.cross_attention_dim,
+            clip_embeddings_dim=projection_dim,
+            clip_extra_context_tokens=num_tokens,
+        ).to(self.device, dtype=torch.float16)
+        return image_proj_model
+    def init_proj_plus(self, heads, num_tokens):
+        image_proj_model = Resampler(
+            dim=1280,
+            depth=4,
+            dim_head=64,
+            heads=heads,
+            num_queries=num_tokens,
+            embedding_dim=self.image_encoder.config.hidden_size,
+            output_dim=self.cross_attention_dim,
+            ff_mult=4,
+        ).to(self.device, dtype=torch.float16)
+        return image_proj_model
+    def load_ip_adapter(self):
+        unet = self.pipe.unet
+        def parse_ckpt_path(ckpt):
+            ll = ckpt.split("/")
+            weight_name = ll[-1]
+            subfolder = ll[-2]
+            pretrained_path = "/".join(ll[:-2])
+            return pretrained_path, subfolder, weight_name
+        if self.ip_ckpt:
+            state_dict = torch.load(self.ip_ckpt, map_location="cpu")
+            self.image_proj_model.load_state_dict(state_dict["image_proj"])
+            pretrained_path, subfolder, weight_name = parse_ckpt_path(self.ip_ckpt)
+            dir_ipadapter = os.path.join(pretrained_path, "lyra_tran", subfolder, '.'.join(weight_name.split(".")[:-1]))
+            unet.load_ip_adapter(dir_ipadapter, "", 1, "fp16")
+        if self.fp_ckpt:
+            state_dict = torch.load(self.fp_ckpt, map_location="cpu")
+            self.face_proj_model.load_state_dict(state_dict["face_proj"])
+            pretrained_path, subfolder, weight_name = parse_ckpt_path(self.fp_ckpt)
+            dir_ipadapter = os.path.join(pretrained_path, "lyra_tran", subfolder, '.'.join(weight_name.split(".")[:-1]))
+            unet.load_facein(dir_ipadapter, "fp16")
+    @torch.inference_mode()
+    def get_image_embeds(self, image=None, face_emb=None):
+        image_prompt_embeds, uncond_image_prompt_embeds = None, None
+        if image is not None:
+            if not isinstance(image, list):
+                image = [image]
+            clip_image = self.clip_image_processor(images=image, return_tensors="pt").pixel_values
+            clip_image = clip_image.to(self.device, dtype=torch.float16)
+            if self.ip_plus:
+                clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
+                uncond_clip_image_embeds = self.image_encoder(
+                    torch.zeros_like(clip_image), output_hidden_states=True
+                ).hidden_states[-2]
+            else:
+                clip_image_embeds = self.image_encoder(clip_image).image_embeds
+                uncond_clip_image_embeds = torch.zeros_like(clip_image_embeds)
+            clip_image_prompt_embeds = self.image_proj_model(clip_image_embeds)
+            uncond_clip_image_prompt_embeds = self.image_proj_model(uncond_clip_image_embeds)
+            image_prompt_embeds = clip_image_prompt_embeds
+            uncond_image_prompt_embeds = uncond_clip_image_prompt_embeds
+        if face_emb is not None:
+            face_embeds = face_emb.to(self.device, dtype=torch.float16)
+            face_prompt_embeds = self.face_proj_model(face_embeds)
+            uncond_face_prompt_embeds = self.face_proj_model(torch.zeros_like(face_embeds))
+            if image_prompt_embeds is None:
+                image_prompt_embeds = face_prompt_embeds
+                uncond_image_prompt_embeds = uncond_face_prompt_embeds
+            else:
+                image_prompt_embeds = torch.cat([face_prompt_embeds, image_prompt_embeds], axis=1)
+                uncond_image_prompt_embeds = torch.cat([uncond_face_prompt_embeds, uncond_image_prompt_embeds], dim=1)
+        return image_prompt_embeds, uncond_image_prompt_embeds
+    @torch.inference_mode()
+    def get_image_embeds_lyrasd(self, image=None, ip_image_embeds=None, face_emb=None, batch_size = 1, ip_scale=1.0, fp_scale=1.0, do_classifier_free_guidance=True):
+        dict_tensor = {}
+        if self.ip_ckpt and ip_scale>0:
+            if ip_image_embeds is not None:
+                dict_tensor["ip_hidden_states"] = ip_image_embeds
+            elif image is not None:
+                if not isinstance(image, list):
+                    image = [image]
+                clip_image = self.clip_image_processor(images=image, return_tensors="pt").pixel_values
+                clip_image = clip_image.to(self.device, dtype=torch.float16)
+                if self.ip_plus:
+                    clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
+                    uncond_clip_image_embeds = self.image_encoder(
+                        torch.zeros_like(clip_image), output_hidden_states=True
+                    ).hidden_states[-2]
+                else:
+                    clip_image_embeds = self.image_encoder(clip_image).image_embeds
+                    uncond_clip_image_embeds = torch.zeros_like(clip_image_embeds)
+                if do_classifier_free_guidance:
+                    clip_image_embeds = torch.cat([uncond_clip_image_embeds, clip_image_embeds])
+                ip_image_embeds = self.image_proj_model(clip_image_embeds)
+                dict_tensor["ip_hidden_states"] = ip_image_embeds
+        if face_emb is not None and self.fp_ckpt and ip_scale>0:
+            face_embeds = face_emb.to(self.device, dtype=torch.float16)
+            face_prompt_embeds = self.face_proj_model(face_embeds)
+            uncond_face_prompt_embeds = self.face_proj_model(torch.zeros_like(face_embeds))
+            if do_classifier_free_guidance:
+                fp_image_embeds = torch.cat([uncond_face_prompt_embeds, face_prompt_embeds])
+            else:
+                fp_image_embeds = face_prompt_embeds
+            dict_tensor["fp_hidden_states"] = fp_image_embeds
+        return dict_tensor
+if __name__ == "__main__":
+    sys.path.append("/data/home/kiokaxiao/repos/LyraSD/python/lyrasd")
+    from lyrasd_model import LyraSdXLTxt2ImgPipeline
+    model_path = "/data/SharedModels/SD/checkpoints/stable-diffusion-xl-base-1.0/"
+    # model_path = "/cfs-datasets/projects/VirtualIdol/models/base_model/sdxl/xxmix9realisticsdxlV1"
+    lib_path = os.environ.get("LIBLYRASD_SO")
+    dir_ip_adapter = "/cfs-datasets/projects/VirtualIdol/models/ip_adapter/sdxl_models/ip-adapter-plus_sdxl_vit-h.bin"
+    dir_facein = "/cfs-datasets/projects/VirtualIdol/models/FaceIn/v1/FaceIn_sdxl.bin"
+    image_encoder_path = "/cfs-datasets/projects/VirtualIdol/models/ip_adapter/models/image_encoder"
+    pipeline = LyraSdXLTxt2ImgPipeline(model_path, lib_path)
+    pipeline.load_ip_adapter(dir_ip_adapter, True, image_encoder_path, 16,1024, dir_facein, 1, 512)
+    # pipeline.load_ip_adapter(dir_ip_adapter, True, image_encoder_path, 16,1024, "", 1, 512)
+    face_emb = np.load("/data/home/kiokaxiao/repos/VidolImageDraw/girl.npy")
+    face_emb = torch.Tensor(face_emb.reshape([1,-1]))
+    ip_image = Image.open("/data/home/kiokaxiao/repos/VidolImageDraw/images/input_image.png").convert('RGB')
+    generator = torch.Generator("cuda").manual_seed(123)
+    batches = [2]
+    sizes = [[512, 512], [768, 768], [1024, 1024]]
+    # sizes = [[832, 640]]
+    # sizes = [[1024, 1024]]
+    running_cnt = 1
+    do_bench = False
+    ip_ratio = 1
+    facein_ratio = 0.6
+    extra_tensor_dict = {}
+    extra_tensor_dict = pipeline.ip_adapter_helper.get_image_embeds_lyrasd(ip_image, None, face_emb, batches[0], ip_ratio, facein_ratio)
+    param_scale_dict = {"facein_ratio": facein_ratio, "ip_ratio": ip_ratio}
+    draw_cfg = {'width': 640,
+                'num_inference_steps': 30,
+                'height': 832,
+                'negative_prompt': '(worst quality, low quality, 3d, 2d, cartoons, sketch), tooth, open mouth',
+                'guidance_scale': 7,
+                'prompt': 'xxmixgirl, masterpiece, best quality, 1girl, solo, looking at viewer, simple background, hair ornament, black eyes, portrait',
+                'output_type': 'pil',
+                'extra_tensor_dict': extra_tensor_dict,
+                "param_scale_dict": param_scale_dict}
+    def warmup(draw_cfg):
+        draw_cfg_wm = deepcopy(draw_cfg)
+        draw_cfg_wm['num_inference_steps'] = 1
+        pipeline(**draw_cfg_wm, generator= generator)
+    if not do_bench:
+        images = pipeline(**draw_cfg, generator= generator)
+    else:
+        for batch in batches:
+            for height, width in sizes:
+                draw_cfg['width'] = width
+                draw_cfg['height'] = height
+                draw_cfg['num_images_per_prompt'] = batch
+                draw_cfg["num_inference_steps"] = 20
+                warmup(draw_cfg)
+                time_uses = []
+                for x in range(running_cnt):
+                    start = time.perf_counter()
+                    draw_cfg['num_images_per_prompt'] = batch
+                    generator = torch.Generator("cuda").manual_seed(123)
+                    print("draw_cfg: ", draw_cfg.keys())
+                    print("draw_cfg: ", draw_cfg)
+                    images = pipeline(**draw_cfg, generator= generator)
+                    time_use = time.perf_counter() - start
+                    time_uses.append(time_use)
+                print("bench", batch, width, sum(time_uses)/running_cnt, get_mem_use())
+    print(type(images))
+    images[0].save("t.png")

lyrasd_model/module/resampler.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
+import math
+import torch
+import torch.nn as nn
+# FFN
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    #(bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+        b, l, _ = latents.shape
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+        return self.to_out(out)
+class Resampler(nn.Module):
+    def __init__(
+        self,
+        dim=1024,
+        depth=8,
+        dim_head=64,
+        heads=16,
+        num_queries=8,
+        embedding_dim=768,
+        output_dim=1024,
+        ff_mult=4,
+    ):
+        super().__init__()
+        self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
+        self.proj_in = nn.Linear(embedding_dim, dim)
+        self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(output_dim)
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+    def forward(self, x):
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        x = self.proj_in(x)
+        print("layers: ", len(self.layers))
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)

lyrasd_model/module/tools.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import torch
+import numpy as np
+import os, sys
+import time
+class LyraChecker:
+    def __init__(self, dir_data, tol):
+        self.dir_data = dir_data
+        self.tol = tol
+    def cmp(self, fpath1, fpath2="", tol=0):
+        tolbk = self.tol
+        if tol != 0:
+            self.tol = tol
+        if fpath2 == "":
+            fpath2 = fpath1
+            fpath1 += "_1"
+            fpath2 += "_2"
+        v1 = self.get_npy(fpath1) #np.load(os.path.join(self.dir_data, fpath1))
+        v2 = self.get_npy(fpath2) #np.load(os.path.join(self.dir_data, fpath2))
+        name = fpath1
+        if ".npy" in fpath1:
+            name = ".".join(os.path.basename(fpath1).split(".")[:-1])
+        self._cmp_inner(v1, v2, name)
+        self.tol = tolbk
+    def _cmp_inner(self, v1, v2, name):
+        print(v1.shape, v2.shape)
+        if v1.shape != v2.shape:
+            if v1.shape[1] == v2.shape[1]:
+                v2 = v2.reshape([v2.shape[0], v2.shape[1], -1])
+            else:
+                v2 = torch.tensor(v2).permute(0, 3, 1, 2).numpy()
+            print(v1.shape, v2.shape)
+        self._check_data(name, v1, v2)
+        print(np.size(v1))
+    def _check_data(self, stage, x_out, x_gt):
+        print(f"========== {stage} =============")
+        print(x_out.shape, x_gt.shape)
+        if np.allclose(x_gt, x_out, atol=self.tol):
+            print(f"[OK] At {stage}, tol: {self.tol}")
+        else:
+            diff_cnt = np.count_nonzero(np.abs(x_gt - x_out)>self.tol)
+            print(f"[FAIL]At {stage}, not aligned. tol: {self.tol}")
+            print("    [INFO]Max diff: ", np.max(np.abs(x_gt - x_out)))
+            print("    [INFO]Diff count: ", diff_cnt, ", ratio: ", round(diff_cnt/np.size(x_out), 2))
+        print(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
+    def cmp_query(self, fpath1, fpath2):
+        v1 = np.load(os.path.join(self.dir_data, fpath1))
+        vk = np.load(os.path.join(self.dir_data, fpath1).replace("query", "key"))
+        vv = np.load(os.path.join(self.dir_data, fpath1).replace("query", "value"))
+        v2 = np.load(os.path.join(self.dir_data, fpath2))
+        # print(v1.shape, v2.shape)
+        q2 = v2[:,:,0,:,:].transpose([0,2,1,3])
+        # print(v1.shape, q2.shape)
+        self.check_data("query", v1, q2)
+        # print(vk.shape, v2.shape)
+        k2 = v2[:,:,1,:,:].transpose([0,2,1,3])
+        self.check_data("key", vk, k2)
+        vv2 = v2[:,:,2,:,:].transpose([0,2,1,3])
+        # print(vv.shape, vv2.shape)
+        self.check_data("value", vv, vv2)
+    def _get_data_fpath(self, fname):
+        fpath = os.path.join(self.dir_data, fname)
+        if not fpath.endswith(".npy"):
+            fpath += ".npy"
+        return fpath
+    def get_npy(self, fname):
+        fpath = self._get_data_fpath(fname)
+        return np.load(fpath)
+class MkDataHelper:
+    def __init__(self, data_dir="/data/home/kiokaxiao/data"):
+        self.data_dir = data_dir
+    def mkdata(self, subdir, name, shape, dtype=torch.float16):
+        outdir = os.path.join(self.data_dir, subdir)
+        os.makedirs(outdir, exist_ok=True)
+        fpath = os.path.join(outdir, name+".npy")
+        data = torch.randn(shape, dtype=torch.float16)
+        np.save(fpath, data.to(dtype).numpy())
+        return data
+    def gen_out_with_func(self, func, inputs):
+        output = func(inputs)
+        return output
+    def savedata(self, subdir, name, data):
+        outdir = os.path.join(self.data_dir, subdir)
+        os.makedirs(outdir, exist_ok=True)
+        fpath = os.path.join(outdir, name+".npy")
+        np.save(fpath, data.cpu().numpy())
+class TorchSaver:
+    def __init__(self, data_dir):
+        self.data_dir = data_dir
+        os.makedirs(self.data_dir, exist_ok=True)
+        self.is_save = True
+    def save_v(self, name, v):
+        if not self.is_save:
+            return
+        fpath = os.path.join(self.data_dir, name+"_1.npy")
+        np.save(fpath, v.detach().cpu().numpy())
+    def save_v2(self, name, v):
+        if not self.is_save:
+            return
+        fpath = os.path.join(self.data_dir, name+"_1.npy")
+        np.save(fpath, v.detach().cpu().numpy())
+def timer_annoc(funct):
+    def inner(*args,**kwargs):
+        start = time.perf_counter()
+        res = funct(*args,**kwargs)
+        torch.cuda.synchronize()
+        end = time.perf_counter()
+        print("torch cost: ", end-start)
+        return res
+    return inner
+def get_mem_use():
+    f = os.popen("nvidia-smi | grep MiB" )
+    line = f.read().strip()
+    while "  " in line:
+        line = line.replace("  ", " ")
+    memuse = line.split(" ")[8]
+    return memuse
+if __name__ == "__main__":
+    dir_data = sys.argv[1]
+    fname_v1 = sys.argv[2]
+    fname_v2 = sys.argv[3]
+    tol = 0.01
+    if len(sys.argv) > 4:
+        tol = float(sys.argv[4])
+    checker = LyraChecker(dir_data, tol)
+    checker.cmp(fname_v1, fname_v2)

models/README.md CHANGED Viewed

@@ -2,11 +2,20 @@
 ### This is the place where you should download the checkpoints, and unzip them
 ```bash
-wget -O lyrasd_rev_animated.tar.gz "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/lyrasd/lyrasd_rev_animated.tar.gz?q-sign-algorithm=sha1&q-ak=AKIDBF6i7GCtKWS8ZkgOtACzX3MQDl37xYty&q-sign-time=1694078210;1866878210&q-key-time=1694078210;1866878210&q-header-list=&q-url-param-list=&q-signature=6046546135631dee9e8be7d8e061a77e8790e675"
-wget -O lyrasd_canny.tar.gz "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/lyrasd/lyrasd_canny.tar.gz?q-sign-algorithm=sha1&q-ak=AKIDBF6i7GCtKWS8ZkgOtACzX3MQDl37xYty&q-sign-time=1694078194;1866878194&q-key-time=1694078194;1866878194&q-header-list=&q-url-param-list=&q-signature=efb713ee650a0ee3c954fb3a0e148c37ef13cd3b"
-wget -O lyrasd_xiaorenshu_lora.tar.gz "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/lyrasd/lyrasd_xiaorenshu_lora.tar.gz?q-sign-algorithm=sha1&q-ak=AKIDBF6i7GCtKWS8ZkgOtACzX3MQDl37xYty&q-sign-time=1694078234;1866878234&q-key-time=1694078234;1866878234&q-header-list=&q-url-param-list=&q-signature=fb9a577a54ea6dedd9be696e40b96b71a1b23b5d"
 tar -xvf lyrasd_rev_animated.tar.gz
-tar -xvf lyrasd_canny.tar.gz
-tar -xvf lyrasd_xiaorenshu_lora.tar.gz
 ```

 ### This is the place where you should download the checkpoints, and unzip them
 ```bash
+wget -O lyrasd_rev_animated.tar.gz "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/lyrasd/lyrasd_rev_animated.tar.gz"
+wget -O sd-controlnet-canny.tar.gz "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/lyrasd/sd-controlnet-canny.tar.gz"
+wget -O xiaorenshu.safetensors "https://civitai.com/api/download/models/25661"
+wget -O helloworldSDXL20Fp16.tar.gz "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/lyrasd/helloworldSDXL20Fp16.tar.gz"
+wget -O controlnet-canny-sdxl-1.0.tar.gz "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/lyrasd/controlnet-canny-sdxl-1.0.tar.gz"
+wget -O dissolve_sdxl.safetensors "https://civitai.com/api/download/models/277389?type=Model&format=SafeTensor"
 tar -xvf lyrasd_rev_animated.tar.gz
+tar -xvf sd-controlnet-canny.tar.gz
+tar -xvf helloworldSDXL20Fp16.tar.gz
+tar -xvf controlnet-canny-sdxl-1.0.tar.gz
 ```

outputs/res_controlnet_img2img_0.png CHANGED Viewed

Git LFS Details

SHA256: 1b314eb678f2f3d76737b2b90507fe66e3a62393a89f00681e29bf821d273a60
Pointer size: 131 Bytes
Size of remote file: 447 kB

Git LFS Details

SHA256: 96aea3fc1f0992974935c798380f1ce008e61ff3b75d89c5d12700ed10fddbc9
Pointer size: 131 Bytes
Size of remote file: 436 kB

outputs/{res_controlnet_sdxl_txt2img.png → res_controlnet_sdxl_txt2img_0.png} RENAMED Viewed

File without changes

outputs/res_controlnet_txt2img_0.png CHANGED Viewed

Git LFS Details

SHA256: 225654758e835c97f49749170bb2440988d34607c023d47a03935068c9778993
Pointer size: 131 Bytes
Size of remote file: 398 kB

Git LFS Details

SHA256: b6d15a9715dd171a9e58ed2b8d628a5655b2b2de2539a9b9147f64d3e1529838
Pointer size: 131 Bytes
Size of remote file: 389 kB

outputs/res_img2img_0.png CHANGED Viewed

Git LFS Details

SHA256: cfe8f20e1e4382eacfa6851c5f7d386b5aeb875bca6ff7d927ede1ba43e7677a
Pointer size: 131 Bytes
Size of remote file: 406 kB

Git LFS Details

SHA256: 500882308c72de757094b7d8cc097eadc02dae2745c75ec223b37254190ad9f3
Pointer size: 131 Bytes
Size of remote file: 409 kB

outputs/res_txt2img_lora_0.png CHANGED Viewed

Git LFS Details

SHA256: b3879bf13166e9a16cd5314ab69072b6f7f69b80840b2e2204342c7fcfafbe04
Pointer size: 131 Bytes
Size of remote file: 433 kB

Git LFS Details

SHA256: cc46ad18b2444ddee13772c4862eca9519099c8bbb93722061004bfaec486bb7
Pointer size: 131 Bytes
Size of remote file: 436 kB

outputs/{res_sdxl_txt2img_lora_0.png → res_txt2img_xl_lora_0.png} RENAMED Viewed

File without changes

txt2img_demo.py CHANGED Viewed

@@ -10,22 +10,25 @@ from lyrasd_model import LyraSdTxt2ImgPipeline
 #   4. scheduler 配置
 # LyraSD 的 C++ 编译动态链接库，其中包含 C++ CUDA 计算的细节
-lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so"
-model_path = "./models/lyrasd_rev_animated"
-lora_path = "./models/lyrasd_xiaorenshu_lora"
 # 构建 Txt2Img 的 Pipeline
-model = LyraSdTxt2ImgPipeline(model_path, lib_path)
 # load lora
 # 参数分别为 lora 存放位置，名字，lora 强度，lora模型精度
-model.load_lora(lora_path, "xiaorenshu", 0.4, "fp32")
 # 准备应用的输入和超参数
 prompt = "a cat, cute, cartoon, concise, traditional, chinese painting, Tang and Song Dynasties, masterpiece, 4k, 8k, UHD, best quality"
 negative_prompt = "(((horrible))), (((scary))), (((naked))), (((large breasts))), high saturation, colorful, human:2, body:2, low quality, bad quality, lowres, out of frame, duplicate, watermark, signature, text, frames, cut, cropped, malformed limbs, extra limbs, (((missing arms))), (((missing legs)))"
 height, width = 512, 512
-steps = 30
 guidance_scale = 7
 generator = torch.Generator().manual_seed(123)
 num_images = 1
@@ -33,12 +36,12 @@ num_images = 1
 start = time.perf_counter()
 # 推理生成
 images = model(prompt, height, width, steps,
-        guidance_scale, negative_prompt, num_images,
-        generator=generator)
-print("image gen cost: ",time.perf_counter() - start)
 # 存储生成的图片
 for i, image in enumerate(images):
     image.save(f"outputs/res_txt2img_lora_{i}.png")
 # unload lora，参数为 lora 的名字，是否清除 lora 缓存
-# model.unload_lora("xiaorenshu", True)

 #   4. scheduler 配置
 # LyraSD 的 C++ 编译动态链接库，其中包含 C++ CUDA 计算的细节
+lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm80.so"
+model_path = "./models/rev-animated"
+lora_path = "./models/xiaorenshu.safetensors"
+torch.classes.load_library(lib_path)
 # 构建 Txt2Img 的 Pipeline
+model = LyraSdTxt2ImgPipeline()
+model.reload_pipe(model_path)
 # load lora
 # 参数分别为 lora 存放位置，名字，lora 强度，lora模型精度
+model.load_lora_v2(lora_path, "xiaorenshu", 0.4)
 # 准备应用的输入和超参数
 prompt = "a cat, cute, cartoon, concise, traditional, chinese painting, Tang and Song Dynasties, masterpiece, 4k, 8k, UHD, best quality"
 negative_prompt = "(((horrible))), (((scary))), (((naked))), (((large breasts))), high saturation, colorful, human:2, body:2, low quality, bad quality, lowres, out of frame, duplicate, watermark, signature, text, frames, cut, cropped, malformed limbs, extra limbs, (((missing arms))), (((missing legs)))"
 height, width = 512, 512
+steps = 20
 guidance_scale = 7
 generator = torch.Generator().manual_seed(123)
 num_images = 1
 start = time.perf_counter()
 # 推理生成
 images = model(prompt, height, width, steps,
+               guidance_scale, negative_prompt, num_images,
+               generator=generator)
+print("image gen cost: ", time.perf_counter() - start)
 # 存储生成的图片
 for i, image in enumerate(images):
     image.save(f"outputs/res_txt2img_lora_{i}.png")
 # unload lora，参数为 lora 的名字，是否清除 lora 缓存
+model.unload_lora_v2("xiaorenshu", True)

txt2img_sdxl_demo.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import torch
+from lyrasd_model import LyraSdXLTxt2ImgPipeline
+import time
+import GPUtil
+import os
+from glob import glob
+import random
+# 存放模型文件的路径，应该包含一下结构：
+#   1. clip 模型
+#   2. 转换好的优化后的 unet 模型，放入其中的 unet_bins 文件夹
+#   3. vae 模型
+#   4. scheduler 配置
+# LyraSD 的 C++ 编译动态链接库，其中包含 C++ CUDA 计算的细节
+lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm80.so"
+model_path = "./models/helloworldSDXL20Fp16"
+lora_path = "./models/dissolve_sdxl.safetensors"
+torch.classes.load_library(lib_path)
+# 构建 Txt2Img 的 Pipeline
+model = LyraSdXLTxt2ImgPipeline()
+model.reload_pipe(model_path)
+# load lora
+# lora model path, name，lora strength
+model.load_lora_v2(lora_path, "dissolve_sdxl", 0.4)
+# 准备应用的输入和超参数
+prompt = "a cat, ral-dissolve"
+negative_prompt = "nswf, watermark"
+height, width = 1024, 1024
+steps = 20
+guidance_scale = 7.5
+generator = torch.Generator().manual_seed(8788800)
+start = time.perf_counter()
+# 推理生成
+images = model(prompt,
+               height=height,
+               width=width,
+               num_inference_steps=steps,
+               num_images_per_prompt=1,
+               guidance_scale=guidance_scale,
+               negative_prompt=negative_prompt,
+               generator=generator
+               )
+print("image gen cost: ", time.perf_counter() - start)
+# 存储生成的图片
+for i, image in enumerate(images):
+    image.save(f"outputs/res_txt2img_xl_lora_{i}.png")
+# unload lora，参数为 lora 的名字，是否清除 lora 缓存
+model.unload_lora_v2("dissolve_sdxl", True)