File size: 11,955 Bytes

import os
import re
import time
import torch
import numpy as np
from safetensors.torch import load_file
from diffusers.loaders import LoraLoaderMixin
from diffusers.loaders.lora_conversion_utils import _maybe_map_sgm_blocks_to_diffusers, _convert_kohya_lora_to_diffusers
from types import SimpleNamespace
import logging.handlers
LORA_PREFIX_UNET = "lora_unet"
LORA_PREFIX_TEXT_ENCODER = "lora_te"
LORA_UNET_LAYERS = ['lora_unet_down_blocks_0_attentions_0', 'lora_unet_down_blocks_0_attentions_1', 'lora_unet_down_blocks_1_attentions_0', 'lora_unet_down_blocks_1_attentions_1', 'lora_unet_down_blocks_2_attentions_0', 'lora_unet_down_blocks_2_attentions_1', 'lora_unet_mid_block_attentions_0', 'lora_unet_up_blocks_1_attentions_0',
                    'lora_unet_up_blocks_1_attentions_1', 'lora_unet_up_blocks_1_attentions_2', 'lora_unet_up_blocks_2_attentions_0', 'lora_unet_up_blocks_2_attentions_1', 'lora_unet_up_blocks_2_attentions_2', 'lora_unet_up_blocks_3_attentions_0', 'lora_unet_up_blocks_3_attentions_1', 'lora_unet_up_blocks_3_attentions_2']


def add_text_lora_layer(clip_model, lora_model_path="Misaka.safetensors", alpha=1.0, lora_file_format="fp32", device="cuda:0"):
    if lora_file_format == "fp32":
        model_dtype = np.float32
    elif lora_file_format == "fp16":
        model_dtype = np.float16
    else:
        raise Exception(f"unsupported model dtype: {lora_file_format}")
    all_files = os.scandir(lora_model_path)
    unload_dict = []
    # directly update weight in diffusers model
    for file in all_files:

        if 'text' in file.name:
            layer_infos = file.name.split('.')[0].split(
                'text_model_')[-1].split('_')
            curr_layer = clip_model.text_model
        else:
            continue

        # find the target layer
        temp_name = layer_infos.pop(0)
        while len(layer_infos) > -1:
            try:
                curr_layer = curr_layer.__getattr__(temp_name)
                if len(layer_infos) > 0:
                    temp_name = layer_infos.pop(0)
                    # if temp_name == "self":
                    #     temp_name += "_" + layer_infos.pop(0)
                    # elif temp_name != "mlp" and len(layer_infos) == 1:
                    #     temp_name += "_" + layer_infos.pop(0)
                elif len(layer_infos) == 0:
                    break
            except Exception:
                if len(temp_name) > 0:
                    temp_name += '_'+layer_infos.pop(0)
                else:
                    temp_name = layer_infos.pop(0)
        data = torch.from_numpy(np.fromfile(file.path, dtype=model_dtype)).to(
            clip_model.dtype).to(clip_model.device).reshape(curr_layer.weight.data.shape)
        if len(curr_layer.weight.data) == 4:
            adding_weight = alpha * data.permute(0, 3, 1, 2)
        else:
            adding_weight = alpha * data
        curr_layer.weight.data += adding_weight

        curr_layer_unload_data = {
            "layer": curr_layer,
            "added_weight": adding_weight
        }
        unload_dict.append(curr_layer_unload_data)
    return unload_dict


def add_xltext_lora_layer(clip_model, clip_model_2, lora_model_path, alpha=1.0, lora_file_format="fp32", device="cuda:0"):
    if lora_file_format == "fp32":
        model_dtype = np.float32
    elif lora_file_format == "fp16":
        model_dtype = np.float16
    else:
        raise Exception(f"unsupported model dtype: {lora_file_format}")
    all_files = os.scandir(lora_model_path)
    unload_dict = []
    # directly update weight in diffusers model
    for file in all_files:

        if 'text' in file.name:
            layer_infos = file.name.split('.')[0].split(
                'text_model_')[-1].split('_')
            if "text_encoder_2" in file.name:
                curr_layer = clip_model_2.text_model
            elif "text_encoder" in file.name:
                curr_layer = clip_model.text_model
            else:
                raise ValueError(
                    "Cannot identify clip model, need text_encoder or text_encoder_2 in filename, found: ", file.name)
        else:
            continue

        # find the target layer
        # find the target layer
        temp_name = layer_infos.pop(0)
        while len(layer_infos) > -1:
            try:
                curr_layer = curr_layer.__getattr__(temp_name)
                if len(layer_infos) > 0:
                    temp_name = layer_infos.pop(0)
                    # if temp_name == "self":
                    #     temp_name += "_" + layer_infos.pop(0)
                    # elif temp_name != "mlp" and len(layer_infos) == 1:
                    #     temp_name += "_" + layer_infos.pop(0)
                elif len(layer_infos) == 0:
                    break
            except Exception:
                if len(temp_name) > 0:
                    temp_name += '_'+layer_infos.pop(0)
                else:
                    temp_name = layer_infos.pop(0)

        data = torch.from_numpy(np.fromfile(file.path, dtype=model_dtype)).to(
            clip_model.dtype).to(clip_model.device).reshape(curr_layer.weight.data.shape)
        if len(curr_layer.weight.data) == 4:
            adding_weight = alpha * data.permute(0, 3, 1, 2)
        else:
            adding_weight = alpha * data
        curr_layer.weight.data += adding_weight

        curr_layer_unload_data = {
            "layer": curr_layer,
            "added_weight": adding_weight
        }
        unload_dict.append(curr_layer_unload_data)
    return unload_dict

def lora_trans(state_dict):
    loraload = LoraLoaderMixin()
    unet_config = SimpleNamespace(**{'layers_per_block': 2})
    state_dicts = _maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config)
    state_dicts_trans, state_dicts_alpha = _convert_kohya_lora_to_diffusers(
        state_dicts)
    keys = list(state_dicts_trans.keys())
    for k in keys:
        key = k.replace('processor.', '')
        for x in ['.lora_linear_layer.', '_lora.', '.lora.']:
            key = key.replace(x, '.lora_')
        if key.find('text_encoder') >= 0:
            for x in ['q', 'k', 'v', 'out']:
                key = key.replace(f'.to_{x}.', f'.{x}_proj.')
        key = key.replace('to_out.', 'to_out.0.')
        if key != k:
            state_dicts_trans[key] = state_dicts_trans.pop(k)
    alpha = torch.Tensor(list(set(list(state_dicts_alpha.values()))))
    state_dicts_trans.update({'lora.alpha': alpha})

    return state_dicts_trans


def load_state_dict(filename, need_trans=True):
    state_dict = load_file(os.path.abspath(filename), device="cpu")
    if need_trans:
        state_dict = lora_trans(state_dict)
    return state_dict


def move_state_dict_to_cuda(state_dict):
    ret_state_dict = {}
    for item in state_dict:
        ret_state_dict[item] = state_dict[item].cuda()
    return ret_state_dict


def add_lora_to_opt_model(state_dict, unet, clip_model, clip_model_2, alpha=1.0, need_trans=False):
    # directly update weight in diffusers model
    state_dict = move_state_dict_to_cuda(state_dict)

    alpha_ks = list(filter(lambda x: x.find('.alpha') >= 0, state_dict))
    lora_alpha = state_dict[alpha_ks[0]].item() if len(alpha_ks) > 0 else -1

    visited = set()
    for key in state_dict:
        # print(key)
        # it is suggested to print out the key, it usually will be something like below
        # "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"

        # as we have set the alpha beforehand, so just skip
        if '.alpha' in key or key in visited:
            continue

        if "text" in key:
            curr_layer = clip_model_2 if key.find(
                'text_encoder_2') >= 0 else clip_model

            # if is_sdxl:
            layer_infos = key.split('.')[1:]
            
            for x in layer_infos:
                try:
                    curr_layer = curr_layer.__getattr__(x)
                except Exception:
                    break
                
            # update weight
            pair_keys = [key.replace("lora_down", "lora_up"),
                         key.replace("lora_up", "lora_down")]
            weight_up, weight_down = state_dict[pair_keys[0]
                                                ], state_dict[pair_keys[1]]

            weight_scale = lora_alpha/weight_up.shape[1] if lora_alpha != -1 else 1.0

            if len(weight_up.shape) == 4:
                weight_up = weight_up.squeeze([2, 3])
                weight_down = weight_down.squeeze([2, 3])
                if len(weight_down.shape) == 4:
                    adding_weight = torch.einsum(
                        'a b, b c h w -> a c h w', weight_up, weight_down)
                else:
                    adding_weight = torch.mm(
                        weight_up, weight_down).unsqueeze(2).unsqueeze(3)
            else:
                adding_weight = torch.mm(weight_up, weight_down)
            adding_weight = alpha * weight_scale * adding_weight

            curr_layer.weight.data += adding_weight.to(torch.float16)
            # update visited list
            for item in pair_keys:
                visited.add(item)

        elif "unet" in key:
            layer_infos = key
            layer_infos = layer_infos.replace(".lora_up.weight", "")
            layer_infos = layer_infos.replace(".lora_down.weight", "")

            layer_infos = layer_infos[5:]
            layer_names = layer_infos.split(".")

            layers = []
            i = 0
            while i < len(layer_names):

                if len(layers) >= 4:
                    layers[-1] += "_" + layer_names[i]
                elif i + 1 < len(layer_names) and layer_names[i+1].isdigit():
                    layers.append(layer_names[i] + "_" + layer_names[i+1])
                    i += 1
                elif len(layers) > 0 and "samplers" in layers[-1]:
                    layers[-1] += "_" + layer_names[i]
                else:
                    layers.append(layer_names[i])
                i += 1
            layer_infos = ".".join(layers)

            pair_keys = [key.replace("lora_down", "lora_up"),
                         key.replace("lora_up", "lora_down")]
            
            # update weight
            if len(state_dict[pair_keys[0]].shape) == 4:
                weight_up = state_dict[pair_keys[0]].squeeze(
                    3).squeeze(2).to(torch.float32)
                weight_down = state_dict[pair_keys[1]].to(torch.float32)
                weight_scale = lora_alpha/weight_up.shape[1] if lora_alpha != -1 else 1.0

                weight_up, weight_down = state_dict[pair_keys[0]
                                                    ], state_dict[pair_keys[1]]
                weight_up = weight_up.squeeze([2, 3]).to(torch.float32)
                weight_down = weight_down.squeeze([2, 3]).to(torch.float32)
                if len(weight_down.shape) == 4:
                    curr_layer_weight = weight_scale * \
                        torch.einsum('a b, b c h w -> a c h w',
                                     weight_up, weight_down)
                else:
                    curr_layer_weight = weight_scale * \
                        torch.mm(weight_up, weight_down).unsqueeze(
                            2).unsqueeze(3)

                curr_layer_weight = curr_layer_weight.permute(0, 2, 3, 1)

            else:
                weight_up = state_dict[pair_keys[0]].to(torch.float32)
                weight_down = state_dict[pair_keys[1]].to(torch.float32)
                weight_scale = lora_alpha/weight_up.shape[1] if lora_alpha != -1 else 1.0

                curr_layer_weight = weight_scale * \
                    torch.mm(weight_up, weight_down)
            #

            curr_layer_weight = curr_layer_weight.to(torch.float16)

            unet.load_lora_by_name(layers, curr_layer_weight, alpha)

            for item in pair_keys:
                visited.add(item)