diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..f0ad0a1f3551d190d15a04ce63ae02f2bfcc485e 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+asset/chinese[[:space:]]elements[[:space:]]understanding.png filter=lfs diff=lfs merge=lfs -text
+asset/Hunyuan_DiT_Tech_Report_05140553.pdf filter=lfs diff=lfs merge=lfs -text
+asset/long[[:space:]]text[[:space:]]understanding.png filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
index 064e6f05750670cbdcef43fea0ed58269a2536d9..1e0000dc9bc1c6a19671505556a48834659887be 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ colorFrom: indigo
 colorTo: red
 sdk: gradio
 sdk_version: 4.31.1
-app_file: app.py
+app_file: app/hydit_app.py
 pinned: false
 ---
 
diff --git a/app/hydit_app.py b/app/hydit_app.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d115ec7f404b4d80cbf43f7956b977b23594685
--- /dev/null
+++ b/app/hydit_app.py
@@ -0,0 +1,170 @@
+import gradio as gr
+import pandas as pd
+from pathlib import Path
+from PIL import Image
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from hydit.constants import SAMPLER_FACTORY
+from sample_t2i import inferencer
+
+ROOT = Path(__file__).parent.parent
+SAMPLERS = list(SAMPLER_FACTORY.keys())
+SIZES = {
+    "square": (1024, 1024),
+    "landscape": (768, 1280),
+    "portrait": (1280, 768),
+}
+
+def get_strings(lang):
+    lang_file = Path(f"app/lang/{lang}.csv")
+    strings = pd.read_csv(lang_file, header=0)
+    strings = strings.set_index("key")['value'].to_dict()
+    return strings
+
+
+args, gen, enhancer = inferencer()
+strings = get_strings("en")
+
+
+def infer(
+    prompt,
+    negative_prompt,
+    seed,
+    cfg_scale,
+    infer_steps,
+    oriW, oriH,
+    sampler,
+    size,
+    enhance
+):
+    if enhance and enhancer is not None:
+        success, enhanced_prompt = enhancer(prompt)
+        if not success:
+            fail_image = Image.open(ROOT / 'app/fail.png')
+            return fail_image
+    else:
+        enhanced_prompt = None
+
+    height, width = SIZES[size]
+    results = gen.predict(prompt,
+                          height=height,
+                          width=width,
+                          seed=seed,
+                          enhanced_prompt=enhanced_prompt,
+                          negative_prompt=negative_prompt,
+                          infer_steps=infer_steps,
+                          guidance_scale=cfg_scale,
+                          batch_size=1,
+                          src_size_cond=(oriW, oriH),
+                          sampler=sampler,
+                          )
+    image = results['images'][0]
+    return image
+
+
+def ui():
+    block = gr.Blocks()
+
+    description = f"""
+    # {strings['title']}
+    
+    ## {strings['desc']}
+    
+    """
+
+    with block:
+        with gr.Row():
+            gr.Markdown(description)
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    size = gr.Radio(
+                        label=strings['size'], choices=[
+                            (strings['square'], 'square'),
+                            (strings['landscape'], 'landscape'),
+                            (strings['portrait'], 'portrait'),
+                        ],
+                        value="square"
+                    )
+                prompt = gr.Textbox(label=strings['prompt'], value=strings['default prompt'], lines=3)
+                with gr.Row():
+                    infer_steps = gr.Slider(
+                        label=strings['infer steps'], minimum=1, maximum=200, value=100, step=1,
+                    )
+                    seed = gr.Number(
+                        label=strings['seed'], minimum=-1, maximum=1_000_000_000, value=1, step=1, precision=0,
+                    )
+                    enhance = gr.Checkbox(
+                        label=strings['enhance'], value=enhancer is not None, interactive=True,
+                    )
+
+                with gr.Accordion(
+                    strings['accordion'], open=False
+                ):
+                    with gr.Row():
+                        negative_prompt = gr.Textbox(label=strings['negative_prompt'],
+                                                     value=gen.default_negative_prompt,
+                                                     lines=2,
+                                                     )
+                    with gr.Row():
+                        sampler = gr.Dropdown(SAMPLERS, label=strings['sampler'], value="ddpm")
+                        cfg_scale = gr.Slider(
+                            label=strings['cfg'], minimum=1.0, maximum=16.0, value=6.0, step=1
+                        )
+                        oriW = gr.Number(
+                            label=strings['width cond'], minimum=1024, maximum=4096, value=1024, step=64, precision=0,
+                            min_width=80,
+                        )
+                        oriH = gr.Number(
+                            label=strings['height cond'], minimum=1024, maximum=4096, value=1024, step=64, precision=0,
+                            min_width=80,
+                        )
+                with gr.Row():
+                    advanced_button = gr.Button(strings['run'])
+            with gr.Column():
+                default_img = Image.open(ROOT / 'app/default.png')
+                output_img = gr.Image(
+                    label=strings['generated image'],
+                    interactive=False,
+                    format='png',
+                    value=default_img,
+                )
+            advanced_button.click(
+                fn=infer,
+                inputs=[
+                    prompt, negative_prompt, seed, cfg_scale, infer_steps,
+                    oriW, oriH, sampler, size, enhance,
+                ],
+                outputs=output_img,
+            )
+
+        with gr.Row():
+            gr.Examples([
+                ['一只小猫'],
+                ['现实主义风格，画面主要描述一个巴洛克风格的花瓶，带有金色的装饰边框，花瓶上盛开着各种色彩鲜艳的花，白色背景'],
+                ['一只聪明的狐狸走在阔叶树林里, 旁边是一条小溪, 细节真实, 摄影'],
+                ['飞流直下三千尺，疑是银河落九天'],
+                ['一只长靴猫手持亮银色的宝剑，身着铠甲，眼神坚毅，站在一堆金币上，背景是暗色调的洞穴，图像上有金币的光影点缀。'],
+                ['麻婆豆腐'],
+                ['苏州园林'],
+                ['一颗新鲜的草莓特写，红色的外表，表面布满许多种子，背景是淡绿色的叶子'],
+                ['请画出“忽如一夜春风来 千树万树梨花开”'],
+                ['请将“杞人忧天”的样子画出来'],
+                ['枯藤老树昏鸦，小桥流水人家'],
+                ['湖水清澈，天空湛蓝，阳光灿烂。一只优雅的白天鹅在湖边游泳。它周围有几只小鸭子，看起来非常可爱，整个画面给人一种宁静祥和的感觉。'],
+                ['一朵鲜艳的红色玫瑰花，花瓣撒有一些水珠，晶莹剔透，特写镜头'],
+                ['臭豆腐'],
+                ['九寨沟'],
+                ['俗语“鲤鱼跃龙门”'],
+                ['风格是写实，画面主要描述一个亚洲戏曲艺术家正在表演，她穿着华丽的戏服，脸上戴着精致的面具，身姿优雅，背景是古色古香的舞台，镜头是近景'],
+            ],
+            [prompt],
+            label=strings['examples']
+            )
+    return block
+
+
+if __name__ == "__main__":
+    interface = ui()
+    interface.launch()
diff --git a/app/lang/en.csv b/app/lang/en.csv
new file mode 100644
index 0000000000000000000000000000000000000000..f70f662bbb33b661b9260c337cc494711fb617fc
--- /dev/null
+++ b/app/lang/en.csv
@@ -0,0 +1,22 @@
+key,value
+size,Size
+sampler,Sampler
+prompt,Prompt
+default prompt,"A cute cat"
+negative_prompt,Negative Prompt
+seed,Seed
+cfg,CFG Scale
+infer steps,Sampling Steps
+batch size,Batch Size
+width cond,Width Cond
+height cond,Height Cond
+enhance,Prompt Enhancement
+run,Submit
+square,Square(1024x1024)
+landscape,Landscape(1280x768)
+portrait,Portrait(768x1280)
+accordion,Advanced Options
+generated image,HunYuanDiT Generated Image
+examples,More Examples
+title,Hunyuan-DiT
+desc,A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding
\ No newline at end of file
diff --git a/app/lang/zh.csv b/app/lang/zh.csv
new file mode 100644
index 0000000000000000000000000000000000000000..62e7b4f56e33a42cc36d93b2adf3f153e58e3551
--- /dev/null
+++ b/app/lang/zh.csv
@@ -0,0 +1,22 @@
+key,value
+size,尺寸
+sampler,采样器
+prompt,文本描述
+default prompt,"一只可爱的猫"
+negative_prompt,负向词
+seed,种子
+cfg,CFG系数
+infer steps,采样步数
+batch size,批大小
+width cond,宽度条件
+height cond,高度条件
+enhance,文本增强
+run,提交生成
+square,方形(1024x1024)
+portrait,竖屏(1280x768)
+landscape,横屏(768x1280)
+accordion,高级设置
+generated image,HunYuanDiT 生成
+examples,更多示例
+title,混元-DiT
+desc,具有细粒度中文理解的高性能多分辨率 Diffusion Transformer 模型
\ No newline at end of file
diff --git a/asset/Hunyuan_DiT_Tech_Report_05140553.pdf b/asset/Hunyuan_DiT_Tech_Report_05140553.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..84e21098a150cc84529bbd8ebdd7c7bc34938eb9
--- /dev/null
+++ b/asset/Hunyuan_DiT_Tech_Report_05140553.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f8514b002ba3bb4704575096683f65e09df06693a54bf3004f0b351138ab1e5
+size 42132252
diff --git a/asset/chinese elements understanding.png b/asset/chinese elements understanding.png
new file mode 100644
index 0000000000000000000000000000000000000000..80a12590aacd378fb615d095f49d8a2b8328d395
--- /dev/null
+++ b/asset/chinese elements understanding.png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5761413a7c2b15adb83dcad04c3b56c6358debd3a354dfd559919b611c9fb52
+size 6060348
diff --git a/asset/cover.png b/asset/cover.png
new file mode 100644
index 0000000000000000000000000000000000000000..5a47da501a4503d7b63444f3ebc02be14ba7094c
Binary files /dev/null and b/asset/cover.png differ
diff --git a/asset/framework.png b/asset/framework.png
new file mode 100644
index 0000000000000000000000000000000000000000..ae6d0ff0565464b30048fc45d18c00df4c0c6102
Binary files /dev/null and b/asset/framework.png differ
diff --git a/asset/logo.png b/asset/logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..500d7aa19ee3ae60ce5c1d5610e83819a31dc5ae
Binary files /dev/null and b/asset/logo.png differ
diff --git a/asset/long text understanding.png b/asset/long text understanding.png
new file mode 100644
index 0000000000000000000000000000000000000000..0a1c7379115c50b6887781597bc0addf53b6d2b8
--- /dev/null
+++ b/asset/long text understanding.png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8060c105db0cc40a83a89443096c8b95b2838da57fd04d4ddf828328dce8811e
+size 5146571
diff --git a/asset/mllm.png b/asset/mllm.png
new file mode 100644
index 0000000000000000000000000000000000000000..b5354af5e6c9bac7cc91fb70a5ffe375d7b97b72
Binary files /dev/null and b/asset/mllm.png differ
diff --git a/asset/radar.png b/asset/radar.png
new file mode 100644
index 0000000000000000000000000000000000000000..8ed651c2e884cd80467f4b9dc855fe2588493a24
Binary files /dev/null and b/asset/radar.png differ
diff --git a/dialoggen/dialoggen_demo.py b/dialoggen/dialoggen_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..a73bbbffba09812aa878cc910bef87a6473f8506
--- /dev/null
+++ b/dialoggen/dialoggen_demo.py
@@ -0,0 +1,172 @@
+import argparse
+import torch
+import sys
+import os
+# 添加当前命令行运行的目录到 sys.path
+sys.path.append(os.getcwd()+"/dialoggen")
+
+
+from llava.constants import (
+    IMAGE_TOKEN_INDEX,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IM_END_TOKEN,
+    IMAGE_PLACEHOLDER,
+)
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import (
+    process_images,
+    tokenizer_image_token,
+    get_model_name_from_path,
+)
+
+import requests
+from PIL import Image
+from io import BytesIO
+import re
+
+
+def image_parser(image_file, sep=','):
+    out = image_file.split(sep)
+    return out
+
+
+def load_image(image_file):
+    if image_file.startswith("http") or image_file.startswith("https"):
+        response = requests.get(image_file)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+    else:
+        image = Image.open(image_file).convert("RGB")
+    return image
+
+
+def load_images(image_files):
+    out = []
+    for image_file in image_files:
+        image = load_image(image_file)
+        out.append(image)
+    return out
+
+
+def init_dialoggen_model(model_path, model_base=None):
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(
+        model_path, model_base, model_name, llava_type_model=True)
+    return {"tokenizer": tokenizer,
+            "model": model,
+            "image_processor": image_processor}
+
+
+def eval_model(models,
+               query='详细描述一下这张图片',
+               image_file=None,
+               sep=',',
+               temperature=0.2,
+               top_p=None,
+               num_beams=1,
+               max_new_tokens=512,
+               ):
+    # Model
+    disable_torch_init()
+
+    qs = query
+    image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+    if IMAGE_PLACEHOLDER in qs:
+        if models["model"].config.mm_use_im_start_end:
+            qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
+        else:
+            qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
+    else:
+        if models["model"].config.mm_use_im_start_end:
+            qs = image_token_se + "\n" + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
+
+    conv = conv_templates['llava_v1'].copy()
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+
+    if image_file is not None:
+        image_files = image_parser(image_file, sep=sep)
+        images = load_images(image_files)
+        image_sizes = [x.size for x in images]
+        images_tensor = process_images(
+            images,
+            models["image_processor"],
+            models["model"].config
+        ).to(models["model"].device, dtype=torch.float16)
+    else:
+        # fomatted input as training data
+        image_sizes = [(1024, 1024)]
+        images_tensor = torch.zeros(1, 5, 3, models["image_processor"].crop_size["height"], models["image_processor"].crop_size["width"])
+        images_tensor = images_tensor.to(models["model"].device, dtype=torch.float16)
+
+    input_ids = (
+        tokenizer_image_token(prompt, models["tokenizer"], IMAGE_TOKEN_INDEX, return_tensors="pt")
+        .unsqueeze(0)
+        .cuda()
+    )
+    with torch.inference_mode():
+        output_ids = models["model"].generate(
+            input_ids,
+            images=images_tensor,
+            image_sizes=image_sizes,
+            do_sample=True if temperature > 0 else False,
+            temperature=temperature,
+            top_p=top_p,
+            num_beams=num_beams,
+            max_new_tokens=max_new_tokens,
+            use_cache=True,
+        )
+
+    outputs = models["tokenizer"].batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+    return outputs
+
+
+def remove_prefix(text):
+    if text.startswith("<画图>"):
+        return text[len("<画图>"):], True
+    elif text.startswith("对不起"):
+        # 拒绝画图
+        return "", False
+    else:
+        return text, True
+
+
+class DialogGen(object):
+    def __init__(self, model_path):
+        self.models = init_dialoggen_model(model_path)
+        self.query_template = "请先判断用户的意图，若为画图则在输出前加入<画图>:{}"
+
+    def __call__(self, prompt):
+        enhanced_prompt = eval_model(
+            models=self.models,
+            query=self.query_template.format(prompt),
+            image_file=None,
+        )
+
+        enhanced_prompt, compliance = remove_prefix(enhanced_prompt)
+        if not compliance:
+            return False, ""
+        return True, enhanced_prompt
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_path', type=str, default='./ckpts/dialoggen')
+    parser.add_argument('--prompt', type=str, default='画一只小猫')
+    parser.add_argument('--image_file', type=str, default=None) # 'images/demo1.jpeg'
+    args = parser.parse_args()
+
+    query = f"请先判断用户的意图，若为画图则在输出前加入<画图>:{args.prompt}"
+
+    models = init_dialoggen_model(args.model_path)
+
+    res = eval_model(models,
+        query=query,
+        image_file=args.image_file,
+    )
+    print(res)
diff --git a/dialoggen/images/demo1.jpeg b/dialoggen/images/demo1.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..4c2d4caef08421824efee343d1c5c252035ece3d
Binary files /dev/null and b/dialoggen/images/demo1.jpeg differ
diff --git a/dialoggen/images/demo2.jpeg b/dialoggen/images/demo2.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..b7cdc9a7d204dfd01ccd0a14a8793e055f06f3a0
Binary files /dev/null and b/dialoggen/images/demo2.jpeg differ
diff --git a/dialoggen/llava/__init__.py b/dialoggen/llava/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d1f016db1028101d45ba7d68cb3f0bcb558c2bb
--- /dev/null
+++ b/dialoggen/llava/__init__.py
@@ -0,0 +1 @@
+from .model import LlavaLlamaForCausalLM
diff --git a/dialoggen/llava/constants.py b/dialoggen/llava/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..374be090510b302de9882d880c755787a8eafe11
--- /dev/null
+++ b/dialoggen/llava/constants.py
@@ -0,0 +1,13 @@
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+
+LOGDIR = "."
+
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"
diff --git a/dialoggen/llava/conversation.py b/dialoggen/llava/conversation.py
new file mode 100644
index 0000000000000000000000000000000000000000..00c56867dd1fd88094df9556f3d1c57e71a7ada8
--- /dev/null
+++ b/dialoggen/llava/conversation.py
@@ -0,0 +1,396 @@
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+import base64
+from io import BytesIO
+from PIL import Image
+
+
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+
+    skip_next: bool = False
+
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+        return ret
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
+        if image_process_mode == "Pad":
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+        if max(image.size) > max_len:
+            max_hw, min_hw = max(image.size), min(image.size)
+            aspect_ratio = max_hw / min_hw
+            shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+            longest_edge = int(shortest_edge * aspect_ratio)
+            W, H = image.size
+            if H > W:
+                H, W = longest_edge, shortest_edge
+            else:
+                H, W = shortest_edge, longest_edge
+            image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    image = self.process_image(image, image_process_mode, return_pil=return_pil)
+                    images.append(image)
+        return images
+
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    img_b64_str = self.process_image(
+                        image, "Default", return_pil=False,
+                        image_format='JPEG')
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+
+
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+
+conv_mistral_instruct = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="",
+    sep2="</s>",
+)
+
+conv_chatml_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+default_conversation = conv_vicuna_v1
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+    "mistral_instruct": conv_mistral_instruct,
+    "chatml_direct": conv_chatml_direct,
+    "mistral_direct": conv_chatml_direct,
+
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+
+    "mpt": conv_mpt,
+}
+
+
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())
diff --git a/dialoggen/llava/mm_utils.py b/dialoggen/llava/mm_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9717a81f8e481a452bbd99f4aa0baad95d0306df
--- /dev/null
+++ b/dialoggen/llava/mm_utils.py
@@ -0,0 +1,247 @@
+from PIL import Image
+from io import BytesIO
+import base64
+import torch
+import math
+import ast
+
+from transformers import StoppingCriteria
+from llava.constants import IMAGE_TOKEN_INDEX
+
+
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float('inf')
+
+    for width, height in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+
+    return best_fit
+
+
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+
+    new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+
+    return new_image
+
+
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+
+    return patches
+
+
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (tuple): The size of the input image in the format (width, height).
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+        patch_size (int): The size of each image patch.
+
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    width, height = select_best_resolution(image_size, possible_resolutions)
+    return width // patch_size, height // patch_size
+
+
+def process_anyres_image(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+
+    patches = divide_to_patches(image_padded, processor.crop_size['height'])
+
+    image_original_resize = image.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
+
+    image_patches = [image_original_resize] + patches
+    image_patches = [processor.preprocess(image_patch, return_tensors='pt')['pixel_values'][0]
+                     for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+
+
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+
+
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+
+
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == 'pad':
+        for image in images:
+            image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
+            image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            new_images.append(image)
+    elif image_aspect_ratio == "anyres":
+        for image in images:
+            image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+            new_images.append(image)
+    else:
+        return image_processor(images, return_tensors='pt')['pixel_values']
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+
+
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+
+
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith('checkpoint-'):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+
+    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
+            if torch.equal(truncated_output_ids, keyword_id):
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)
diff --git a/dialoggen/llava/model/__init__.py b/dialoggen/llava/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbd91789f0cde61dd13a7f9a5f7a69488ad07279
--- /dev/null
+++ b/dialoggen/llava/model/__init__.py
@@ -0,0 +1,6 @@
+try:
+    from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
+    from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
+    from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
+except:
+    pass
diff --git a/dialoggen/llava/model/apply_delta.py b/dialoggen/llava/model/apply_delta.py
new file mode 100644
index 0000000000000000000000000000000000000000..666dd9691bde7d54ddf2871e311d6f621e29f099
--- /dev/null
+++ b/dialoggen/llava/model/apply_delta.py
@@ -0,0 +1,48 @@
+"""
+Usage:
+python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
+"""
+import argparse
+
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llava import LlavaLlamaForCausalLM
+
+
+def apply_delta(base_model_path, target_model_path, delta_path):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+
+    print("Loading delta")
+    delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
+
+    print("Applying delta")
+    for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
+        if name not in base.state_dict():
+            assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data += base.state_dict()[name]
+        else:
+            assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
+                f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
+            bparam = base.state_dict()[name]
+            param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
+
+    print("Saving target model")
+    delta.save_pretrained(target_model_path)
+    delta_tokenizer.save_pretrained(target_model_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+
+    args = parser.parse_args()
+
+    apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
diff --git a/dialoggen/llava/model/builder.py b/dialoggen/llava/model/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a775132dc97564278b1c1342cf9e5054fb66e0a
--- /dev/null
+++ b/dialoggen/llava/model/builder.py
@@ -0,0 +1,167 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+import os
+import warnings
+import shutil
+
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+import torch
+from llava.model import *
+from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+
+
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, llava_type_model=True, **kwargs):
+    kwargs = {"device_map": device_map, **kwargs}
+
+    if device != "cuda":
+        kwargs['device_map'] = {"": device}
+
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.float16
+
+    if use_flash_attn:
+        kwargs['attn_implementation'] = 'flash_attention_2'
+
+    if 'llava' in model_name.lower():
+        # Load LLaVA model
+        if 'lora' in model_name.lower() and model_base is None:
+            warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
+        if 'lora' in model_name.lower() and model_base is not None:
+            from llava.model.language_model.llava_llama import LlavaConfig
+            lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path)
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            print('Loading LLaVA from base model...')
+            model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
+            token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
+            if model.lm_head.weight.shape[0] != token_num:
+                model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+                model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+
+            print('Loading additional LLaVA weights...')
+            if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
+                non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
+            else:
+                # this is probably from HF Hub
+                from huggingface_hub import hf_hub_download
+                def load_from_hf(repo_id, filename, subfolder=None):
+                    cache_file = hf_hub_download(
+                        repo_id=repo_id,
+                        filename=filename,
+                        subfolder=subfolder)
+                    return torch.load(cache_file, map_location='cpu')
+                non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
+            non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
+            if any(k.startswith('model.model.') for k in non_lora_trainables):
+                non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
+            model.load_state_dict(non_lora_trainables, strict=False)
+
+            from peft import PeftModel
+            print('Loading LoRA weights...')
+            model = PeftModel.from_pretrained(model, model_path)
+            print('Merging LoRA weights...')
+            model = model.merge_and_unload()
+            print('Model is loaded...')
+        elif model_base is not None:
+            # this may be mm projector only
+            print('Loading LLaVA from base model...')
+            if 'mpt' in model_name.lower():
+                if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')):
+                    shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py'))
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
+                cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+                model = LlavaMptForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+                cfg_pretrained = AutoConfig.from_pretrained(model_path)
+                model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+
+            mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
+            mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
+            model.load_state_dict(mm_projector_weights, strict=False)
+        else:
+            if 'mpt' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = LlavaMptForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+            elif 'mistral' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path)
+                model = LlavaMistralForCausalLM.from_pretrained(
+                    model_path,
+                    low_cpu_mem_usage=True,
+                    **kwargs
+                )
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = LlavaLlamaForCausalLM.from_pretrained(
+                    model_path,
+                    low_cpu_mem_usage=True,
+                    **kwargs
+                )
+    else:
+        # Load language model
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print('Convert to FP16...')
+            model.to(torch.float16)
+        else:
+            use_fast = False
+            if 'mpt' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+
+    image_processor = None
+
+    if llava_type_model:
+        mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+        mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+        if mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        if mm_use_im_start_end:
+            tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+        model.resize_token_embeddings(len(tokenizer))
+
+        vision_tower = model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model(device_map=device_map)
+        if device_map != 'auto':
+            vision_tower.to(device=device_map, dtype=torch.float16)
+        image_processor = vision_tower.image_processor
+
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+
+    return tokenizer, model, image_processor, context_len
diff --git a/dialoggen/llava/model/consolidate.py b/dialoggen/llava/model/consolidate.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e324210e229eeba23b75791bba82df7c6e639eb
--- /dev/null
+++ b/dialoggen/llava/model/consolidate.py
@@ -0,0 +1,29 @@
+"""
+Usage:
+python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
+"""
+import argparse
+
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llava.model import *
+from llava.model.utils import auto_upgrade
+
+
+def consolidate_ckpt(src_path, dst_path):
+    print("Loading model")
+    auto_upgrade(src_path)
+    src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
+    src_model.save_pretrained(dst_path)
+    src_tokenizer.save_pretrained(dst_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str, required=True)
+    parser.add_argument("--dst", type=str, required=True)
+
+    args = parser.parse_args()
+
+    consolidate_ckpt(args.src, args.dst)
diff --git a/dialoggen/llava/model/language_model/llava_llama.py b/dialoggen/llava/model/language_model/llava_llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..069d0d1c10da42f5d278598e8534f166d1f9f5ff
--- /dev/null
+++ b/dialoggen/llava/model/language_model/llava_llama.py
@@ -0,0 +1,158 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         LlamaConfig, LlamaModel, LlamaForCausalLM
+
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+
+from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+
+
+class LlavaConfig(LlamaConfig):
+    model_type = "llava_llama"
+
+
+class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
+    config_class = LlavaConfig
+
+    def __init__(self, config: LlamaConfig):
+        super(LlavaLlamaModel, self).__init__(config)
+
+
+class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaConfig
+
+    def __init__(self, config):
+        super(LlamaForCausalLM, self).__init__(config)
+        self.model = LlavaLlamaModel(config)
+        self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_sizes
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
+                                      inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            inputs['images'] = images
+        if image_sizes is not None:
+            inputs['image_sizes'] = image_sizes
+        return inputs
+
+AutoConfig.register("llava_llama", LlavaConfig)
+AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)
diff --git a/dialoggen/llava/model/language_model/llava_mistral.py b/dialoggen/llava/model/language_model/llava_mistral.py
new file mode 100644
index 0000000000000000000000000000000000000000..0def682ea3c497e36aa85f1c53eb2cfab6e2fb87
--- /dev/null
+++ b/dialoggen/llava/model/language_model/llava_mistral.py
@@ -0,0 +1,158 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         MistralConfig, MistralModel, MistralForCausalLM
+
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+
+from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+
+
+class LlavaMistralConfig(MistralConfig):
+    model_type = "llava_mistral"
+
+
+class LlavaMistralModel(LlavaMetaModel, MistralModel):
+    config_class = LlavaMistralConfig
+
+    def __init__(self, config: MistralConfig):
+        super(LlavaMistralModel, self).__init__(config)
+
+
+class LlavaMistralForCausalLM(MistralForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaMistralConfig
+
+    def __init__(self, config):
+        super(MistralForCausalLM, self).__init__(config)
+        self.model = LlavaMistralModel(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_sizes
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
+                                      inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            inputs['images'] = images
+        if image_sizes is not None:
+            inputs['image_sizes'] = image_sizes
+        return inputs
+
+AutoConfig.register("llava_mistral", LlavaMistralConfig)
+AutoModelForCausalLM.register(LlavaMistralConfig, LlavaMistralForCausalLM)
diff --git a/dialoggen/llava/model/language_model/llava_mpt.py b/dialoggen/llava/model/language_model/llava_mpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..02e5237ece031af23fcd76b5b4e0d9b0bc5f55cc
--- /dev/null
+++ b/dialoggen/llava/model/language_model/llava_mpt.py
@@ -0,0 +1,97 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import Optional, Tuple
+
+import torch
+
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         MptConfig, MptForCausalLM, MptModel
+from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+
+
+class LlavaMptConfig(MptConfig):
+    model_type = "llava_mpt"
+
+
+class LlavaMptModel(LlavaMetaModel, MptModel):
+    config_class = LlavaMptConfig
+
+    def __init__(self, config: MptConfig):
+        config.hidden_size = config.d_model
+        super(LlavaMptModel, self).__init__(config)
+    
+    def embed_tokens(self, x):
+        return self.wte(x)
+
+
+class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaMptConfig
+    supports_gradient_checkpointing = True
+
+    def __init__(self, config):
+        super(MptForCausalLM, self).__init__(config)
+
+        self.transformer = LlavaMptModel(config)
+        self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.transformer
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LlavaMptModel):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        images=None):
+
+        input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
+        
+        return super().forward(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        _inputs['images'] = images
+        return _inputs
+
+
+AutoConfig.register("llava_mpt", LlavaMptConfig)
+AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM)
diff --git a/dialoggen/llava/model/llava_arch.py b/dialoggen/llava/model/llava_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b299d3c416a0f5ffea3d03d7be5a32b77319533
--- /dev/null
+++ b/dialoggen/llava/model/llava_arch.py
@@ -0,0 +1,368 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from abc import ABC, abstractmethod
+
+import torch
+import torch.nn as nn
+
+from .multimodal_encoder.builder import build_vision_tower
+from .multimodal_projector.builder import build_vision_projector
+
+from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+
+from llava.mm_utils import get_anyres_image_grid_shape
+
+
+class LlavaMetaModel:
+
+    def __init__(self, config):
+        super(LlavaMetaModel, self).__init__(config)
+
+        if hasattr(config, "mm_vision_tower"):
+            self.vision_tower = build_vision_tower(config, delay_load=True)
+            self.mm_projector = build_vision_projector(config)
+
+            if 'unpad' in getattr(config, 'mm_patch_merge_type', ''):
+                self.image_newline = nn.Parameter(
+                    torch.empty(config.hidden_size, dtype=self.dtype)
+                )
+
+    def get_vision_tower(self):
+        vision_tower = getattr(self, 'vision_tower', None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_tower = model_args.vision_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        mm_patch_merge_type = model_args.mm_patch_merge_type
+
+        self.config.mm_vision_tower = vision_tower
+
+        if self.get_vision_tower() is None:
+            vision_tower = build_vision_tower(model_args)
+
+            if fsdp is not None and len(fsdp) > 0:
+                self.vision_tower = [vision_tower]
+            else:
+                self.vision_tower = vision_tower
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                vision_tower = self.vision_tower[0]
+            else:
+                vision_tower = self.vision_tower
+            vision_tower.load_model()
+
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
+        self.config.mm_hidden_size = vision_tower.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        self.config.mm_patch_merge_type = mm_patch_merge_type
+
+        if getattr(self, 'mm_projector', None) is None:
+            self.mm_projector = build_vision_projector(self.config)
+
+            if 'unpad' in mm_patch_merge_type:
+                embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
+                self.image_newline = nn.Parameter(
+                    torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
+                )
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+
+            self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
+
+
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+
+    Args:
+    tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
+    original_size (tuple): The original size of the image (height, width).
+
+    Returns:
+    torch.Tensor: The unpadded image tensor.
+    """
+    original_width, original_height = original_size
+    current_height, current_width = tensor.shape[1:]
+
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding:current_height - padding, :]
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding:current_width - padding]
+
+    return unpadded_tensor
+
+
+class LlavaMetaForCausalLM(ABC):
+
+    @abstractmethod
+    def get_model(self):
+        pass
+
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+
+    def encode_images(self, images):
+        image_features = self.get_model().get_vision_tower()(images)
+        image_features = self.get_model().mm_projector(image_features)
+        return image_features
+
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, position_ids, attention_mask, past_key_values, labels,
+        images, image_sizes=None
+    ):
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+
+        if type(images) is list or images.ndim == 5:
+            if type(images) is list:
+                images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
+            concat_images = torch.cat([image for image in images], dim=0)
+            image_features = self.encode_images(concat_images)
+            split_sizes = [image.shape[0] for image in images]
+            image_features = torch.split(image_features, split_sizes, dim=0)
+            mm_patch_merge_type = getattr(self.config, 'mm_patch_merge_type', 'flat')
+            image_aspect_ratio = getattr(self.config, 'image_aspect_ratio', 'square')
+            if mm_patch_merge_type == 'flat':
+                image_features = [x.flatten(0, 1) for x in image_features]
+            elif mm_patch_merge_type.startswith('spatial'):
+                new_image_features = []
+                for image_idx, image_feature in enumerate(image_features):
+                    if image_feature.shape[0] > 1:
+                        base_image_feature = image_feature[0]
+                        image_feature = image_feature[1:]
+                        height = width = self.get_vision_tower().num_patches_per_side
+                        assert height * width == base_image_feature.shape[0]
+                        if image_aspect_ratio == 'anyres':
+                            num_patch_width, num_patch_height = get_anyres_image_grid_shape(image_sizes[image_idx], self.config.image_grid_pinpoints, self.get_vision_tower().config.image_size)
+                            image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
+                        else:
+                            raise NotImplementedError
+                        if 'unpad' in mm_patch_merge_type:
+                            image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                            image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                            image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                            image_feature = torch.cat((
+                                image_feature,
+                                self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
+                            ), dim=-1)
+                            image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                        else:
+                            image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+                            image_feature = image_feature.flatten(0, 3)
+                        image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+                    else:
+                        image_feature = image_feature[0]
+                        if 'unpad' in mm_patch_merge_type:
+                            image_feature = torch.cat((
+                                image_feature,
+                                self.model.image_newline[None].to(image_feature.device)
+                            ), dim=0)
+                    new_image_features.append(image_feature)
+                image_features = new_image_features
+            else:
+                raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
+        else:
+            image_features = self.encode_images(images)
+
+        # TODO: image start / end is not implemented here to support pretraining.
+        if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
+            raise NotImplementedError
+
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+
+        # remove the padding using attention_mask -- FIXME
+        _input_ids = input_ids
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+
+            image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+
+            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
+                new_input_embeds_padded.append(torch.cat((
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
+                    cur_new_embed
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+            else:
+                new_input_embeds_padded.append(torch.cat((
+                    cur_new_embed,
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+
+        if _position_ids is None:
+            position_ids = None
+
+        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
+                embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
+        elif model_args.mm_use_im_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
diff --git a/dialoggen/llava/model/make_delta.py b/dialoggen/llava/model/make_delta.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ae55d59c2c8bab80299272314a41bbeb959d8ed
--- /dev/null
+++ b/dialoggen/llava/model/make_delta.py
@@ -0,0 +1,52 @@
+"""
+Usage:
+python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
+"""
+import argparse
+
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llava.model.utils import auto_upgrade
+
+
+def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+
+    print("Loading target model")
+    auto_upgrade(target_model_path)
+    target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+
+    print("Calculating delta")
+    for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
+        if name not in base.state_dict():
+            assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data -= base.state_dict()[name]
+        else:
+            assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
+            bparam = base.state_dict()[name]
+            param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
+
+    print("Saving delta")
+    if hub_repo_id:
+        kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
+    else:
+        kwargs = {}
+    target.save_pretrained(delta_path, **kwargs)
+    target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
+    target_tokenizer.save_pretrained(delta_path, **kwargs)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    parser.add_argument("--hub-repo-id", type=str, default=None)
+    args = parser.parse_args()
+
+    make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
diff --git a/dialoggen/llava/model/multimodal_encoder/builder.py b/dialoggen/llava/model/multimodal_encoder/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..e89507c49df413945453959d48b51a71b9031ef7
--- /dev/null
+++ b/dialoggen/llava/model/multimodal_encoder/builder.py
@@ -0,0 +1,11 @@
+import os
+from .clip_encoder import CLIPVisionTower
+
+
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
+    is_absolute_path_exists = os.path.exists(vision_tower)
+    if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
+        return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+
+    raise ValueError(f'Unknown vision tower: {vision_tower}')
diff --git a/dialoggen/llava/model/multimodal_encoder/clip_encoder.py b/dialoggen/llava/model/multimodal_encoder/clip_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..89ce2d9e084a2b324dc5e19fb2d2d889f3a60602
--- /dev/null
+++ b/dialoggen/llava/model/multimodal_encoder/clip_encoder.py
@@ -0,0 +1,88 @@
+import torch
+import torch.nn as nn
+
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+
+
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args,  delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
diff --git a/dialoggen/llava/model/multimodal_projector/builder.py b/dialoggen/llava/model/multimodal_projector/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..31cd4f48e6055cd6d00a162af30b1c8139e26b57
--- /dev/null
+++ b/dialoggen/llava/model/multimodal_projector/builder.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+import re
+
+
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, *args, **kwargs):
+        return x
+
+    @property
+    def config(self):
+        return {"mm_projector_type": 'identity'}
+
+
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+
+
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+
+    if projector_type == 'linear':
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+
+    if projector_type == 'identity':
+        return IdentityMap()
+
+    raise ValueError(f'Unknown projector type: {projector_type}')
diff --git a/dialoggen/llava/model/utils.py b/dialoggen/llava/model/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2563f89c6cedf5e73508afec8f9979105df9b745
--- /dev/null
+++ b/dialoggen/llava/model/utils.py
@@ -0,0 +1,20 @@
+from transformers import AutoConfig
+
+
+def auto_upgrade(config):
+    cfg = AutoConfig.from_pretrained(config)
+    if 'llava' in config and 'llava' not in cfg.model_type:
+        assert cfg.model_type == 'llama'
+        print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
+        print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
+        confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
+        if confirm.lower() in ["y", "yes"]:
+            print("Upgrading checkpoint...")
+            assert len(cfg.architectures) == 1
+            setattr(cfg.__class__, "model_type", "llava")
+            cfg.architectures[0] = 'LlavaLlamaForCausalLM'
+            cfg.save_pretrained(config)
+            print("Checkpoint upgraded.")
+        else:
+            print("Checkpoint upgrade aborted.")
+            exit(1)
diff --git a/dialoggen/llava/utils.py b/dialoggen/llava/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4006cf917e26c365080b0844c56fab78c48457c0
--- /dev/null
+++ b/dialoggen/llava/utils.py
@@ -0,0 +1,126 @@
+import datetime
+import logging
+import logging.handlers
+import os
+import sys
+
+import requests
+
+from llava.constants import LOGDIR
+
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
+
+handler = None
+
+
+def build_logger(logger_name, logger_filename):
+    global handler
+
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when='D', utc=True, encoding='UTF-8')
+        handler.setFormatter(formatter)
+
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+
+    return logger
+
+
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ''
+
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ''
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == '\n':
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+
+    def flush(self):
+        if self.linebuf != '':
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ''
+
+
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+
+
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {"Content-Type": "application/json",
+               "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        flagged = False
+    except KeyError as e:
+        flagged = False
+
+    return flagged
+
+
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
diff --git a/en.csv b/en.csv
new file mode 100644
index 0000000000000000000000000000000000000000..f70f662bbb33b661b9260c337cc494711fb617fc
--- /dev/null
+++ b/en.csv
@@ -0,0 +1,22 @@
+key,value
+size,Size
+sampler,Sampler
+prompt,Prompt
+default prompt,"A cute cat"
+negative_prompt,Negative Prompt
+seed,Seed
+cfg,CFG Scale
+infer steps,Sampling Steps
+batch size,Batch Size
+width cond,Width Cond
+height cond,Height Cond
+enhance,Prompt Enhancement
+run,Submit
+square,Square(1024x1024)
+landscape,Landscape(1280x768)
+portrait,Portrait(768x1280)
+accordion,Advanced Options
+generated image,HunYuanDiT Generated Image
+examples,More Examples
+title,Hunyuan-DiT
+desc,A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding
\ No newline at end of file
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f43b45b7fbd182dc2cb6c82dec2cbecc562b1a17
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,8 @@
+name: HunyuanDiT
+channels:
+  - pytorch
+  - nvidia
+dependencies:
+  - python=3.8.12
+  - pytorch=1.13.1
+  - pip
diff --git a/example_prompts.txt b/example_prompts.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f590be43c46e4c50f91a28f84821047241066df2
--- /dev/null
+++ b/example_prompts.txt
@@ -0,0 +1,28 @@
+一只聪明的狐狸走在阔叶树林里, 旁边是一条小溪, 细节真实, 摄影
+湖水清澈，天空湛蓝，阳光灿烂。一只优雅的白天鹅在湖边游泳。它周围有几只小鸭子，看起来非常可爱，整个画面给人一种宁静祥和的感觉。
+太阳微微升起，花园里的玫瑰花瓣上露珠晶莹剔透，一只瓢虫正在爬向露珠，背景是清晨的花园，微距镜头
+一位女明星，中国人，头发是黑色，衣服是纯白色短袖，人物风格清新，城市背景
+后印象主义风格，一条古老的石板路上面散落着金黄色的树叶。路旁的风车在静谧地转动，后面竖着两个风车。背景是一片向日葵田，蓝天上飘着几朵白云
+一幅细致的油画描绘了一只年轻獾轻轻嗅着一朵明亮的黄色玫瑰时错综复杂的皮毛。背景是一棵大树干的粗糙纹理，獾的爪子轻轻地挖进树皮。在柔和的背景中，一个宁静的瀑布倾泻而下，它的水在绿色植物中闪烁着蓝色。
+渔舟唱晚
+请将杞人忧天的样子画出来
+一只长靴猫手持亮银色的宝剑，身着铠甲，眼神坚毅，站在一堆金币上，背景是暗色调的洞穴，图像上有金币的光影点缀。
+插画风格，一只狐狸和一只刺猬坐在水边的石头上，刺猬手里拿着一杯茶，狐狸旁边放着一个玻璃杯。周围是茂密的绿色植物和树木，阳光透过树叶洒在水面上，画面宁静温馨。
+泥塑风格，一座五彩斑斓的花园在画面中展现，各种各样的花朵，绿色的叶子和一只正在嬉戏的小猫形成了一幅生动的图像，背景是蓝天和白云
+枯藤老树昏鸦，小桥流水人家
+一张细致的照片捕捉到了一尊雕像的形象，这尊雕像酷似一位古代法老，头上出人意料地戴着一副青铜蒸汽朋克护目镜。这座雕像穿着复古时髦，一件清爽的白色T恤和一件合身的黑色皮夹克，与传统的头饰形成鲜明对比。背景是简单的纯色，突出了雕像的非传统服装和蒸汽朋克眼镜的复杂细节。
+一朵鲜艳的红色玫瑰花，花瓣撒有一些水珠，晶莹剔透，特写镜头，
+一只可爱的猫, 细节真实, 摄影
+飞流直下三千尺，疑是银河落九天
+成语“鲤鱼跃龙门”
+一颗新鲜的草莓特写，红色的外表，表面布满许多种子，背景是淡绿色的叶子
+九寨沟
+摄影风格，在画面中心是一盘热气腾腾的麻婆豆腐，豆腐呈白色，上面撒着一层红色的辣酱，有些许绿色的葱花点缀，背景是深色木质餐桌，桌子上放有辣椒和葱花作为点缀。
+一位年轻女子站在春季的火车站月台上。她身着蓝灰色长风衣，白色衬衫。她的深棕色头发扎成低马尾，几缕碎发随风飘扬。她的眼神充满期待，阳光洒在她温暖的脸庞上。
+一只优雅的白鹤在湖边静静地站立，它的身体纯白色，翅膀轻轻展开，背景是湖面和远处的山脉
+国画风格，苏州园林中的小桥流水，周围是郁郁葱葱的树，池塘里有几朵绽放的荷花，背景是宁静的江南水乡
+现实主义风格，画面主要描述一个巴洛克风格的花瓶，带有金色的装饰边框，花瓶上盛开着各种色彩鲜艳的花，白色背景
+醉后不知天在水，满船清梦压星河
+长城
+一个亚洲中年男士在夕阳下的公园长椅上静坐。他穿着一件深蓝色的针织毛衣和灰色裤子。他的头发略显花白，手中拿着一本敞开的书。面带微笑，眼神温和，周围是落日余晖和四周的绿树。
+风格是写实，画面主要描述一个亚洲戏曲艺术家正在表演，她穿着华丽的戏服，脸上戴着精致的面具，身姿优雅，背景是古色古香的舞台，镜头是近景
\ No newline at end of file
diff --git a/hydit/__init__.py b/hydit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/hydit/config.py b/hydit/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..153714ac5f3b972f753ca307e2baed2aaf3f0d18
--- /dev/null
+++ b/hydit/config.py
@@ -0,0 +1,67 @@
+import argparse
+
+from .constants import *
+from .modules.models import HUNYUAN_DIT_CONFIG
+
+
+def get_args(default_args=None):
+    parser = argparse.ArgumentParser()
+
+    # Basic
+    parser.add_argument("--prompt", type=str, default="一只小猫", help="The prompt for generating images.")
+    parser.add_argument("--model-root", type=str, default="ckpts", help="Model root path.")
+    parser.add_argument("--image-size", type=int, nargs='+', default=[1024, 1024],
+                        help='Image size (h, w). If a single value is provided, the image will be treated to '
+                             '(value, value).')
+    parser.add_argument("--infer-mode", type=str, choices=["fa", "torch", "trt"], default="torch",
+                        help="Inference mode")
+
+    # HunYuan-DiT
+    parser.add_argument("--model", type=str, choices=list(HUNYUAN_DIT_CONFIG.keys()), default='DiT-g/2')
+    parser.add_argument("--norm", type=str, default="layer", help="Normalization layer type")
+    parser.add_argument("--load-key", type=str, choices=["ema", "module"], default="ema", help="Load model key for HunYuanDiT checkpoint.")
+    parser.add_argument('--size-cond', type=int, nargs='+', default=[1024, 1024],
+                        help="Size condition used in sampling. 2 values are required for height and width. "
+                             "If a single value is provided, the image will be treated to (value, value).")
+    parser.add_argument("--cfg-scale", type=float, default=6.0, help="Guidance scale for classifier-free.")
+
+    # Prompt enhancement
+    parser.add_argument("--enhance", action="store_true", help="Enhance prompt with dialoggen.")
+    parser.add_argument("--no-enhance", dest="enhance", action="store_false")
+    parser.set_defaults(enhance=True)
+
+    # Diffusion
+    parser.add_argument("--learn-sigma", action="store_true", help="Learn extra channels for sigma.")
+    parser.add_argument("--no-learn-sigma", dest="learn_sigma", action="store_false")
+    parser.set_defaults(learn_sigma=True)
+    parser.add_argument("--predict-type", type=str, choices=list(PREDICT_TYPE), default="v_prediction",
+                        help="Diffusion predict type")
+    parser.add_argument("--noise-schedule", type=str, choices=list(NOISE_SCHEDULES), default="scaled_linear",
+                        help="Noise schedule")
+    parser.add_argument("--beta-start", type=float, default=0.00085, help="Beta start value")
+    parser.add_argument("--beta-end", type=float, default=0.03, help="Beta end value")
+
+    # Text condition
+    parser.add_argument("--text-states-dim", type=int, default=1024, help="Hidden size of CLIP text encoder.")
+    parser.add_argument("--text-len", type=int, default=77, help="Token length of CLIP text encoder output.")
+    parser.add_argument("--text-states-dim-t5", type=int, default=2048, help="Hidden size of CLIP text encoder.")
+    parser.add_argument("--text-len-t5", type=int, default=256, help="Token length of T5 text encoder output.")
+    parser.add_argument("--negative", type=str, default=None, help="Negative prompt.")
+
+    # Acceleration
+    parser.add_argument("--use_fp16", action="store_true", help="Use FP16 precision.")
+    parser.add_argument("--no-fp16", dest="use_fp16", action="store_false")
+    parser.set_defaults(use_fp16=True)
+
+    # Sampling
+    parser.add_argument("--batch-size", type=int, default=1, help="Per-GPU batch size")
+    parser.add_argument("--sampler", type=str, choices=SAMPLER_FACTORY, default="ddpm", help="Diffusion sampler")
+    parser.add_argument("--infer-steps", type=int, default=100, help="Inference steps")
+    parser.add_argument('--seed', type=int, default=42, help="A seed for all the prompts.")
+
+    # App
+    parser.add_argument("--lang", type=str, default="zh", choices=["zh", "en"], help="Language")
+
+    args = parser.parse_args(default_args)
+
+    return args
diff --git a/hydit/constants.py b/hydit/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2cdf81c06ad8f5b995577abbc9c70b845718f95
--- /dev/null
+++ b/hydit/constants.py
@@ -0,0 +1,62 @@
+# =======================================================
+NOISE_SCHEDULES = {
+    "linear",
+    "scaled_linear",
+    "squaredcos_cap_v2",
+}
+
+PREDICT_TYPE = {
+    "epsilon",
+    "sample",
+    "v_prediction",
+}
+
+# =======================================================
+NEGATIVE_PROMPT = '错误的眼睛，糟糕的人脸，毁容，糟糕的艺术，变形，多余的肢体，模糊的颜色，模糊，重复，病态，残缺，'
+
+
+# =======================================================
+# Constants about models
+# =======================================================
+
+SAMPLER_FACTORY = {
+    'ddpm': {
+        'scheduler': 'DDPMScheduler',
+        'name': 'DDPM',
+        'kwargs': {
+            'steps_offset': 1,
+            'clip_sample': False,
+            'clip_sample_range': 1.0,
+            'beta_schedule': 'scaled_linear',
+            'beta_start': 0.00085,
+            'beta_end': 0.03,
+            'prediction_type': 'v_prediction',
+        }
+    },
+    'ddim': {
+        'scheduler': 'DDIMScheduler',
+        'name': 'DDIM',
+        'kwargs': {
+            'steps_offset': 1,
+            'clip_sample': False,
+            'clip_sample_range': 1.0,
+            'beta_schedule': 'scaled_linear',
+            'beta_start': 0.00085,
+            'beta_end': 0.03,
+            'prediction_type': 'v_prediction',
+        }
+    },
+    'dpmms': {
+        'scheduler': 'DPMSolverMultistepScheduler',
+        'name': 'DPMMS',
+        'kwargs': {
+            'beta_schedule': 'scaled_linear',
+            'beta_start': 0.00085,
+            'beta_end': 0.03,
+            'prediction_type': 'v_prediction',
+            'trained_betas': None,
+            'solver_order': 2,
+            'algorithm_type': 'dpmsolver++',
+        }
+    },
+}
diff --git a/hydit/diffusion/__init__.py b/hydit/diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/hydit/diffusion/pipeline.py b/hydit/diffusion/pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..d030471a3b25d5871a671510734430f42c986e1a
--- /dev/null
+++ b/hydit/diffusion/pipeline.py
@@ -0,0 +1,830 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import PIL
+import numpy as np
+import torch
+import torchvision.transforms as T
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    PIL_INTERPOLATION,
+    deprecate,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from transformers import BertModel, BertTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ..modules.models import HunYuanDiT
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from io import BytesIO
+
+        >>> from diffusers import StableDiffusionImg2ImgPipeline
+
+        >>> device = "cuda"
+        >>> model_id_or_path = "runwayml/stable-diffusion-v1-5"
+        >>> pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
+        >>> pipe = pipe.to(device)
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+        >>> response = requests.get(url)
+        >>> init_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> init_image = init_image.resize((768, 512))
+
+        >>> prompt = "A fantasy landscape, trending on artstation"
+
+        >>> images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
+        >>> images[0].save("fantasy_landscape.png")
+        ```
+"""
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+class StableDiffusionPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+):
+    r"""
+    Pipeline for text-guided image-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder (Optional[`~transformers.BertModel`, `~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (Optional[`~transformers.BertTokenizer`, `~transformers.CLIPTokenizer`]):
+            A `BertTokenizer` or `CLIPTokenizer` to tokenize text.
+        unet (Optional[`HunYuanDiT`, `UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+            self,
+            vae: AutoencoderKL,
+            text_encoder: Union[BertModel, CLIPTextModel],
+            tokenizer: Union[BertTokenizer, CLIPTokenizer],
+            unet: Union[HunYuanDiT, UNet2DConditionModel],
+            scheduler: KarrasDiffusionSchedulers,
+            safety_checker: StableDiffusionSafetyChecker,
+            feature_extractor: CLIPImageProcessor,
+            requires_safety_checker: bool = True,
+            progress_bar_config: Dict[str, Any] = None,
+            embedder_t5=None,
+            infer_mode='torch',
+    ):
+        super().__init__()
+
+        # ========================================================
+        self.embedder_t5 = embedder_t5
+        self.infer_mode = infer_mode
+
+        # ========================================================
+        if progress_bar_config is None:
+            progress_bar_config = {}
+        if not hasattr(self, '_progress_bar_config'):
+            self._progress_bar_config = {}
+        self._progress_bar_config.update(progress_bar_config)
+        # ========================================================
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+            self,
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt=None,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+            lora_scale: Optional[float] = None,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+            self,
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt=None,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+            lora_scale: Optional[float] = None,
+            embedder=None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            embedder:
+                T5 embedder (including text encoder and tokenizer)
+        """
+        if embedder is None:
+            text_encoder = self.text_encoder
+            tokenizer = self.tokenizer
+            max_length = self.tokenizer.model_max_length
+        else:
+            text_encoder = embedder.model
+            tokenizer = embedder.tokenizer
+            max_length = embedder.max_length
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+            text_inputs = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+            ):
+                removed_text = tokenizer.batch_decode(
+                    untruncated_ids[:, tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+            prompt_embeds = text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+            attention_mask = attention_mask.repeat(num_images_per_prompt, 1)
+        else:
+            attention_mask = None
+
+        if text_encoder is not None:
+            prompt_embeds_dtype = text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            uncond_attention_mask = uncond_input.attention_mask.to(device)
+            negative_prompt_embeds = text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=uncond_attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+            uncond_attention_mask = uncond_attention_mask.repeat(num_images_per_prompt, 1)
+        else:
+            uncond_attention_mask = None
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds, negative_prompt_embeds, attention_mask, uncond_attention_mask
+
+    def _convert_to_rgb(self, image):
+        return image.convert('RGB')
+
+    def image_transform(self, image_size=224):
+        transform = T.Compose([
+            T.Resize((image_size, image_size), interpolation=T.InterpolationMode.BICUBIC),
+            self._convert_to_rgb,
+            T.ToTensor(),
+            T.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+        ])
+        return transform
+
+    def encode_img(self, img, device, do_classifier_free_guidance):
+        # print('len', len(img))
+        # print('img', img.size)
+        img = img[0]    # TODO: support batch processing
+        image_preprocess = self.image_transform(224)
+        img_for_clip = image_preprocess(img)
+        # print('img_for_clip', img_for_clip.shape)
+        img_for_clip = img_for_clip.unsqueeze(0)
+        img_clip_embedding = self.img_encoder(img_for_clip.to(device)).to(dtype=torch.float16)
+        # print('img_clip_embedding_1_type', img_clip_embedding.dtype)
+        if do_classifier_free_guidance:
+            negative_img_clip_embedding = torch.zeros_like(img_clip_embedding)
+        return img_clip_embedding, negative_img_clip_embedding
+
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+            self,
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt=None,
+            prompt_embeds=None,
+            negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+                callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+            self,
+            height: int,
+            width: int,
+            prompt: Union[str, List[str]] = None,
+            num_inference_steps: Optional[int] = 50,
+            guidance_scale: Optional[float] = 7.5,
+            negative_prompt: Optional[Union[str, List[str]]] = None,
+            num_images_per_prompt: Optional[int] = 1,
+            eta: Optional[float] = 0.0,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.FloatTensor] = None,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            prompt_embeds_t5: Optional[torch.FloatTensor] = None,
+            negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+            negative_prompt_embeds_t5: Optional[torch.FloatTensor] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            callback: Optional[Callable[[int, int, torch.FloatTensor, torch.FloatTensor], None]] = None,
+            callback_steps: int = 1,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            guidance_rescale: float = 0.0,
+            image_meta_size: Optional[torch.LongTensor] = None,
+            style: Optional[torch.LongTensor] = None,
+            progress: bool = True,
+            use_fp16: bool = False,
+            freqs_cis_img: Optional[tuple] = None,
+            learn_sigma: bool = True,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            height (`int`):
+                The height in pixels of the generated image.
+            width (`int`):
+                The width in pixels of the generated image.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor,
+                pred_x0: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+
+        prompt_embeds, negative_prompt_embeds, attention_mask, uncond_attention_mask = \
+            self.encode_prompt(prompt,
+                               device,
+                               num_images_per_prompt,
+                               do_classifier_free_guidance,
+                               negative_prompt,
+                               prompt_embeds=prompt_embeds,
+                               negative_prompt_embeds=negative_prompt_embeds,
+                               lora_scale=text_encoder_lora_scale,
+                               )
+        prompt_embeds_t5, negative_prompt_embeds_t5, attention_mask_t5, uncond_attention_mask_t5 = \
+            self.encode_prompt(prompt,
+                               device,
+                               num_images_per_prompt,
+                               do_classifier_free_guidance,
+                               negative_prompt,
+                               prompt_embeds=prompt_embeds_t5,
+                               negative_prompt_embeds=negative_prompt_embeds_t5,
+                               lora_scale=text_encoder_lora_scale,
+                               embedder=self.embedder_t5,
+                               )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            attention_mask = torch.cat([uncond_attention_mask, attention_mask])
+            prompt_embeds_t5 = torch.cat([negative_prompt_embeds_t5, prompt_embeds_t5])
+            attention_mask_t5 = torch.cat([uncond_attention_mask_t5, attention_mask_t5])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(batch_size * num_images_per_prompt,
+                                       num_channels_latents,
+                                       height,
+                                       width,
+                                       prompt_embeds.dtype,
+                                       device,
+                                       generator,
+                                       latents,
+                                       )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # expand scalar t to 1-D tensor to match the 1st dim of latent_model_input
+                t_expand = torch.tensor([t] * latent_model_input.shape[0], device=latent_model_input.device)
+
+                if use_fp16:
+                    latent_model_input = latent_model_input.half()
+                    t_expand = t_expand.half()
+                    prompt_embeds = prompt_embeds.half()
+                    ims = image_meta_size.half() if image_meta_size is not None else None
+                else:
+                    ims = image_meta_size if image_meta_size is not None else None
+
+                # predict the noise residual
+                if self.infer_mode in ["fa", "torch"]:
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t_expand,
+                        encoder_hidden_states=prompt_embeds,
+                        text_embedding_mask=attention_mask,
+                        encoder_hidden_states_t5=prompt_embeds_t5,
+                        text_embedding_mask_t5=attention_mask_t5,
+                        image_meta_size=ims,
+                        style=style,
+                        cos_cis_img=freqs_cis_img[0],
+                        sin_cis_img=freqs_cis_img[1],
+                        return_dict=False,
+                    )
+                elif self.infer_mode == "trt":
+                    raise NotImplementedError("TensorRT model is not supported yet.")
+                else:
+                    raise ValueError("[ERROR] invalid inference mode! please check your config file")
+                if learn_sigma:
+                    noise_pred, _ = noise_pred.chunk(2, dim=1)
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                results = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=True)
+                latents = results.prev_sample
+                pred_x0 = results.pred_original_sample if hasattr(results, 'pred_original_sample') else None
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents, pred_x0)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/hydit/inference.py b/hydit/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..85d4f2c52157f41b9000e585bd8deebb788d89c4
--- /dev/null
+++ b/hydit/inference.py
@@ -0,0 +1,389 @@
+import random
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
+
+# For reproducibility
+# torch.backends.cudnn.benchmark = False
+# torch.backends.cudnn.deterministic = True
+
+from diffusers import schedulers
+from diffusers.models import AutoencoderKL
+from loguru import logger
+from transformers import BertModel, BertTokenizer
+from transformers.modeling_utils import logger as tf_logger
+
+from .constants import SAMPLER_FACTORY, NEGATIVE_PROMPT
+from .diffusion.pipeline import StableDiffusionPipeline
+from .modules.models import HunYuanDiT, HUNYUAN_DIT_CONFIG
+from .modules.posemb_layers import get_2d_rotary_pos_embed, get_fill_resize_and_crop
+from .modules.text_encoder import MT5Embedder
+from .utils.tools import set_seeds
+
+
+class Resolution:
+    def __init__(self, width, height):
+        self.width = width
+        self.height = height
+
+    def __str__(self):
+        return f'{self.height}x{self.width}'
+
+
+class ResolutionGroup:
+    def __init__(self):
+        self.data = [
+            Resolution(768, 768),   # 1:1
+            Resolution(1024, 1024), # 1:1
+            Resolution(1280, 1280), # 1:1
+            Resolution(1024, 768),  # 4:3
+            Resolution(1152, 864),  # 4:3
+            Resolution(1280, 960),  # 4:3
+            Resolution(768, 1024),  # 3:4
+            Resolution(864, 1152),  # 3:4
+            Resolution(960, 1280),  # 3:4
+            Resolution(1280, 768),  # 16:9
+            Resolution(768, 1280),  # 9:16
+        ]
+        self.supported_sizes = set([(r.width, r.height) for r in self.data])
+
+    def is_valid(self, width, height):
+        return (width, height) in self.supported_sizes
+
+
+STANDARD_RATIO = np.array([
+    1.0,        # 1:1
+    4.0 / 3.0,  # 4:3
+    3.0 / 4.0,  # 3:4
+    16.0 / 9.0, # 16:9
+    9.0 / 16.0, # 9:16
+])
+STANDARD_SHAPE = [
+    [(768, 768), (1024, 1024), (1280, 1280)],   # 1:1
+    [(1024, 768), (1152, 864), (1280, 960)],    # 4:3
+    [(768, 1024), (864, 1152), (960, 1280)],    # 3:4
+    [(1280, 768)],                              # 16:9
+    [(768, 1280)],                              # 9:16
+]
+STANDARD_AREA = [
+    np.array([w * h for w, h in shapes])
+    for shapes in STANDARD_SHAPE
+]
+
+
+def get_standard_shape(target_width, target_height):
+    """
+    Map image size to standard size.
+    """
+    target_ratio = target_width / target_height
+    closest_ratio_idx = np.argmin(np.abs(STANDARD_RATIO - target_ratio))
+    closest_area_idx = np.argmin(np.abs(STANDARD_AREA[closest_ratio_idx] - target_width * target_height))
+    width, height = STANDARD_SHAPE[closest_ratio_idx][closest_area_idx]
+    return width, height
+
+
+def _to_tuple(val):
+    if isinstance(val, (list, tuple)):
+        if len(val) == 1:
+            val = [val[0], val[0]]
+        elif len(val) == 2:
+            val = tuple(val)
+        else:
+            raise ValueError(f"Invalid value: {val}")
+    elif isinstance(val, (int, float)):
+        val = (val, val)
+    else:
+        raise ValueError(f"Invalid value: {val}")
+    return val
+
+
+def get_pipeline(args, vae, text_encoder, tokenizer, model, device, rank,
+                 embedder_t5, infer_mode, sampler=None):
+    """
+    Get scheduler and pipeline for sampling. The sampler and pipeline are both
+    based on diffusers and make some modifications.
+
+    Returns
+    -------
+    pipeline: StableDiffusionPipeline
+    sampler_name: str
+    """
+    sampler = sampler or args.sampler
+
+    # Load sampler from factory
+    kwargs = SAMPLER_FACTORY[sampler]['kwargs']
+    scheduler = SAMPLER_FACTORY[sampler]['scheduler']
+
+    # Update sampler according to the arguments
+    kwargs['beta_schedule'] = args.noise_schedule
+    kwargs['beta_start'] = args.beta_start
+    kwargs['beta_end'] = args.beta_end
+    kwargs['prediction_type'] = args.predict_type
+
+    # Build scheduler according to the sampler.
+    scheduler_class = getattr(schedulers, scheduler)
+    scheduler = scheduler_class(**kwargs)
+
+    # Set timesteps for inference steps.
+    scheduler.set_timesteps(args.infer_steps, device)
+
+    # Only enable progress bar for rank 0
+    progress_bar_config = {} if rank == 0 else {'disable': True}
+
+    pipeline = StableDiffusionPipeline(vae=vae,
+                                       text_encoder=text_encoder,
+                                       tokenizer=tokenizer,
+                                       unet=model,
+                                       scheduler=scheduler,
+                                       feature_extractor=None,
+                                       safety_checker=None,
+                                       requires_safety_checker=False,
+                                       progress_bar_config=progress_bar_config,
+                                       embedder_t5=embedder_t5,
+                                       infer_mode=infer_mode,
+                                       )
+
+    pipeline = pipeline.to(device)
+
+    return pipeline, sampler
+
+
+class End2End(object):
+    def __init__(self, args, models_root_path):
+        self.args = args
+
+        # Check arguments
+        t2i_root_path = Path(models_root_path) / "t2i"
+        self.root = t2i_root_path
+        logger.info(f"Got text-to-image model root path: {t2i_root_path}")
+
+        # Set device and disable gradient
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        torch.set_grad_enabled(False)
+        # Disable BertModel logging checkpoint info
+        tf_logger.setLevel('ERROR')
+
+        # ========================================================================
+        model_dir = self.root / "model"
+
+        # ========================================================================
+        logger.info(f"Loading CLIP Text Encoder...")
+        text_encoder_path = self.root / "clip_text_encoder"
+        self.clip_text_encoder = BertModel.from_pretrained(str(text_encoder_path), False, revision=None).to(self.device)
+        logger.info(f"Loading CLIP Text Encoder finished")
+
+        # ========================================================================
+        logger.info(f"Loading CLIP Tokenizer...")
+        tokenizer_path = self.root / "tokenizer"
+        self.tokenizer = BertTokenizer.from_pretrained(str(tokenizer_path))
+        logger.info(f"Loading CLIP Tokenizer finished")
+
+        # ========================================================================
+        logger.info(f"Loading T5 Text Encoder and T5 Tokenizer...")
+        t5_text_encoder_path = self.root / 'mt5'
+        embedder_t5 = MT5Embedder(t5_text_encoder_path, torch_dtype=torch.float16, max_length=256)
+        self.embedder_t5 = embedder_t5
+        logger.info(f"Loading t5_text_encoder and t5_tokenizer finished")
+
+        # ========================================================================
+        logger.info(f"Loading VAE...")
+        vae_path = self.root / "sdxl-vae-fp16-fix"
+        self.vae = AutoencoderKL.from_pretrained(str(vae_path)).to(self.device)
+        logger.info(f"Loading VAE finished")
+
+        # ========================================================================
+        # Create model structure and load the checkpoint
+        logger.info(f"Building HunYuan-DiT model...")
+        model_config = HUNYUAN_DIT_CONFIG[self.args.model]
+        self.patch_size = model_config['patch_size']
+        self.head_size = model_config['hidden_size'] // model_config['num_heads']
+        self.resolutions, self.freqs_cis_img = self.standard_shapes()   # Used for TensorRT models
+        self.image_size = _to_tuple(self.args.image_size)
+        latent_size = (self.image_size[0] // 8, self.image_size[1] // 8)
+
+        self.infer_mode = self.args.infer_mode
+        if self.infer_mode in ['fa', 'torch']:
+            model_path = model_dir / f"pytorch_model_{self.args.load_key}.pt"
+            if not model_path.exists():
+                raise ValueError(f"model_path not exists: {model_path}")
+            # Build model structure
+            self.model = HunYuanDiT(self.args,
+                                    input_size=latent_size,
+                                    **model_config,
+                                    log_fn=logger.info,
+                                    ).half().to(self.device)    # Force to use fp16
+            # Load model checkpoint
+            logger.info(f"Loading model checkpoint {model_path}...")
+            state_dict = torch.load(model_path, map_location=lambda storage, loc: storage)
+            self.model.load_state_dict(state_dict)
+            self.model.eval()
+        elif self.infer_mode == 'trt':
+            raise NotImplementedError("TensorRT model is not supported yet.")
+        else:
+            raise ValueError(f"Unknown infer_mode: {self.infer_mode}")
+
+        # ========================================================================
+        # Build inference pipeline. We use a customized StableDiffusionPipeline.
+        logger.info(f"Loading inference pipeline...")
+        self.pipeline, self.sampler = self.load_sampler()
+        logger.info(f'Loading pipeline finished')
+
+        # ========================================================================
+        self.default_negative_prompt = NEGATIVE_PROMPT
+        logger.info("==================================================")
+        logger.info(f"                Model is ready.                  ")
+        logger.info("==================================================")
+
+    def load_sampler(self, sampler=None):
+        pipeline, sampler = get_pipeline(self.args,
+                                         self.vae,
+                                         self.clip_text_encoder,
+                                         self.tokenizer,
+                                         self.model,
+                                         device=self.device,
+                                         rank=0,
+                                         embedder_t5=self.embedder_t5,
+                                         infer_mode=self.infer_mode,
+                                         sampler=sampler,
+                                         )
+        return pipeline, sampler
+
+    def calc_rope(self, height, width):
+        th = height // 8 // self.patch_size
+        tw = width // 8 // self.patch_size
+        base_size = 512 // 8 // self.patch_size
+        start, stop = get_fill_resize_and_crop((th, tw), base_size)
+        sub_args = [start, stop, (th, tw)]
+        rope = get_2d_rotary_pos_embed(self.head_size, *sub_args)
+        return rope
+
+    def standard_shapes(self):
+        resolutions = ResolutionGroup()
+        freqs_cis_img = {}
+        for reso in resolutions.data:
+            freqs_cis_img[str(reso)] = self.calc_rope(reso.height, reso.width)
+        return resolutions, freqs_cis_img
+
+    def predict(self,
+                user_prompt,
+                height=1024,
+                width=1024,
+                seed=None,
+                enhanced_prompt=None,
+                negative_prompt=None,
+                infer_steps=100,
+                guidance_scale=6,
+                batch_size=1,
+                src_size_cond=(1024, 1024),
+                sampler=None,
+                ):
+        # ========================================================================
+        # Arguments: seed
+        # ========================================================================
+        if seed is None:
+            seed = random.randint(0, 1_000_000)
+        if not isinstance(seed, int):
+            raise TypeError(f"`seed` must be an integer, but got {type(seed)}")
+        generator = set_seeds(seed)
+
+        # ========================================================================
+        # Arguments: target_width, target_height
+        # ========================================================================
+        if width <= 0 or height <= 0:
+            raise ValueError(f"`height` and `width` must be positive integers, got height={height}, width={width}")
+        logger.info(f"Input (height, width) = ({height}, {width})")
+        if self.infer_mode in ['fa', 'torch']:
+            # We must force height and width to align to 16 and to be an integer.
+            target_height = int((height // 16) * 16)
+            target_width = int((width // 16) * 16)
+            logger.info(f"Align to 16: (height, width) = ({target_height}, {target_width})")
+        elif self.infer_mode == 'trt':
+            target_width, target_height = get_standard_shape(width, height)
+            logger.info(f"Align to standard shape: (height, width) = ({target_height}, {target_width})")
+        else:
+            raise ValueError(f"Unknown infer_mode: {self.infer_mode}")
+
+        # ========================================================================
+        # Arguments: prompt, new_prompt, negative_prompt
+        # ========================================================================
+        if not isinstance(user_prompt, str):
+            raise TypeError(f"`user_prompt` must be a string, but got {type(user_prompt)}")
+        user_prompt = user_prompt.strip()
+        prompt = user_prompt
+
+        if enhanced_prompt is not None:
+            if not isinstance(enhanced_prompt, str):
+                raise TypeError(f"`enhanced_prompt` must be a string, but got {type(enhanced_prompt)}")
+            enhanced_prompt = enhanced_prompt.strip()
+            prompt = enhanced_prompt
+
+        # negative prompt
+        if negative_prompt is None or negative_prompt == '':
+            negative_prompt = self.default_negative_prompt
+        if not isinstance(negative_prompt, str):
+            raise TypeError(f"`negative_prompt` must be a string, but got {type(negative_prompt)}")
+
+        # ========================================================================
+        # Arguments: style. (A fixed argument. Don't Change it.)
+        # ========================================================================
+        style = torch.as_tensor([0, 0] * batch_size, device=self.device)
+
+        # ========================================================================
+        # Inner arguments: image_meta_size (Please refer to SDXL.)
+        # ========================================================================
+        if isinstance(src_size_cond, int):
+            src_size_cond = [src_size_cond, src_size_cond]
+        if not isinstance(src_size_cond, (list, tuple)):
+            raise TypeError(f"`src_size_cond` must be a list or tuple, but got {type(src_size_cond)}")
+        if len(src_size_cond) != 2:
+            raise ValueError(f"`src_size_cond` must be a tuple of 2 integers, but got {len(src_size_cond)}")
+        size_cond = list(src_size_cond) + [target_width, target_height, 0, 0]
+        image_meta_size = torch.as_tensor([size_cond] * 2 * batch_size, device=self.device)
+
+        # ========================================================================
+        start_time = time.time()
+        logger.debug(f"""
+                       prompt: {user_prompt}
+              enhanced prompt: {enhanced_prompt}
+                         seed: {seed}
+              (height, width): {(target_height, target_width)}
+              negative_prompt: {negative_prompt}
+                   batch_size: {batch_size}
+               guidance_scale: {guidance_scale}
+                  infer_steps: {infer_steps}
+              image_meta_size: {size_cond}
+        """)
+        reso = f'{target_height}x{target_width}'
+        if reso in self.freqs_cis_img:
+            freqs_cis_img = self.freqs_cis_img[reso]
+        else:
+            freqs_cis_img = self.calc_rope(target_height, target_width)
+
+        if sampler is not None and sampler != self.sampler:
+            self.pipeline, self.sampler = self.load_sampler(sampler)
+
+        samples = self.pipeline(
+            height=target_height,
+            width=target_width,
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=batch_size,
+            guidance_scale=guidance_scale,
+            num_inference_steps=infer_steps,
+            image_meta_size=image_meta_size,
+            style=style,
+            return_dict=False,
+            generator=generator,
+            freqs_cis_img=freqs_cis_img,
+            use_fp16=self.args.use_fp16,
+            learn_sigma=self.args.learn_sigma,
+        )[0]
+        gen_time = time.time() - start_time
+        logger.debug(f"Success, time: {gen_time}")
+
+        return {
+            'images': samples,
+            'seed': seed,
+        }
diff --git a/hydit/modules/__init__.py b/hydit/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/hydit/modules/attn_layers.py b/hydit/modules/attn_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4308af93293c87d101519ef6b1e07663d91cd2ef
--- /dev/null
+++ b/hydit/modules/attn_layers.py
@@ -0,0 +1,377 @@
+import torch
+import torch.nn as nn
+from typing import Tuple, Union, Optional
+
+try:
+    import flash_attn
+    if hasattr(flash_attn, '__version__') and int(flash_attn.__version__[0]) == 2:
+        from flash_attn.flash_attn_interface import flash_attn_kvpacked_func
+        from flash_attn.modules.mha import FlashSelfAttention, FlashCrossAttention
+    else:
+        from flash_attn.flash_attn_interface import flash_attn_unpadded_kvpacked_func
+        from flash_attn.modules.mha import FlashSelfAttention, FlashCrossAttention
+except Exception as e:
+    print(f'flash_attn import failed: {e}')
+
+
+def reshape_for_broadcast(freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]], x: torch.Tensor, head_first=False):
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+
+    Args:
+        freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+        head_first (bool): head dimension first (except batch dim) or not.
+
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+
+    Raises:
+        AssertionError: If the frequency tensor doesn't match the expected shape.
+        AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
+    """
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+
+    if isinstance(freqs_cis, tuple):
+        # freqs_cis: (cos, sin) in real space
+        if head_first:
+            assert freqs_cis[0].shape == (x.shape[-2], x.shape[-1]), f'freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}'
+            shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        else:
+            assert freqs_cis[0].shape == (x.shape[1], x.shape[-1]), f'freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}'
+            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
+    else:
+        # freqs_cis: values in complex space
+        if head_first:
+            assert freqs_cis.shape == (x.shape[-2], x.shape[-1]), f'freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}'
+            shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        else:
+            assert freqs_cis.shape == (x.shape[1], x.shape[-1]), f'freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}'
+            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return freqs_cis.view(*shape)
+
+
+def rotate_half(x):
+    x_real, x_imag = x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+    return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+
+
+def apply_rotary_emb(
+        xq: torch.Tensor,
+        xk: Optional[torch.Tensor],
+        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+        head_first: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor.
+
+    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
+    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
+    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
+    returned as real tensors.
+
+    Args:
+        xq (torch.Tensor): Query tensor to apply rotary embeddings. [B, S, H, D]
+        xk (torch.Tensor): Key tensor to apply rotary embeddings.   [B, S, H, D]
+        freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Precomputed frequency tensor for complex exponentials.
+        head_first (bool): head dimension first (except batch dim) or not.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+
+    """
+    xk_out = None
+    if isinstance(freqs_cis, tuple):
+        cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first)    # [S, D]
+        cos, sin = cos.to(xq.device), sin.to(xq.device)
+        xq_out = (xq.float() * cos + rotate_half(xq.float()) * sin).type_as(xq)
+        if xk is not None:
+            xk_out = (xk.float() * cos + rotate_half(xk.float()) * sin).type_as(xk)
+    else:
+        xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))  # [B, S, H, D//2]
+        freqs_cis = reshape_for_broadcast(freqs_cis, xq_, head_first).to(xq.device)   # [S, D//2] --> [1, S, 1, D//2]
+        xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
+        if xk is not None:
+            xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))  # [B, S, H, D//2]
+            xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
+
+    return xq_out, xk_out
+
+
+class FlashSelfMHAModified(nn.Module):
+    """
+    Use QK Normalization.
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_norm=False,
+                 attn_drop=0.0,
+                 proj_drop=0.0,
+                 device=None,
+                 dtype=None,
+                 norm_layer=nn.LayerNorm,
+                 ):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        assert self.dim % num_heads == 0, "self.kdim must be divisible by num_heads"
+        self.head_dim = self.dim // num_heads
+        assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8"
+
+        self.Wqkv = nn.Linear(dim, 3 * dim, bias=qkv_bias, **factory_kwargs)
+        # TODO: eps should be 1 / 65530 if using fp16
+        self.q_norm = norm_layer(self.head_dim, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
+        self.inner_attn = FlashSelfAttention(attention_dropout=attn_drop)
+        self.out_proj = nn.Linear(dim, dim, bias=qkv_bias, **factory_kwargs)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, freqs_cis_img=None):
+        """
+        Parameters
+        ----------
+        x: torch.Tensor
+            (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim)
+        freqs_cis_img: torch.Tensor
+            (batch, hidden_dim // 2), RoPE for image
+        """
+        b, s, d = x.shape
+
+        qkv = self.Wqkv(x)
+        qkv = qkv.view(b, s, 3, self.num_heads, self.head_dim)  # [b, s, 3, h, d]
+        q, k, v = qkv.unbind(dim=2) # [b, s, h, d]
+        q = self.q_norm(q).half()   # [b, s, h, d]
+        k = self.k_norm(k).half()
+
+        # Apply RoPE if needed
+        if freqs_cis_img is not None:
+            qq, kk = apply_rotary_emb(q, k, freqs_cis_img)
+            assert qq.shape == q.shape and kk.shape == k.shape, f'qq: {qq.shape}, q: {q.shape}, kk: {kk.shape}, k: {k.shape}'
+            q, k = qq, kk
+
+        qkv = torch.stack([q, k, v], dim=2)     # [b, s, 3, h, d]
+        context = self.inner_attn(qkv)
+        out = self.out_proj(context.view(b, s, d))
+        out = self.proj_drop(out)
+
+        out_tuple = (out,)
+
+        return out_tuple
+
+
+class FlashCrossMHAModified(nn.Module):
+    """
+    Use QK Normalization.
+    """
+    def __init__(self,
+                 qdim,
+                 kdim,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_norm=False,
+                 attn_drop=0.0,
+                 proj_drop=0.0,
+                 device=None,
+                 dtype=None,
+                 norm_layer=nn.LayerNorm,
+                 ):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.qdim = qdim
+        self.kdim = kdim
+        self.num_heads = num_heads
+        assert self.qdim % num_heads == 0, "self.qdim must be divisible by num_heads"
+        self.head_dim = self.qdim // num_heads
+        assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8"
+
+        self.scale = self.head_dim ** -0.5
+
+        self.q_proj = nn.Linear(qdim, qdim, bias=qkv_bias, **factory_kwargs)
+        self.kv_proj = nn.Linear(kdim, 2 * qdim, bias=qkv_bias, **factory_kwargs)
+
+        # TODO: eps should be 1 / 65530 if using fp16
+        self.q_norm = norm_layer(self.head_dim, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
+
+        self.inner_attn = FlashCrossAttention(attention_dropout=attn_drop)
+        self.out_proj = nn.Linear(qdim, qdim, bias=qkv_bias, **factory_kwargs)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, y, freqs_cis_img=None):
+        """
+        Parameters
+        ----------
+        x: torch.Tensor
+            (batch, seqlen1, hidden_dim) (where hidden_dim = num_heads * head_dim)
+        y: torch.Tensor
+            (batch, seqlen2, hidden_dim2)
+        freqs_cis_img: torch.Tensor
+            (batch, hidden_dim // num_heads), RoPE for image
+        """
+        b, s1, _ = x.shape     # [b, s1, D]
+        _, s2, _ = y.shape     # [b, s2, 1024]
+
+        q = self.q_proj(x).view(b, s1, self.num_heads, self.head_dim)       # [b, s1, h, d]
+        kv = self.kv_proj(y).view(b, s2, 2, self.num_heads, self.head_dim)  # [b, s2, 2, h, d]
+        k, v = kv.unbind(dim=2)                 # [b, s2, h, d]
+        q = self.q_norm(q).half()               # [b, s1, h, d]
+        k = self.k_norm(k).half()               # [b, s2, h, d]
+
+        # Apply RoPE if needed
+        if freqs_cis_img is not None:
+            qq, _ = apply_rotary_emb(q, None, freqs_cis_img)
+            assert qq.shape == q.shape, f'qq: {qq.shape}, q: {q.shape}'
+            q = qq                              # [b, s1, h, d]
+        kv = torch.stack([k, v], dim=2)         # [b, s1, 2, h, d]
+        context = self.inner_attn(q, kv)        # [b, s1, h, d]
+        context = context.view(b, s1, -1)       # [b, s1, D]
+
+        out = self.out_proj(context)
+        out = self.proj_drop(out)
+
+        out_tuple = (out,)
+
+        return out_tuple
+
+
+class CrossAttention(nn.Module):
+    """
+    Use QK Normalization.
+    """
+    def __init__(self,
+                 qdim,
+                 kdim,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_norm=False,
+                 attn_drop=0.0,
+                 proj_drop=0.0,
+                 device=None,
+                 dtype=None,
+                 norm_layer=nn.LayerNorm,
+                 ):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.qdim = qdim
+        self.kdim = kdim
+        self.num_heads = num_heads
+        assert self.qdim % num_heads == 0, "self.qdim must be divisible by num_heads"
+        self.head_dim = self.qdim // num_heads
+        assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8"
+        self.scale = self.head_dim ** -0.5
+
+        self.q_proj = nn.Linear(qdim, qdim, bias=qkv_bias, **factory_kwargs)
+        self.kv_proj = nn.Linear(kdim, 2 * qdim, bias=qkv_bias, **factory_kwargs)
+
+        # TODO: eps should be 1 / 65530 if using fp16
+        self.q_norm = norm_layer(self.head_dim, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.out_proj = nn.Linear(qdim, qdim, bias=qkv_bias, **factory_kwargs)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, y, freqs_cis_img=None):
+        """
+        Parameters
+        ----------
+        x: torch.Tensor
+            (batch, seqlen1, hidden_dim) (where hidden_dim = num heads * head dim)
+        y: torch.Tensor
+            (batch, seqlen2, hidden_dim2)
+        freqs_cis_img: torch.Tensor
+            (batch, hidden_dim // 2), RoPE for image
+        """
+        b, s1, c = x.shape     # [b, s1, D]
+        _, s2, c = y.shape     # [b, s2, 1024]
+
+        q = self.q_proj(x).view(b, s1, self.num_heads, self.head_dim)   # [b, s1, h, d]
+        kv = self.kv_proj(y).view(b, s2, 2, self.num_heads, self.head_dim)    # [b, s2, 2, h, d]
+        k, v = kv.unbind(dim=2) # [b, s, h, d]
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+
+        # Apply RoPE if needed
+        if freqs_cis_img is not None:
+            qq, _ = apply_rotary_emb(q, None, freqs_cis_img)
+            assert qq.shape == q.shape, f'qq: {qq.shape}, q: {q.shape}'
+            q = qq
+
+        q = q * self.scale
+        q = q.transpose(-2, -3).contiguous()        # q ->  B, L1, H, C - B, H, L1, C
+        k = k.permute(0, 2, 3, 1).contiguous()      # k ->  B, L2, H, C - B, H, C, L2
+        attn = q @ k                                # attn -> B, H, L1, L2
+        attn = attn.softmax(dim=-1)                 # attn -> B, H, L1, L2
+        attn = self.attn_drop(attn)
+        x = attn @ v.transpose(-2, -3)              # v -> B, L2, H, C - B, H, L2, C    x-> B, H, L1, C
+        context = x.transpose(1, 2)                 # context -> B, H, L1, C - B, L1, H, C
+
+        context = context.contiguous().view(b, s1, -1)
+
+        out = self.out_proj(context)  # context.reshape - B, L1, -1
+        out = self.proj_drop(out)
+
+        out_tuple = (out,)
+
+        return out_tuple
+
+
+class Attention(nn.Module):
+    """
+    We rename some layer names to align with flash attention
+    """
+    def __init__(self, dim, num_heads, qkv_bias=True, qk_norm=False, attn_drop=0., proj_drop=0.,
+                 norm_layer=nn.LayerNorm,
+                 ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        assert self.dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.head_dim = self.dim // num_heads
+        # This assertion is aligned with flash attention
+        assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8"
+        self.scale = self.head_dim ** -0.5
+
+        # qkv --> Wqkv
+        self.Wqkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        # TODO: eps should be 1 / 65530 if using fp16
+        self.q_norm = norm_layer(self.head_dim, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.out_proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, freqs_cis_img=None):
+        B, N, C = x.shape
+        qkv = self.Wqkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)   # [3, b, h, s, d]
+        q, k, v = qkv.unbind(0)     # [b, h, s, d]
+        q = self.q_norm(q)          # [b, h, s, d]
+        k = self.k_norm(k)          # [b, h, s, d]
+
+        # Apply RoPE if needed
+        if freqs_cis_img is not None:
+            qq, kk = apply_rotary_emb(q, k, freqs_cis_img, head_first=True)
+            assert qq.shape == q.shape and kk.shape == k.shape, \
+                f'qq: {qq.shape}, q: {q.shape}, kk: {kk.shape}, k: {k.shape}'
+            q, k = qq, kk
+
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)              # [b, h, s, d] @ [b, h, d, s]
+        attn = attn.softmax(dim=-1)                 # [b, h, s, s]
+        attn = self.attn_drop(attn)
+        x = attn @ v                                # [b, h, s, d]
+
+        x = x.transpose(1, 2).reshape(B, N, C)      # [b, s, h, d]
+        x = self.out_proj(x)
+        x = self.proj_drop(x)
+
+        out_tuple = (x,)
+
+        return out_tuple
diff --git a/hydit/modules/embedders.py b/hydit/modules/embedders.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fe08cba22eef41ca9fd9f70fe6f062a4dd606c8
--- /dev/null
+++ b/hydit/modules/embedders.py
@@ -0,0 +1,111 @@
+import math
+import torch
+import torch.nn as nn
+from einops import repeat
+
+from timm.models.layers import to_2tuple
+
+
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding
+
+    Image to Patch Embedding using Conv2d
+
+    A convolution based approach to patchifying a 2D image w/ embedding projection.
+
+    Based on the impl in https://github.com/google-research/vision_transformer
+
+    Hacked together by / Copyright 2020 Ross Wightman
+
+    Remove the _assert function in forward function to be compatible with multi-resolution images.
+    """
+    def __init__(
+            self,
+            img_size=224,
+            patch_size=16,
+            in_chans=3,
+            embed_dim=768,
+            norm_layer=None,
+            flatten=True,
+            bias=True,
+    ):
+        super().__init__()
+        if isinstance(img_size, int):
+            img_size = to_2tuple(img_size)
+        elif isinstance(img_size, (tuple, list)) and len(img_size) == 2:
+            img_size = tuple(img_size)
+        else:
+            raise ValueError(f"img_size must be int or tuple/list of length 2. Got {img_size}")
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def update_image_size(self, img_size):
+        self.img_size = img_size
+        self.grid_size = (img_size[0] // self.patch_size[0], img_size[1] // self.patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+
+    def forward(self, x):
+        # B, C, H, W = x.shape
+        # _assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).")
+        # _assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).")
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+
+
+def timestep_embedding(t, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        ).to(device=t.device)   # size: [dim/2], 一个指数衰减的曲线
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+    else:
+        embedding = repeat(t, "b -> b d", d=dim)
+    return embedding
+
+
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256, out_size=None):
+        super().__init__()
+        if out_size is None:
+            out_size = hidden_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, out_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size).type(self.mlp[0].weight.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
diff --git a/hydit/modules/models.py b/hydit/modules/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..d125aa9f7b1605e1fd6658620bcfdd7eb1ae06dd
--- /dev/null
+++ b/hydit/modules/models.py
@@ -0,0 +1,409 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models import ModelMixin
+from timm.models.vision_transformer import Mlp
+
+from .attn_layers import Attention, FlashCrossMHAModified, FlashSelfMHAModified, CrossAttention
+from .embedders import TimestepEmbedder, PatchEmbed, timestep_embedding
+from .norm_layers import RMSNorm
+from .poolers import AttentionPool
+
+
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+
+class FP32_Layernorm(nn.LayerNorm):
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        origin_dtype = inputs.dtype
+        return F.layer_norm(inputs.float(), self.normalized_shape, self.weight.float(), self.bias.float(),
+                            self.eps).to(origin_dtype)
+
+
+class FP32_SiLU(nn.SiLU):
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.silu(inputs.float(), inplace=False).to(inputs.dtype)
+
+
+class HunYuanDiTBlock(nn.Module):
+    """
+    A HunYuanDiT block with `add` conditioning.
+    """
+    def __init__(self,
+                 hidden_size,
+                 c_emb_size,
+                 num_heads,
+                 mlp_ratio=4.0,
+                 text_states_dim=1024,
+                 use_flash_attn=False,
+                 qk_norm=False,
+                 norm_type="layer",
+                 skip=False,
+                 ):
+        super().__init__()
+        self.use_flash_attn = use_flash_attn
+        use_ele_affine = True
+
+        if norm_type == "layer":
+            norm_layer = FP32_Layernorm
+        elif norm_type == "rms":
+            norm_layer = RMSNorm
+        else:
+            raise ValueError(f"Unknown norm_type: {norm_type}")
+
+        # ========================= Self-Attention =========================
+        self.norm1 = norm_layer(hidden_size, elementwise_affine=use_ele_affine, eps=1e-6)
+        if use_flash_attn:
+            self.attn1 = FlashSelfMHAModified(hidden_size, num_heads=num_heads, qkv_bias=True, qk_norm=qk_norm)
+        else:
+            self.attn1 = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, qk_norm=qk_norm)
+
+        # ========================= FFN =========================
+        self.norm2 = norm_layer(hidden_size, elementwise_affine=use_ele_affine, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+
+        # ========================= Add =========================
+        # Simply use add like SDXL.
+        self.default_modulation = nn.Sequential(
+            FP32_SiLU(),
+            nn.Linear(c_emb_size, hidden_size, bias=True)
+        )
+
+        # ========================= Cross-Attention =========================
+        if use_flash_attn:
+            self.attn2 = FlashCrossMHAModified(hidden_size, text_states_dim, num_heads=num_heads, qkv_bias=True,
+                                               qk_norm=qk_norm)
+        else:
+            self.attn2 = CrossAttention(hidden_size, text_states_dim, num_heads=num_heads, qkv_bias=True,
+                                        qk_norm=qk_norm)
+        self.norm3 = norm_layer(hidden_size, elementwise_affine=True, eps=1e-6)
+
+        # ========================= Skip Connection =========================
+        if skip:
+            self.skip_norm = norm_layer(2 * hidden_size, elementwise_affine=True, eps=1e-6)
+            self.skip_linear = nn.Linear(2 * hidden_size, hidden_size)
+        else:
+            self.skip_linear = None
+
+    def forward(self, x, c=None, text_states=None, freq_cis_img=None, skip=None):
+        # Long Skip Connection
+        if self.skip_linear is not None:
+            cat = torch.cat([x, skip], dim=-1)
+            cat = self.skip_norm(cat)
+            x = self.skip_linear(cat)
+
+        # Self-Attention
+        shift_msa = self.default_modulation(c).unsqueeze(dim=1)
+        attn_inputs = (
+            self.norm1(x) + shift_msa, freq_cis_img,
+        )
+        x = x + self.attn1(*attn_inputs)[0]
+
+        # Cross-Attention
+        cross_inputs = (
+            self.norm3(x), text_states, freq_cis_img
+        )
+        x = x + self.attn2(*cross_inputs)[0]
+
+        # FFN Layer
+        mlp_inputs = self.norm2(x)
+        x = x + self.mlp(mlp_inputs)
+
+        return x
+
+
+class FinalLayer(nn.Module):
+    """
+    The final layer of HunYuanDiT.
+    """
+    def __init__(self, final_hidden_size, c_emb_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(final_hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(final_hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            FP32_SiLU(),
+            nn.Linear(c_emb_size, 2 * final_hidden_size, bias=True)
+        )
+
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+
+
+class HunYuanDiT(ModelMixin, ConfigMixin):
+    """
+    HunYuanDiT: Diffusion model with a Transformer backbone.
+
+    Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.
+
+    Parameters
+    ----------
+    args: argparse.Namespace
+        The arguments parsed by argparse.
+    input_size: tuple
+        The size of the input image.
+    patch_size: int
+        The size of the patch.
+    in_channels: int
+        The number of input channels.
+    hidden_size: int
+        The hidden size of the transformer backbone.
+    depth: int
+        The number of transformer blocks.
+    num_heads: int
+        The number of attention heads.
+    mlp_ratio: float
+        The ratio of the hidden size of the MLP in the transformer block.
+    log_fn: callable
+        The logging function.
+    """
+    @register_to_config
+    def __init__(
+            self, args,
+            input_size=(32, 32),
+            patch_size=2,
+            in_channels=4,
+            hidden_size=1152,
+            depth=28,
+            num_heads=16,
+            mlp_ratio=4.0,
+            log_fn=print,
+    ):
+        super().__init__()
+        self.args = args
+        self.log_fn = log_fn
+        self.depth = depth
+        self.learn_sigma = args.learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if args.learn_sigma else in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.text_states_dim = args.text_states_dim
+        self.text_states_dim_t5 = args.text_states_dim_t5
+        self.text_len = args.text_len
+        self.text_len_t5 = args.text_len_t5
+        self.norm = args.norm
+
+        use_flash_attn = args.infer_mode == 'fa'
+        if use_flash_attn:
+            log_fn(f"    Enable Flash Attention.")
+        qk_norm = True  # See http://arxiv.org/abs/2302.05442 for details.
+
+        self.mlp_t5 = nn.Sequential(
+            nn.Linear(self.text_states_dim_t5, self.text_states_dim_t5 * 4, bias=True),
+            FP32_SiLU(),
+            nn.Linear(self.text_states_dim_t5 * 4, self.text_states_dim, bias=True),
+        )
+        # learnable replace
+        self.text_embedding_padding = nn.Parameter(
+            torch.randn(self.text_len + self.text_len_t5, self.text_states_dim, dtype=torch.float32))
+
+        # Attention pooling
+        self.pooler = AttentionPool(self.text_len_t5, self.text_states_dim_t5, num_heads=8, output_dim=1024)
+
+        # Here we use a default learned embedder layer for future extension.
+        self.style_embedder = nn.Embedding(1, hidden_size)
+
+        # Image size and crop size conditions
+        self.extra_in_dim = 256 * 6 + hidden_size
+
+        # Text embedding for `add`
+        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.extra_in_dim += 1024
+        self.extra_embedder = nn.Sequential(
+            nn.Linear(self.extra_in_dim, hidden_size * 4),
+            FP32_SiLU(),
+            nn.Linear(hidden_size * 4, hidden_size, bias=True),
+        )
+
+        # Image embedding
+        num_patches = self.x_embedder.num_patches
+        log_fn(f"    Number of tokens: {num_patches}")
+
+        # HUnYuanDiT Blocks
+        self.blocks = nn.ModuleList([
+            HunYuanDiTBlock(hidden_size=hidden_size,
+                            c_emb_size=hidden_size,
+                            num_heads=num_heads,
+                            mlp_ratio=mlp_ratio,
+                            text_states_dim=self.text_states_dim,
+                            use_flash_attn=use_flash_attn,
+                            qk_norm=qk_norm,
+                            norm_type=self.norm,
+                            skip=layer > depth // 2,
+                            )
+            for layer in range(depth)
+        ])
+
+        self.final_layer = FinalLayer(hidden_size, hidden_size, patch_size, self.out_channels)
+        self.unpatchify_channels = self.out_channels
+
+        self.initialize_weights()
+
+    def forward(self,
+                x,
+                t,
+                encoder_hidden_states=None,
+                text_embedding_mask=None,
+                encoder_hidden_states_t5=None,
+                text_embedding_mask_t5=None,
+                image_meta_size=None,
+                style=None,
+                cos_cis_img=None,
+                sin_cis_img=None,
+                return_dict=True,
+                ):
+        """
+        Forward pass of the encoder.
+
+        Parameters
+        ----------
+        x: torch.Tensor
+            (B, D, H, W)
+        t: torch.Tensor
+            (B)
+        encoder_hidden_states: torch.Tensor
+            CLIP text embedding, (B, L_clip, D)
+        text_embedding_mask: torch.Tensor
+            CLIP text embedding mask, (B, L_clip)
+        encoder_hidden_states_t5: torch.Tensor
+            T5 text embedding, (B, L_t5, D)
+        text_embedding_mask_t5: torch.Tensor
+            T5 text embedding mask, (B, L_t5)
+        image_meta_size: torch.Tensor
+            (B, 6)
+        style: torch.Tensor
+            (B)
+        cos_cis_img: torch.Tensor
+        sin_cis_img: torch.Tensor
+        return_dict: bool
+            Whether to return a dictionary.
+        """
+
+        text_states = encoder_hidden_states                     # 2,77,1024
+        text_states_t5 = encoder_hidden_states_t5               # 2,256,2048
+        text_states_mask = text_embedding_mask.bool()           # 2,77
+        text_states_t5_mask = text_embedding_mask_t5.bool()     # 2,256
+        b_t5, l_t5, c_t5 = text_states_t5.shape
+        text_states_t5 = self.mlp_t5(text_states_t5.view(-1, c_t5))
+        text_states = torch.cat([text_states, text_states_t5.view(b_t5, l_t5, -1)], dim=1)  # 2,205，1024
+        clip_t5_mask = torch.cat([text_states_mask, text_states_t5_mask], dim=-1)
+
+        clip_t5_mask = clip_t5_mask
+        text_states = torch.where(clip_t5_mask.unsqueeze(2), text_states, self.text_embedding_padding.to(text_states))
+
+        _, _, oh, ow = x.shape
+        th, tw = oh // self.patch_size, ow // self.patch_size
+
+        # ========================= Build time and image embedding =========================
+        t = self.t_embedder(t)
+        x = self.x_embedder(x)
+
+        # Get image RoPE embedding according to `reso`lution.
+        freqs_cis_img = (cos_cis_img, sin_cis_img)
+
+        # ========================= Concatenate all extra vectors =========================
+        # Build text tokens with pooling
+        extra_vec = self.pooler(encoder_hidden_states_t5)
+
+        # Build image meta size tokens
+        image_meta_size = timestep_embedding(image_meta_size.view(-1), 256)   # [B * 6, 256]
+        if self.args.use_fp16:
+            image_meta_size = image_meta_size.half()
+        image_meta_size = image_meta_size.view(-1, 6 * 256)
+        extra_vec = torch.cat([extra_vec, image_meta_size], dim=1)  # [B, D + 6 * 256]
+
+        # Build style tokens
+        style_embedding = self.style_embedder(style)
+        extra_vec = torch.cat([extra_vec, style_embedding], dim=1)
+
+        # Concatenate all extra vectors
+        c = t + self.extra_embedder(extra_vec)  # [B, D]
+
+        # ========================= Forward pass through HunYuanDiT blocks =========================
+        skips = []
+        for layer, block in enumerate(self.blocks):
+            if layer > self.depth // 2:
+                skip = skips.pop()
+                x = block(x, c, text_states, freqs_cis_img, skip)   # (N, L, D)
+            else:
+                x = block(x, c, text_states, freqs_cis_img)         # (N, L, D)
+
+            if layer < (self.depth // 2 - 1):
+                skips.append(x)
+
+        # ========================= Final layer =========================
+        x = self.final_layer(x, c)                              # (N, L, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x, th, tw)                          # (N, out_channels, H, W)
+
+        if return_dict:
+            return {'x': x}
+        return x
+
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+
+        # Initialize label embedding table:
+        nn.init.normal_(self.extra_embedder[0].weight, std=0.02)
+        nn.init.normal_(self.extra_embedder[2].weight, std=0.02)
+
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+
+        # Zero-out adaLN modulation layers in HunYuanDiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.default_modulation[-1].weight, 0)
+            nn.init.constant_(block.default_modulation[-1].bias, 0)
+
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+
+    def unpatchify(self, x, h, w):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.unpatchify_channels
+        p = self.x_embedder.patch_size[0]
+        # h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p))
+        return imgs
+
+
+#################################################################################
+#                            HunYuanDiT Configs                                 #
+#################################################################################
+
+HUNYUAN_DIT_CONFIG = {
+    'DiT-g/2': {'depth': 40, 'hidden_size': 1408, 'patch_size': 2, 'num_heads': 16, 'mlp_ratio': 4.3637},
+    'DiT-XL/2': {'depth': 28, 'hidden_size': 1152, 'patch_size': 2, 'num_heads': 16},
+    'DiT-L/2': {'depth': 24, 'hidden_size': 1024, 'patch_size': 2, 'num_heads': 16},
+    'DiT-B/2': {'depth': 12, 'hidden_size': 768, 'patch_size': 2, 'num_heads': 12},
+}
diff --git a/hydit/modules/norm_layers.py b/hydit/modules/norm_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..5204ad9e5aabd5dad05a0f84408ff85f96cfa8db
--- /dev/null
+++ b/hydit/modules/norm_layers.py
@@ -0,0 +1,68 @@
+import torch
+import torch.nn as nn
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, elementwise_affine=True, eps: float = 1e-6):
+        """
+        Initialize the RMSNorm normalization layer.
+
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+
+        """
+        super().__init__()
+        self.eps = eps
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The normalized tensor.
+
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+
+        """
+        output = self._norm(x.float()).type_as(x)
+        if hasattr(self, "weight"):
+            output = output * self.weight
+        return output
+
+
+class GroupNorm32(nn.GroupNorm):
+    def __init__(self, num_groups, num_channels, eps=1e-5, dtype=None):
+        super().__init__(num_groups=num_groups, num_channels=num_channels, eps=eps, dtype=dtype)
+
+    def forward(self, x):
+        y = super().forward(x).to(x.dtype)
+        return y
+
+def normalization(channels, dtype=None):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(num_channels=channels, num_groups=32, dtype=dtype)
diff --git a/hydit/modules/poolers.py b/hydit/modules/poolers.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4adcaca51fded2268a644ca4c70d5b33dfcd3b0
--- /dev/null
+++ b/hydit/modules/poolers.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class AttentionPool(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+
+    def forward(self, x):
+        x = x.permute(1, 0, 2)  # NLC -> LNC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (L+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (L+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x[:1], key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x.squeeze(0)
diff --git a/hydit/modules/posemb_layers.py b/hydit/modules/posemb_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..62c83df77d85b05710e10dd1464638d611249306
--- /dev/null
+++ b/hydit/modules/posemb_layers.py
@@ -0,0 +1,225 @@
+import torch
+import numpy as np
+from typing import Union
+
+
+def _to_tuple(x):
+    if isinstance(x, int):
+        return x, x
+    else:
+        return x
+
+
+def get_fill_resize_and_crop(src, tgt):    # src 来源的分辨率   tgt base 分辨率
+    th, tw = _to_tuple(tgt)
+    h, w = _to_tuple(src)
+
+    tr = th / tw        # base 分辨率
+    r = h / w           # 目标分辨率
+
+    # resize
+    if r > tr:
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))    # 根据base分辨率，将目标分辨率resize下来
+
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+
+
+def get_meshgrid(start, *args):
+    if len(args) == 0:
+        # start is grid_size
+        num = _to_tuple(start)
+        start = (0, 0)
+        stop = num
+    elif len(args) == 1:
+        # start is start, args[0] is stop, step is 1
+        start = _to_tuple(start)
+        stop = _to_tuple(args[0])
+        num = (stop[0] - start[0], stop[1] - start[1])
+    elif len(args) == 2:
+        # start is start, args[0] is stop, args[1] is num
+        start = _to_tuple(start)       # 左上角   eg: 12,0
+        stop = _to_tuple(args[0])      # 右下角   eg: 20,32
+        num = _to_tuple(args[1])       # 目标大小  eg: 32,124
+    else:
+        raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")
+
+    grid_h = np.linspace(start[0], stop[0], num[0], endpoint=False, dtype=np.float32) # 12-20 中间差值32份   0-32 中间差值124份
+    grid_w = np.linspace(start[1], stop[1], num[1], endpoint=False, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)   # [2, W, H]
+    return grid
+
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+
+def get_2d_sincos_pos_embed(embed_dim, start, *args, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid = get_meshgrid(start, *args)   # [2, H, w]
+    # grid_h = np.arange(grid_size, dtype=np.float32)
+    # grid_w = np.arange(grid_size, dtype=np.float32)
+    # grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    # grid = np.stack(grid, axis=0)   # [2, W, H]
+
+    grid = grid.reshape([2, 1, *grid.shape[1:]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1)    # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (W,H)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)   # (M, D/2)
+    emb_cos = np.cos(out)   # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+#################################################################################
+#                   Rotary Positional Embedding Functions                       #
+#################################################################################
+# https://github.com/facebookresearch/llama/blob/main/llama/model.py#L443
+
+def get_2d_rotary_pos_embed(embed_dim, start, *args, use_real=True):
+    """
+    This is a 2d version of precompute_freqs_cis, which is a RoPE for image tokens with 2d structure.
+
+    Parameters
+    ----------
+    embed_dim: int
+        embedding dimension size
+    start: int or tuple of int
+        If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop, step is 1;
+        If len(args) == 2, start is start, args[0] is stop, args[1] is num.
+    use_real: bool
+        If True, return real part and imaginary part separately. Otherwise, return complex numbers.
+
+    Returns
+    -------
+    pos_embed: torch.Tensor
+        [HW, D/2]
+    """
+    grid = get_meshgrid(start, *args)   # [2, H, w]
+    grid = grid.reshape([2, 1, *grid.shape[1:]])   # 返回一个采样矩阵  分辨率与目标分辨率一致
+    pos_embed = get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=use_real)
+    return pos_embed
+
+
+def get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=False):
+    assert embed_dim % 4 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_rotary_pos_embed(embed_dim // 2, grid[0].reshape(-1), use_real=use_real)  # (H*W, D/4)
+    emb_w = get_1d_rotary_pos_embed(embed_dim // 2, grid[1].reshape(-1), use_real=use_real)  # (H*W, D/4)
+
+    if use_real:
+        cos = torch.cat([emb_h[0], emb_w[0]], dim=1)    # (H*W, D/2)
+        sin = torch.cat([emb_h[1], emb_w[1]], dim=1)    # (H*W, D/2)
+        return cos, sin
+    else:
+        emb = torch.cat([emb_h, emb_w], dim=1)    # (H*W, D/2)
+        return emb
+
+
+def get_1d_rotary_pos_embed(dim: int, pos: Union[np.ndarray, int], theta: float = 10000.0, use_real=False):
+    """
+    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+
+    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        pos (np.ndarray, int): Position indices for the frequency tensor. [S] or scalar
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+        use_real (bool, optional): If True, return real part and imaginary part separately.
+                                   Otherwise, return complex numbers.
+
+    Returns:
+        torch.Tensor: Precomputed frequency tensor with complex exponentials. [S, D/2]
+
+    """
+    if isinstance(pos, int):
+        pos = np.arange(pos)
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))  # [D/2]
+    t = torch.from_numpy(pos).to(freqs.device)  # type: ignore  # [S]
+    freqs = torch.outer(t, freqs).float()  # type: ignore   # [S, D/2]
+    if use_real:
+        freqs_cos = freqs.cos().repeat_interleave(2, dim=1)  # [S, D]
+        freqs_sin = freqs.sin().repeat_interleave(2, dim=1)  # [S, D]
+        return freqs_cos, freqs_sin
+    else:
+        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64     # [S, D/2]
+        return freqs_cis
+
+
+
+def calc_sizes(rope_img, patch_size, th, tw):
+    """ 计算 RoPE 的尺寸. """
+    if rope_img == 'extend':
+        # 拓展模式
+        sub_args = [(th, tw)]
+    elif rope_img.startswith('base'):
+        # 基于一个尺寸, 其他尺寸插值获得.
+        base_size = int(rope_img[4:]) // 8 // patch_size            # 基于512作为base，其他根据512差值得到
+        start, stop = get_fill_resize_and_crop((th, tw), base_size)   # 需要在32x32里面 crop的左上角和右下角
+        sub_args = [start, stop, (th, tw)]
+    else:
+        raise ValueError(f"Unknown rope_img: {rope_img}")
+    return sub_args
+
+
+def init_image_posemb(rope_img,
+                      resolutions,
+                      patch_size,
+                      hidden_size,
+                      num_heads,
+                      log_fn,
+                      rope_real=True,
+                      ):
+    freqs_cis_img = {}
+    for reso in resolutions:
+        th, tw = reso.height // 8 // patch_size, reso.width // 8 // patch_size
+        sub_args = calc_sizes(rope_img, patch_size, th, tw)      #  [左上角, 右下角, 目标高宽]   需要在32x32里面 crop的左上角和右下角
+        freqs_cis_img[str(reso)] = get_2d_rotary_pos_embed(hidden_size // num_heads, *sub_args, use_real=rope_real)
+        log_fn(f"    Using image RoPE ({rope_img}) ({'real' if rope_real else 'complex'}): {sub_args} | ({reso}) "
+               f"{freqs_cis_img[str(reso)][0].shape if rope_real else freqs_cis_img[str(reso)].shape}")
+    return freqs_cis_img
diff --git a/hydit/modules/text_encoder.py b/hydit/modules/text_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a16b21f4ffbd9acc896c93e2833caf5aecfa0a2
--- /dev/null
+++ b/hydit/modules/text_encoder.py
@@ -0,0 +1,95 @@
+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, T5EncoderModel, T5ForConditionalGeneration
+
+
+class MT5Embedder(nn.Module):
+    available_models = ["t5-v1_1-xxl"]
+
+    def __init__(
+        self,
+        model_dir="t5-v1_1-xxl",
+        model_kwargs=None,
+        torch_dtype=None,
+        use_tokenizer_only=False,
+        conditional_generation=False,
+        max_length=128,
+    ):
+        super().__init__()
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.torch_dtype = torch_dtype or torch.bfloat16
+        self.max_length = max_length
+        if model_kwargs is None:
+            model_kwargs = {
+                # "low_cpu_mem_usage": True,
+                "torch_dtype": self.torch_dtype,
+            }
+        model_kwargs["device_map"] = {"shared": self.device, "encoder": self.device}
+        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
+        if use_tokenizer_only:
+            return
+        if conditional_generation:
+            self.model = None
+            self.generation_model = T5ForConditionalGeneration.from_pretrained(
+                model_dir
+            )
+            return
+        self.model = T5EncoderModel.from_pretrained(model_dir, **model_kwargs).eval().to(self.torch_dtype)
+
+    def get_tokens_and_mask(self, texts):
+        text_tokens_and_mask = self.tokenizer(
+            texts,
+            max_length=self.max_length,
+            padding="max_length",
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        tokens = text_tokens_and_mask["input_ids"][0]
+        mask = text_tokens_and_mask["attention_mask"][0]
+        # tokens = torch.tensor(tokens).clone().detach()
+        # mask = torch.tensor(mask, dtype=torch.bool).clone().detach()
+        return tokens, mask
+
+    def get_text_embeddings(self, texts, attention_mask=True, layer_index=-1):
+        text_tokens_and_mask = self.tokenizer(
+            texts,
+            max_length=self.max_length,
+            padding="max_length",
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+
+        with torch.no_grad():
+            outputs = self.model(
+                input_ids=text_tokens_and_mask["input_ids"].to(self.device),
+                attention_mask=text_tokens_and_mask["attention_mask"].to(self.device)
+                if attention_mask
+                else None,
+                output_hidden_states=True,
+            )
+            text_encoder_embs = outputs["hidden_states"][layer_index].detach()
+
+        return text_encoder_embs, text_tokens_and_mask["attention_mask"].to(self.device)
+
+    @torch.no_grad()
+    def __call__(self, tokens, attention_mask, layer_index=-1):
+        with torch.cuda.amp.autocast():
+            outputs = self.model(
+                input_ids=tokens,
+                attention_mask=attention_mask,
+                output_hidden_states=True,
+            )
+
+        z = outputs.hidden_states[layer_index].detach()
+        return z
+
+    def general(self, text: str):
+        # input_ids = input_ids = torch.tensor([list(text.encode("utf-8"))]) + num_special_tokens
+        input_ids = self.tokenizer(text, max_length=128).input_ids
+        print(input_ids)
+        outputs = self.generation_model(input_ids)
+        return outputs
\ No newline at end of file
diff --git a/hydit/utils/tools.py b/hydit/utils/tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..66c0b033470d45573fc57b71d24d0cebcd704f26
--- /dev/null
+++ b/hydit/utils/tools.py
@@ -0,0 +1,17 @@
+import random
+
+import numpy as np
+import torch
+
+
+def set_seeds(seed_list, device=None):
+    if isinstance(seed_list, (tuple, list)):
+        seed = sum(seed_list)
+    else:
+        seed = seed_list
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+    return torch.Generator(device).manual_seed(seed)
diff --git a/hydit_app.py b/hydit_app.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d115ec7f404b4d80cbf43f7956b977b23594685
--- /dev/null
+++ b/hydit_app.py
@@ -0,0 +1,170 @@
+import gradio as gr
+import pandas as pd
+from pathlib import Path
+from PIL import Image
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from hydit.constants import SAMPLER_FACTORY
+from sample_t2i import inferencer
+
+ROOT = Path(__file__).parent.parent
+SAMPLERS = list(SAMPLER_FACTORY.keys())
+SIZES = {
+    "square": (1024, 1024),
+    "landscape": (768, 1280),
+    "portrait": (1280, 768),
+}
+
+def get_strings(lang):
+    lang_file = Path(f"app/lang/{lang}.csv")
+    strings = pd.read_csv(lang_file, header=0)
+    strings = strings.set_index("key")['value'].to_dict()
+    return strings
+
+
+args, gen, enhancer = inferencer()
+strings = get_strings("en")
+
+
+def infer(
+    prompt,
+    negative_prompt,
+    seed,
+    cfg_scale,
+    infer_steps,
+    oriW, oriH,
+    sampler,
+    size,
+    enhance
+):
+    if enhance and enhancer is not None:
+        success, enhanced_prompt = enhancer(prompt)
+        if not success:
+            fail_image = Image.open(ROOT / 'app/fail.png')
+            return fail_image
+    else:
+        enhanced_prompt = None
+
+    height, width = SIZES[size]
+    results = gen.predict(prompt,
+                          height=height,
+                          width=width,
+                          seed=seed,
+                          enhanced_prompt=enhanced_prompt,
+                          negative_prompt=negative_prompt,
+                          infer_steps=infer_steps,
+                          guidance_scale=cfg_scale,
+                          batch_size=1,
+                          src_size_cond=(oriW, oriH),
+                          sampler=sampler,
+                          )
+    image = results['images'][0]
+    return image
+
+
+def ui():
+    block = gr.Blocks()
+
+    description = f"""
+    # {strings['title']}
+    
+    ## {strings['desc']}
+    
+    """
+
+    with block:
+        with gr.Row():
+            gr.Markdown(description)
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    size = gr.Radio(
+                        label=strings['size'], choices=[
+                            (strings['square'], 'square'),
+                            (strings['landscape'], 'landscape'),
+                            (strings['portrait'], 'portrait'),
+                        ],
+                        value="square"
+                    )
+                prompt = gr.Textbox(label=strings['prompt'], value=strings['default prompt'], lines=3)
+                with gr.Row():
+                    infer_steps = gr.Slider(
+                        label=strings['infer steps'], minimum=1, maximum=200, value=100, step=1,
+                    )
+                    seed = gr.Number(
+                        label=strings['seed'], minimum=-1, maximum=1_000_000_000, value=1, step=1, precision=0,
+                    )
+                    enhance = gr.Checkbox(
+                        label=strings['enhance'], value=enhancer is not None, interactive=True,
+                    )
+
+                with gr.Accordion(
+                    strings['accordion'], open=False
+                ):
+                    with gr.Row():
+                        negative_prompt = gr.Textbox(label=strings['negative_prompt'],
+                                                     value=gen.default_negative_prompt,
+                                                     lines=2,
+                                                     )
+                    with gr.Row():
+                        sampler = gr.Dropdown(SAMPLERS, label=strings['sampler'], value="ddpm")
+                        cfg_scale = gr.Slider(
+                            label=strings['cfg'], minimum=1.0, maximum=16.0, value=6.0, step=1
+                        )
+                        oriW = gr.Number(
+                            label=strings['width cond'], minimum=1024, maximum=4096, value=1024, step=64, precision=0,
+                            min_width=80,
+                        )
+                        oriH = gr.Number(
+                            label=strings['height cond'], minimum=1024, maximum=4096, value=1024, step=64, precision=0,
+                            min_width=80,
+                        )
+                with gr.Row():
+                    advanced_button = gr.Button(strings['run'])
+            with gr.Column():
+                default_img = Image.open(ROOT / 'app/default.png')
+                output_img = gr.Image(
+                    label=strings['generated image'],
+                    interactive=False,
+                    format='png',
+                    value=default_img,
+                )
+            advanced_button.click(
+                fn=infer,
+                inputs=[
+                    prompt, negative_prompt, seed, cfg_scale, infer_steps,
+                    oriW, oriH, sampler, size, enhance,
+                ],
+                outputs=output_img,
+            )
+
+        with gr.Row():
+            gr.Examples([
+                ['一只小猫'],
+                ['现实主义风格，画面主要描述一个巴洛克风格的花瓶，带有金色的装饰边框，花瓶上盛开着各种色彩鲜艳的花，白色背景'],
+                ['一只聪明的狐狸走在阔叶树林里, 旁边是一条小溪, 细节真实, 摄影'],
+                ['飞流直下三千尺，疑是银河落九天'],
+                ['一只长靴猫手持亮银色的宝剑，身着铠甲，眼神坚毅，站在一堆金币上，背景是暗色调的洞穴，图像上有金币的光影点缀。'],
+                ['麻婆豆腐'],
+                ['苏州园林'],
+                ['一颗新鲜的草莓特写，红色的外表，表面布满许多种子，背景是淡绿色的叶子'],
+                ['请画出“忽如一夜春风来 千树万树梨花开”'],
+                ['请将“杞人忧天”的样子画出来'],
+                ['枯藤老树昏鸦，小桥流水人家'],
+                ['湖水清澈，天空湛蓝，阳光灿烂。一只优雅的白天鹅在湖边游泳。它周围有几只小鸭子，看起来非常可爱，整个画面给人一种宁静祥和的感觉。'],
+                ['一朵鲜艳的红色玫瑰花，花瓣撒有一些水珠，晶莹剔透，特写镜头'],
+                ['臭豆腐'],
+                ['九寨沟'],
+                ['俗语“鲤鱼跃龙门”'],
+                ['风格是写实，画面主要描述一个亚洲戏曲艺术家正在表演，她穿着华丽的戏服，脸上戴着精致的面具，身姿优雅，背景是古色古香的舞台，镜头是近景'],
+            ],
+            [prompt],
+            label=strings['examples']
+            )
+    return block
+
+
+if __name__ == "__main__":
+    interface = ui()
+    interface.launch()
diff --git a/lang/en.csv b/lang/en.csv
new file mode 100644
index 0000000000000000000000000000000000000000..f70f662bbb33b661b9260c337cc494711fb617fc
--- /dev/null
+++ b/lang/en.csv
@@ -0,0 +1,22 @@
+key,value
+size,Size
+sampler,Sampler
+prompt,Prompt
+default prompt,"A cute cat"
+negative_prompt,Negative Prompt
+seed,Seed
+cfg,CFG Scale
+infer steps,Sampling Steps
+batch size,Batch Size
+width cond,Width Cond
+height cond,Height Cond
+enhance,Prompt Enhancement
+run,Submit
+square,Square(1024x1024)
+landscape,Landscape(1280x768)
+portrait,Portrait(768x1280)
+accordion,Advanced Options
+generated image,HunYuanDiT Generated Image
+examples,More Examples
+title,Hunyuan-DiT
+desc,A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding
\ No newline at end of file
diff --git a/lang/zh.csv b/lang/zh.csv
new file mode 100644
index 0000000000000000000000000000000000000000..62e7b4f56e33a42cc36d93b2adf3f153e58e3551
--- /dev/null
+++ b/lang/zh.csv
@@ -0,0 +1,22 @@
+key,value
+size,尺寸
+sampler,采样器
+prompt,文本描述
+default prompt,"一只可爱的猫"
+negative_prompt,负向词
+seed,种子
+cfg,CFG系数
+infer steps,采样步数
+batch size,批大小
+width cond,宽度条件
+height cond,高度条件
+enhance,文本增强
+run,提交生成
+square,方形(1024x1024)
+portrait,竖屏(1280x768)
+landscape,横屏(768x1280)
+accordion,高级设置
+generated image,HunYuanDiT 生成
+examples,更多示例
+title,混元-DiT
+desc,具有细粒度中文理解的高性能多分辨率 Diffusion Transformer 模型
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0804087683ac1ce94c10f403b3fbe7ad73d01187
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,19 @@
+--extra-index-url https://pypi.ngc.nvidia.com
+timm==0.9.5
+diffusers==0.21.2
+peft==0.10.0
+protobuf==3.19.0
+torchvision==0.14.1
+transformers==4.37.2
+accelerate==0.29.3
+loguru==0.7.2
+einops==0.7.0
+sentencepiece==0.1.99
+cuda-python==11.7.1
+onnxruntime==1.12.1
+onnx==1.12.0
+nvidia-pyindex==1.0.9
+onnx-graphsurgeon==0.3.27
+polygraphy==0.47.1
+pandas==2.0.3
+gradio==4.31.0
diff --git a/sample_t2i.py b/sample_t2i.py
new file mode 100644
index 0000000000000000000000000000000000000000..d969e114eddfd3923a29def52d8defb08b0d03ba
--- /dev/null
+++ b/sample_t2i.py
@@ -0,0 +1,72 @@
+from pathlib import Path
+
+from loguru import logger
+
+from dialoggen.dialoggen_demo import DialogGen
+from hydit.config import get_args
+from hydit.inference import End2End
+
+
+def inferencer():
+    args = get_args()
+    models_root_path = Path(args.model_root)
+    if not models_root_path.exists():
+        raise ValueError(f"`models_root` not exists: {models_root_path}")
+
+    # Load models
+    gen = End2End(args, models_root_path)
+
+    # Try to enhance prompt
+    if args.enhance:
+        logger.info("Loading DialogGen model (for prompt enhancement)...")
+        enhancer = DialogGen(str(models_root_path / "dialoggen"))
+        logger.info("DialogGen model loaded.")
+    else:
+        enhancer = None
+
+    return args, gen, enhancer
+
+
+if __name__ == "__main__":
+    args, gen, enhancer = inferencer()
+
+    if enhancer:
+        logger.info("Prompt Enhancement...")
+        success, enhanced_prompt = enhancer(args.prompt)
+        if not success:
+            logger.info("Sorry, the prompt is not compliant, refuse to draw.")
+            exit()
+        logger.info(f"Enhanced prompt: {enhanced_prompt}")
+    else:
+        enhanced_prompt = None
+
+    # Run inference
+    logger.info("Generating images...")
+    height, width = args.image_size
+    results = gen.predict(args.prompt,
+                          height=height,
+                          width=width,
+                          seed=args.seed,
+                          enhanced_prompt=enhanced_prompt,
+                          negative_prompt=args.negative,
+                          infer_steps=args.infer_steps,
+                          guidance_scale=args.cfg_scale,
+                          batch_size=args.batch_size,
+                          src_size_cond=args.size_cond,
+                          )
+    images = results['images']
+
+    # Save images
+    save_dir = Path('results')
+    save_dir.mkdir(exist_ok=True)
+    # Find the first available index
+    all_files = list(save_dir.glob('*.png'))
+    if all_files:
+        start = max([int(f.stem) for f in all_files]) + 1
+    else:
+        start = 0
+
+    for idx, pil_img in enumerate(images):
+        save_path = save_dir / f"{idx + start}.png"
+        pil_img.save(save_path)
+        logger.info(f"Save to {save_path}")
diff --git a/zh.csv b/zh.csv
new file mode 100644
index 0000000000000000000000000000000000000000..62e7b4f56e33a42cc36d93b2adf3f153e58e3551
--- /dev/null
+++ b/zh.csv
@@ -0,0 +1,22 @@
+key,value
+size,尺寸
+sampler,采样器
+prompt,文本描述
+default prompt,"一只可爱的猫"
+negative_prompt,负向词
+seed,种子
+cfg,CFG系数
+infer steps,采样步数
+batch size,批大小
+width cond,宽度条件
+height cond,高度条件
+enhance,文本增强
+run,提交生成
+square,方形(1024x1024)
+portrait,竖屏(1280x768)
+landscape,横屏(768x1280)
+accordion,高级设置
+generated image,HunYuanDiT 生成
+examples,更多示例
+title,混元-DiT
+desc,具有细粒度中文理解的高性能多分辨率 Diffusion Transformer 模型
\ No newline at end of file