Spaces:

Shitao
/

OmniGen

Running on Zero

App Files Files Community

Shitao commited on 18 days ago

Commit

44bc074

•

1 Parent(s): 200a130

add seed

Browse files

Files changed (14) hide show

OmniGen/__pycache__/__init__.cpython-310.pyc +0 -0
OmniGen/__pycache__/model.cpython-310.pyc +0 -0
OmniGen/__pycache__/pipeline.cpython-310.pyc +0 -0
OmniGen/__pycache__/processor.cpython-310.pyc +0 -0
OmniGen/__pycache__/scheduler.cpython-310.pyc +0 -0
OmniGen/__pycache__/transformer.cpython-310.pyc +0 -0
OmniGen/__pycache__/utils.cpython-310.pyc +0 -0
OmniGen/model.py +16 -11
OmniGen/pipeline.py +36 -8
OmniGen/processor.py +11 -25
OmniGen/scheduler.py +1 -1
OmniGen/transformer.py +1 -1
OmniGen/utils.py +110 -0
app.py +14 -5

OmniGen/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (325 Bytes). View file

OmniGen/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (12.5 kB). View file

OmniGen/__pycache__/pipeline.cpython-310.pyc ADDED Viewed

Binary file (8.5 kB). View file

OmniGen/__pycache__/processor.cpython-310.pyc ADDED Viewed

Binary file (11.2 kB). View file

OmniGen/__pycache__/scheduler.cpython-310.pyc ADDED Viewed

Binary file (2.75 kB). View file

OmniGen/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (3.95 kB). View file

OmniGen/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (3.52 kB). View file

OmniGen/model.py CHANGED Viewed

@@ -5,7 +5,10 @@ import torch.nn as nn
 import numpy as np
 import math
 from typing import Dict
 from timm.models.vision_transformer import PatchEmbed, Attention, Mlp
 from OmniGen.transformer import Phi3Config, Phi3Transformer
@@ -145,7 +148,7 @@ class PatchEmbedMR(nn.Module):
         return x
-class OmniGen(nn.Module):
     """
     Diffusion model with a Transformer backbone.
     """
@@ -191,7 +194,7 @@ class OmniGen(nn.Module):
                                            ignore_patterns=['flax_model.msgpack', 'rust_model.ot', 'tf_model.h5'])
         config = Phi3Config.from_pretrained(model_name)
         model = cls(config)
-        ckpt = torch.load(os.path.join(model_name, 'model.pt'))
         model.load_state_dict(ckpt)
         return model
@@ -304,7 +307,7 @@ class OmniGen(nn.Module):
         return latents, num_tokens, shapes
-    def forward(self, x, timestep, text_ids, pixel_values, image_sizes, attention_mask, position_ids, padding_latent=None, past_key_values=None):
         """
         """
@@ -312,16 +315,16 @@ class OmniGen(nn.Module):
         x, num_tokens, shapes = self.patch_multiple_resolutions(x, padding_latent)
         time_token = self.time_token(timestep, dtype=x[0].dtype).unsqueeze(1)
-        if pixel_values is not None:
-            input_latents, _, _ = self.patch_multiple_resolutions(pixel_values, is_input_images=True)
-        if text_ids is not None:
-            condition_embeds = self.llm.embed_tokens(text_ids)
             input_img_inx = 0
-            for b_inx in image_sizes.keys():
-                for start_inx, end_inx in image_sizes[b_inx]:
                     condition_embeds[b_inx, start_inx: end_inx] = input_latents[input_img_inx]
                     input_img_inx += 1
-            if pixel_values is not None:
                 assert input_img_inx == len(input_latents)
             input_emb = torch.cat([condition_embeds, time_token, x], dim=1)
@@ -344,7 +347,9 @@ class OmniGen(nn.Module):
             x = self.final_layer(image_embedding, time_emb)
             latents = self.unpatchify(x, shapes[0], shapes[1])
-        return latents, past_key_values
     @torch.no_grad()
     def forward_with_cfg(self, x, timestep, input_ids, input_img_latents, input_image_sizes, attention_mask, position_ids, cfg_scale, use_img_cfg, img_cfg_scale, past_key_values, use_kv_cache):

 import numpy as np
 import math
 from typing import Dict
+from diffusers.loaders import PeftAdapterMixin
 from timm.models.vision_transformer import PatchEmbed, Attention, Mlp
+from huggingface_hub import snapshot_download
 from OmniGen.transformer import Phi3Config, Phi3Transformer
         return x
+class OmniGen(nn.Module, PeftAdapterMixin):
     """
     Diffusion model with a Transformer backbone.
     """
                                            ignore_patterns=['flax_model.msgpack', 'rust_model.ot', 'tf_model.h5'])
         config = Phi3Config.from_pretrained(model_name)
         model = cls(config)
+        ckpt = torch.load(os.path.join(model_name, 'model.pt'), map_location='cpu')
         model.load_state_dict(ckpt)
         return model
         return latents, num_tokens, shapes
+    def forward(self, x, timestep, input_ids, input_img_latents, input_image_sizes, attention_mask, position_ids, padding_latent=None, past_key_values=None, return_past_key_values=True):
         """
         """
         x, num_tokens, shapes = self.patch_multiple_resolutions(x, padding_latent)
         time_token = self.time_token(timestep, dtype=x[0].dtype).unsqueeze(1)
+        if input_img_latents is not None:
+            input_latents, _, _ = self.patch_multiple_resolutions(input_img_latents, is_input_images=True)
+        if input_ids is not None:
+            condition_embeds = self.llm.embed_tokens(input_ids).clone()
             input_img_inx = 0
+            for b_inx in input_image_sizes.keys():
+                for start_inx, end_inx in input_image_sizes[b_inx]:
                     condition_embeds[b_inx, start_inx: end_inx] = input_latents[input_img_inx]
                     input_img_inx += 1
+            if input_img_latents is not None:
                 assert input_img_inx == len(input_latents)
             input_emb = torch.cat([condition_embeds, time_token, x], dim=1)
             x = self.final_layer(image_embedding, time_emb)
             latents = self.unpatchify(x, shapes[0], shapes[1])
+        if past_key_values:
+            return latents, past_key_values
+        return latents
     @torch.no_grad()
     def forward_with_cfg(self, x, timestep, input_ids, input_img_latents, input_image_sizes, attention_mask, position_ids, cfg_scale, use_img_cfg, img_cfg_scale, past_key_values, use_kv_cache):

OmniGen/pipeline.py CHANGED Viewed

@@ -6,6 +6,7 @@ from PIL import Image
 import numpy as np
 import torch
 from huggingface_hub import snapshot_download
 from diffusers.models import AutoencoderKL
 from diffusers.utils import (
     USE_PEFT_BACKEND,
@@ -31,7 +32,7 @@ EXAMPLE_DOC_STRING = """
         >>> prompt = "A woman holds a bouquet of flowers and faces the camera"
         >>> image = pipe(
         ...     prompt,
-        ...     guidance_scale=1.0,
         ...     num_inference_steps=50,
         ... ).images[0]
         >>> image.save("t2i.png")
@@ -53,23 +54,42 @@ class OmniGenPipeline:
         self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
         self.model.to(self.device)
         self.vae.to(self.device)
     @classmethod
-    def from_pretrained(cls, model_name):
         if not os.path.exists(model_name):
             cache_folder = os.getenv('HF_HUB_CACHE')
-            print(cache_folder)
             model_name = snapshot_download(repo_id=model_name,
                                            cache_dir=cache_folder,
                                            ignore_patterns=['flax_model.msgpack', 'rust_model.ot', 'tf_model.h5'])
             logger.info(f"Downloaded model to {model_name}")
         model = OmniGen.from_pretrained(model_name)
         processor = OmniGenProcessor.from_pretrained(model_name)
-        vae = AutoencoderKL.from_pretrained(os.path.join(model_name, "vae"))
         return cls(vae, model, processor)
     def vae_encode(self, x, dtype):
         if self.vae.config.shift_factor is not None:
             x = self.vae.encode(x).latent_dist.sample()
@@ -100,6 +120,7 @@ class OmniGenPipeline:
         separate_cfg_infer: bool = False,
         use_kv_cache: bool = True,
         dtype: torch.dtype = torch.bfloat16,
         ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -128,15 +149,18 @@ class OmniGenPipeline:
             separate_cfg_infer (`bool`, *optional*, defaults to False):
                 Perform inference on images with different guidance separately; this can save memory when generating images of large size at the expense of slower inference.
             use_kv_cache (`bool`, *optional*, defaults to True): enable kv cache to speed up the inference
         Examples:
         Returns:
             A list with the generated images.
         """
         assert height%16 == 0 and width%16 == 0
-        if use_kv_cache and separate_cfg_infer:
-            raise "Currently, don't support both use_kv_cache and separate_cfg_infer"
         if input_images is None:
             use_img_guidance = False
         if isinstance(prompt, str):
@@ -149,7 +173,11 @@ class OmniGenPipeline:
         num_cfg = 2 if use_img_guidance else 1
         latent_size_h, latent_size_w = height//8, width//8
-        latents = torch.randn(num_prompt, 4, latent_size_h, latent_size_w, device=self.device)
         latents = torch.cat([latents]*(1+num_cfg), 0).to(dtype)
         input_img_latents = []

 import numpy as np
 import torch
 from huggingface_hub import snapshot_download
+from peft import LoraConfig, PeftModel
 from diffusers.models import AutoencoderKL
 from diffusers.utils import (
     USE_PEFT_BACKEND,
         >>> prompt = "A woman holds a bouquet of flowers and faces the camera"
         >>> image = pipe(
         ...     prompt,
+        ...     guidance_scale=3.0,
         ...     num_inference_steps=50,
         ... ).images[0]
         >>> image.save("t2i.png")
         self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
         self.model.to(self.device)
+        self.model.eval()
         self.vae.to(self.device)
     @classmethod
+    def from_pretrained(cls, model_name, vae_path: str=None):
         if not os.path.exists(model_name):
+            logger.info("Model not found, downloading...")
             cache_folder = os.getenv('HF_HUB_CACHE')
             model_name = snapshot_download(repo_id=model_name,
                                            cache_dir=cache_folder,
                                            ignore_patterns=['flax_model.msgpack', 'rust_model.ot', 'tf_model.h5'])
             logger.info(f"Downloaded model to {model_name}")
         model = OmniGen.from_pretrained(model_name)
         processor = OmniGenProcessor.from_pretrained(model_name)
+        if os.path.exists(os.path.join(model_name, "vae")):
+            vae = AutoencoderKL.from_pretrained(os.path.join(model_name, "vae"))
+        elif vae_path is not None:
+            vae = AutoencoderKL.from_pretrained(vae_path).to(device)
+        else:
+            logger.info(f"No VAE found in {model_name}, downloading stabilityai/sdxl-vae from HF")
+            vae = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae").to(device)
         return cls(vae, model, processor)
+    def merge_lora(self, lora_path: str):
+        model = PeftModel.from_pretrained(self.model, lora_path)
+        model.merge_and_unload()
+        self.model = model
+    def to(self, device: Union[str, torch.device]):
+        if isinstance(device, str):
+            device = torch.device(device)
+        self.model.to(device)
+        self.vae.to(device)
     def vae_encode(self, x, dtype):
         if self.vae.config.shift_factor is not None:
             x = self.vae.encode(x).latent_dist.sample()
         separate_cfg_infer: bool = False,
         use_kv_cache: bool = True,
         dtype: torch.dtype = torch.bfloat16,
+        seed: int = None,
         ):
         r"""
         Function invoked when calling the pipeline for generation.
             separate_cfg_infer (`bool`, *optional*, defaults to False):
                 Perform inference on images with different guidance separately; this can save memory when generating images of large size at the expense of slower inference.
             use_kv_cache (`bool`, *optional*, defaults to True): enable kv cache to speed up the inference
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
         Examples:
         Returns:
             A list with the generated images.
         """
         assert height%16 == 0 and width%16 == 0
+        if separate_cfg_infer:
+            use_kv_cache = False
+            # raise "Currently, don't support both use_kv_cache and separate_cfg_infer"
         if input_images is None:
             use_img_guidance = False
         if isinstance(prompt, str):
         num_cfg = 2 if use_img_guidance else 1
         latent_size_h, latent_size_w = height//8, width//8
+        if seed is not None:
+            generator = torch.Generator(device=self.device).manual_seed(seed)
+        else:
+            generator = None
+        latents = torch.randn(num_prompt, 4, latent_size_h, latent_size_w, device=self.device, generator=generator)
         latents = torch.cat([latents]*(1+num_cfg), 0).to(dtype)
         input_img_latents = []

OmniGen/processor.py CHANGED Viewed

@@ -11,28 +11,15 @@ from torchvision import transforms
 from transformers import AutoTokenizer
 from huggingface_hub import snapshot_download
-def crop_arr(pil_image, max_image_size):
-    while min(*pil_image.size) >= 2 * max_image_size:
-        pil_image = pil_image.resize(
-            tuple(x // 2 for x in pil_image.size), resample=Image.BOX
-        )
-    if max(*pil_image.size) > max_image_size:
-        scale = max_image_size / max(*pil_image.size)
-        pil_image = pil_image.resize(
-            tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
-        )
-    arr = np.array(pil_image)
-    crop_y1 = (arr.shape[0] % 16) // 2
-    crop_y2 = arr.shape[0] % 16 - crop_y1
-    crop_x1 = (arr.shape[1] % 16) // 2
-    crop_x2 = arr.shape[1] % 16 - crop_x1
-    arr = arr[crop_y1:arr.shape[0]-crop_y2, crop_x1:arr.shape[1]-crop_x2]
-    return Image.fromarray(arr)
 class OmniGenProcessor:
@@ -68,6 +55,7 @@ class OmniGenProcessor:
         return self.image_transform(image)
     def process_multi_modal_prompt(self, text, input_images):
         if input_images is None or len(input_images) == 0:
             model_inputs = self.text_tokenizer(text)
             return {"input_ids": model_inputs.input_ids, "pixel_values": None, "image_sizes": None}
@@ -132,7 +120,6 @@ class OmniGenProcessor:
         for i in range(len(instructions)):
             cur_instruction = instructions[i]
             cur_input_images = None if input_images is None else input_images[i]
-            cur_instruction = self.add_prefix_instruction(cur_instruction)
             if cur_input_images is not None and len(cur_input_images) > 0:
                 cur_input_images = [self.process_image(x) for x in cur_input_images]
             else:
@@ -143,14 +130,13 @@ class OmniGenProcessor:
             neg_mllm_input, img_cfg_mllm_input = None, None
-            neg_instruction = self.add_prefix_instruction(negative_prompt)
-            neg_mllm_input = self.process_multi_modal_prompt(neg_instruction, None)
             if use_img_cfg:
                 if cur_input_images is not None and len(cur_input_images) >= 1:
                     img_cfg_prompt = [f"<img><|image_{i+1}|></img>" for i in range(len(cur_input_images))]
-                    img_cfg_mllm_input = self.process_multi_modal_prompt(self.add_prefix_instruction(" ".join(img_cfg_prompt)), cur_input_images)
                 else:
-                    img_cfg_mllm_input = neg_instruction
             input_data.append((mllm_input, neg_mllm_input, img_cfg_mllm_input, [height, width]))

 from transformers import AutoTokenizer
 from huggingface_hub import snapshot_download
+from OmniGen.utils import (
+    create_logger,
+    update_ema,
+    requires_grad,
+    center_crop_arr,
+    crop_arr,
+)
 class OmniGenProcessor:
         return self.image_transform(image)
     def process_multi_modal_prompt(self, text, input_images):
+        text = self.add_prefix_instruction(text)
         if input_images is None or len(input_images) == 0:
             model_inputs = self.text_tokenizer(text)
             return {"input_ids": model_inputs.input_ids, "pixel_values": None, "image_sizes": None}
         for i in range(len(instructions)):
             cur_instruction = instructions[i]
             cur_input_images = None if input_images is None else input_images[i]
             if cur_input_images is not None and len(cur_input_images) > 0:
                 cur_input_images = [self.process_image(x) for x in cur_input_images]
             else:
             neg_mllm_input, img_cfg_mllm_input = None, None
+            neg_mllm_input = self.process_multi_modal_prompt(negative_prompt, None)
             if use_img_cfg:
                 if cur_input_images is not None and len(cur_input_images) >= 1:
                     img_cfg_prompt = [f"<img><|image_{i+1}|></img>" for i in range(len(cur_input_images))]
+                    img_cfg_mllm_input = self.process_multi_modal_prompt(" ".join(img_cfg_prompt), cur_input_images)
                 else:
+                    img_cfg_mllm_input = neg_mllm_input
             input_data.append((mllm_input, neg_mllm_input, img_cfg_mllm_input, [height, width]))

OmniGen/scheduler.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
 from tqdm import tqdm
-from transformers.cache_utils import Cache, DynamicCache, OffloadedCache
 class OmniGenScheduler:
     def __init__(self, num_steps: int=50, time_shifting_factor: int=1):

 import torch
 from tqdm import tqdm
+from transformers.cache_utils import Cache, DynamicCache
 class OmniGenScheduler:
     def __init__(self, num_steps: int=50, time_shifting_factor: int=1):

OmniGen/transformer.py CHANGED Viewed

@@ -16,7 +16,7 @@ from transformers.modeling_outputs import (
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers import Phi3Config, Phi3Model
-from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache, OffloadedCache
 from transformers.utils import logging
 logger = logging.get_logger(__name__)

 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers import Phi3Config, Phi3Model
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
 from transformers.utils import logging
 logger = logging.get_logger(__name__)

OmniGen/utils.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import logging
+from PIL import Image
+import torch
+import numpy as np
+def create_logger(logging_dir):
+    """
+    Create a logger that writes to a log file and stdout.
+    """
+    logging.basicConfig(
+        level=logging.INFO,
+        format='[\033[34m%(asctime)s\033[0m] %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S',
+        handlers=[logging.StreamHandler(), logging.FileHandler(f"{logging_dir}/log.txt")]
+    )
+    logger = logging.getLogger(__name__)
+    return logger
+@torch.no_grad()
+def update_ema(ema_model, model, decay=0.9999):
+    """
+    Step the EMA model towards the current model.
+    """
+    ema_params = dict(ema_model.named_parameters())
+    for name, param in model.named_parameters():
+        # TODO: Consider applying only to params that require_grad to avoid small numerical changes of pos_embed
+        ema_params[name].mul_(decay).add_(param.data, alpha=1 - decay)
+def requires_grad(model, flag=True):
+    """
+    Set requires_grad flag for all parameters in a model.
+    """
+    for p in model.parameters():
+        p.requires_grad = flag
+def center_crop_arr(pil_image, image_size):
+    """
+    Center cropping implementation from ADM.
+    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
+    """
+    while min(*pil_image.size) >= 2 * image_size:
+        pil_image = pil_image.resize(
+            tuple(x // 2 for x in pil_image.size), resample=Image.BOX
+        )
+    scale = image_size / min(*pil_image.size)
+    pil_image = pil_image.resize(
+        tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
+    )
+    arr = np.array(pil_image)
+    crop_y = (arr.shape[0] - image_size) // 2
+    crop_x = (arr.shape[1] - image_size) // 2
+    return Image.fromarray(arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size])
+def crop_arr(pil_image, max_image_size):
+    while min(*pil_image.size) >= 2 * max_image_size:
+        pil_image = pil_image.resize(
+            tuple(x // 2 for x in pil_image.size), resample=Image.BOX
+        )
+    if max(*pil_image.size) > max_image_size:
+        scale = max_image_size / max(*pil_image.size)
+        pil_image = pil_image.resize(
+            tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
+        )
+    if min(*pil_image.size) < 16:
+        scale = 16 / min(*pil_image.size)
+        pil_image = pil_image.resize(
+            tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
+        )
+    arr = np.array(pil_image)
+    crop_y1 = (arr.shape[0] % 16) // 2
+    crop_y2 = arr.shape[0] % 16 - crop_y1
+    crop_x1 = (arr.shape[1] % 16) // 2
+    crop_x2 = arr.shape[1] % 16 - crop_x1
+    arr = arr[crop_y1:arr.shape[0]-crop_y2, crop_x1:arr.shape[1]-crop_x2]
+    return Image.fromarray(arr)
+def vae_encode(vae, x, weight_dtype):
+    if x is not None:
+        if vae.config.shift_factor is not None:
+            x = vae.encode(x).latent_dist.sample()
+            x = (x - vae.config.shift_factor) * vae.config.scaling_factor
+        else:
+            x = vae.encode(x).latent_dist.sample().mul_(vae.config.scaling_factor)
+        x = x.to(weight_dtype)
+    return x
+def vae_encode_list(vae, x, weight_dtype):
+    latents = []
+    for img in x:
+        img = vae_encode(vae, img, weight_dtype)
+        latents.append(img)
+    return latents

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ pipe = OmniGenPipeline.from_pretrained(
 @spaces.GPU
 # 示例处理函数：生成图像
-def generate_image(text, img1, img2, img3, height, width, guidance_scale, inference_steps):
     input_images = [img1, img2, img3]
     # 去除 None
     input_images = [img for img in input_images if img is not None]
@@ -28,6 +28,7 @@ def generate_image(text, img1, img2, img3, height, width, guidance_scale, infere
         num_inference_steps=inference_steps,
         separate_cfg_infer=True,
         use_kv_cache=False,
     )
     img = output[0]
     return img
@@ -54,6 +55,7 @@ def get_example():
             1024,
             3.0,
             20,
         ],
         [
             "Three zebras are standing side by side on a vibrant savannah, each showcasing unique patterns and characteristics that highlight their individuality. The zebra on the left has a strikingly bold black and white stripe pattern, with wider stripes that create a dramatic contrast against its sleek body. In the middle, the zebra features a more subtle stripe arrangement, with thinner stripes that blend seamlessly into a slightly sandy-colored coat, giving it a softer appearance. On the right, the zebra's stripes are more irregular, with a distinct patch of brown fur near its shoulder, adding a layer of uniqueness to its overall look. Together, these zebras create a captivating scene, each representing the diverse beauty of their species in the wild. The right zebras is the zebras from <img><|image_1|></img>. The center zebras is from <img><|image_2|></img>. The left zebras is the zebras from <img><|image_3|></img>.",
@@ -64,22 +66,23 @@ def get_example():
             1024,
             3.0,
             20,
         ],
     ]
     return case
-def run_for_examples(text, img1, img2, img3, height, width, guidance_scale, inference_steps):
-    return generate_image(text, img1, img2, img3, height, width, guidance_scale, inference_steps)
 # Gradio 接口
 with gr.Blocks() as demo:
-    gr.Markdown("## Text + Multiple Images to Image Generator")
     with gr.Row():
         with gr.Column():
             # 文本输入框
             prompt_input = gr.Textbox(
-                label="Enter your prompt", placeholder="Type your prompt here..."
             )
             with gr.Row(equal_height=True):
@@ -105,6 +108,10 @@ with gr.Blocks() as demo:
                 label="Inference Steps", minimum=1, maximum=50, value=50, step=1
             )
             # 生成按钮
             generate_button = gr.Button("Generate Image")
@@ -124,6 +131,7 @@ with gr.Blocks() as demo:
             width_input,
             guidance_scale_input,
             num_inference_steps,
         ],
         outputs=output_image,
     )
@@ -140,6 +148,7 @@ with gr.Blocks() as demo:
             width_input,
             guidance_scale_input,
             num_inference_steps,
         ],
         outputs=output_image,
     )

 @spaces.GPU
 # 示例处理函数：生成图像
+def generate_image(text, img1, img2, img3, height, width, guidance_scale, inference_steps, seed):
     input_images = [img1, img2, img3]
     # 去除 None
     input_images = [img for img in input_images if img is not None]
         num_inference_steps=inference_steps,
         separate_cfg_infer=True,
         use_kv_cache=False,
+        seed=seed,
     )
     img = output[0]
     return img
             1024,
             3.0,
             20,
+            42,
         ],
         [
             "Three zebras are standing side by side on a vibrant savannah, each showcasing unique patterns and characteristics that highlight their individuality. The zebra on the left has a strikingly bold black and white stripe pattern, with wider stripes that create a dramatic contrast against its sleek body. In the middle, the zebra features a more subtle stripe arrangement, with thinner stripes that blend seamlessly into a slightly sandy-colored coat, giving it a softer appearance. On the right, the zebra's stripes are more irregular, with a distinct patch of brown fur near its shoulder, adding a layer of uniqueness to its overall look. Together, these zebras create a captivating scene, each representing the diverse beauty of their species in the wild. The right zebras is the zebras from <img><|image_1|></img>. The center zebras is from <img><|image_2|></img>. The left zebras is the zebras from <img><|image_3|></img>.",
             1024,
             3.0,
             20,
+            42,
         ],
     ]
     return case
+def run_for_examples(text, img1, img2, img3, height, width, guidance_scale, inference_steps, seed):
+    return generate_image(text, img1, img2, img3, height, width, guidance_scale, inference_steps, seed)
 # Gradio 接口
 with gr.Blocks() as demo:
+    gr.Markdown("# OmniGen: Unified Image Generation")
     with gr.Row():
         with gr.Column():
             # 文本输入框
             prompt_input = gr.Textbox(
+                label="Enter your prompt, use <img><|image_i|></img> tokens for images", placeholder="Type your prompt here..."
             )
             with gr.Row(equal_height=True):
                 label="Inference Steps", minimum=1, maximum=50, value=50, step=1
             )
+            seed_input = gr.Slider(
+                label="Seed", minimum=0, maximum=2147483647, value=42, step=1
+            )
             # 生成按钮
             generate_button = gr.Button("Generate Image")
             width_input,
             guidance_scale_input,
             num_inference_steps,
+            seed_input,
         ],
         outputs=output_image,
     )
             width_input,
             guidance_scale_input,
             num_inference_steps,
+            seed_input,
         ],
         outputs=output_image,
     )