Spaces:

Shitao
/

OmniGen

Running on Zero

App Files Files Community

yrr commited on 21 days ago

Commit

7f48662

•

1 Parent(s): 2c2ec6c

test

Browse files

Files changed (16) hide show

OmniGen/__init__.py +4 -0
OmniGen/model.py +402 -0
OmniGen/pipeline.py +201 -0
OmniGen/processor.py +349 -0
OmniGen/scheduler.py +55 -0
OmniGen/train.py +0 -0
OmniGen/transformer.py +159 -0
app.py +59 -145
edit.png +0 -0
imgs/.DS_Store +3 -0
imgs/test_cases/liuyifei.png +0 -0
imgs/test_cases/taylor.png +0 -0
imgs/test_cases/trump.png +0 -0
imgs/test_cases/turing.png +0 -0
inference.ipynb +0 -0
setup.py +23 -0

OmniGen/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .model import OmniGen
+from .processor import OmniGenProcessor
+from .scheduler import OmniGenScheduler
+from .pipeline import OmniGenPipeline

OmniGen/model.py ADDED Viewed

	@@ -0,0 +1,402 @@

+# The code is revised from DiT
+import os
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+from typing import Dict
+from timm.models.vision_transformer import PatchEmbed, Attention, Mlp
+from OmniGen.transformer import Phi3Config, Phi3Transformer
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t, dtype=torch.float32):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=1):
+    """
+    grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+    [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_size = (grid_size, grid_size)
+    grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size) / interpolation_scale
+    grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size) / interpolation_scale
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+class PatchEmbedMR(nn.Module):
+    """ 2D Image to Patch Embedding
+    """
+    def __init__(
+            self,
+            patch_size: int = 2,
+            in_chans: int = 4,
+            embed_dim: int = 768,
+            bias: bool = True,
+    ):
+        super().__init__()
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
+    def forward(self, x):
+        x = self.proj(x)
+        x = x.flatten(2).transpose(1, 2)  # NCHW -> NLC
+        return x
+class OmniGen(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        transformer_config: Phi3Config,
+        patch_size=2,
+        in_channels=4,
+        pe_interpolation: float = 1.0,
+        pos_embed_max_size: int = 192,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.patch_size = patch_size
+        self.pos_embed_max_size = pos_embed_max_size
+        hidden_size = transformer_config.hidden_size
+        self.x_embedder = PatchEmbedMR(patch_size, in_channels, hidden_size, bias=True)
+        self.input_x_embedder = PatchEmbedMR(patch_size, in_channels, hidden_size, bias=True)
+        self.time_token = TimestepEmbedder(hidden_size)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.pe_interpolation = pe_interpolation
+        pos_embed = get_2d_sincos_pos_embed(hidden_size, pos_embed_max_size, interpolation_scale=self.pe_interpolation, base_size=64)
+        self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=True)
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+        self.llm = Phi3Transformer(config=transformer_config)
+        self.llm.config.use_cache = False
+    @classmethod
+    def from_pretrained(cls, model_name):
+        if not os.path.exists(os.path.join(model_name, 'model.pt')):
+            cache_folder = os.getenv('HF_HUB_CACHE')
+            model_name = snapshot_download(repo_id=model_name,
+                                           cache_dir=cache_folder,
+                                           ignore_patterns=['flax_model.msgpack', 'rust_model.ot', 'tf_model.h5'])
+        config = Phi3Config.from_pretrained(model_name)
+        model = cls(config)
+        ckpt = torch.load(os.path.join(model_name, 'model.pt'))
+        model.load_state_dict(ckpt)
+        return model
+    def initialize_weights(self):
+        assert not hasattr(self, "llama")
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        w = self.input_x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        nn.init.normal_(self.time_token.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.time_token.mlp[2].weight, std=0.02)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x, h, w):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        x = x.reshape(shape=(x.shape[0], h//self.patch_size, w//self.patch_size, self.patch_size, self.patch_size, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h, w))
+        return imgs
+    def cropped_pos_embed(self, height, width):
+        """Crops positional embeddings for SD3 compatibility."""
+        if self.pos_embed_max_size is None:
+            raise ValueError("`pos_embed_max_size` must be set for cropping.")
+        height = height // self.patch_size
+        width = width // self.patch_size
+        if height > self.pos_embed_max_size:
+            raise ValueError(
+                f"Height ({height}) cannot be greater than `pos_embed_max_size`: {self.pos_embed_max_size}."
+            )
+        if width > self.pos_embed_max_size:
+            raise ValueError(
+                f"Width ({width}) cannot be greater than `pos_embed_max_size`: {self.pos_embed_max_size}."
+            )
+        top = (self.pos_embed_max_size - height) // 2
+        left = (self.pos_embed_max_size - width) // 2
+        spatial_pos_embed = self.pos_embed.reshape(1, self.pos_embed_max_size, self.pos_embed_max_size, -1)
+        spatial_pos_embed = spatial_pos_embed[:, top : top + height, left : left + width, :]
+        # print(top, top + height, left, left + width, spatial_pos_embed.size())
+        spatial_pos_embed = spatial_pos_embed.reshape(1, -1, spatial_pos_embed.shape[-1])
+        return spatial_pos_embed
+    def patch_multiple_resolutions(self, latents, padding_latent=None, is_input_images:bool=False):
+        if isinstance(latents, list):
+            return_list = False
+            if padding_latent is None:
+                padding_latent = [None] * len(latents)
+                return_list = True
+            patched_latents, num_tokens, shapes = [], [], []
+            for latent, padding in zip(latents, padding_latent):
+                height, width = latent.shape[-2:]
+                if is_input_images:
+                    latent = self.input_x_embedder(latent)
+                else:
+                    latent = self.x_embedder(latent)
+                pos_embed = self.cropped_pos_embed(height, width)
+                latent = latent + pos_embed
+                if padding is not None:
+                    latent = torch.cat([latent, padding], dim=-2)
+                patched_latents.append(latent)
+                num_tokens.append(pos_embed.size(1))
+                shapes.append([height, width])
+            if not return_list:
+                latents = torch.cat(patched_latents, dim=0)
+            else:
+                latents = patched_latents
+        else:
+            height, width = latents.shape[-2:]
+            if is_input_images:
+                latents = self.input_x_embedder(latents)
+            else:
+                latents = self.x_embedder(latents)
+            pos_embed = self.cropped_pos_embed(height, width)
+            latents = latents + pos_embed
+            num_tokens = latents.size(1)
+            shapes = [height, width]
+        return latents, num_tokens, shapes
+    def forward(self, x, timestep, text_ids, pixel_values, image_sizes, attention_mask, position_ids, padding_latent=None, past_key_values=None):
+        """
+        """
+        input_is_list = isinstance(x, list)
+        x, num_tokens, shapes = self.patch_multiple_resolutions(x, padding_latent)
+        time_token = self.time_token(timestep, dtype=x[0].dtype).unsqueeze(1)
+        if pixel_values is not None:
+            input_latents, _, _ = self.patch_multiple_resolutions(pixel_values, is_input_images=True)
+        if text_ids is not None:
+            condition_embeds = self.llm.embed_tokens(text_ids)
+            input_img_inx = 0
+            for b_inx in image_sizes.keys():
+                for start_inx, end_inx in image_sizes[b_inx]:
+                    condition_embeds[b_inx, start_inx: end_inx] = input_latents[input_img_inx]
+                    input_img_inx += 1
+            if pixel_values is not None:
+                assert input_img_inx == len(input_latents)
+            input_emb = torch.cat([condition_embeds, time_token, x], dim=1)
+        else:
+            input_emb = torch.cat([time_token, x], dim=1)
+        output = self.llm(inputs_embeds=input_emb, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values)
+        output, past_key_values = output.last_hidden_state, output.past_key_values
+        if input_is_list:
+            image_embedding = output[:, -max(num_tokens):]
+            time_emb = self.t_embedder(timestep, dtype=x.dtype)
+            x = self.final_layer(image_embedding, time_emb)
+            latents = []
+            for i in range(x.size(0)):
+                latent = x[i:i+1, :num_tokens[i]]
+                latent = self.unpatchify(latent, shapes[i][0], shapes[i][1])
+                latents.append(latent)
+        else:
+            image_embedding = output[:, -num_tokens:]
+            time_emb = self.t_embedder(timestep, dtype=x.dtype)
+            x = self.final_layer(image_embedding, time_emb)
+            latents = self.unpatchify(x, shapes[0], shapes[1])
+        return latents, past_key_values
+    @torch.no_grad()
+    def forward_with_cfg(self, x, timestep, input_ids, input_img_latents, input_image_sizes, attention_mask, position_ids, cfg_scale, use_img_cfg, img_cfg_scale, past_key_values, use_kv_cache):
+        """
+        Forward pass of DiT, but also batches the unconditional forward pass for classifier-free guidance.
+        """
+        self.llm.config.use_cache = use_kv_cache
+        model_out, past_key_values = self.forward(x, timestep, input_ids, input_img_latents, input_image_sizes, attention_mask, position_ids, past_key_values=past_key_values)
+        if use_img_cfg:
+            cond, uncond, img_cond = torch.split(model_out, len(model_out) // 3, dim=0)
+            cond = uncond + img_cfg_scale * (img_cond - uncond) + cfg_scale * (cond - img_cond)
+            model_out = [cond, cond, cond]
+        else:
+            cond, uncond = torch.split(model_out, len(model_out) // 2, dim=0)
+            cond = uncond + cfg_scale * (cond - uncond)
+            model_out = [cond, cond]
+        return torch.cat(model_out, dim=0), past_key_values
+    @torch.no_grad()
+    def forward_with_separate_cfg(self, x, timestep, input_ids, input_img_latents, input_image_sizes, attention_mask, position_ids, cfg_scale, use_img_cfg, img_cfg_scale, past_key_values, use_kv_cache):
+        """
+        Forward pass of DiT, but also batches the unconditional forward pass for classifier-free guidance.
+        """
+        self.llm.config.use_cache = use_kv_cache
+        if past_key_values is None:
+            past_key_values = [None] * len(attention_mask)
+        x = torch.split(x, len(x) // len(attention_mask), dim=0)
+        timestep = timestep.to(x[0].dtype)
+        timestep = torch.split(timestep, len(timestep) // len(input_ids), dim=0)
+        model_out, pask_key_values = [], []
+        for i in range(len(input_ids)):
+            temp_out, temp_pask_key_values = self.forward(x[i], timestep[i], input_ids[i], input_img_latents[i], input_image_sizes[i], attention_mask[i], position_ids[i], past_key_values[i])
+            model_out.append(temp_out)
+            pask_key_values.append(temp_pask_key_values)
+        if len(model_out) == 3:
+            cond, uncond, img_cond = model_out
+            cond = uncond + img_cfg_scale * (img_cond - uncond) + cfg_scale * (cond - img_cond)
+            model_out = [cond, cond, cond]
+        elif len(model_out) == 2:
+            cond, uncond = model_out
+            cond = uncond + cfg_scale * (cond - uncond)
+            model_out = [cond, cond]
+        else:
+            return model_out[0]
+        return torch.cat(model_out, dim=0), pask_key_values

OmniGen/pipeline.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import os
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+from PIL import Image
+import numpy as np
+import torch
+from huggingface_hub import snapshot_download
+from diffusers.models import AutoencoderKL
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from OmniGen import OmniGen, OmniGenProcessor, OmniGenScheduler
+logger = logging.get_logger(__name__)
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from OmniGen import OmniGenPipeline
+        >>> pipe = FluxControlNetPipeline.from_pretrained(
+        ...     base_model
+        ... )
+        >>> prompt = "A woman holds a bouquet of flowers and faces the camera"
+        >>> image = pipe(
+        ...     prompt,
+        ...     guidance_scale=1.0,
+        ...     num_inference_steps=50,
+        ... ).images[0]
+        >>> image.save("t2i.png")
+        ```
+"""
+class OmniGenPipeline:
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        model: OmniGen,
+        processor: OmniGenProcessor,
+    ):
+        self.vae = vae
+        self.model = model
+        self.processor = processor
+        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+        self.model.to(self.device)
+        self.vae.to(self.device)
+    @classmethod
+    def from_pretrained(cls, model_name):
+        if not os.path.exists(model_name):
+            cache_folder = os.getenv('HF_HUB_CACHE')
+            print(cache_folder)
+            model_name = snapshot_download(repo_id=model_name,
+                                           cache_dir=cache_folder,
+                                           ignore_patterns=['flax_model.msgpack', 'rust_model.ot', 'tf_model.h5'])
+            logger.info(f"Downloaded model to {model_name}")
+        model = OmniGen.from_pretrained(model_name)
+        processor = OmniGenProcessor.from_pretrained(model_name)
+        vae = AutoencoderKL.from_pretrained(os.path.join(model_name, "vae"))
+        return cls(vae, model, processor)
+    def vae_encode(self, x, dtype):
+        if self.vae.config.shift_factor is not None:
+            x = self.vae.encode(x).latent_dist.sample()
+            x = (x - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        else:
+            x = self.vae.encode(x).latent_dist.sample().mul_(self.vae.config.scaling_factor)
+        x = x.to(dtype)
+        return x
+    def move_to_device(self, data):
+        if isinstance(data, list):
+            return [x.to(self.device) for x in data]
+        return data.to(self.device)
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        input_images: Union[List[str], List[List[str]]] = None,
+        height: int = 1024,
+        width: int = 1024,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 3,
+        use_img_guidance: bool = True,
+        img_guidance_scale: float = 1.6,
+        separate_cfg_infer: bool = False,
+        use_kv_cache: bool = True,
+        dtype: torch.dtype = torch.bfloat16,
+        ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            input_images (`List[str]` or `List[List[str]]`, *optional*):
+                The list of input images. We will replace the "<|image_i|>" in prompt with the 1-th image in list.
+            height (`int`, *optional*, defaults to 1024):
+                The height in pixels of the generated image. The number must be a multiple of 16.
+            width (`int`, *optional*, defaults to 1024):
+                The width in pixels of the generated image. The number must be a multiple of 16.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            use_img_guidance (`bool`, *optional*, defaults to True):
+                Defined as equation 3 in [Instrucpix2pix](https://arxiv.org/pdf/2211.09800).
+            img_guidance_scale (`float`, *optional*, defaults to 1.6):
+                Defined as equation 3 in [Instrucpix2pix](https://arxiv.org/pdf/2211.09800).
+            separate_cfg_infer (`bool`, *optional*, defaults to False):
+                Perform inference on images with different guidance separately; this can save memory when generating images of large size at the expense of slower inference.
+            use_kv_cache (`bool`, *optional*, defaults to True): enable kv cache to speed up the inference
+        Examples:
+        Returns:
+            A list with the generated images.
+        """
+        assert height%16 == 0 and width%16 == 0
+        if use_kv_cache and separate_cfg_infer:
+            raise "Currently, don't support both use_kv_cache and separate_cfg_infer"
+        if input_images is None:
+            use_img_guidance = False
+        if isinstance(prompt, str):
+            prompt = [prompt]
+            input_images = [input_images] if input_images is not None else None
+        input_data = self.processor(prompt, input_images, height=height, width=width, use_img_cfg=use_img_guidance, separate_cfg_input=separate_cfg_infer)
+        num_prompt = len(prompt)
+        num_cfg = 2 if use_img_guidance else 1
+        latent_size_h, latent_size_w = height//8, width//8
+        latents = torch.randn(num_prompt, 4, latent_size_h, latent_size_w, device=self.device)
+        latents = torch.cat([latents]*(1+num_cfg), 0).to(dtype)
+        input_img_latents = []
+        if separate_cfg_infer:
+            for temp_pixel_values in input_data['input_pixel_values']:
+                temp_input_latents = []
+                for img in temp_pixel_values:
+                    img = self.vae_encode(img.to(self.device), dtype)
+                    temp_input_latents.append(img)
+                input_img_latents.append(temp_input_latents)
+        else:
+            for img in input_data['input_pixel_values']:
+                img = self.vae_encode(img.to(self.device), dtype)
+                input_img_latents.append(img)
+        model_kwargs = dict(input_ids=self.move_to_device(input_data['input_ids']),
+            input_img_latents=input_img_latents,
+            input_image_sizes=input_data['input_image_sizes'],
+            attention_mask=self.move_to_device(input_data["attention_mask"]),
+            position_ids=self.move_to_device(input_data["position_ids"]),
+            cfg_scale=guidance_scale,
+            img_cfg_scale=img_guidance_scale,
+            use_img_cfg=use_img_guidance,
+            use_kv_cache=use_kv_cache)
+        if separate_cfg_infer:
+            func = self.model.forward_with_separate_cfg
+        else:
+            func = self.model.forward_with_cfg
+        self.model.to(dtype)
+        scheduler = OmniGenScheduler(num_steps=num_inference_steps)
+        samples = scheduler(latents, func, model_kwargs, use_kv_cache=use_kv_cache)
+        samples = samples.chunk((1+num_cfg), dim=0)[0]
+        samples = samples.to(torch.float32)
+        if self.vae.config.shift_factor is not None:
+            samples = samples / self.vae.config.scaling_factor + self.vae.config.shift_factor
+        else:
+            samples = samples / self.vae.config.scaling_factor
+        samples = self.vae.decode(samples).sample
+        output_samples = (samples * 0.5 + 0.5).clamp(0, 1)*255
+        output_samples = output_samples.permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()
+        output_images = []
+        for i, sample in enumerate(output_samples):
+            output_images.append(Image.fromarray(sample))
+        return output_images

OmniGen/processor.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import os
+import re
+from typing import Dict, List
+import json
+import torch
+import numpy as np
+import random
+from PIL import Image
+from torchvision import transforms
+from transformers import AutoTokenizer
+from huggingface_hub import snapshot_download
+def crop_arr(pil_image, max_image_size):
+    while min(*pil_image.size) >= 2 * max_image_size:
+        pil_image = pil_image.resize(
+            tuple(x // 2 for x in pil_image.size), resample=Image.BOX
+        )
+    if max(*pil_image.size) > max_image_size:
+        scale = max_image_size / max(*pil_image.size)
+        pil_image = pil_image.resize(
+            tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
+        )
+    arr = np.array(pil_image)
+    crop_y1 = (arr.shape[0] % 16) // 2
+    crop_y2 = arr.shape[0] % 16 - crop_y1
+    crop_x1 = (arr.shape[1] % 16) // 2
+    crop_x2 = arr.shape[1] % 16 - crop_x1
+    arr = arr[crop_y1:arr.shape[0]-crop_y2, crop_x1:arr.shape[1]-crop_x2]
+    return Image.fromarray(arr)
+class OmniGenProcessor:
+    def __init__(self,
+                text_tokenizer,
+                max_image_size: int=1024):
+        self.text_tokenizer = text_tokenizer
+        self.max_image_size = max_image_size
+        self.image_transform = transforms.Compose([
+            transforms.Lambda(lambda pil_image: crop_arr(pil_image, max_image_size)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+        ])
+        self.collator = OmniGenCollator()
+        self.separate_collator = OmniGenSeparateCollator()
+    @classmethod
+    def from_pretrained(cls, model_name):
+        if not os.path.exists(model_name):
+            cache_folder = os.getenv('HF_HUB_CACHE')
+            model_name = snapshot_download(repo_id=model_name,
+                                           cache_dir=cache_folder,
+                                           allow_patterns="*.json")
+        text_tokenizer = AutoTokenizer.from_pretrained(model_name)
+        return cls(text_tokenizer)
+    def process_image(self, image):
+        image = Image.open(image).convert('RGB')
+        return self.image_transform(image)
+    def process_multi_modal_prompt(self, text, input_images):
+        if input_images is None or len(input_images) == 0:
+            model_inputs = self.text_tokenizer(text)
+            return {"input_ids": model_inputs.input_ids, "pixel_values": None, "image_sizes": None}
+        pattern = r"<\|image_\d+\|>"
+        prompt_chunks = [self.text_tokenizer(chunk).input_ids for chunk in re.split(pattern, text)]
+        for i in range(1, len(prompt_chunks)):
+            if prompt_chunks[i][0] == 1:
+                prompt_chunks[i] = prompt_chunks[i][1:]
+        image_tags = re.findall(pattern, text)
+        image_ids = [int(s.split("|")[1].split("_")[-1]) for s in image_tags]
+        unique_image_ids = sorted(list(set(image_ids)))
+        assert unique_image_ids == list(range(1, len(unique_image_ids)+1)), f"image_ids must start from 1, and must be continuous int, e.g. [1, 2, 3], cannot be {unique_image_ids}"
+        # total images must be the same as the number of image tags
+        assert len(unique_image_ids) == len(input_images), f"total images must be the same as the number of image tags, got {len(unique_image_ids)} image tags and {len(input_images)} images"
+        input_images = [input_images[x-1] for x in image_ids]
+        all_input_ids = []
+        img_inx = []
+        idx = 0
+        for i in range(len(prompt_chunks)):
+            all_input_ids.extend(prompt_chunks[i])
+            if i != len(prompt_chunks) -1:
+                start_inx = len(all_input_ids)
+                size = input_images[i].size(-2) *  input_images[i].size(-1) // 16 // 16
+                img_inx.append([start_inx, start_inx+size])
+                all_input_ids.extend([0]*size)
+        return {"input_ids": all_input_ids, "pixel_values": input_images, "image_sizes": img_inx}
+    def add_prefix_instruction(self, prompt):
+        user_prompt = '<|user|>\n'
+        generation_prompt = 'Generate an image according to the following instructions\n'
+        assistant_prompt = '<|assistant|>\n<|diffusion|>'
+        prompt_suffix = "<|end|>\n"
+        prompt = f"{user_prompt}{generation_prompt}{prompt}{prompt_suffix}{assistant_prompt}"
+        return prompt
+    def __call__(self,
+                instructions: List[str],
+                input_images: List[List[str]] = None,
+                height: int = 1024,
+                width: int = 1024,
+                negative_prompt: str = "low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers.",
+                use_img_cfg: bool = True,
+                separate_cfg_input: bool = False,
+                ) -> Dict:
+        if input_images is None:
+            use_img_cfg = False
+        if isinstance(instructions, str):
+            instructions = [instructions]
+            input_images = [input_images]
+        input_data = []
+        for i in range(len(instructions)):
+            cur_instruction = instructions[i]
+            cur_input_images = None if input_images is None else input_images[i]
+            cur_instruction = self.add_prefix_instruction(cur_instruction)
+            if cur_input_images is not None and len(cur_input_images) > 0:
+                cur_input_images = [self.process_image(x) for x in cur_input_images]
+            else:
+                cur_input_images = None
+                assert "<img><|image_1|></img>" not in cur_instruction
+            mllm_input = self.process_multi_modal_prompt(cur_instruction, cur_input_images)
+            neg_mllm_input, img_cfg_mllm_input = None, None
+            neg_instruction = self.add_prefix_instruction(negative_prompt)
+            neg_mllm_input = self.process_multi_modal_prompt(neg_instruction, None)
+            if use_img_cfg:
+                if cur_input_images is not None and len(cur_input_images) >= 1:
+                    img_cfg_prompt = [f"<img><|image_{i+1}|></img>" for i in range(len(cur_input_images))]
+                    img_cfg_mllm_input = self.process_multi_modal_prompt(self.add_prefix_instruction(" ".join(img_cfg_prompt)), cur_input_images)
+                else:
+                    img_cfg_mllm_input = neg_instruction
+            input_data.append((mllm_input, neg_mllm_input, img_cfg_mllm_input, [height, width]))
+        if separate_cfg_input:
+            return self.separate_collator(input_data)
+        return self.collator(input_data)
+class OmniGenCollator:
+    def __init__(self, pad_token_id=2, hidden_size=3072):
+        self.pad_token_id = pad_token_id
+        self.hidden_size = hidden_size
+    def create_position(self, attention_mask, num_tokens_for_output_images):
+        position_ids = []
+        text_length = attention_mask.size(-1)
+        img_length = max(num_tokens_for_output_images)
+        for mask in attention_mask:
+            temp_l = torch.sum(mask)
+            temp_position = [0]*(text_length-temp_l) + [i for i in range(temp_l+img_length+1)] # we add a time embedding into the sequence, so add one more token
+            position_ids.append(temp_position)
+        return torch.LongTensor(position_ids)
+    def create_mask(self, attention_mask, num_tokens_for_output_images):
+        extended_mask = []
+        padding_images = []
+        text_length = attention_mask.size(-1)
+        img_length = max(num_tokens_for_output_images)
+        seq_len = text_length + img_length + 1 # we add a time embedding into the sequence, so add one more token
+        inx = 0
+        for mask in attention_mask:
+            temp_l = torch.sum(mask)
+            pad_l = text_length - temp_l
+            temp_mask = torch.tril(torch.ones(size=(temp_l+1, temp_l+1)))
+            image_mask = torch.zeros(size=(temp_l+1, img_length))
+            temp_mask = torch.cat([temp_mask, image_mask], dim=-1)
+            image_mask = torch.ones(size=(img_length, temp_l+img_length+1))
+            temp_mask = torch.cat([temp_mask, image_mask], dim=0)
+            if pad_l > 0:
+                pad_mask = torch.zeros(size=(temp_l+1+img_length, pad_l))
+                temp_mask = torch.cat([pad_mask, temp_mask], dim=-1)
+                pad_mask = torch.ones(size=(pad_l, seq_len))
+                temp_mask = torch.cat([pad_mask, temp_mask], dim=0)
+            true_img_length = num_tokens_for_output_images[inx]
+            pad_img_length = img_length - true_img_length
+            if pad_img_length > 0:
+                temp_mask[:, -pad_img_length:] = 0
+                temp_padding_imgs = torch.zeros(size=(1, pad_img_length, self.hidden_size))
+            else:
+                temp_padding_imgs = None
+            extended_mask.append(temp_mask.unsqueeze(0))
+            padding_images.append(temp_padding_imgs)
+            inx += 1
+        return torch.cat(extended_mask, dim=0), padding_images
+    def adjust_attention_for_input_images(self, attention_mask, image_sizes):
+        for b_inx in image_sizes.keys():
+            for start_inx, end_inx in image_sizes[b_inx]:
+                attention_mask[b_inx][start_inx:end_inx, start_inx:end_inx] = 1
+        return attention_mask
+    def pad_input_ids(self, input_ids, image_sizes):
+        max_l = max([len(x) for x in input_ids])
+        padded_ids = []
+        attention_mask = []
+        new_image_sizes = []
+        for i in range(len(input_ids)):
+            temp_ids = input_ids[i]
+            temp_l = len(temp_ids)
+            pad_l = max_l - temp_l
+            if pad_l == 0:
+                attention_mask.append([1]*max_l)
+                padded_ids.append(temp_ids)
+            else:
+                attention_mask.append([0]*pad_l+[1]*temp_l)
+                padded_ids.append([self.pad_token_id]*pad_l+temp_ids)
+            if i in image_sizes:
+                new_inx = []
+                for old_inx in image_sizes[i]:
+                    new_inx.append([x+pad_l for x in old_inx])
+                image_sizes[i] = new_inx
+        return torch.LongTensor(padded_ids), torch.LongTensor(attention_mask), image_sizes
+    def process_mllm_input(self, mllm_inputs, target_img_size):
+        num_tokens_for_output_images = []
+        for img_size in target_img_size:
+            num_tokens_for_output_images.append(img_size[0]*img_size[1]//16//16)
+        pixel_values, image_sizes = [], {}
+        b_inx = 0
+        for x in mllm_inputs:
+            if x['pixel_values'] is not None:
+                pixel_values.extend(x['pixel_values'])
+                for size in x['image_sizes']:
+                    if b_inx not in image_sizes:
+                        image_sizes[b_inx] = [size]
+                    else:
+                        image_sizes[b_inx].append(size)
+            b_inx += 1
+        pixel_values = [x.unsqueeze(0) for x in pixel_values]
+        input_ids = [x['input_ids'] for x in mllm_inputs]
+        padded_input_ids, attention_mask, image_sizes = self.pad_input_ids(input_ids, image_sizes)
+        position_ids = self.create_position(attention_mask, num_tokens_for_output_images)
+        attention_mask, padding_images = self.create_mask(attention_mask, num_tokens_for_output_images)
+        attention_mask = self.adjust_attention_for_input_images(attention_mask, image_sizes)
+        return padded_input_ids, position_ids, attention_mask, padding_images, pixel_values, image_sizes
+    def __call__(self, features):
+        mllm_inputs = [f[0] for f in features]
+        cfg_mllm_inputs = [f[1] for f in features]
+        img_cfg_mllm_input = [f[2] for f in features]
+        target_img_size = [f[3] for f in features]
+        if img_cfg_mllm_input[0] is not None:
+            mllm_inputs = mllm_inputs + cfg_mllm_inputs + img_cfg_mllm_input
+            target_img_size = target_img_size + target_img_size + target_img_size
+        else:
+            mllm_inputs = mllm_inputs + cfg_mllm_inputs
+            target_img_size = target_img_size + target_img_size
+        all_padded_input_ids, all_position_ids, all_attention_mask, all_padding_images, all_pixel_values, all_image_sizes = self.process_mllm_input(mllm_inputs, target_img_size)
+        data = {"input_ids": all_padded_input_ids,
+        "attention_mask": all_attention_mask,
+        "position_ids": all_position_ids,
+        "input_pixel_values": all_pixel_values,
+        "input_image_sizes": all_image_sizes,
+        "padding_images": all_padding_images,
+        }
+        return data
+class OmniGenSeparateCollator(OmniGenCollator):
+    def __call__(self, features):
+        mllm_inputs = [f[0] for f in features]
+        cfg_mllm_inputs = [f[1] for f in features]
+        img_cfg_mllm_input = [f[2] for f in features]
+        target_img_size = [f[3] for f in features]
+        all_padded_input_ids, all_attention_mask, all_position_ids, all_pixel_values, all_image_sizes, all_padding_images = [], [], [], [], [], []
+        padded_input_ids, position_ids, attention_mask, padding_images, pixel_values, image_sizes = self.process_mllm_input(mllm_inputs, target_img_size)
+        all_padded_input_ids.append(padded_input_ids)
+        all_attention_mask.append(attention_mask)
+        all_position_ids.append(position_ids)
+        all_pixel_values.append(pixel_values)
+        all_image_sizes.append(image_sizes)
+        all_padding_images.append(padding_images)
+        if cfg_mllm_inputs[0] is not None:
+            padded_input_ids, position_ids, attention_mask, padding_images, pixel_values, image_sizes = self.process_mllm_input(cfg_mllm_inputs, target_img_size)
+            all_padded_input_ids.append(padded_input_ids)
+            all_attention_mask.append(attention_mask)
+            all_position_ids.append(position_ids)
+            all_pixel_values.append(pixel_values)
+            all_image_sizes.append(image_sizes)
+            all_padding_images.append(padding_images)
+        if img_cfg_mllm_input[0] is not None:
+            padded_input_ids, position_ids, attention_mask, padding_images, pixel_values, image_sizes = self.process_mllm_input(img_cfg_mllm_input, target_img_size)
+            all_padded_input_ids.append(padded_input_ids)
+            all_attention_mask.append(attention_mask)
+            all_position_ids.append(position_ids)
+            all_pixel_values.append(pixel_values)
+            all_image_sizes.append(image_sizes)
+            all_padding_images.append(padding_images)
+        data = {"input_ids": all_padded_input_ids,
+        "attention_mask": all_attention_mask,
+        "position_ids": all_position_ids,
+        "input_pixel_values": all_pixel_values,
+        "input_image_sizes": all_image_sizes,
+        "padding_images": all_padding_images,
+        }
+        return data

OmniGen/scheduler.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import torch
+from tqdm import tqdm
+from transformers.cache_utils import Cache, DynamicCache, OffloadedCache
+class OmniGenScheduler:
+    def __init__(self, num_steps: int=50, time_shifting_factor: int=1):
+        self.num_steps = num_steps
+        self.time_shift = time_shifting_factor
+        t = torch.linspace(0, 1, num_steps+1)
+        t = t / (t + time_shifting_factor - time_shifting_factor * t)
+        self.sigma = t
+    def crop_kv_cache(self, past_key_values, num_tokens_for_img):
+        crop_past_key_values = ()
+        for layer_idx in range(len(past_key_values)):
+            key_states, value_states = past_key_values[layer_idx][:2]
+            crop_past_key_values += ((key_states[..., :-(num_tokens_for_img+1), :], value_states[..., :-(num_tokens_for_img+1), :], ),)
+        return crop_past_key_values
+        # return DynamicCache.from_legacy_cache(crop_past_key_values)
+    def crop_position_ids_for_cache(self, position_ids, num_tokens_for_img):
+        if isinstance(position_ids, list):
+            for i in range(len(position_ids)):
+                position_ids[i] = position_ids[i][:, -(num_tokens_for_img+1):]
+        else:
+            position_ids = position_ids[:, -(num_tokens_for_img+1):]
+        return position_ids
+    def crop_attention_mask_for_cache(self, attention_mask, num_tokens_for_img):
+        if isinstance(attention_mask, list):
+            return [x[..., -(num_tokens_for_img+1):, :] for x in attention_mask]
+        return attention_mask[..., -(num_tokens_for_img+1):, :]
+    def __call__(self, z, func, model_kwargs, use_kv_cache: bool=True):
+        past_key_values = None
+        for i in tqdm(range(self.num_steps)):
+            timesteps = torch.zeros(size=(len(z), )).to(z.device) + self.sigma[i]
+            pred, temp_past_key_values = func(z, timesteps, past_key_values=past_key_values, **model_kwargs)
+            sigma_next = self.sigma[i+1]
+            sigma = self.sigma[i]
+            z = z + (sigma_next - sigma) * pred
+            if i == 0 and use_kv_cache:
+                num_tokens_for_img = z.size(-1)*z.size(-2) // 4
+                if isinstance(temp_past_key_values, list):
+                    past_key_values = [self.crop_kv_cache(x, num_tokens_for_img) for x in temp_past_key_values]
+                    model_kwargs['input_ids'] = [None] * len(temp_past_key_values)
+                else:
+                    past_key_values = self.crop_kv_cache(temp_past_key_values, num_tokens_for_img)
+                    model_kwargs['input_ids'] = None
+                model_kwargs['position_ids'] = self.crop_position_ids_for_cache(model_kwargs['position_ids'], num_tokens_for_img)
+                model_kwargs['attention_mask'] = self.crop_attention_mask_for_cache(model_kwargs['attention_mask'], num_tokens_for_img)
+        return z

OmniGen/train.py ADDED Viewed

File without changes

OmniGen/transformer.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from huggingface_hub import snapshot_download
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers import Phi3Config, Phi3Model
+from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache, OffloadedCache
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Phi3Transformer(Phi3Model):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi3DecoderLayer`]
+    We only modified the attention mask
+    Args:
+        config: Phi3Config
+    """
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        if attention_mask is not None and attention_mask.dim() == 3:
+            dtype = inputs_embeds.dtype
+            min_dtype = torch.finfo(dtype).min
+            attention_mask = (1 - attention_mask) * min_dtype
+            attention_mask = attention_mask.unsqueeze(1).to(inputs_embeds.dtype)
+        else:
+            raise
+            # causal_mask = self._update_causal_mask(
+            #     attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+            # )
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )

app.py CHANGED Viewed

@@ -1,154 +1,68 @@
 import gradio as gr
-import numpy as np
-import random
-# import spaces #[uncomment to use ZeroGPU]
-from diffusers import DiffusionPipeline
-import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
-if torch.cuda.is_available():
-    torch_dtype = torch.float16
-else:
-    torch_dtype = torch.float32
-pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-pipe = pipe.to(device)
-MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
-# @spaces.GPU #[uncomment to use ZeroGPU]
-def infer(
-    prompt,
-    negative_prompt,
-    seed,
-    randomize_seed,
-    width,
-    height,
-    guidance_scale,
-    num_inference_steps,
-    progress=gr.Progress(track_tqdm=True),
-):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    image = pipe(
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
-        width=width,
         height=height,
-        generator=generator,
-    ).images[0]
-    return image, seed
-examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
-]
-css = """
-#col-container {
-    margin: 0 auto;
-    max-width: 640px;
-}
-"""
-with gr.Blocks(css=css) as demo:
-    with gr.Column(elem_id="col-container"):
-        gr.Markdown(" # Text-to-Image Gradio Template")
-        with gr.Row():
-            prompt = gr.Text(
-                label="Prompt",
-                show_label=False,
-                max_lines=1,
-                placeholder="Enter your prompt",
-                container=False,
-            )
-            run_button = gr.Button("Run", scale=0, variant="primary")
-        result = gr.Image(label="Result", show_label=False)
-        with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
-                max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
-            )
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
-            )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-            with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-            with gr.Row():
-                guidance_scale = gr.Slider(
-                    label="Guidance scale",
-                    minimum=0.0,
-                    maximum=10.0,
-                    step=0.1,
-                    value=0.0,  # Replace with defaults that work for your model
-                )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
-                    minimum=1,
-                    maximum=50,
-                    step=1,
-                    value=2,  # Replace with defaults that work for your model
-                )
-        gr.Examples(examples=examples, inputs=[prompt])
-    gr.on(
-        triggers=[run_button.click, prompt.submit],
-        fn=infer,
-        inputs=[
-            prompt,
-            negative_prompt,
-            seed,
-            randomize_seed,
-            width,
-            height,
-            guidance_scale,
-            num_inference_steps,
-        ],
-        outputs=[result, seed],
     )
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+from PIL import Image
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '7'
+from OmniGen import OmniGenPipeline
+pipe = OmniGenPipeline.from_pretrained("shitao/tmp-preview")
+# 示例处理函数：生成图像
+def generate_image(text, img1, img2, img3, height, width, guidance_scale):
+    input_images = [img1, img2, img3]
+    # 去除 None
+    input_images = [img for img in input_images if img is not None]
+    if len(input_images) == 0:
+        input_images = None
+    output = pipe(
+        prompt=text,
+        input_images=input_images,
         height=height,
+        width=width,
+        guidance_scale=guidance_scale,
+        img_guidance_scale=1.6,
+        separate_cfg_infer=True,
+        use_kv_cache=False
+    )
+    img = output[0]
+    return img
+# Gradio 接口
+with gr.Blocks() as demo:
+    gr.Markdown("## Text + Multiple Images to Image Generator")
+    with gr.Row():
+        with gr.Column():
+            # 文本输入框
+            prompt_input = gr.Textbox(label="Enter your prompt", placeholder="Type your prompt here...")
+            # 图片上传框
+            image_input_1 = gr.Image(label="<img><|image_1|></img>", type="filepath")
+            image_input_2 = gr.Image(label="<img><|image_2|></img>", type="filepath")
+            image_input_3 = gr.Image(label="<img><|image_3|></img>", type="filepath")
+            # 高度和宽度滑块
+            height_input = gr.Slider(label="Height", minimum=256, maximum=2048, value=1024, step=16)
+            width_input = gr.Slider(label="Width", minimum=256, maximum=2048, value=1024, step=16)
+            # 引导尺度输入
+            guidance_scale_input = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, value=3.0, step=0.1)
+            # 生成按钮
+            generate_button = gr.Button("Generate Image")
+        with gr.Column():
+            # 输出图像框
+            output_image = gr.Image(label="Output Image")
+    # 按钮点击事件
+    generate_button.click(
+        generate_image,
+        inputs=[prompt_input, image_input_1, image_input_2, image_input_3, height_input, width_input, guidance_scale_input],
+        outputs=output_image
     )
+# 启动应用
+demo.launch()

edit.png ADDED Viewed

imgs/.DS_Store ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d65165279105ca6773180500688df4bdc69a2c7b771752f0a46ef120b7fd8ec3
+size 6148

imgs/test_cases/liuyifei.png ADDED Viewed

imgs/test_cases/taylor.png ADDED Viewed

imgs/test_cases/trump.png ADDED Viewed

imgs/test_cases/turing.png ADDED Viewed

inference.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

setup.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from setuptools import setup, find_packages
+with open("README.md", mode="r", encoding="utf-8") as readme_file:
+    readme = readme_file.read()
+setup(
+    name='OmniGen',
+    version='1.0.0',
+    description='OmniGen',
+    long_description=readme,
+    long_description_content_type="text/markdown",
+    author_email='[email protected]',
+    url='https://github.com/VectorSpaceLab/OmniGen',
+    packages=find_packages(),
+    include_package_data=True,
+    install_requires=[
+        'torch>=1.6.0',
+        'transformers>=4.41.0',
+        'datasets',
+        'accelerate>=0.20.1',
+        'diffusers>=0.30.3'
+    ],
+)