Spaces:
Running
on
Zero
Running
on
Zero
File size: 14,566 Bytes
f8a748e 7f48662 a713a09 1876385 f8a748e 7f48662 f8a748e 200a130 c38e273 200a130 f8a748e a713a09 7f48662 a713a09 7f48662 f8a748e 7f48662 f8a748e 7f48662 200a130 a713a09 200a130 a713a09 44bc074 7f48662 a713a09 200a130 c38e273 730f5fd c38e273 4004f94 730f5fd c38e273 4004f94 0d3229d a713a09 c38e273 200a130 730f5fd 89a8b3c 200a130 4004f94 730f5fd eb7f8ad 730f5fd 0d3229d a713a09 4004f94 730f5fd 4004f94 0d3229d a713a09 200a130 730f5fd 0d3229d a713a09 730f5fd 0d3229d a713a09 730f5fd 4004f94 730f5fd 0d3229d a713a09 730f5fd 4004f94 0d3229d a713a09 4004f94 a713a09 4004f94 730f5fd 4004f94 0d3229d a713a09 4004f94 a713a09 4004f94 730f5fd 4004f94 0d3229d a713a09 4004f94 a713a09 4004f94 730f5fd eb7f8ad 44bc074 0d3229d a713a09 200a130 4004f94 a713a09 4004f94 730f5fd 4004f94 0d3229d a713a09 4004f94 730f5fd a713a09 730f5fd 0d3229d a713a09 730f5fd 4004f94 a713a09 4004f94 730f5fd 4004f94 0d3229d a713a09 4004f94 687aaef 730f5fd a713a09 687aaef 730f5fd 687aaef 730f5fd 0d3229d a713a09 687aaef 200a130 a713a09 200a130 4004f94 c0df1a8 d89eff2 730f5fd a713a09 730f5fd a713a09 730f5fd a713a09 4004f94 7f48662 5ed5dc0 a713a09 7f48662 c38e273 4004f94 7f48662 a713a09 200a130 c0df1a8 200a130 a713a09 200a130 a713a09 200a130 a713a09 200a130 a713a09 200a130 730f5fd 200a130 eb7f8ad 200a130 44bc074 5ed5dc0 a713a09 5ed5dc0 a713a09 7f48662 a713a09 200a130 7f48662 a713a09 7f48662 a713a09 7f48662 200a130 730f5fd 200a130 44bc074 5ed5dc0 a713a09 200a130 730f5fd 200a130 44bc074 5ed5dc0 a713a09 200a130 f8a748e a713a09 7f48662 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 |
import gradio as gr
from PIL import Image
import os
import spaces
from OmniGen import OmniGenPipeline
pipe = OmniGenPipeline.from_pretrained(
"Shitao/OmniGen-v1"
)
@spaces.GPU(duration=300)
def generate_image(text, img1, img2, img3, height, width, guidance_scale, img_guidance_scale, inference_steps, seed, separate_cfg_infer, offload_model,
use_input_image_size_as_output):
input_images = [img1, img2, img3]
# Delete None
input_images = [img for img in input_images if img is not None]
if len(input_images) == 0:
input_images = None
output = pipe(
prompt=text,
input_images=input_images,
height=height,
width=width,
guidance_scale=guidance_scale,
img_guidance_scale=img_guidance_scale,
num_inference_steps=inference_steps,
separate_cfg_infer=separate_cfg_infer,
use_kv_cache=True,
offload_kv_cache=True,
offload_model=offload_model,
use_input_image_size_as_output=use_input_image_size_as_output,
seed=seed,
)
img = output[0]
return img
def get_example():
case = [
[
"A curly-haired man in a red shirt is drinking tea.",
None,
None,
None,
1024,
1024,
2.5,
1.6,
50,
0,
True,
False,
False,
],
[
"The woman in <img><|image_1|></img> waves her hand happily in the crowd",
"./imgs/test_cases/zhang.png",
None,
None,
1024,
1024,
2.5,
1.9,
50,
128,
True,
False,
False,
],
[
"A man in a black shirt is reading a book. The man is the right man in <img><|image_1|></img>.",
"./imgs/test_cases/two_man.jpg",
None,
None,
1024,
1024,
2.5,
1.6,
50,
0,
True,
False,
False,
],
[
"Two woman are raising fried chicken legs in a bar. A woman is <img><|image_1|></img>. The other woman is <img><|image_2|></img>.",
"./imgs/test_cases/mckenna.jpg",
"./imgs/test_cases/Amanda.jpg",
None,
1024,
1024,
2.5,
1.8,
50,
168,
True,
False,
False,
],
[
"A man and a short-haired woman with a wrinkled face are standing in front of a bookshelf in a library. The man is the man in the middle of <img><|image_1|></img>, and the woman is oldest woman in <img><|image_2|></img>",
"./imgs/test_cases/1.jpg",
"./imgs/test_cases/2.jpg",
None,
1024,
1024,
2.5,
1.6,
50,
60,
True,
False,
False,
],
[
"A man and a woman are sitting at a classroom desk. The man is the man with yellow hair in <img><|image_1|></img>. The woman is the woman on the left of <img><|image_2|></img>",
"./imgs/test_cases/3.jpg",
"./imgs/test_cases/4.jpg",
None,
1024,
1024,
2.5,
1.8,
50,
66,
True,
False,
False,
],
[
"The flower <img><|image_1|><\/img> is placed in the vase which is in the middle of <img><|image_2|><\/img> on a wooden table of a living room",
"./imgs/test_cases/rose.jpg",
"./imgs/test_cases/vase.jpg",
None,
1024,
1024,
2.5,
1.6,
50,
0,
True,
False,
False,
],
[
"<img><|image_1|><img>\n Remove the woman's earrings. Replace the mug with a clear glass filled with sparkling iced cola.",
"./imgs/demo_cases/t2i_woman_with_book.png",
None,
None,
None,
None,
2.5,
1.6,
50,
222,
True,
False,
True,
],
[
"Detect the skeleton of human in this image: <img><|image_1|></img>.",
"./imgs/test_cases/control.jpg",
None,
None,
None,
None,
2.0,
1.6,
50,
0,
True,
False,
True,
],
[
"Generate a new photo using the following picture and text as conditions: <img><|image_1|><img>\n A young boy is sitting on a sofa in the library, holding a book. His hair is neatly combed, and a faint smile plays on his lips, with a few freckles scattered across his cheeks. The library is quiet, with rows of shelves filled with books stretching out behind him.",
"./imgs/demo_cases/skeletal.png",
None,
None,
None,
None,
2,
1.6,
50,
42,
True,
False,
True,
],
[
"Following the pose of this image <img><|image_1|><img>, generate a new photo: A young boy is sitting on a sofa in the library, holding a book. His hair is neatly combed, and a faint smile plays on his lips, with a few freckles scattered across his cheeks. The library is quiet, with rows of shelves filled with books stretching out behind him.",
"./imgs/demo_cases/edit.png",
None,
None,
None,
None,
2.0,
1.6,
50,
123,
True,
False,
True,
],
[
"Following the depth mapping of this image <img><|image_1|><img>, generate a new photo: A young girl is sitting on a sofa in the library, holding a book. His hair is neatly combed, and a faint smile plays on his lips, with a few freckles scattered across his cheeks. The library is quiet, with rows of shelves filled with books stretching out behind him.",
"./imgs/demo_cases/edit.png",
None,
None,
None,
None,
2.0,
1.6,
50,
1,
True,
False,
True,
],
[
"<img><|image_1|><\/img> What item can be used to see the current time? Please remove it.",
"./imgs/test_cases/watch.jpg",
None,
None,
None,
None,
2.5,
1.6,
50,
0,
True,
False,
True,
],
[
"According to the following examples, generate an output for the input.\nInput: <img><|image_1|></img>\nOutput: <img><|image_2|></img>\n\nInput: <img><|image_3|></img>\nOutput: ",
"./imgs/test_cases/icl1.jpg",
"./imgs/test_cases/icl2.jpg",
"./imgs/test_cases/icl3.jpg",
224,
224,
2.5,
1.6,
50,
1,
True,
False,
False,
],
]
return case
def run_for_examples(text, img1, img2, img3, height, width, guidance_scale, img_guidance_scale, inference_steps, seed, separate_cfg_infer, offload_model,
use_input_image_size_as_output):
return generate_image(text, img1, img2, img3, height, width, guidance_scale, img_guidance_scale, inference_steps, seed, separate_cfg_infer, offload_model,
use_input_image_size_as_output)
description = """
OmniGen is a unified image generation model that you can use to perform various tasks, including but not limited to text-to-image generation, subject-driven generation, Identity-Preserving Generation, and image-conditioned generation.
For multi-modal to image generation, you should pass a string as `prompt`, and a list of image paths as `input_images`. The placeholder in the prompt should be in the format of `<img><|image_*|></img>` (for the first image, the placeholder is <img><|image_1|></img>. for the second image, the the placeholder is <img><|image_2|></img>).
For example, use an image of a woman to generate a new image:
prompt = "A woman holds a bouquet of flowers and faces the camera. Thw woman is \<img\>\<|image_1|\>\</img\>."
Tips:
- For out of memory or time cost, you can refer to [./docs/inference.md#requiremented-resources](https://github.com/VectorSpaceLab/OmniGen/blob/main/docs/inference.md#requiremented-resources) to select a appropriate setting.
- Oversaturated: If the image appears oversaturated, please reduce the `guidance_scale`.
- Not match the prompt: If the image does not match the prompt, please try to increase the `guidance_scale`.
- Low-quality: More detailed prompt will lead to better results.
- Animate Style: If the genereate images is in animate style, you can try to add `photo` to the prompt`.
- Edit generated image. If you generate a image by omnigen and then want to edit it, you cannot use the same seed to edit this image. For example, use seed=0 to generate image, and should use seed=1 to edit this image.
- For image editing tasks, we recommend placing the image before the editing instruction. For example, use `<img><|image_1|></img> remove suit`, rather than `remove suit <img><|image_1|></img>`.
- For image editing task and controlnet task, we recommend to set the height and width of output image as the same as input image. For example, if you want to edit a 512x512 image, you should set the height and width of output image as 512x512. You also can set the `use_input_image_size_as_output` to automatically set the height and width of output image as the same as input image.
"""
article = """
---
**Citation**
<br>
If you find this repository useful, please consider giving a star ⭐ and citation
```
@article{xiao2024omnigen,
title={Omnigen: Unified image generation},
author={Xiao, Shitao and Wang, Yueze and Zhou, Junjie and Yuan, Huaying and Xing, Xingrun and Yan, Ruiran and Wang, Shuting and Huang, Tiejun and Liu, Zheng},
journal={arXiv preprint arXiv:2409.11340},
year={2024}
}
```
**Contact**
<br>
If you have any questions, please feel free to open an issue or directly reach us out via email.
"""
# Gradio
with gr.Blocks() as demo:
gr.Markdown("# OmniGen: Unified Image Generation [paper](https://arxiv.org/abs/2409.11340) [code](https://github.com/VectorSpaceLab/OmniGen)")
gr.Markdown(description)
with gr.Row():
with gr.Column():
# text prompt
prompt_input = gr.Textbox(
label="Enter your prompt, use <img><|image_i|></img> to represent i-th input image", placeholder="Type your prompt here..."
)
with gr.Row(equal_height=True):
# input images
image_input_1 = gr.Image(label="<img><|image_1|></img>", type="filepath")
image_input_2 = gr.Image(label="<img><|image_2|></img>", type="filepath")
image_input_3 = gr.Image(label="<img><|image_3|></img>", type="filepath")
# slider
height_input = gr.Slider(
label="Height", minimum=128, maximum=2048, value=1024, step=16
)
width_input = gr.Slider(
label="Width", minimum=128, maximum=2048, value=1024, step=16
)
guidance_scale_input = gr.Slider(
label="Guidance Scale", minimum=1.0, maximum=5.0, value=2.5, step=0.1
)
img_guidance_scale_input = gr.Slider(
label="img_guidance_scale", minimum=1.0, maximum=2.0, value=1.6, step=0.1
)
num_inference_steps = gr.Slider(
label="Inference Steps", minimum=1, maximum=100, value=50, step=1
)
seed_input = gr.Slider(
label="Seed", minimum=0, maximum=2147483647, value=42, step=1
)
separate_cfg_infer = gr.Checkbox(
label="separate_cfg_infer", info="Whether to use separate inference process for different guidance. This will reduce the memory cost.", value=True,
)
offload_model = gr.Checkbox(
label="offload_model", info="Offload model to CPU, which will significantly reduce the memory cost but slow down the generation speed. You can cancle separate_cfg_infer and set offload_model=True. If both separate_cfg_infer and offload_model be True, further reduce the memory, but slowest generation", value=False,
)
use_input_image_size_as_output = gr.Checkbox(
label="use_input_image_size_as_output", info="Automatically adjust the output image size to be same as input image size. For editing and controlnet task, it can make sure the output image has the same size with input image leading to better performance", value=False,
)
# generate
generate_button = gr.Button("Generate Image")
with gr.Column():
# output image
output_image = gr.Image(label="Output Image")
# click
generate_button.click(
generate_image,
inputs=[
prompt_input,
image_input_1,
image_input_2,
image_input_3,
height_input,
width_input,
guidance_scale_input,
img_guidance_scale_input,
num_inference_steps,
seed_input,
separate_cfg_infer,
offload_model,
use_input_image_size_as_output,
],
outputs=output_image,
)
gr.Examples(
examples=get_example(),
fn=run_for_examples,
inputs=[
prompt_input,
image_input_1,
image_input_2,
image_input_3,
height_input,
width_input,
guidance_scale_input,
img_guidance_scale_input,
num_inference_steps,
seed_input,
separate_cfg_infer,
offload_model,
use_input_image_size_as_output,
],
outputs=output_image,
)
gr.Markdown(article)
# launch
demo.launch() |