Emu3

Running on L40S

App Files Files Community

ryanzhangfan commited on 7 days ago

Commit

058e220

•

1 Parent(s): 542fa16

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -60

app.py CHANGED Viewed

@@ -17,24 +17,22 @@ from transformers.generation import (
 )
 import torch
 from emu3.mllm.processing_emu3 import Emu3Processor
-import spaces
 import io
 import base64
 def image2str(image):
     buf = io.BytesIO()
     image.save(buf, format="PNG")
     i_str = base64.b64encode(buf.getvalue()).decode()
     return f'<div style="float:left"><img src="data:image/png;base64, {i_str}"></div>'
-# Install flash attention, skipping CUDA build if necessary
-subprocess.run(
-    "pip install flash-attn --no-build-isolation",
-    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
-    shell=True,
-)
 print(gr.__version__)
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -46,7 +44,6 @@ VQ_HUB = "BAAI/Emu3-VisionTokenizer"
 # uncomment to use gen model
-"""
 # Prepare models and processors
 # Emu3-Gen model and processor
 gen_model = AutoModelForCausalLM.from_pretrained(
@@ -55,7 +52,15 @@ gen_model = AutoModelForCausalLM.from_pretrained(
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
     trust_remote_code=True,
-)
 tokenizer = AutoTokenizer.from_pretrained(EMU_CHAT_HUB, trust_remote_code=True)
 image_processor = AutoImageProcessor.from_pretrained(
@@ -66,14 +71,12 @@ image_tokenizer = AutoModel.from_pretrained(
 ).eval()
 print(device)
-gen_model.to(device)
 image_tokenizer.to(device)
 processor = Emu3Processor(
     image_processor, image_tokenizer, tokenizer
 )
-@spaces.GPU(duration=300)
 def generate_image(prompt):
     POSITIVE_PROMPT = " masterpiece, film grained, best quality."
     NEGATIVE_PROMPT = (
@@ -104,6 +107,9 @@ def generate_image(prompt):
         top_k=2048,
     )
     h, w = pos_inputs.image_size[0]
     constrained_fn = processor.build_prefix_constrained_fn(h, w)
     logits_processor = LogitsProcessorList(
@@ -128,54 +134,17 @@ def generate_image(prompt):
     )
     mm_list = processor.decode(outputs[0])
     for idx, im in enumerate(mm_list):
         if isinstance(im, Image.Image):
-            return im
-    return None
-def chat(history, user_input, user_image):
-    if user_image is not None:
-        history = history + [(image2str(user_image) + "<br>" + user_input, "Sorry, gen model do not accept image input")]
-    else:
-        # Use Emu3-Gen for image generation
-        generated_image = generate_image(user_input)
-        if generated_image is not None:
-            # Append the user input and generated image to the history
-            history = history + [(user_input, image2str(generated_image))]
-        else:
-            # If image generation failed, respond with an error message
-            history = history + [
-                (user_input, "Sorry, I could not generate an image.")
-            ]
-    return history, history, gr.update(value=None)
-"""
-# Emu3-Chat model and processor
-chat_model = AutoModelForCausalLM.from_pretrained(
-    EMU_CHAT_HUB,
-    device_map="cpu",
-    torch_dtype=torch.bfloat16,
-    attn_implementation="flash_attention_2",
-    trust_remote_code=True,
-)
-tokenizer = AutoTokenizer.from_pretrained(EMU_CHAT_HUB, trust_remote_code=True)
-image_processor = AutoImageProcessor.from_pretrained(
-    VQ_HUB, trust_remote_code=True
-)
-image_tokenizer = AutoModel.from_pretrained(
-    VQ_HUB, device_map="cpu", trust_remote_code=True
-).eval()
-print(device)
-chat_model.to(device)
-image_tokenizer.to(device)
-processor = Emu3Processor(
-    image_processor, image_tokenizer, tokenizer
-)
-@spaces.GPU
 def vision_language_understanding(image, text):
     inputs = processor(
         text=text,
@@ -194,6 +163,9 @@ def vision_language_understanding(image, text):
         max_new_tokens=320,
     )
     # Generate
     outputs = chat_model.generate(
         inputs.input_ids.to(device),
@@ -203,8 +175,13 @@ def vision_language_understanding(image, text):
     outputs = outputs[:, inputs.input_ids.shape[-1] :]
     response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
     return response
 def chat(history, user_input, user_image):
     if user_image is not None:
         # Use Emu3-Chat for vision-language understanding
@@ -212,21 +189,32 @@ def chat(history, user_input, user_image):
         # Append the user input and response to the history
         history = history + [(image2str(user_image) + "<br>" + user_input, response)]
     else:
-        history = history + [(user_input, "Sorry, please specify a valid image for vl understanding.")]
     return history, history, gr.update(value=None)
-# uncomment to here to disable chat
-# """
 def clear_input():
     return gr.update(value="")
 with gr.Blocks() as demo:
     gr.Markdown("# Emu3 Chatbot Demo")
     gr.Markdown(
         "This is a chatbot demo for image generation and vision-language understanding using Emu3 models."
     )
     chatbot = gr.Chatbot()
     state = gr.State([])

 )
 import torch
 from emu3.mllm.processing_emu3 import Emu3Processor
 import io
 import base64
+subprocess.run(
+    "pip3 install flash-attn --no-build-isolation",
+    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    shell=True,
+)
 def image2str(image):
     buf = io.BytesIO()
     image.save(buf, format="PNG")
     i_str = base64.b64encode(buf.getvalue()).decode()
     return f'<div style="float:left"><img src="data:image/png;base64, {i_str}"></div>'
 print(gr.__version__)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # uncomment to use gen model
 # Prepare models and processors
 # Emu3-Gen model and processor
 gen_model = AutoModelForCausalLM.from_pretrained(
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
     trust_remote_code=True,
+).eval()
+chat_model = AutoModelForCausalLM.from_pretrained(
+    EMU_CHAT_HUB,
+    device_map="cpu",
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    trust_remote_code=True,
+).eval()
 tokenizer = AutoTokenizer.from_pretrained(EMU_CHAT_HUB, trust_remote_code=True)
 image_processor = AutoImageProcessor.from_pretrained(
 ).eval()
 print(device)
 image_tokenizer.to(device)
 processor = Emu3Processor(
     image_processor, image_tokenizer, tokenizer
 )
 def generate_image(prompt):
     POSITIVE_PROMPT = " masterpiece, film grained, best quality."
     NEGATIVE_PROMPT = (
         top_k=2048,
     )
+    torch.cuda.empty_cache()
+    gen_model.to(device)
     h, w = pos_inputs.image_size[0]
     constrained_fn = processor.build_prefix_constrained_fn(h, w)
     logits_processor = LogitsProcessorList(
     )
     mm_list = processor.decode(outputs[0])
+    result = None
     for idx, im in enumerate(mm_list):
         if isinstance(im, Image.Image):
+            result = im
+            break
+    gen_model.cpu()
+    torch.cuda.empty_cache()
+    return result
 def vision_language_understanding(image, text):
     inputs = processor(
         text=text,
         max_new_tokens=320,
     )
+    torch.cuda.empty_cache()
+    chat_model.to(device)
     # Generate
     outputs = chat_model.generate(
         inputs.input_ids.to(device),
     outputs = outputs[:, inputs.input_ids.shape[-1] :]
     response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+    chat_model.cpu()
+    torch.cuda.empty_cache()
     return response
 def chat(history, user_input, user_image):
     if user_image is not None:
         # Use Emu3-Chat for vision-language understanding
         # Append the user input and response to the history
         history = history + [(image2str(user_image) + "<br>" + user_input, response)]
     else:
+        # Use Emu3-Gen for image generation
+        generated_image = generate_image(user_input)
+        if generated_image is not None:
+            # Append the user input and generated image to the history
+            history = history + [(user_input, image2str(generated_image))]
+        else:
+            # If image generation failed, respond with an error message
+            history = history + [
+                (user_input, "Sorry, I could not generate an image.")
+            ]
     return history, history, gr.update(value=None)
 def clear_input():
     return gr.update(value="")
 with gr.Blocks() as demo:
     gr.Markdown("# Emu3 Chatbot Demo")
     gr.Markdown(
         "This is a chatbot demo for image generation and vision-language understanding using Emu3 models."
     )
+    gr.Markdown(
+        "Please pass only text input for image generation and both image and text for vision-language understanding"
+    )
     chatbot = gr.Chatbot()
     state = gr.State([])