Emu3

Running on L40S

App Files Files Community

ryanzhangfan commited on 9 days ago

Commit

66ecdd5

•

1 Parent(s): 0f8e8b9

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -21

app.py CHANGED Viewed

@@ -39,15 +39,6 @@ gen_model = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True,
 )
-gen_tokenizer = AutoTokenizer.from_pretrained(EMU_GEN_HUB, trust_remote_code=True)
-gen_image_processor = AutoImageProcessor.from_pretrained(
-    VQ_HUB, trust_remote_code=True
-)
-gen_image_tokenizer = AutoModel.from_pretrained(
-    VQ_HUB, device_map="cuda:0", trust_remote_code=True
-).eval()
-gen_processor = Emu3Processor(gen_image_processor, gen_image_tokenizer, gen_tokenizer)
 # Emu3-Chat model and processor
 chat_model = AutoModelForCausalLM.from_pretrained(
     EMU_CHAT_HUB,
@@ -57,18 +48,18 @@ chat_model = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True,
 )
-chat_tokenizer = AutoTokenizer.from_pretrained(EMU_CHAT_HUB, trust_remote_code=True)
-chat_image_processor = AutoImageProcessor.from_pretrained(
     VQ_HUB, trust_remote_code=True
 )
-chat_image_tokenizer = AutoModel.from_pretrained(
     VQ_HUB, device_map="cuda:0", trust_remote_code=True
 ).eval()
-chat_processor = Emu3Processor(
-    chat_image_processor, chat_image_tokenizer, chat_tokenizer
 )
-@spaces.GPU(duration=120)
 def generate_image(prompt):
     POSITIVE_PROMPT = " masterpiece, film grained, best quality."
     NEGATIVE_PROMPT = (
@@ -86,8 +77,8 @@ def generate_image(prompt):
         image_area=gen_model.config.image_area,
         return_tensors="pt",
     )
-    pos_inputs = gen_processor(text=full_prompt, **kwargs)
-    neg_inputs = gen_processor(text=NEGATIVE_PROMPT, **kwargs)
     # Prepare hyperparameters
     GENERATION_CONFIG = GenerationConfig(
@@ -100,7 +91,7 @@ def generate_image(prompt):
     )
     h, w = pos_inputs.image_size[0]
-    constrained_fn = gen_processor.build_prefix_constrained_fn(h, w)
     logits_processor = LogitsProcessorList(
         [
             UnbatchedClassifierFreeGuidanceLogitsProcessor(
@@ -122,14 +113,14 @@ def generate_image(prompt):
         logits_processor=logits_processor,
     )
-    mm_list = gen_processor.decode(outputs[0])
     for idx, im in enumerate(mm_list):
         if isinstance(im, Image.Image):
             return im
     return None
 def vision_language_understanding(image, text):
-    inputs = chat_processor(
         text=text,
         image=image,
         mode="U",
@@ -154,7 +145,7 @@ def vision_language_understanding(image, text):
     )
     outputs = outputs[:, inputs.input_ids.shape[-1] :]
-    response = chat_processor.batch_decode(outputs, skip_special_tokens=True)[0]
     return response
 def chat(history, user_input, user_image):

     trust_remote_code=True,
 )
 # Emu3-Chat model and processor
 chat_model = AutoModelForCausalLM.from_pretrained(
     EMU_CHAT_HUB,
     trust_remote_code=True,
 )
+tokenizer = AutoTokenizer.from_pretrained(EMU_CHAT_HUB, trust_remote_code=True)
+image_processor = AutoImageProcessor.from_pretrained(
     VQ_HUB, trust_remote_code=True
 )
+image_tokenizer = AutoModel.from_pretrained(
     VQ_HUB, device_map="cuda:0", trust_remote_code=True
 ).eval()
+processor = Emu3Processor(
+    image_processor, image_tokenizer, tokenizer
 )
+@spaces.GPU(duration=300)
 def generate_image(prompt):
     POSITIVE_PROMPT = " masterpiece, film grained, best quality."
     NEGATIVE_PROMPT = (
         image_area=gen_model.config.image_area,
         return_tensors="pt",
     )
+    pos_inputs = processor(text=full_prompt, **kwargs)
+    neg_inputs = processor(text=NEGATIVE_PROMPT, **kwargs)
     # Prepare hyperparameters
     GENERATION_CONFIG = GenerationConfig(
     )
     h, w = pos_inputs.image_size[0]
+    constrained_fn = processor.build_prefix_constrained_fn(h, w)
     logits_processor = LogitsProcessorList(
         [
             UnbatchedClassifierFreeGuidanceLogitsProcessor(
         logits_processor=logits_processor,
     )
+    mm_list = processor.decode(outputs[0])
     for idx, im in enumerate(mm_list):
         if isinstance(im, Image.Image):
             return im
     return None
 def vision_language_understanding(image, text):
+    inputs = processor(
         text=text,
         image=image,
         mode="U",
     )
     outputs = outputs[:, inputs.input_ids.shape[-1] :]
+    response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
     return response
 def chat(history, user_input, user_image):