longlian
/

text-to-video-lvd-ms

Model card Files Files and versions Community

longlian commited on Apr 13

Commit

2e20888

•

1 Parent(s): 66f32ec

Update lvd_pipeline.py

Files changed (1) hide show

lvd_pipeline.py +14 -10

lvd_pipeline.py CHANGED Viewed

@@ -742,15 +742,18 @@ class GroundedTextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMix
                     lvd_gligen_phrases_frame = lvd_gligen_phrases_frame[:max_objs]
                     lvd_gligen_boxes_frame = lvd_gligen_boxes_frame[:max_objs]
-                # prepare batched input to the PositionNet (boxes, phrases, mask)
-                # Get tokens for phrases from pre-trained CLIPTokenizer
-                tokenizer_inputs = self.tokenizer(
-                    lvd_gligen_phrases_frame, padding=True, return_tensors="pt").to(device)
-                # For the token, we use the same pre-trained text encoder
-                # to obtain its text feature
-                _text_embeddings = self.text_encoder(
-                    **tokenizer_inputs).pooler_output
-                n_objs = len(lvd_gligen_boxes_frame)
                 # For each entity, described in phrases, is denoted with a bounding box,
                 # we represent the location information as (xmin,ymin,xmax,ymax)
                 boxes = torch.zeros(max_objs, 4, device=device,
@@ -759,7 +762,8 @@ class GroundedTextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMix
                 text_embeddings = torch.zeros(
                     max_objs, self.unet.cross_attention_dim, device=device, dtype=self.text_encoder.dtype
                 )
-                text_embeddings[:n_objs] = _text_embeddings
                 # Generate a mask for each object that is entity described by phrases
                 masks = torch.zeros(max_objs, device=device,
                                     dtype=self.text_encoder.dtype)

                     lvd_gligen_phrases_frame = lvd_gligen_phrases_frame[:max_objs]
                     lvd_gligen_boxes_frame = lvd_gligen_boxes_frame[:max_objs]
+                                n_objs = len(lvd_gligen_boxes_frame)
+                if n_objs:
+                    # prepare batched input to the PositionNet (boxes, phrases, mask)
+                    # Get tokens for phrases from pre-trained CLIPTokenizer
+                    tokenizer_inputs = self.tokenizer(
+                        lvd_gligen_phrases_frame, padding=True, return_tensors="pt").to(device)
+                    # For the token, we use the same pre-trained text encoder
+                    # to obtain its text feature
+                    _text_embeddings = self.text_encoder(
+                        **tokenizer_inputs).pooler_output
                 # For each entity, described in phrases, is denoted with a bounding box,
                 # we represent the location information as (xmin,ymin,xmax,ymax)
                 boxes = torch.zeros(max_objs, 4, device=device,
                 text_embeddings = torch.zeros(
                     max_objs, self.unet.cross_attention_dim, device=device, dtype=self.text_encoder.dtype
                 )
+                if n_objs:
+                    text_embeddings[:n_objs] = _text_embeddings
                 # Generate a mask for each object that is entity described by phrases
                 masks = torch.zeros(max_objs, device=device,
                                     dtype=self.text_encoder.dtype)