Spaces:
Running
on
Zero
Running
on
Zero
myniu
commited on
Commit
•
43ba5db
1
Parent(s):
e9f1b91
init
Browse files- app.py +144 -132
- oldapp.py → modifiedapp.py +135 -147
app.py
CHANGED
@@ -89,6 +89,79 @@ def get_sparseflow_and_mask_forward(
|
|
89 |
return s_flow, mask
|
90 |
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
def interpolate_trajectory(points, n_points):
|
93 |
x = [point[0] for point in points]
|
94 |
y = [point[1] for point in points]
|
@@ -142,110 +215,22 @@ def visualize_drag_v2(background_image_path, splited_tracks, width, height):
|
|
142 |
return trajectory_maps, transparent_layer
|
143 |
|
144 |
|
145 |
-
with gr.Blocks() as demo:
|
146 |
-
gr.Markdown("""<h1 align="center">MOFA-Video</h1><br>""")
|
147 |
-
|
148 |
-
gr.Markdown("""Official Gradio Demo for <a href='https://myniuuu.github.io/MOFA_Video'><b>MOFA-Video: Controllable Image Animation via Generative Motion Field Adaptions in Frozen Image-to-Video Diffusion Model</b></a>.<br>""")
|
149 |
-
|
150 |
-
gr.Markdown(
|
151 |
-
"""
|
152 |
-
During the inference, kindly follow these instructions:
|
153 |
-
<br>
|
154 |
-
1. Use the "Upload Image" button to upload an image. Avoid dragging the image directly into the window. <br>
|
155 |
-
2. Proceed to draw trajectories: <br>
|
156 |
-
2.1. Click "Add Trajectory" first, then select points on the "Add Trajectory Here" image. The first click sets the starting point. Click multiple points to create a non-linear trajectory. To add a new trajectory, click "Add Trajectory" again and select points on the image. Avoid clicking the "Add Trajectory" button multiple times without clicking points in the image to add the trajectory, as this can lead to errors. <br>
|
157 |
-
2.2. After adding each trajectory, an optical flow image will be displayed automatically. Use it as a reference to adjust the trajectory for desired effects (e.g., area, intensity). <br>
|
158 |
-
2.3. To delete the latest trajectory, click "Delete Last Trajectory." <br>
|
159 |
-
2.4. Choose the Control Scale in the bar. This determines the control intensity. Setting it to 0 means no control (pure generation result of SVD itself), while setting it to 1 results in the strongest control (which will not lead to good results in most cases because of twisting artifacts). A preset value of 0.6 is recommended for most cases. <br>
|
160 |
-
2.5. To use the motion brush for restraining the control area of the trajectory, click to add masks on the "Add Motion Brush Here" image. The motion brush restricts the optical flow area derived from the trajectory whose starting point is within the motion brush. The displayed optical flow image will change correspondingly. Adjust the motion brush radius using the "Motion Brush Radius" bar. <br>
|
161 |
-
3. Click the "Run" button to animate the image according to the path. <br>
|
162 |
-
"""
|
163 |
-
)
|
164 |
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
-
pipeline, cmp = None, None
|
168 |
|
169 |
-
|
170 |
-
|
171 |
-
motion_brush_points = gr.State([])
|
172 |
-
motion_brush_mask = gr.State()
|
173 |
-
motion_brush_viz = gr.State()
|
174 |
-
inference_batch_size = gr.State(1)
|
175 |
|
176 |
-
|
177 |
-
|
178 |
|
179 |
-
|
180 |
-
from pipeline.pipeline import FlowControlNetPipeline
|
181 |
-
from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
|
182 |
-
|
183 |
-
print('start loading models...')
|
184 |
-
# Load scheduler, tokenizer and models.
|
185 |
-
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
186 |
-
pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
|
187 |
-
)
|
188 |
-
vae = AutoencoderKLTemporalDecoder.from_pretrained(
|
189 |
-
pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
|
190 |
-
unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
|
191 |
-
pretrained_model_name_or_path,
|
192 |
-
subfolder="unet",
|
193 |
-
low_cpu_mem_usage=True,
|
194 |
-
variant="fp16",
|
195 |
-
)
|
196 |
-
|
197 |
-
controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
|
198 |
-
|
199 |
-
cmp = CMP_demo(
|
200 |
-
'./models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
|
201 |
-
42000
|
202 |
-
).to(device)
|
203 |
-
cmp.requires_grad_(False)
|
204 |
-
|
205 |
-
# Freeze vae and image_encoder
|
206 |
-
vae.requires_grad_(False)
|
207 |
-
image_encoder.requires_grad_(False)
|
208 |
-
unet.requires_grad_(False)
|
209 |
-
controlnet.requires_grad_(False)
|
210 |
-
|
211 |
-
# Move image_encoder and vae to gpu and cast to weight_dtype
|
212 |
-
image_encoder.to(device, dtype=weight_dtype)
|
213 |
-
vae.to(device, dtype=weight_dtype)
|
214 |
-
unet.to(device, dtype=weight_dtype)
|
215 |
-
controlnet.to(device, dtype=weight_dtype)
|
216 |
-
|
217 |
-
if enable_xformers_memory_efficient_attention:
|
218 |
-
if is_xformers_available():
|
219 |
-
import xformers
|
220 |
-
|
221 |
-
xformers_version = version.parse(xformers.__version__)
|
222 |
-
if xformers_version == version.parse("0.0.16"):
|
223 |
-
print(
|
224 |
-
"xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
|
225 |
-
)
|
226 |
-
unet.enable_xformers_memory_efficient_attention()
|
227 |
-
else:
|
228 |
-
raise ValueError(
|
229 |
-
"xformers is not available. Make sure it is installed correctly")
|
230 |
-
|
231 |
-
if allow_tf32:
|
232 |
-
torch.backends.cuda.matmul.allow_tf32 = True
|
233 |
-
|
234 |
-
pipeline = FlowControlNetPipeline.from_pretrained(
|
235 |
-
pretrained_model_name_or_path,
|
236 |
-
unet=unet,
|
237 |
-
controlnet=controlnet,
|
238 |
-
image_encoder=image_encoder,
|
239 |
-
vae=vae,
|
240 |
-
torch_dtype=weight_dtype,
|
241 |
-
)
|
242 |
-
pipeline = pipeline.to(device)
|
243 |
-
|
244 |
-
print('models loaded.')
|
245 |
-
|
246 |
-
return pipeline, cmp
|
247 |
-
|
248 |
-
def get_cmp_flow(frames, sparse_optical_flow, mask, brush_mask=None):
|
249 |
|
250 |
'''
|
251 |
frames: [b, 13, 3, 384, 384] (0, 1) tensor
|
@@ -270,19 +255,19 @@ with gr.Blocks() as demo:
|
|
270 |
return cmp_flow
|
271 |
|
272 |
|
273 |
-
def get_flow(pixel_values_384, sparse_optical_flow_384, mask_384, motion_brush_mask=None):
|
274 |
|
275 |
fb, fl, fc, _, _ = pixel_values_384.shape
|
276 |
|
277 |
-
controlnet_flow = get_cmp_flow(
|
278 |
pixel_values_384[:, 0:1, :, :, :].repeat(1, fl, 1, 1, 1),
|
279 |
sparse_optical_flow_384,
|
280 |
mask_384, motion_brush_mask
|
281 |
)
|
282 |
|
283 |
-
if height != 384 or width != 384:
|
284 |
-
scales = [height / 384, width / 384]
|
285 |
-
controlnet_flow = F.interpolate(controlnet_flow.flatten(0, 1), (height, width), mode='nearest').reshape(fb, fl, 2, height, width)
|
286 |
controlnet_flow[:, :, 0] *= scales[1]
|
287 |
controlnet_flow[:, :, 1] *= scales[0]
|
288 |
|
@@ -290,7 +275,7 @@ with gr.Blocks() as demo:
|
|
290 |
|
291 |
|
292 |
@torch.no_grad()
|
293 |
-
def forward_sample(input_drag_384_inmask, input_drag_384_outmask, input_first_frame, input_mask_384_inmask, input_mask_384_outmask, in_mask_flag, out_mask_flag, motion_brush_mask=None, ctrl_scale=1., outputs=dict()):
|
294 |
'''
|
295 |
input_drag: [1, 13, 320, 576, 2]
|
296 |
input_drag_384: [1, 13, 384, 384, 2]
|
@@ -322,22 +307,22 @@ with gr.Blocks() as demo:
|
|
322 |
input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
|
323 |
|
324 |
if in_mask_flag:
|
325 |
-
flow_inmask = get_flow(
|
326 |
input_first_frame_384,
|
327 |
input_drag_384_inmask, mask_384_inmask, motion_brush_mask
|
328 |
)
|
329 |
else:
|
330 |
fb, fl = mask_384_inmask.shape[:2]
|
331 |
-
flow_inmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
|
332 |
|
333 |
if out_mask_flag:
|
334 |
-
flow_outmask = get_flow(
|
335 |
input_first_frame_384,
|
336 |
input_drag_384_outmask, mask_384_outmask
|
337 |
)
|
338 |
else:
|
339 |
fb, fl = mask_384_outmask.shape[:2]
|
340 |
-
flow_outmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
|
341 |
|
342 |
inmask_no_zero = (flow_inmask != 0).all(dim=2)
|
343 |
inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
|
@@ -383,16 +368,16 @@ with gr.Blocks() as demo:
|
|
383 |
|
384 |
@spaces.GPU
|
385 |
@torch.no_grad()
|
386 |
-
def get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path):
|
387 |
|
388 |
-
original_width, original_height = width, height
|
389 |
|
390 |
input_all_points = tracking_points.constructor_args['value']
|
391 |
|
392 |
if len(input_all_points) == 0 or len(input_all_points[-1]) == 1:
|
393 |
return np.uint8(np.ones((original_width, original_height, 3))*255)
|
394 |
|
395 |
-
resized_all_points = [tuple([tuple([int(e1[0]*width/original_width), int(e1[1]*height/original_height)]) for e1 in e]) for e in input_all_points]
|
396 |
resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
|
397 |
|
398 |
new_resized_all_points = []
|
@@ -470,22 +455,22 @@ with gr.Blocks() as demo:
|
|
470 |
input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
|
471 |
|
472 |
if in_mask_flag:
|
473 |
-
flow_inmask = get_flow(
|
474 |
input_first_frame_384,
|
475 |
input_drag_384_inmask, mask_384_inmask, motion_brush_mask_384
|
476 |
)
|
477 |
else:
|
478 |
fb, fl = mask_384_inmask.shape[:2]
|
479 |
-
flow_inmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
|
480 |
|
481 |
if out_mask_flag:
|
482 |
-
flow_outmask = get_flow(
|
483 |
input_first_frame_384,
|
484 |
input_drag_384_outmask, mask_384_outmask
|
485 |
)
|
486 |
else:
|
487 |
fb, fl = mask_384_outmask.shape[:2]
|
488 |
-
flow_outmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
|
489 |
|
490 |
inmask_no_zero = (flow_inmask != 0).all(dim=2)
|
491 |
inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
|
@@ -498,12 +483,12 @@ with gr.Blocks() as demo:
|
|
498 |
return viz_esti_flows
|
499 |
|
500 |
@spaces.GPU(duration=200)
|
501 |
-
def run(first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale):
|
502 |
|
503 |
-
original_width, original_height = width, height
|
504 |
|
505 |
input_all_points = tracking_points.constructor_args['value']
|
506 |
-
resized_all_points = [tuple([tuple([int(e1[0]*width/original_width), int(e1[1]*height/original_height)]) for e1 in e]) for e in input_all_points]
|
507 |
resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
|
508 |
|
509 |
new_resized_all_points = []
|
@@ -556,9 +541,9 @@ with gr.Blocks() as demo:
|
|
556 |
id = base.split('_')[0]
|
557 |
|
558 |
image_pil = image2pil(first_frame_path)
|
559 |
-
image_pil = image_pil.resize((width, height), Image.BILINEAR).convert('RGB')
|
560 |
|
561 |
-
visualized_drag, _ = visualize_drag_v2(first_frame_path, resized_all_points, width, height)
|
562 |
|
563 |
motion_brush_viz_pil = Image.fromarray(motion_brush_viz.astype(np.uint8)).convert('RGBA')
|
564 |
visualized_drag = visualized_drag[0].convert('RGBA')
|
@@ -581,7 +566,7 @@ with gr.Blocks() as demo:
|
|
581 |
first_frames = outputs['logits_imgs'][:, -1]
|
582 |
|
583 |
|
584 |
-
outputs = forward_sample(
|
585 |
input_drag_384_inmask.to('cuda'),
|
586 |
input_drag_384_outmask.to('cuda'),
|
587 |
first_frames.to('cuda'),
|
@@ -644,16 +629,43 @@ with gr.Blocks() as demo:
|
|
644 |
|
645 |
return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
|
646 |
|
647 |
-
@spaces.GPU(duration=100)
|
648 |
-
def preprocess_image(image):
|
649 |
|
650 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
651 |
|
652 |
image_pil = image2pil(image.name)
|
653 |
raw_w, raw_h = image_pil.size
|
654 |
|
655 |
max_edge = min(raw_w, raw_h)
|
656 |
-
resize_ratio =
|
657 |
|
658 |
image_pil = image_pil.resize((round(raw_w * resize_ratio), round(raw_h * resize_ratio)), Image.BILINEAR)
|
659 |
|
@@ -663,8 +675,8 @@ with gr.Blocks() as demo:
|
|
663 |
|
664 |
image_pil = transforms.CenterCrop((crop_h, crop_w))(image_pil.convert('RGB'))
|
665 |
|
666 |
-
width = crop_w
|
667 |
-
height = crop_h
|
668 |
|
669 |
id = str(time.time()).split('.')[0]
|
670 |
os.makedirs(os.path.join(output_dir_video, str(id)), exist_ok=True)
|
@@ -709,7 +721,7 @@ with gr.Blocks() as demo:
|
|
709 |
transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
|
710 |
trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
|
711 |
|
712 |
-
viz_flow = get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
|
713 |
|
714 |
return tracking_points, trajectory_map, viz_flow
|
715 |
|
@@ -729,7 +741,7 @@ with gr.Blocks() as demo:
|
|
729 |
transparent_layer_pil = Image.fromarray(transparent_layer.astype(np.uint8))
|
730 |
motion_map = Image.alpha_composite(transparent_background, transparent_layer_pil)
|
731 |
|
732 |
-
viz_flow = get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
|
733 |
|
734 |
return motion_brush_mask, transparent_layer, motion_map, viz_flow
|
735 |
|
@@ -765,7 +777,7 @@ with gr.Blocks() as demo:
|
|
765 |
transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
|
766 |
trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
|
767 |
|
768 |
-
viz_flow = get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
|
769 |
|
770 |
return tracking_points, trajectory_map, viz_flow
|
771 |
|
@@ -820,6 +832,6 @@ with gr.Blocks() as demo:
|
|
820 |
|
821 |
input_image_mask.select(add_motion_brushes, [motion_brush_points, motion_brush_mask, motion_brush_viz, first_frame_path, brush_radius, tracking_points], [motion_brush_mask, motion_brush_viz, input_image_mask, viz_flow])
|
822 |
|
823 |
-
run_button.click(run, [first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale], [hint_image, output_video, output_flow, output_video_mp4, output_flow_mp4])
|
824 |
|
825 |
demo.launch()
|
|
|
89 |
return s_flow, mask
|
90 |
|
91 |
|
92 |
+
@spaces.GPU(duration=100)
|
93 |
+
def init_models(pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
|
94 |
+
|
95 |
+
from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
|
96 |
+
from pipeline.pipeline import FlowControlNetPipeline
|
97 |
+
from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
|
98 |
+
|
99 |
+
print('start loading models...')
|
100 |
+
# Load scheduler, tokenizer and models.
|
101 |
+
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
102 |
+
pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
|
103 |
+
)
|
104 |
+
vae = AutoencoderKLTemporalDecoder.from_pretrained(
|
105 |
+
pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
|
106 |
+
unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
|
107 |
+
pretrained_model_name_or_path,
|
108 |
+
subfolder="unet",
|
109 |
+
low_cpu_mem_usage=True,
|
110 |
+
variant="fp16",
|
111 |
+
)
|
112 |
+
|
113 |
+
controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
|
114 |
+
|
115 |
+
cmp = CMP_demo(
|
116 |
+
'./models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
|
117 |
+
42000
|
118 |
+
).to(device)
|
119 |
+
cmp.requires_grad_(False)
|
120 |
+
|
121 |
+
# Freeze vae and image_encoder
|
122 |
+
vae.requires_grad_(False)
|
123 |
+
image_encoder.requires_grad_(False)
|
124 |
+
unet.requires_grad_(False)
|
125 |
+
controlnet.requires_grad_(False)
|
126 |
+
|
127 |
+
# Move image_encoder and vae to gpu and cast to weight_dtype
|
128 |
+
image_encoder.to(device, dtype=weight_dtype)
|
129 |
+
vae.to(device, dtype=weight_dtype)
|
130 |
+
unet.to(device, dtype=weight_dtype)
|
131 |
+
controlnet.to(device, dtype=weight_dtype)
|
132 |
+
|
133 |
+
if enable_xformers_memory_efficient_attention:
|
134 |
+
if is_xformers_available():
|
135 |
+
import xformers
|
136 |
+
|
137 |
+
xformers_version = version.parse(xformers.__version__)
|
138 |
+
if xformers_version == version.parse("0.0.16"):
|
139 |
+
print(
|
140 |
+
"xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
|
141 |
+
)
|
142 |
+
unet.enable_xformers_memory_efficient_attention()
|
143 |
+
else:
|
144 |
+
raise ValueError(
|
145 |
+
"xformers is not available. Make sure it is installed correctly")
|
146 |
+
|
147 |
+
if allow_tf32:
|
148 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
149 |
+
|
150 |
+
pipeline = FlowControlNetPipeline.from_pretrained(
|
151 |
+
pretrained_model_name_or_path,
|
152 |
+
unet=unet,
|
153 |
+
controlnet=controlnet,
|
154 |
+
image_encoder=image_encoder,
|
155 |
+
vae=vae,
|
156 |
+
torch_dtype=weight_dtype,
|
157 |
+
)
|
158 |
+
pipeline = pipeline.to(device)
|
159 |
+
|
160 |
+
print('models loaded.')
|
161 |
+
|
162 |
+
return pipeline, cmp
|
163 |
+
|
164 |
+
|
165 |
def interpolate_trajectory(points, n_points):
|
166 |
x = [point[0] for point in points]
|
167 |
y = [point[1] for point in points]
|
|
|
215 |
return trajectory_maps, transparent_layer
|
216 |
|
217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
+
pipeline, cmp = init_models(
|
220 |
+
"ckpts/stable-video-diffusion-img2vid-xt-1-1",
|
221 |
+
"ckpts/controlnet",
|
222 |
+
weight_dtype=torch.float16,
|
223 |
+
device='cuda'
|
224 |
+
)
|
225 |
|
|
|
226 |
|
227 |
+
class Drag:
|
228 |
+
def __init__(self, height, width):
|
|
|
|
|
|
|
|
|
229 |
|
230 |
+
self.height = height
|
231 |
+
self.width = width
|
232 |
|
233 |
+
def get_cmp_flow(self, frames, sparse_optical_flow, mask, brush_mask=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
'''
|
236 |
frames: [b, 13, 3, 384, 384] (0, 1) tensor
|
|
|
255 |
return cmp_flow
|
256 |
|
257 |
|
258 |
+
def get_flow(self, pixel_values_384, sparse_optical_flow_384, mask_384, motion_brush_mask=None):
|
259 |
|
260 |
fb, fl, fc, _, _ = pixel_values_384.shape
|
261 |
|
262 |
+
controlnet_flow = self.get_cmp_flow(
|
263 |
pixel_values_384[:, 0:1, :, :, :].repeat(1, fl, 1, 1, 1),
|
264 |
sparse_optical_flow_384,
|
265 |
mask_384, motion_brush_mask
|
266 |
)
|
267 |
|
268 |
+
if self.height != 384 or self.width != 384:
|
269 |
+
scales = [self.height / 384, self.width / 384]
|
270 |
+
controlnet_flow = F.interpolate(controlnet_flow.flatten(0, 1), (self.height, self.width), mode='nearest').reshape(fb, fl, 2, self.height, self.width)
|
271 |
controlnet_flow[:, :, 0] *= scales[1]
|
272 |
controlnet_flow[:, :, 1] *= scales[0]
|
273 |
|
|
|
275 |
|
276 |
|
277 |
@torch.no_grad()
|
278 |
+
def forward_sample(self, input_drag_384_inmask, input_drag_384_outmask, input_first_frame, input_mask_384_inmask, input_mask_384_outmask, in_mask_flag, out_mask_flag, motion_brush_mask=None, ctrl_scale=1., outputs=dict()):
|
279 |
'''
|
280 |
input_drag: [1, 13, 320, 576, 2]
|
281 |
input_drag_384: [1, 13, 384, 384, 2]
|
|
|
307 |
input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
|
308 |
|
309 |
if in_mask_flag:
|
310 |
+
flow_inmask = self.get_flow(
|
311 |
input_first_frame_384,
|
312 |
input_drag_384_inmask, mask_384_inmask, motion_brush_mask
|
313 |
)
|
314 |
else:
|
315 |
fb, fl = mask_384_inmask.shape[:2]
|
316 |
+
flow_inmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
|
317 |
|
318 |
if out_mask_flag:
|
319 |
+
flow_outmask = self.get_flow(
|
320 |
input_first_frame_384,
|
321 |
input_drag_384_outmask, mask_384_outmask
|
322 |
)
|
323 |
else:
|
324 |
fb, fl = mask_384_outmask.shape[:2]
|
325 |
+
flow_outmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
|
326 |
|
327 |
inmask_no_zero = (flow_inmask != 0).all(dim=2)
|
328 |
inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
|
|
|
368 |
|
369 |
@spaces.GPU
|
370 |
@torch.no_grad()
|
371 |
+
def get_cmp_flow_from_tracking_points(self, tracking_points, motion_brush_mask, first_frame_path):
|
372 |
|
373 |
+
original_width, original_height = self.width, self.height
|
374 |
|
375 |
input_all_points = tracking_points.constructor_args['value']
|
376 |
|
377 |
if len(input_all_points) == 0 or len(input_all_points[-1]) == 1:
|
378 |
return np.uint8(np.ones((original_width, original_height, 3))*255)
|
379 |
|
380 |
+
resized_all_points = [tuple([tuple([int(e1[0]*self.width/original_width), int(e1[1]*self.height/original_height)]) for e1 in e]) for e in input_all_points]
|
381 |
resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
|
382 |
|
383 |
new_resized_all_points = []
|
|
|
455 |
input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
|
456 |
|
457 |
if in_mask_flag:
|
458 |
+
flow_inmask = self.get_flow(
|
459 |
input_first_frame_384,
|
460 |
input_drag_384_inmask, mask_384_inmask, motion_brush_mask_384
|
461 |
)
|
462 |
else:
|
463 |
fb, fl = mask_384_inmask.shape[:2]
|
464 |
+
flow_inmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
|
465 |
|
466 |
if out_mask_flag:
|
467 |
+
flow_outmask = self.get_flow(
|
468 |
input_first_frame_384,
|
469 |
input_drag_384_outmask, mask_384_outmask
|
470 |
)
|
471 |
else:
|
472 |
fb, fl = mask_384_outmask.shape[:2]
|
473 |
+
flow_outmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
|
474 |
|
475 |
inmask_no_zero = (flow_inmask != 0).all(dim=2)
|
476 |
inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
|
|
|
483 |
return viz_esti_flows
|
484 |
|
485 |
@spaces.GPU(duration=200)
|
486 |
+
def run(self, first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale):
|
487 |
|
488 |
+
original_width, original_height = self.width, self.height
|
489 |
|
490 |
input_all_points = tracking_points.constructor_args['value']
|
491 |
+
resized_all_points = [tuple([tuple([int(e1[0]*self.width/original_width), int(e1[1]*self.height/original_height)]) for e1 in e]) for e in input_all_points]
|
492 |
resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
|
493 |
|
494 |
new_resized_all_points = []
|
|
|
541 |
id = base.split('_')[0]
|
542 |
|
543 |
image_pil = image2pil(first_frame_path)
|
544 |
+
image_pil = image_pil.resize((self.width, self.height), Image.BILINEAR).convert('RGB')
|
545 |
|
546 |
+
visualized_drag, _ = visualize_drag_v2(first_frame_path, resized_all_points, self.width, self.height)
|
547 |
|
548 |
motion_brush_viz_pil = Image.fromarray(motion_brush_viz.astype(np.uint8)).convert('RGBA')
|
549 |
visualized_drag = visualized_drag[0].convert('RGBA')
|
|
|
566 |
first_frames = outputs['logits_imgs'][:, -1]
|
567 |
|
568 |
|
569 |
+
outputs = self.forward_sample(
|
570 |
input_drag_384_inmask.to('cuda'),
|
571 |
input_drag_384_outmask.to('cuda'),
|
572 |
first_frames.to('cuda'),
|
|
|
629 |
|
630 |
return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
|
631 |
|
|
|
|
|
632 |
|
633 |
+
with gr.Blocks() as demo:
|
634 |
+
gr.Markdown("""<h1 align="center">MOFA-Video</h1><br>""")
|
635 |
+
|
636 |
+
gr.Markdown("""Official Gradio Demo for <a href='https://myniuuu.github.io/MOFA_Video'><b>MOFA-Video: Controllable Image Animation via Generative Motion Field Adaptions in Frozen Image-to-Video Diffusion Model</b></a>.<br>""")
|
637 |
+
|
638 |
+
gr.Markdown(
|
639 |
+
"""
|
640 |
+
During the inference, kindly follow these instructions:
|
641 |
+
<br>
|
642 |
+
1. Use the "Upload Image" button to upload an image. Avoid dragging the image directly into the window. <br>
|
643 |
+
2. Proceed to draw trajectories: <br>
|
644 |
+
2.1. Click "Add Trajectory" first, then select points on the "Add Trajectory Here" image. The first click sets the starting point. Click multiple points to create a non-linear trajectory. To add a new trajectory, click "Add Trajectory" again and select points on the image. Avoid clicking the "Add Trajectory" button multiple times without clicking points in the image to add the trajectory, as this can lead to errors. <br>
|
645 |
+
2.2. After adding each trajectory, an optical flow image will be displayed automatically. Use it as a reference to adjust the trajectory for desired effects (e.g., area, intensity). <br>
|
646 |
+
2.3. To delete the latest trajectory, click "Delete Last Trajectory." <br>
|
647 |
+
2.4. Choose the Control Scale in the bar. This determines the control intensity. Setting it to 0 means no control (pure generation result of SVD itself), while setting it to 1 results in the strongest control (which will not lead to good results in most cases because of twisting artifacts). A preset value of 0.6 is recommended for most cases. <br>
|
648 |
+
2.5. To use the motion brush for restraining the control area of the trajectory, click to add masks on the "Add Motion Brush Here" image. The motion brush restricts the optical flow area derived from the trajectory whose starting point is within the motion brush. The displayed optical flow image will change correspondingly. Adjust the motion brush radius using the "Motion Brush Radius" bar. <br>
|
649 |
+
3. Click the "Run" button to animate the image according to the path. <br>
|
650 |
+
"""
|
651 |
+
)
|
652 |
+
|
653 |
+
target_size = 512
|
654 |
+
DragNUWA_net = Drag(target_size, target_size)
|
655 |
+
first_frame_path = gr.State()
|
656 |
+
tracking_points = gr.State([])
|
657 |
+
motion_brush_points = gr.State([])
|
658 |
+
motion_brush_mask = gr.State()
|
659 |
+
motion_brush_viz = gr.State()
|
660 |
+
inference_batch_size = gr.State(1)
|
661 |
+
|
662 |
+
def preprocess_image(image):
|
663 |
|
664 |
image_pil = image2pil(image.name)
|
665 |
raw_w, raw_h = image_pil.size
|
666 |
|
667 |
max_edge = min(raw_w, raw_h)
|
668 |
+
resize_ratio = target_size / max_edge
|
669 |
|
670 |
image_pil = image_pil.resize((round(raw_w * resize_ratio), round(raw_h * resize_ratio)), Image.BILINEAR)
|
671 |
|
|
|
675 |
|
676 |
image_pil = transforms.CenterCrop((crop_h, crop_w))(image_pil.convert('RGB'))
|
677 |
|
678 |
+
DragNUWA_net.width = crop_w
|
679 |
+
DragNUWA_net.height = crop_h
|
680 |
|
681 |
id = str(time.time()).split('.')[0]
|
682 |
os.makedirs(os.path.join(output_dir_video, str(id)), exist_ok=True)
|
|
|
721 |
transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
|
722 |
trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
|
723 |
|
724 |
+
viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
|
725 |
|
726 |
return tracking_points, trajectory_map, viz_flow
|
727 |
|
|
|
741 |
transparent_layer_pil = Image.fromarray(transparent_layer.astype(np.uint8))
|
742 |
motion_map = Image.alpha_composite(transparent_background, transparent_layer_pil)
|
743 |
|
744 |
+
viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
|
745 |
|
746 |
return motion_brush_mask, transparent_layer, motion_map, viz_flow
|
747 |
|
|
|
777 |
transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
|
778 |
trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
|
779 |
|
780 |
+
viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
|
781 |
|
782 |
return tracking_points, trajectory_map, viz_flow
|
783 |
|
|
|
832 |
|
833 |
input_image_mask.select(add_motion_brushes, [motion_brush_points, motion_brush_mask, motion_brush_viz, first_frame_path, brush_radius, tracking_points], [motion_brush_mask, motion_brush_viz, input_image_mask, viz_flow])
|
834 |
|
835 |
+
run_button.click(DragNUWA_net.run, [first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale], [hint_image, output_video, output_flow, output_video_mp4, output_flow_mp4])
|
836 |
|
837 |
demo.launch()
|
oldapp.py → modifiedapp.py
RENAMED
@@ -89,78 +89,6 @@ def get_sparseflow_and_mask_forward(
|
|
89 |
return s_flow, mask
|
90 |
|
91 |
|
92 |
-
def init_models(pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
|
93 |
-
|
94 |
-
from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
|
95 |
-
from pipeline.pipeline import FlowControlNetPipeline
|
96 |
-
from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
|
97 |
-
|
98 |
-
print('start loading models...')
|
99 |
-
# Load scheduler, tokenizer and models.
|
100 |
-
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
101 |
-
pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
|
102 |
-
)
|
103 |
-
vae = AutoencoderKLTemporalDecoder.from_pretrained(
|
104 |
-
pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
|
105 |
-
unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
|
106 |
-
pretrained_model_name_or_path,
|
107 |
-
subfolder="unet",
|
108 |
-
low_cpu_mem_usage=True,
|
109 |
-
variant="fp16",
|
110 |
-
)
|
111 |
-
|
112 |
-
controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
|
113 |
-
|
114 |
-
cmp = CMP_demo(
|
115 |
-
'./models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
|
116 |
-
42000
|
117 |
-
).to(device)
|
118 |
-
cmp.requires_grad_(False)
|
119 |
-
|
120 |
-
# Freeze vae and image_encoder
|
121 |
-
vae.requires_grad_(False)
|
122 |
-
image_encoder.requires_grad_(False)
|
123 |
-
unet.requires_grad_(False)
|
124 |
-
controlnet.requires_grad_(False)
|
125 |
-
|
126 |
-
# Move image_encoder and vae to gpu and cast to weight_dtype
|
127 |
-
image_encoder.to(device, dtype=weight_dtype)
|
128 |
-
vae.to(device, dtype=weight_dtype)
|
129 |
-
unet.to(device, dtype=weight_dtype)
|
130 |
-
controlnet.to(device, dtype=weight_dtype)
|
131 |
-
|
132 |
-
if enable_xformers_memory_efficient_attention:
|
133 |
-
if is_xformers_available():
|
134 |
-
import xformers
|
135 |
-
|
136 |
-
xformers_version = version.parse(xformers.__version__)
|
137 |
-
if xformers_version == version.parse("0.0.16"):
|
138 |
-
print(
|
139 |
-
"xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
|
140 |
-
)
|
141 |
-
unet.enable_xformers_memory_efficient_attention()
|
142 |
-
else:
|
143 |
-
raise ValueError(
|
144 |
-
"xformers is not available. Make sure it is installed correctly")
|
145 |
-
|
146 |
-
if allow_tf32:
|
147 |
-
torch.backends.cuda.matmul.allow_tf32 = True
|
148 |
-
|
149 |
-
pipeline = FlowControlNetPipeline.from_pretrained(
|
150 |
-
pretrained_model_name_or_path,
|
151 |
-
unet=unet,
|
152 |
-
controlnet=controlnet,
|
153 |
-
image_encoder=image_encoder,
|
154 |
-
vae=vae,
|
155 |
-
torch_dtype=weight_dtype,
|
156 |
-
)
|
157 |
-
pipeline = pipeline.to(device)
|
158 |
-
|
159 |
-
print('models loaded.')
|
160 |
-
|
161 |
-
return pipeline, cmp
|
162 |
-
|
163 |
-
|
164 |
def interpolate_trajectory(points, n_points):
|
165 |
x = [point[0] for point in points]
|
166 |
y = [point[1] for point in points]
|
@@ -214,24 +142,110 @@ def visualize_drag_v2(background_image_path, splited_tracks, width, height):
|
|
214 |
return trajectory_maps, transparent_layer
|
215 |
|
216 |
|
217 |
-
|
218 |
-
|
219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
|
221 |
-
|
222 |
-
|
223 |
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
)
|
230 |
|
231 |
-
|
232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
|
234 |
-
|
|
|
|
|
|
|
|
|
235 |
|
236 |
'''
|
237 |
frames: [b, 13, 3, 384, 384] (0, 1) tensor
|
@@ -244,7 +258,7 @@ class Drag:
|
|
244 |
frames = frames.flatten(0, 1) # [b*13, 3, 256, 256]
|
245 |
sparse_optical_flow = sparse_optical_flow.flatten(0, 1) # [b*13, 2, 256, 256]
|
246 |
mask = mask.flatten(0, 1) # [b*13, 2, 256, 256]
|
247 |
-
cmp_flow =
|
248 |
|
249 |
if brush_mask is not None:
|
250 |
brush_mask = torch.from_numpy(brush_mask) / 255.
|
@@ -256,19 +270,19 @@ class Drag:
|
|
256 |
return cmp_flow
|
257 |
|
258 |
|
259 |
-
def get_flow(
|
260 |
|
261 |
fb, fl, fc, _, _ = pixel_values_384.shape
|
262 |
|
263 |
-
controlnet_flow =
|
264 |
pixel_values_384[:, 0:1, :, :, :].repeat(1, fl, 1, 1, 1),
|
265 |
sparse_optical_flow_384,
|
266 |
mask_384, motion_brush_mask
|
267 |
)
|
268 |
|
269 |
-
if
|
270 |
-
scales = [
|
271 |
-
controlnet_flow = F.interpolate(controlnet_flow.flatten(0, 1), (
|
272 |
controlnet_flow[:, :, 0] *= scales[1]
|
273 |
controlnet_flow[:, :, 1] *= scales[0]
|
274 |
|
@@ -276,7 +290,7 @@ class Drag:
|
|
276 |
|
277 |
|
278 |
@torch.no_grad()
|
279 |
-
def forward_sample(
|
280 |
'''
|
281 |
input_drag: [1, 13, 320, 576, 2]
|
282 |
input_drag_384: [1, 13, 384, 384, 2]
|
@@ -308,29 +322,29 @@ class Drag:
|
|
308 |
input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
|
309 |
|
310 |
if in_mask_flag:
|
311 |
-
flow_inmask =
|
312 |
input_first_frame_384,
|
313 |
input_drag_384_inmask, mask_384_inmask, motion_brush_mask
|
314 |
)
|
315 |
else:
|
316 |
fb, fl = mask_384_inmask.shape[:2]
|
317 |
-
flow_inmask = torch.zeros(fb, fl, 2,
|
318 |
|
319 |
if out_mask_flag:
|
320 |
-
flow_outmask =
|
321 |
input_first_frame_384,
|
322 |
input_drag_384_outmask, mask_384_outmask
|
323 |
)
|
324 |
else:
|
325 |
fb, fl = mask_384_outmask.shape[:2]
|
326 |
-
flow_outmask = torch.zeros(fb, fl, 2,
|
327 |
|
328 |
inmask_no_zero = (flow_inmask != 0).all(dim=2)
|
329 |
inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
|
330 |
|
331 |
controlnet_flow = torch.where(inmask_no_zero, flow_inmask, flow_outmask)
|
332 |
|
333 |
-
val_output =
|
334 |
input_first_frame_pil,
|
335 |
input_first_frame_pil,
|
336 |
controlnet_flow,
|
@@ -369,16 +383,16 @@ class Drag:
|
|
369 |
|
370 |
@spaces.GPU
|
371 |
@torch.no_grad()
|
372 |
-
def get_cmp_flow_from_tracking_points(
|
373 |
|
374 |
-
original_width, original_height =
|
375 |
|
376 |
input_all_points = tracking_points.constructor_args['value']
|
377 |
|
378 |
if len(input_all_points) == 0 or len(input_all_points[-1]) == 1:
|
379 |
return np.uint8(np.ones((original_width, original_height, 3))*255)
|
380 |
|
381 |
-
resized_all_points = [tuple([tuple([int(e1[0]*
|
382 |
resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
|
383 |
|
384 |
new_resized_all_points = []
|
@@ -456,22 +470,22 @@ class Drag:
|
|
456 |
input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
|
457 |
|
458 |
if in_mask_flag:
|
459 |
-
flow_inmask =
|
460 |
input_first_frame_384,
|
461 |
input_drag_384_inmask, mask_384_inmask, motion_brush_mask_384
|
462 |
)
|
463 |
else:
|
464 |
fb, fl = mask_384_inmask.shape[:2]
|
465 |
-
flow_inmask = torch.zeros(fb, fl, 2,
|
466 |
|
467 |
if out_mask_flag:
|
468 |
-
flow_outmask =
|
469 |
input_first_frame_384,
|
470 |
input_drag_384_outmask, mask_384_outmask
|
471 |
)
|
472 |
else:
|
473 |
fb, fl = mask_384_outmask.shape[:2]
|
474 |
-
flow_outmask = torch.zeros(fb, fl, 2,
|
475 |
|
476 |
inmask_no_zero = (flow_inmask != 0).all(dim=2)
|
477 |
inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
|
@@ -484,12 +498,12 @@ class Drag:
|
|
484 |
return viz_esti_flows
|
485 |
|
486 |
@spaces.GPU(duration=200)
|
487 |
-
def run(
|
488 |
|
489 |
-
original_width, original_height =
|
490 |
|
491 |
input_all_points = tracking_points.constructor_args['value']
|
492 |
-
resized_all_points = [tuple([tuple([int(e1[0]*
|
493 |
resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
|
494 |
|
495 |
new_resized_all_points = []
|
@@ -542,9 +556,9 @@ class Drag:
|
|
542 |
id = base.split('_')[0]
|
543 |
|
544 |
image_pil = image2pil(first_frame_path)
|
545 |
-
image_pil = image_pil.resize((
|
546 |
|
547 |
-
visualized_drag, _ = visualize_drag_v2(first_frame_path, resized_all_points,
|
548 |
|
549 |
motion_brush_viz_pil = Image.fromarray(motion_brush_viz.astype(np.uint8)).convert('RGBA')
|
550 |
visualized_drag = visualized_drag[0].convert('RGBA')
|
@@ -567,7 +581,7 @@ class Drag:
|
|
567 |
first_frames = outputs['logits_imgs'][:, -1]
|
568 |
|
569 |
|
570 |
-
outputs =
|
571 |
input_drag_384_inmask.to('cuda'),
|
572 |
input_drag_384_outmask.to('cuda'),
|
573 |
first_frames.to('cuda'),
|
@@ -630,43 +644,17 @@ class Drag:
|
|
630 |
|
631 |
return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
|
632 |
|
633 |
-
|
634 |
-
with gr.Blocks() as demo:
|
635 |
-
gr.Markdown("""<h1 align="center">MOFA-Video</h1><br>""")
|
636 |
-
|
637 |
-
gr.Markdown("""Official Gradio Demo for <a href='https://myniuuu.github.io/MOFA_Video'><b>MOFA-Video: Controllable Image Animation via Generative Motion Field Adaptions in Frozen Image-to-Video Diffusion Model</b></a>.<br>""")
|
638 |
-
|
639 |
-
gr.Markdown(
|
640 |
-
"""
|
641 |
-
During the inference, kindly follow these instructions:
|
642 |
-
<br>
|
643 |
-
1. Use the "Upload Image" button to upload an image. Avoid dragging the image directly into the window. <br>
|
644 |
-
2. Proceed to draw trajectories: <br>
|
645 |
-
2.1. Click "Add Trajectory" first, then select points on the "Add Trajectory Here" image. The first click sets the starting point. Click multiple points to create a non-linear trajectory. To add a new trajectory, click "Add Trajectory" again and select points on the image. Avoid clicking the "Add Trajectory" button multiple times without clicking points in the image to add the trajectory, as this can lead to errors. <br>
|
646 |
-
2.2. After adding each trajectory, an optical flow image will be displayed automatically. Use it as a reference to adjust the trajectory for desired effects (e.g., area, intensity). <br>
|
647 |
-
2.3. To delete the latest trajectory, click "Delete Last Trajectory." <br>
|
648 |
-
2.4. Choose the Control Scale in the bar. This determines the control intensity. Setting it to 0 means no control (pure generation result of SVD itself), while setting it to 1 results in the strongest control (which will not lead to good results in most cases because of twisting artifacts). A preset value of 0.6 is recommended for most cases. <br>
|
649 |
-
2.5. To use the motion brush for restraining the control area of the trajectory, click to add masks on the "Add Motion Brush Here" image. The motion brush restricts the optical flow area derived from the trajectory whose starting point is within the motion brush. The displayed optical flow image will change correspondingly. Adjust the motion brush radius using the "Motion Brush Radius" bar. <br>
|
650 |
-
3. Click the "Run" button to animate the image according to the path. <br>
|
651 |
-
"""
|
652 |
-
)
|
653 |
-
|
654 |
-
target_size = 512
|
655 |
-
DragNUWA_net = Drag(target_size, target_size)
|
656 |
-
first_frame_path = gr.State()
|
657 |
-
tracking_points = gr.State([])
|
658 |
-
motion_brush_points = gr.State([])
|
659 |
-
motion_brush_mask = gr.State()
|
660 |
-
motion_brush_viz = gr.State()
|
661 |
-
inference_batch_size = gr.State(1)
|
662 |
-
|
663 |
def preprocess_image(image):
|
|
|
|
|
|
|
664 |
|
665 |
image_pil = image2pil(image.name)
|
666 |
raw_w, raw_h = image_pil.size
|
667 |
|
668 |
max_edge = min(raw_w, raw_h)
|
669 |
-
resize_ratio =
|
670 |
|
671 |
image_pil = image_pil.resize((round(raw_w * resize_ratio), round(raw_h * resize_ratio)), Image.BILINEAR)
|
672 |
|
@@ -676,8 +664,8 @@ with gr.Blocks() as demo:
|
|
676 |
|
677 |
image_pil = transforms.CenterCrop((crop_h, crop_w))(image_pil.convert('RGB'))
|
678 |
|
679 |
-
|
680 |
-
|
681 |
|
682 |
id = str(time.time()).split('.')[0]
|
683 |
os.makedirs(os.path.join(output_dir_video, str(id)), exist_ok=True)
|
@@ -722,7 +710,7 @@ with gr.Blocks() as demo:
|
|
722 |
transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
|
723 |
trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
|
724 |
|
725 |
-
viz_flow =
|
726 |
|
727 |
return tracking_points, trajectory_map, viz_flow
|
728 |
|
@@ -742,7 +730,7 @@ with gr.Blocks() as demo:
|
|
742 |
transparent_layer_pil = Image.fromarray(transparent_layer.astype(np.uint8))
|
743 |
motion_map = Image.alpha_composite(transparent_background, transparent_layer_pil)
|
744 |
|
745 |
-
viz_flow =
|
746 |
|
747 |
return motion_brush_mask, transparent_layer, motion_map, viz_flow
|
748 |
|
@@ -778,7 +766,7 @@ with gr.Blocks() as demo:
|
|
778 |
transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
|
779 |
trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
|
780 |
|
781 |
-
viz_flow =
|
782 |
|
783 |
return tracking_points, trajectory_map, viz_flow
|
784 |
|
@@ -833,6 +821,6 @@ with gr.Blocks() as demo:
|
|
833 |
|
834 |
input_image_mask.select(add_motion_brushes, [motion_brush_points, motion_brush_mask, motion_brush_viz, first_frame_path, brush_radius, tracking_points], [motion_brush_mask, motion_brush_viz, input_image_mask, viz_flow])
|
835 |
|
836 |
-
run_button.click(
|
837 |
|
838 |
demo.launch()
|
|
|
89 |
return s_flow, mask
|
90 |
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
def interpolate_trajectory(points, n_points):
|
93 |
x = [point[0] for point in points]
|
94 |
y = [point[1] for point in points]
|
|
|
142 |
return trajectory_maps, transparent_layer
|
143 |
|
144 |
|
145 |
+
with gr.Blocks() as demo:
|
146 |
+
gr.Markdown("""<h1 align="center">MOFA-Video</h1><br>""")
|
147 |
+
|
148 |
+
gr.Markdown("""Official Gradio Demo for <a href='https://myniuuu.github.io/MOFA_Video'><b>MOFA-Video: Controllable Image Animation via Generative Motion Field Adaptions in Frozen Image-to-Video Diffusion Model</b></a>.<br>""")
|
149 |
+
|
150 |
+
gr.Markdown(
|
151 |
+
"""
|
152 |
+
During the inference, kindly follow these instructions:
|
153 |
+
<br>
|
154 |
+
1. Use the "Upload Image" button to upload an image. Avoid dragging the image directly into the window. <br>
|
155 |
+
2. Proceed to draw trajectories: <br>
|
156 |
+
2.1. Click "Add Trajectory" first, then select points on the "Add Trajectory Here" image. The first click sets the starting point. Click multiple points to create a non-linear trajectory. To add a new trajectory, click "Add Trajectory" again and select points on the image. Avoid clicking the "Add Trajectory" button multiple times without clicking points in the image to add the trajectory, as this can lead to errors. <br>
|
157 |
+
2.2. After adding each trajectory, an optical flow image will be displayed automatically. Use it as a reference to adjust the trajectory for desired effects (e.g., area, intensity). <br>
|
158 |
+
2.3. To delete the latest trajectory, click "Delete Last Trajectory." <br>
|
159 |
+
2.4. Choose the Control Scale in the bar. This determines the control intensity. Setting it to 0 means no control (pure generation result of SVD itself), while setting it to 1 results in the strongest control (which will not lead to good results in most cases because of twisting artifacts). A preset value of 0.6 is recommended for most cases. <br>
|
160 |
+
2.5. To use the motion brush for restraining the control area of the trajectory, click to add masks on the "Add Motion Brush Here" image. The motion brush restricts the optical flow area derived from the trajectory whose starting point is within the motion brush. The displayed optical flow image will change correspondingly. Adjust the motion brush radius using the "Motion Brush Radius" bar. <br>
|
161 |
+
3. Click the "Run" button to animate the image according to the path. <br>
|
162 |
+
"""
|
163 |
+
)
|
164 |
+
|
165 |
+
height, width = 512, 512
|
166 |
+
|
167 |
+
pipeline, cmp = None, None
|
168 |
+
|
169 |
+
first_frame_path = gr.State()
|
170 |
+
tracking_points = gr.State([])
|
171 |
+
motion_brush_points = gr.State([])
|
172 |
+
motion_brush_mask = gr.State()
|
173 |
+
motion_brush_viz = gr.State()
|
174 |
+
inference_batch_size = gr.State(1)
|
175 |
|
176 |
+
@spaces.GPU(duration=100)
|
177 |
+
def init_models(pretrained_model_name_or_path="ckpts/stable-video-diffusion-img2vid-xt-1-1", resume_from_checkpoint="ckpts/controlnet", weight_dtype=torch.float16, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
|
178 |
|
179 |
+
from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
|
180 |
+
from pipeline.pipeline import FlowControlNetPipeline
|
181 |
+
from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
|
182 |
+
|
183 |
+
print('start loading models...')
|
184 |
+
# Load scheduler, tokenizer and models.
|
185 |
+
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
186 |
+
pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
|
187 |
+
)
|
188 |
+
vae = AutoencoderKLTemporalDecoder.from_pretrained(
|
189 |
+
pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
|
190 |
+
unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
|
191 |
+
pretrained_model_name_or_path,
|
192 |
+
subfolder="unet",
|
193 |
+
low_cpu_mem_usage=True,
|
194 |
+
variant="fp16",
|
195 |
)
|
196 |
|
197 |
+
controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
|
198 |
+
|
199 |
+
cmp = CMP_demo(
|
200 |
+
'./models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
|
201 |
+
42000
|
202 |
+
).to(device)
|
203 |
+
cmp.requires_grad_(False)
|
204 |
+
|
205 |
+
# Freeze vae and image_encoder
|
206 |
+
vae.requires_grad_(False)
|
207 |
+
image_encoder.requires_grad_(False)
|
208 |
+
unet.requires_grad_(False)
|
209 |
+
controlnet.requires_grad_(False)
|
210 |
+
|
211 |
+
# Move image_encoder and vae to gpu and cast to weight_dtype
|
212 |
+
image_encoder.to(device, dtype=weight_dtype)
|
213 |
+
vae.to(device, dtype=weight_dtype)
|
214 |
+
unet.to(device, dtype=weight_dtype)
|
215 |
+
controlnet.to(device, dtype=weight_dtype)
|
216 |
+
|
217 |
+
if enable_xformers_memory_efficient_attention:
|
218 |
+
if is_xformers_available():
|
219 |
+
import xformers
|
220 |
+
|
221 |
+
xformers_version = version.parse(xformers.__version__)
|
222 |
+
if xformers_version == version.parse("0.0.16"):
|
223 |
+
print(
|
224 |
+
"xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
|
225 |
+
)
|
226 |
+
unet.enable_xformers_memory_efficient_attention()
|
227 |
+
else:
|
228 |
+
raise ValueError(
|
229 |
+
"xformers is not available. Make sure it is installed correctly")
|
230 |
+
|
231 |
+
if allow_tf32:
|
232 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
233 |
+
|
234 |
+
pipeline = FlowControlNetPipeline.from_pretrained(
|
235 |
+
pretrained_model_name_or_path,
|
236 |
+
unet=unet,
|
237 |
+
controlnet=controlnet,
|
238 |
+
image_encoder=image_encoder,
|
239 |
+
vae=vae,
|
240 |
+
torch_dtype=weight_dtype,
|
241 |
+
)
|
242 |
+
pipeline = pipeline.to(device)
|
243 |
|
244 |
+
print('models loaded.')
|
245 |
+
|
246 |
+
return pipeline, cmp
|
247 |
+
|
248 |
+
def get_cmp_flow(frames, sparse_optical_flow, mask, brush_mask=None):
|
249 |
|
250 |
'''
|
251 |
frames: [b, 13, 3, 384, 384] (0, 1) tensor
|
|
|
258 |
frames = frames.flatten(0, 1) # [b*13, 3, 256, 256]
|
259 |
sparse_optical_flow = sparse_optical_flow.flatten(0, 1) # [b*13, 2, 256, 256]
|
260 |
mask = mask.flatten(0, 1) # [b*13, 2, 256, 256]
|
261 |
+
cmp_flow = cmp.run(frames, sparse_optical_flow, mask) # [b*13, 2, 256, 256]
|
262 |
|
263 |
if brush_mask is not None:
|
264 |
brush_mask = torch.from_numpy(brush_mask) / 255.
|
|
|
270 |
return cmp_flow
|
271 |
|
272 |
|
273 |
+
def get_flow(pixel_values_384, sparse_optical_flow_384, mask_384, motion_brush_mask=None):
|
274 |
|
275 |
fb, fl, fc, _, _ = pixel_values_384.shape
|
276 |
|
277 |
+
controlnet_flow = get_cmp_flow(
|
278 |
pixel_values_384[:, 0:1, :, :, :].repeat(1, fl, 1, 1, 1),
|
279 |
sparse_optical_flow_384,
|
280 |
mask_384, motion_brush_mask
|
281 |
)
|
282 |
|
283 |
+
if height != 384 or width != 384:
|
284 |
+
scales = [height / 384, width / 384]
|
285 |
+
controlnet_flow = F.interpolate(controlnet_flow.flatten(0, 1), (height, width), mode='nearest').reshape(fb, fl, 2, height, width)
|
286 |
controlnet_flow[:, :, 0] *= scales[1]
|
287 |
controlnet_flow[:, :, 1] *= scales[0]
|
288 |
|
|
|
290 |
|
291 |
|
292 |
@torch.no_grad()
|
293 |
+
def forward_sample(input_drag_384_inmask, input_drag_384_outmask, input_first_frame, input_mask_384_inmask, input_mask_384_outmask, in_mask_flag, out_mask_flag, motion_brush_mask=None, ctrl_scale=1., outputs=dict()):
|
294 |
'''
|
295 |
input_drag: [1, 13, 320, 576, 2]
|
296 |
input_drag_384: [1, 13, 384, 384, 2]
|
|
|
322 |
input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
|
323 |
|
324 |
if in_mask_flag:
|
325 |
+
flow_inmask = get_flow(
|
326 |
input_first_frame_384,
|
327 |
input_drag_384_inmask, mask_384_inmask, motion_brush_mask
|
328 |
)
|
329 |
else:
|
330 |
fb, fl = mask_384_inmask.shape[:2]
|
331 |
+
flow_inmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
|
332 |
|
333 |
if out_mask_flag:
|
334 |
+
flow_outmask = get_flow(
|
335 |
input_first_frame_384,
|
336 |
input_drag_384_outmask, mask_384_outmask
|
337 |
)
|
338 |
else:
|
339 |
fb, fl = mask_384_outmask.shape[:2]
|
340 |
+
flow_outmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
|
341 |
|
342 |
inmask_no_zero = (flow_inmask != 0).all(dim=2)
|
343 |
inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
|
344 |
|
345 |
controlnet_flow = torch.where(inmask_no_zero, flow_inmask, flow_outmask)
|
346 |
|
347 |
+
val_output = pipeline(
|
348 |
input_first_frame_pil,
|
349 |
input_first_frame_pil,
|
350 |
controlnet_flow,
|
|
|
383 |
|
384 |
@spaces.GPU
|
385 |
@torch.no_grad()
|
386 |
+
def get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path):
|
387 |
|
388 |
+
original_width, original_height = width, height
|
389 |
|
390 |
input_all_points = tracking_points.constructor_args['value']
|
391 |
|
392 |
if len(input_all_points) == 0 or len(input_all_points[-1]) == 1:
|
393 |
return np.uint8(np.ones((original_width, original_height, 3))*255)
|
394 |
|
395 |
+
resized_all_points = [tuple([tuple([int(e1[0]*width/original_width), int(e1[1]*height/original_height)]) for e1 in e]) for e in input_all_points]
|
396 |
resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
|
397 |
|
398 |
new_resized_all_points = []
|
|
|
470 |
input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
|
471 |
|
472 |
if in_mask_flag:
|
473 |
+
flow_inmask = get_flow(
|
474 |
input_first_frame_384,
|
475 |
input_drag_384_inmask, mask_384_inmask, motion_brush_mask_384
|
476 |
)
|
477 |
else:
|
478 |
fb, fl = mask_384_inmask.shape[:2]
|
479 |
+
flow_inmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
|
480 |
|
481 |
if out_mask_flag:
|
482 |
+
flow_outmask = get_flow(
|
483 |
input_first_frame_384,
|
484 |
input_drag_384_outmask, mask_384_outmask
|
485 |
)
|
486 |
else:
|
487 |
fb, fl = mask_384_outmask.shape[:2]
|
488 |
+
flow_outmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
|
489 |
|
490 |
inmask_no_zero = (flow_inmask != 0).all(dim=2)
|
491 |
inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
|
|
|
498 |
return viz_esti_flows
|
499 |
|
500 |
@spaces.GPU(duration=200)
|
501 |
+
def run(first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale):
|
502 |
|
503 |
+
original_width, original_height = width, height
|
504 |
|
505 |
input_all_points = tracking_points.constructor_args['value']
|
506 |
+
resized_all_points = [tuple([tuple([int(e1[0]*width/original_width), int(e1[1]*height/original_height)]) for e1 in e]) for e in input_all_points]
|
507 |
resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
|
508 |
|
509 |
new_resized_all_points = []
|
|
|
556 |
id = base.split('_')[0]
|
557 |
|
558 |
image_pil = image2pil(first_frame_path)
|
559 |
+
image_pil = image_pil.resize((width, height), Image.BILINEAR).convert('RGB')
|
560 |
|
561 |
+
visualized_drag, _ = visualize_drag_v2(first_frame_path, resized_all_points, width, height)
|
562 |
|
563 |
motion_brush_viz_pil = Image.fromarray(motion_brush_viz.astype(np.uint8)).convert('RGBA')
|
564 |
visualized_drag = visualized_drag[0].convert('RGBA')
|
|
|
581 |
first_frames = outputs['logits_imgs'][:, -1]
|
582 |
|
583 |
|
584 |
+
outputs = forward_sample(
|
585 |
input_drag_384_inmask.to('cuda'),
|
586 |
input_drag_384_outmask.to('cuda'),
|
587 |
first_frames.to('cuda'),
|
|
|
644 |
|
645 |
return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
|
646 |
|
647 |
+
@spaces.GPU(duration=100)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
648 |
def preprocess_image(image):
|
649 |
+
|
650 |
+
if pipeline is None or cmp is None:
|
651 |
+
pipeline, cmp = init_models()
|
652 |
|
653 |
image_pil = image2pil(image.name)
|
654 |
raw_w, raw_h = image_pil.size
|
655 |
|
656 |
max_edge = min(raw_w, raw_h)
|
657 |
+
resize_ratio = width / max_edge
|
658 |
|
659 |
image_pil = image_pil.resize((round(raw_w * resize_ratio), round(raw_h * resize_ratio)), Image.BILINEAR)
|
660 |
|
|
|
664 |
|
665 |
image_pil = transforms.CenterCrop((crop_h, crop_w))(image_pil.convert('RGB'))
|
666 |
|
667 |
+
width = crop_w
|
668 |
+
height = crop_h
|
669 |
|
670 |
id = str(time.time()).split('.')[0]
|
671 |
os.makedirs(os.path.join(output_dir_video, str(id)), exist_ok=True)
|
|
|
710 |
transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
|
711 |
trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
|
712 |
|
713 |
+
viz_flow = get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
|
714 |
|
715 |
return tracking_points, trajectory_map, viz_flow
|
716 |
|
|
|
730 |
transparent_layer_pil = Image.fromarray(transparent_layer.astype(np.uint8))
|
731 |
motion_map = Image.alpha_composite(transparent_background, transparent_layer_pil)
|
732 |
|
733 |
+
viz_flow = get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
|
734 |
|
735 |
return motion_brush_mask, transparent_layer, motion_map, viz_flow
|
736 |
|
|
|
766 |
transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
|
767 |
trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
|
768 |
|
769 |
+
viz_flow = get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
|
770 |
|
771 |
return tracking_points, trajectory_map, viz_flow
|
772 |
|
|
|
821 |
|
822 |
input_image_mask.select(add_motion_brushes, [motion_brush_points, motion_brush_mask, motion_brush_viz, first_frame_path, brush_radius, tracking_points], [motion_brush_mask, motion_brush_viz, input_image_mask, viz_flow])
|
823 |
|
824 |
+
run_button.click(run, [first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale], [hint_image, output_video, output_flow, output_video_mp4, output_flow_mp4])
|
825 |
|
826 |
demo.launch()
|