QinOwen commited on
Commit
2ad9d00
1 Parent(s): 5098655
VADER-VideoCrafter/scripts/main/train_t2v_lora.py CHANGED
@@ -29,7 +29,6 @@ from hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer
29
  import hpsv2
30
  import bitsandbytes as bnb
31
  from accelerate import Accelerator
32
- from accelerate.logging import get_logger
33
  from accelerate.utils import gather_object
34
  import torch.distributed as dist
35
  import logging
@@ -43,16 +42,6 @@ import cv2
43
  # st = ipdb.set_trace
44
 
45
 
46
- logger = get_logger(__name__, log_level="INFO") # get logger for current module
47
-
48
- def create_logging(logging, logger, accelerator):
49
- logging.basicConfig(
50
- format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
51
- datefmt="%m/%d/%Y %H:%M:%S",
52
- level=logging.INFO,
53
- )
54
- logger.info(accelerator.state, main_process_only=False)
55
-
56
  def create_output_folders(output_dir, run_name):
57
  out_dir = os.path.join(output_dir, run_name)
58
  os.makedirs(out_dir, exist_ok=True)
@@ -567,12 +556,162 @@ def should_sample(global_step, validation_steps, is_sample_preview):
567
  and is_sample_preview
568
 
569
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570
  def run_training(args, model, **kwargs):
571
  ## ---------------------step 1: accelerator setup---------------------------
572
  accelerator = Accelerator( # Initialize Accelerator
573
  gradient_accumulation_steps=args.gradient_accumulation_steps,
574
  mixed_precision=args.mixed_precision,
575
- project_dir=args.project_dir
 
 
576
  )
577
  output_dir = args.project_dir
578
 
@@ -584,6 +723,8 @@ def run_training(args, model, **kwargs):
584
  lora_dropout=0.01,
585
  )
586
 
 
 
587
  peft_model = peft.get_peft_model(model, config)
588
 
589
  peft_model.print_trainable_parameters()
@@ -599,13 +740,24 @@ def run_training(args, model, **kwargs):
599
  # load the pretrained LoRA model
600
  peft.set_peft_model_state_dict(peft_model, torch.load(args.lora_ckpt_path))
601
 
602
- # Make one log on every process with the configuration for debugging.
603
- create_logging(logging, logger, accelerator)
604
-
 
 
 
605
  # Inference Step: only do inference and save the videos. Skip this step if it is training
606
  # ==================================================================
607
  if args.inference_only:
608
  peft_model = accelerator.prepare(peft_model)
 
 
 
 
 
 
 
 
609
  # sample shape
610
  assert (args.height % 16 == 0) and (args.width % 16 == 0), "Error: image size [h,w] should be multiples of 16!"
611
  # latent noise shape
@@ -618,7 +770,7 @@ def run_training(args, model, **kwargs):
618
  channels = peft_model.channels
619
 
620
  ## Inference step 2: run Inference over samples
621
- logger.info("***** Running inference *****")
622
 
623
  first_epoch = 0
624
  global_step = 0
@@ -627,10 +779,6 @@ def run_training(args, model, **kwargs):
627
  ## Inference Step 3: generate new validation videos
628
  with torch.no_grad():
629
 
630
- # set random seed for each process
631
- random.seed(args.seed)
632
- torch.manual_seed(args.seed)
633
-
634
  prompts_all = [args.prompt_str]
635
  val_prompt = list(prompts_all)
636
 
@@ -670,6 +818,8 @@ def run_training(args, model, **kwargs):
670
  batch_samples = batch_ddim_sampling(peft_model, cond, noise_shape, args.n_samples, \
671
  args.ddim_steps, args.ddim_eta, args.unconditional_guidance_scale, None, decode_frame=args.decode_frame, **kwargs)
672
 
 
 
673
  # batch_samples: b,samples,c,t,h,w
674
  dir_name = os.path.join(output_dir, "samples")
675
  # filenames should be related to the gpu index
@@ -699,7 +849,7 @@ def run_training(args, model, **kwargs):
699
  dir_name.extend(results_gathered[i]["dir_name"])
700
  prompts.extend(results_gathered[i]["prompt"])
701
 
702
- logger.info("Validation sample saved!")
703
 
704
  # # batch size is 1, so only one video is generated
705
 
@@ -715,12 +865,9 @@ def run_training(args, model, **kwargs):
715
  torch.cuda.empty_cache()
716
  gc.collect()
717
 
718
- return video_path
719
 
720
- # end of inference only, training script continues
721
- # ==================================================================
722
 
723
-
724
  def setup_model():
725
  parser = get_parser()
726
  args = parser.parse_args()
@@ -747,6 +894,7 @@ def setup_model():
747
 
748
 
749
  print("Model setup complete!")
 
750
  return model
751
 
752
 
@@ -777,3 +925,8 @@ def main_fn(prompt, lora_model, lora_rank, seed=200, height=320, width=512, unco
777
 
778
  return video_path
779
 
 
 
 
 
 
 
29
  import hpsv2
30
  import bitsandbytes as bnb
31
  from accelerate import Accelerator
 
32
  from accelerate.utils import gather_object
33
  import torch.distributed as dist
34
  import logging
 
42
  # st = ipdb.set_trace
43
 
44
 
 
 
 
 
 
 
 
 
 
 
45
  def create_output_folders(output_dir, run_name):
46
  out_dir = os.path.join(output_dir, run_name)
47
  os.makedirs(out_dir, exist_ok=True)
 
556
  and is_sample_preview
557
 
558
 
559
+ # def run_training(args, model, **kwargs):
560
+ # ## ---------------------step 1: setup---------------------------
561
+ # output_dir = args.project_dir
562
+
563
+
564
+ # # step 2.1: add LoRA using peft
565
+ # config = peft.LoraConfig(
566
+ # r=args.lora_rank,
567
+ # target_modules=["to_k", "to_v", "to_q"], # only diffusion_model has these modules
568
+ # lora_dropout=0.01,
569
+ # )
570
+ # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
571
+
572
+
573
+ # model = model.to(device)
574
+ # peft_model = peft.get_peft_model(model, config)
575
+
576
+
577
+
578
+ # # load the pretrained LoRA model
579
+ # if args.lora_ckpt_path != "Base Model":
580
+ # if args.lora_ckpt_path == "huggingface-hps-aesthetic": # download the pretrained LoRA model from huggingface
581
+ # snapshot_download(repo_id='zheyangqin/VADER', local_dir ='VADER-VideoCrafter/checkpoints/pretrained_lora')
582
+ # args.lora_ckpt_path = 'VADER-VideoCrafter/checkpoints/pretrained_lora/vader_videocrafter_hps_aesthetic.pt'
583
+ # elif args.lora_ckpt_path == "huggingface-pickscore": # download the pretrained LoRA model from huggingface
584
+ # snapshot_download(repo_id='zheyangqin/VADER', local_dir ='VADER-VideoCrafter/checkpoints/pretrained_lora')
585
+ # args.lora_ckpt_path = 'VADER-VideoCrafter/checkpoints/pretrained_lora/vader_videocrafter_pickscore.pt'
586
+ # # load the pretrained LoRA model
587
+ # peft.set_peft_model_state_dict(peft_model, torch.load(args.lora_ckpt_path))
588
+
589
+
590
+ # # peft_model.first_stage_model.to(device)
591
+
592
+ # peft_model.eval()
593
+
594
+ # print("device is: ", device)
595
+ # print("precision: ", peft_model.dtype)
596
+ # # precision of first_stage_model
597
+ # print("precision of first_stage_model: ", peft_model.first_stage_model.dtype)
598
+ # print("peft_model device: ", peft_model.device)
599
+
600
+ # # Inference Step: only do inference and save the videos. Skip this step if it is training
601
+ # # ==================================================================
602
+ # # sample shape
603
+ # assert (args.height % 16 == 0) and (args.width % 16 == 0), "Error: image size [h,w] should be multiples of 16!"
604
+ # # latent noise shape
605
+ # h, w = args.height // 8, args.width // 8
606
+
607
+ # frames = peft_model.temporal_length if args.frames < 0 else args.frames
608
+ # channels = peft_model.channels
609
+
610
+ # ## Inference step 2: run Inference over samples
611
+ # print("***** Running inference *****")
612
+
613
+
614
+ # ## Inference Step 3: generate new validation videos
615
+ # with torch.no_grad():
616
+
617
+ # # set random seed for each process
618
+ # random.seed(args.seed)
619
+ # torch.manual_seed(args.seed)
620
+
621
+ # prompts_all = [args.prompt_str]
622
+ # val_prompt = list(prompts_all)
623
+
624
+ # assert len(val_prompt) == 1, "Error: only one prompt is allowed for inference in gradio!"
625
+
626
+ # # store output of generations in dict
627
+ # results=dict(filenames=[],dir_name=[], prompt=[])
628
+
629
+ # # Inference Step 3.1: forward pass
630
+ # batch_size = len(val_prompt)
631
+ # noise_shape = [batch_size, channels, frames, h, w]
632
+
633
+ # fps = torch.tensor([args.fps]*batch_size).to(device).long()
634
+
635
+ # prompts = val_prompt
636
+ # if isinstance(prompts, str):
637
+ # prompts = [prompts]
638
+
639
+ # # mix precision
640
+
641
+ # if isinstance(peft_model, torch.nn.parallel.DistributedDataParallel):
642
+ # text_emb = peft_model.module.get_learned_conditioning(prompts).to(device)
643
+ # else:
644
+ # text_emb = peft_model.get_learned_conditioning(prompts).to(device)
645
+
646
+ # if args.mode == 'base':
647
+ # cond = {"c_crossattn": [text_emb], "fps": fps}
648
+ # else: # TODO: implement i2v mode training in the future
649
+ # raise NotImplementedError
650
+
651
+ # # Inference Step 3.2: inference, batch_samples shape: batch, <samples>, c, t, h, w
652
+ # # no backprop_mode=args.backprop_mode because it is inference process
653
+ # batch_samples = batch_ddim_sampling(peft_model, cond, noise_shape, args.n_samples, \
654
+ # args.ddim_steps, args.ddim_eta, args.unconditional_guidance_scale, None, decode_frame=args.decode_frame, **kwargs)
655
+
656
+ # print("batch_samples dtype: ", batch_samples.dtype)
657
+ # print("batch_samples device: ", batch_samples.device)
658
+ # # batch_samples: b,samples,c,t,h,w
659
+ # dir_name = os.path.join(output_dir, "samples")
660
+ # # filenames should be related to the gpu index
661
+ # # get timestamps for filenames to avoid overwriting
662
+ # # current_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
663
+ # filenames = [f"temporal"] # only one sample
664
+ # # if dir_name is not exists, create it
665
+ # os.makedirs(dir_name, exist_ok=True)
666
+
667
+ # save_videos(batch_samples, dir_name, filenames, fps=args.savefps)
668
+
669
+ # results["filenames"].extend(filenames)
670
+ # results["dir_name"].extend([dir_name]*len(filenames))
671
+ # results["prompt"].extend(prompts)
672
+ # results=[ results ] # transform to list, otherwise gather_object() will not collect correctly
673
+
674
+ # # Inference Step 3.3: collect inference results and save the videos to wandb
675
+ # # collect inference results from all the GPUs
676
+ # results_gathered=gather_object(results)
677
+
678
+ # filenames = []
679
+ # dir_name = []
680
+ # prompts = []
681
+ # for i in range(len(results_gathered)):
682
+ # filenames.extend(results_gathered[i]["filenames"])
683
+ # dir_name.extend(results_gathered[i]["dir_name"])
684
+ # prompts.extend(results_gathered[i]["prompt"])
685
+
686
+ # print("Validation sample saved!")
687
+
688
+ # # # batch size is 1, so only one video is generated
689
+
690
+ # # video = get_videos(batch_samples)
691
+
692
+ # # # read the video from the saved path
693
+ # video_path = os.path.join(dir_name[0], filenames[0]+".mp4")
694
+
695
+
696
+
697
+ # # release memory
698
+ # del batch_samples
699
+ # torch.cuda.empty_cache()
700
+ # gc.collect()
701
+
702
+ # return video_path
703
+
704
+ # # end of inference only, training script continues
705
+ # # ==================================================================
706
+
707
  def run_training(args, model, **kwargs):
708
  ## ---------------------step 1: accelerator setup---------------------------
709
  accelerator = Accelerator( # Initialize Accelerator
710
  gradient_accumulation_steps=args.gradient_accumulation_steps,
711
  mixed_precision=args.mixed_precision,
712
+ project_dir=args.project_dir,
713
+ device_placement=True,
714
+ cpu=False
715
  )
716
  output_dir = args.project_dir
717
 
 
723
  lora_dropout=0.01,
724
  )
725
 
726
+ model = model.to(accelerator.device)
727
+
728
  peft_model = peft.get_peft_model(model, config)
729
 
730
  peft_model.print_trainable_parameters()
 
740
  # load the pretrained LoRA model
741
  peft.set_peft_model_state_dict(peft_model, torch.load(args.lora_ckpt_path))
742
 
743
+
744
+ print("precision: ", peft_model.dtype)
745
+ # precision of first_stage_model
746
+ print("precision of first_stage_model: ", peft_model.first_stage_model.dtype)
747
+ print("peft_model device: ", peft_model.device)
748
+
749
  # Inference Step: only do inference and save the videos. Skip this step if it is training
750
  # ==================================================================
751
  if args.inference_only:
752
  peft_model = accelerator.prepare(peft_model)
753
+
754
+
755
+ print("precision: ", peft_model.dtype)
756
+ # precision of first_stage_model
757
+ print("precision of first_stage_model: ", peft_model.first_stage_model.dtype)
758
+ print("peft_model device: ", peft_model.device)
759
+
760
+
761
  # sample shape
762
  assert (args.height % 16 == 0) and (args.width % 16 == 0), "Error: image size [h,w] should be multiples of 16!"
763
  # latent noise shape
 
770
  channels = peft_model.channels
771
 
772
  ## Inference step 2: run Inference over samples
773
+ print("***** Running inference *****")
774
 
775
  first_epoch = 0
776
  global_step = 0
 
779
  ## Inference Step 3: generate new validation videos
780
  with torch.no_grad():
781
 
 
 
 
 
782
  prompts_all = [args.prompt_str]
783
  val_prompt = list(prompts_all)
784
 
 
818
  batch_samples = batch_ddim_sampling(peft_model, cond, noise_shape, args.n_samples, \
819
  args.ddim_steps, args.ddim_eta, args.unconditional_guidance_scale, None, decode_frame=args.decode_frame, **kwargs)
820
 
821
+ print("batch_samples dtype: ", batch_samples.dtype)
822
+ print("batch_samples device: ", batch_samples.device)
823
  # batch_samples: b,samples,c,t,h,w
824
  dir_name = os.path.join(output_dir, "samples")
825
  # filenames should be related to the gpu index
 
849
  dir_name.extend(results_gathered[i]["dir_name"])
850
  prompts.extend(results_gathered[i]["prompt"])
851
 
852
+ print("Validation sample saved!")
853
 
854
  # # batch size is 1, so only one video is generated
855
 
 
865
  torch.cuda.empty_cache()
866
  gc.collect()
867
 
868
+ return video_path
869
 
 
 
870
 
 
871
  def setup_model():
872
  parser = get_parser()
873
  args = parser.parse_args()
 
894
 
895
 
896
  print("Model setup complete!")
897
+ print("model dtype: ", model.dtype)
898
  return model
899
 
900
 
 
925
 
926
  return video_path
927
 
928
+ # if main
929
+ if __name__ == "__main__":
930
+ model = setup_model()
931
+
932
+ main_fn("a person walking on the street", "huggingface-hps-aesthetic", 16, 200, 320, 512, 12, 25, 1.0, 24, 10, model=model)
app.py CHANGED
@@ -22,7 +22,7 @@ examples = [
22
 
23
  model = setup_model()
24
 
25
- @spaces.GPU(duration=70)
26
  def gradio_main_fn(prompt, lora_model, lora_rank, seed, height, width, unconditional_guidance_scale, ddim_steps, ddim_eta,
27
  frames, savefps):
28
  global model
@@ -203,16 +203,16 @@ with gr.Blocks(css=custom_css) as demo:
203
  seed = gr.Slider(minimum=0, maximum=65536, label="Seed", step = 1, value=200)
204
 
205
  with gr.Row():
206
- height = gr.Slider(minimum=0, maximum=1024, label="Height", step = 16, value=384)
207
- width = gr.Slider(minimum=0, maximum=1024, label="Width", step = 16, value=512)
208
 
209
  with gr.Row():
210
  frames = gr.Slider(minimum=0, maximum=50, label="Frames", step = 1, value=24)
211
- savefps = gr.Slider(minimum=0, maximum=60, label="Save FPS", step = 1, value=10)
212
 
213
 
214
  with gr.Row():
215
- DDIM_Steps = gr.Slider(minimum=0, maximum=100, label="DDIM Steps", step = 1, value=25)
216
  unconditional_guidance_scale = gr.Slider(minimum=0, maximum=50, label="Guidance Scale", step = 0.1, value=12.0)
217
  DDIM_Eta = gr.Slider(minimum=0, maximum=1, label="DDIM Eta", step = 0.01, value=1.0)
218
 
 
22
 
23
  model = setup_model()
24
 
25
+ @spaces.GPU(duration=120)
26
  def gradio_main_fn(prompt, lora_model, lora_rank, seed, height, width, unconditional_guidance_scale, ddim_steps, ddim_eta,
27
  frames, savefps):
28
  global model
 
203
  seed = gr.Slider(minimum=0, maximum=65536, label="Seed", step = 1, value=200)
204
 
205
  with gr.Row():
206
+ height = gr.Slider(minimum=0, maximum=512, label="Height", step = 16, value=384)
207
+ width = gr.Slider(minimum=0, maximum=512, label="Width", step = 16, value=512)
208
 
209
  with gr.Row():
210
  frames = gr.Slider(minimum=0, maximum=50, label="Frames", step = 1, value=24)
211
+ savefps = gr.Slider(minimum=0, maximum=30, label="Save FPS", step = 1, value=10)
212
 
213
 
214
  with gr.Row():
215
+ DDIM_Steps = gr.Slider(minimum=0, maximum=50, label="DDIM Steps", step = 1, value=25)
216
  unconditional_guidance_scale = gr.Slider(minimum=0, maximum=50, label="Guidance Scale", step = 0.1, value=12.0)
217
  DDIM_Eta = gr.Slider(minimum=0, maximum=1, label="DDIM Eta", step = 0.01, value=1.0)
218
 
gradio_cached_examples/32/indices.csv DELETED
@@ -1 +0,0 @@
1
- 0
 
 
gradio_cached_examples/32/log.csv DELETED
@@ -1,2 +0,0 @@
1
- component 0,flag,username,timestamp
2
- "{""video"": {""path"": ""gradio_cached_examples/32/component 0/fd156c6a458fa048724e/temporal.mp4"", ""url"": ""/file=/tmp/gradio/4bc133becbc469de8da700250f7f7df1103c6f56/temporal.mp4"", ""size"": null, ""orig_name"": ""temporal.mp4"", ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}, ""subtitles"": null}",,,2024-07-19 00:00:10.509808
 
 
 
gradio_cached_examples/34/indices.csv DELETED
@@ -1 +0,0 @@
1
- 0
 
 
gradio_cached_examples/34/log.csv DELETED
@@ -1,2 +0,0 @@
1
- component 0,flag,username,timestamp
2
- "{""video"": {""path"": ""gradio_cached_examples/34/component 0/d2ac1c9664e80f60d50f/temporal.mp4"", ""url"": ""/file=/tmp/gradio/4bc133becbc469de8da700250f7f7df1103c6f56/temporal.mp4"", ""size"": null, ""orig_name"": ""temporal.mp4"", ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}, ""subtitles"": null}",,,2024-07-18 23:33:26.912888