Spaces:
Running
on
T4
Running
on
T4
modify app
Browse files
app.py
CHANGED
@@ -78,6 +78,8 @@ flatsharpDic = {
|
|
78 |
'Bb':'A#'
|
79 |
}
|
80 |
|
|
|
|
|
81 |
max_conseq_N = 0
|
82 |
max_conseq_chord = 2
|
83 |
tempo = 120
|
@@ -89,6 +91,17 @@ min_velocity = 49 # Minimum velocity value in the output range
|
|
89 |
max_velocity = 112 # Maximum velocity value in the output range
|
90 |
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
def split_video_into_frames(video, frame_dir):
|
93 |
output_path = os.path.join(frame_dir, f"%03d.jpg")
|
94 |
cmd = f"ffmpeg -i {video} -vf \"select=bitor(gte(t-prev_selected_t\,1)\,isnan(prev_selected_t))\" -vsync 0 -qmin 1 -q:v 1 {output_path}"
|
@@ -390,77 +403,6 @@ class Video2music:
|
|
390 |
|
391 |
def generate(self, video, primer, key):
|
392 |
|
393 |
-
feature_dir = Path("./feature")
|
394 |
-
output_dir = Path("./output")
|
395 |
-
if feature_dir.exists():
|
396 |
-
shutil.rmtree(str(feature_dir))
|
397 |
-
if output_dir.exists():
|
398 |
-
shutil.rmtree(str(output_dir))
|
399 |
-
|
400 |
-
feature_dir.mkdir(parents=True)
|
401 |
-
output_dir.mkdir(parents=True)
|
402 |
-
|
403 |
-
frame_dir = feature_dir / "vevo_frame"
|
404 |
-
|
405 |
-
#video features
|
406 |
-
semantic_dir = feature_dir / "vevo_semantic"
|
407 |
-
emotion_dir = feature_dir / "vevo_emotion"
|
408 |
-
scene_dir = feature_dir / "vevo_scene"
|
409 |
-
scene_offset_dir = feature_dir / "vevo_scene_offset"
|
410 |
-
motion_dir = feature_dir / "vevo_motion"
|
411 |
-
|
412 |
-
frame_dir.mkdir(parents=True)
|
413 |
-
semantic_dir.mkdir(parents=True)
|
414 |
-
emotion_dir.mkdir(parents=True)
|
415 |
-
scene_dir.mkdir(parents=True)
|
416 |
-
scene_offset_dir.mkdir(parents=True)
|
417 |
-
motion_dir.mkdir(parents=True)
|
418 |
-
|
419 |
-
#music features
|
420 |
-
chord_dir = feature_dir / "vevo_chord"
|
421 |
-
loudness_dir = feature_dir / "vevo_loudness"
|
422 |
-
note_density_dir = feature_dir / "vevo_note_density"
|
423 |
-
|
424 |
-
chord_dir.mkdir(parents=True)
|
425 |
-
loudness_dir.mkdir(parents=True)
|
426 |
-
note_density_dir.mkdir(parents=True)
|
427 |
-
|
428 |
-
split_video_into_frames(video, frame_dir)
|
429 |
-
gen_semantic_feature(frame_dir, semantic_dir)
|
430 |
-
gen_emotion_feature(frame_dir, emotion_dir)
|
431 |
-
gen_scene_feature(video, scene_dir, frame_dir)
|
432 |
-
gen_scene_offset_feature(scene_dir, scene_offset_dir)
|
433 |
-
gen_motion_feature(video, motion_dir)
|
434 |
-
|
435 |
-
feature_scene_offset = get_scene_offset_feature(scene_offset_dir)
|
436 |
-
feature_motion = get_motion_feature(motion_dir)
|
437 |
-
feature_emotion = get_emotion_feature(emotion_dir)
|
438 |
-
feature_semantic = get_semantic_feature(semantic_dir)
|
439 |
-
|
440 |
-
# cuda
|
441 |
-
feature_scene_offset = feature_scene_offset.to(self.device)
|
442 |
-
feature_motion = feature_motion.to(self.device)
|
443 |
-
feature_emotion = feature_emotion.to(self.device)
|
444 |
-
|
445 |
-
feature_scene_offset = feature_scene_offset.unsqueeze(0)
|
446 |
-
feature_motion = feature_motion.unsqueeze(0)
|
447 |
-
feature_emotion = feature_emotion.unsqueeze(0)
|
448 |
-
|
449 |
-
feature_semantic = feature_semantic.to(self.device)
|
450 |
-
feature_semantic_list = []
|
451 |
-
feature_semantic = torch.unsqueeze(feature_semantic, 0)
|
452 |
-
feature_semantic_list.append( feature_semantic.to(self.device) )
|
453 |
-
#feature_semantic_list.append( feature_semantic )
|
454 |
-
|
455 |
-
if "major" in key:
|
456 |
-
feature_key = torch.tensor([0])
|
457 |
-
feature_key = feature_key.float()
|
458 |
-
elif "minor" in key:
|
459 |
-
feature_key = torch.tensor([1])
|
460 |
-
feature_key = feature_key.float()
|
461 |
-
|
462 |
-
feature_key = feature_key.to(self.device)
|
463 |
-
|
464 |
with open('dataset/vevo_meta/chord.json') as json_file:
|
465 |
chordDic = json.load(json_file)
|
466 |
with open('dataset/vevo_meta/chord_inv.json') as json_file:
|
@@ -504,14 +446,30 @@ class Video2music:
|
|
504 |
pChord = pChord[0:type_idx] + ":maj6"
|
505 |
if pChord[type_idx+1:] == "M7":
|
506 |
pChord = pChord[0:type_idx] + ":maj7"
|
507 |
-
if pChord[type_idx+1:] == "":
|
508 |
pChord = pChord[0:type_idx]
|
509 |
|
510 |
print("pchord is ", pChord)
|
511 |
-
|
512 |
-
|
513 |
|
514 |
chord_arr = pChord.split(":")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
515 |
if len(chord_arr) == 1:
|
516 |
chordRootID = chordRootDic[chord_arr[0]]
|
517 |
primerCID_root.append(chordRootID)
|
@@ -537,6 +495,84 @@ class Video2music:
|
|
537 |
primerCID_attr = primerCID_attr.to(torch.long)
|
538 |
primerCID_attr = primerCID_attr.to(self.device)
|
539 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
540 |
# self.model.eval()
|
541 |
# self.modelReg.eval()
|
542 |
|
@@ -616,6 +652,7 @@ class Video2music:
|
|
616 |
midi_chords = voice(midi_chords_orginal)
|
617 |
trans = traspose_key_dic[key]
|
618 |
|
|
|
619 |
for i, chord in enumerate(midi_chords):
|
620 |
if densitylist[i] == 0:
|
621 |
if len(chord) >= 4:
|
@@ -727,6 +764,9 @@ def gradio_generate2(input_youtube, input_primer, input_key):
|
|
727 |
youtube_dir.mkdir(parents=True)
|
728 |
|
729 |
yObject = YouTube(input_youtube)
|
|
|
|
|
|
|
730 |
yObject_stream = yObject.streams.get_by_resolution("240p")
|
731 |
fname = yObject.video_id +".mp4"
|
732 |
if yObject_stream == None:
|
@@ -813,11 +853,11 @@ with gr.Blocks(css=css) as demo:
|
|
813 |
# with gr.Column(visible=True) as colA:
|
814 |
with gr.Column(visible=True) as rowA:
|
815 |
with gr.Row():
|
816 |
-
input_video = gr.Video(label="Input Video")
|
817 |
with gr.Row():
|
818 |
with gr.Row():
|
819 |
-
input_primer = gr.Textbox(label="Input Primer", value="C Am F G")
|
820 |
-
input_key = gr.Dropdown(choices=
|
821 |
with gr.Row():
|
822 |
btn = gr.Button("Generate")
|
823 |
|
@@ -826,8 +866,8 @@ with gr.Blocks(css=css) as demo:
|
|
826 |
input_video_yt = gr.Textbox(label="YouTube URL")
|
827 |
with gr.Row():
|
828 |
with gr.Row():
|
829 |
-
input_primer_yt = gr.Textbox(label="Input Primer", value="C Am F G")
|
830 |
-
input_key_yt = gr.Dropdown(choices=
|
831 |
with gr.Row():
|
832 |
btn_yt = gr.Button("Generate")
|
833 |
|
|
|
78 |
'Bb':'A#'
|
79 |
}
|
80 |
|
81 |
+
chordList = ['C','C#','D','D#','E','F','F#','G','G#','A','A#','B']
|
82 |
+
|
83 |
max_conseq_N = 0
|
84 |
max_conseq_chord = 2
|
85 |
tempo = 120
|
|
|
91 |
max_velocity = 112 # Maximum velocity value in the output range
|
92 |
|
93 |
|
94 |
+
# def get_video_duration(file_path):
|
95 |
+
# try:
|
96 |
+
# clip = VideoFileClip(file_path)
|
97 |
+
# duration = clip.duration
|
98 |
+
# clip.close()
|
99 |
+
# return duration
|
100 |
+
# except Exception as e:
|
101 |
+
# print(f"An error occurred: {e}")
|
102 |
+
# return None
|
103 |
+
|
104 |
+
|
105 |
def split_video_into_frames(video, frame_dir):
|
106 |
output_path = os.path.join(frame_dir, f"%03d.jpg")
|
107 |
cmd = f"ffmpeg -i {video} -vf \"select=bitor(gte(t-prev_selected_t\,1)\,isnan(prev_selected_t))\" -vsync 0 -qmin 1 -q:v 1 {output_path}"
|
|
|
403 |
|
404 |
def generate(self, video, primer, key):
|
405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
406 |
with open('dataset/vevo_meta/chord.json') as json_file:
|
407 |
chordDic = json.load(json_file)
|
408 |
with open('dataset/vevo_meta/chord_inv.json') as json_file:
|
|
|
446 |
pChord = pChord[0:type_idx] + ":maj6"
|
447 |
if pChord[type_idx+1:] == "M7":
|
448 |
pChord = pChord[0:type_idx] + ":maj7"
|
449 |
+
if pChord[type_idx+1:] == "" or pChord[type_idx+1:] == "maj" or pChord[type_idx+1:] == "M":
|
450 |
pChord = pChord[0:type_idx]
|
451 |
|
452 |
print("pchord is ", pChord)
|
453 |
+
if pChord not in chordDic:
|
454 |
+
raise gr.Error("Not Supported Chord Type!")
|
455 |
|
456 |
chord_arr = pChord.split(":")
|
457 |
+
|
458 |
+
trans = traspose_key_dic[key]
|
459 |
+
trasindex = (chordList.index( chord_arr[0] ) - trans) % 12
|
460 |
+
|
461 |
+
if len(chord_arr) == 1:
|
462 |
+
pChordTrans = chordList[trasindex]
|
463 |
+
elif len(chord_arr) == 2:
|
464 |
+
pChordTrans = chordList[trasindex] + ":" + chord_arr[1]
|
465 |
+
|
466 |
+
print(pChordTrans)
|
467 |
+
|
468 |
+
|
469 |
+
chordID = chordDic[pChordTrans]
|
470 |
+
primerCID.append(chordID)
|
471 |
+
chord_arr = pChordTrans.split(":")
|
472 |
+
|
473 |
if len(chord_arr) == 1:
|
474 |
chordRootID = chordRootDic[chord_arr[0]]
|
475 |
primerCID_root.append(chordRootID)
|
|
|
495 |
primerCID_attr = primerCID_attr.to(torch.long)
|
496 |
primerCID_attr = primerCID_attr.to(self.device)
|
497 |
|
498 |
+
# duration = get_video_duration(video)
|
499 |
+
|
500 |
+
# if duration >= 300:
|
501 |
+
# raise gr.Error("We only support duration of video less than 300 seconds")
|
502 |
+
|
503 |
+
feature_dir = Path("./feature")
|
504 |
+
output_dir = Path("./output")
|
505 |
+
if feature_dir.exists():
|
506 |
+
shutil.rmtree(str(feature_dir))
|
507 |
+
if output_dir.exists():
|
508 |
+
shutil.rmtree(str(output_dir))
|
509 |
+
|
510 |
+
feature_dir.mkdir(parents=True)
|
511 |
+
output_dir.mkdir(parents=True)
|
512 |
+
|
513 |
+
frame_dir = feature_dir / "vevo_frame"
|
514 |
+
|
515 |
+
#video features
|
516 |
+
semantic_dir = feature_dir / "vevo_semantic"
|
517 |
+
emotion_dir = feature_dir / "vevo_emotion"
|
518 |
+
scene_dir = feature_dir / "vevo_scene"
|
519 |
+
scene_offset_dir = feature_dir / "vevo_scene_offset"
|
520 |
+
motion_dir = feature_dir / "vevo_motion"
|
521 |
+
|
522 |
+
frame_dir.mkdir(parents=True)
|
523 |
+
semantic_dir.mkdir(parents=True)
|
524 |
+
emotion_dir.mkdir(parents=True)
|
525 |
+
scene_dir.mkdir(parents=True)
|
526 |
+
scene_offset_dir.mkdir(parents=True)
|
527 |
+
motion_dir.mkdir(parents=True)
|
528 |
+
|
529 |
+
#music features
|
530 |
+
chord_dir = feature_dir / "vevo_chord"
|
531 |
+
loudness_dir = feature_dir / "vevo_loudness"
|
532 |
+
note_density_dir = feature_dir / "vevo_note_density"
|
533 |
+
|
534 |
+
chord_dir.mkdir(parents=True)
|
535 |
+
loudness_dir.mkdir(parents=True)
|
536 |
+
note_density_dir.mkdir(parents=True)
|
537 |
+
|
538 |
+
split_video_into_frames(video, frame_dir)
|
539 |
+
gen_semantic_feature(frame_dir, semantic_dir)
|
540 |
+
gen_emotion_feature(frame_dir, emotion_dir)
|
541 |
+
gen_scene_feature(video, scene_dir, frame_dir)
|
542 |
+
gen_scene_offset_feature(scene_dir, scene_offset_dir)
|
543 |
+
gen_motion_feature(video, motion_dir)
|
544 |
+
|
545 |
+
feature_scene_offset = get_scene_offset_feature(scene_offset_dir)
|
546 |
+
feature_motion = get_motion_feature(motion_dir)
|
547 |
+
feature_emotion = get_emotion_feature(emotion_dir)
|
548 |
+
feature_semantic = get_semantic_feature(semantic_dir)
|
549 |
+
|
550 |
+
# cuda
|
551 |
+
feature_scene_offset = feature_scene_offset.to(self.device)
|
552 |
+
feature_motion = feature_motion.to(self.device)
|
553 |
+
feature_emotion = feature_emotion.to(self.device)
|
554 |
+
|
555 |
+
feature_scene_offset = feature_scene_offset.unsqueeze(0)
|
556 |
+
feature_motion = feature_motion.unsqueeze(0)
|
557 |
+
feature_emotion = feature_emotion.unsqueeze(0)
|
558 |
+
|
559 |
+
feature_semantic = feature_semantic.to(self.device)
|
560 |
+
feature_semantic_list = []
|
561 |
+
feature_semantic = torch.unsqueeze(feature_semantic, 0)
|
562 |
+
feature_semantic_list.append( feature_semantic.to(self.device) )
|
563 |
+
#feature_semantic_list.append( feature_semantic )
|
564 |
+
|
565 |
+
if "major" in key:
|
566 |
+
feature_key = torch.tensor([0])
|
567 |
+
feature_key = feature_key.float()
|
568 |
+
elif "minor" in key:
|
569 |
+
feature_key = torch.tensor([1])
|
570 |
+
feature_key = feature_key.float()
|
571 |
+
|
572 |
+
feature_key = feature_key.to(self.device)
|
573 |
+
|
574 |
+
|
575 |
+
|
576 |
# self.model.eval()
|
577 |
# self.modelReg.eval()
|
578 |
|
|
|
652 |
midi_chords = voice(midi_chords_orginal)
|
653 |
trans = traspose_key_dic[key]
|
654 |
|
655 |
+
|
656 |
for i, chord in enumerate(midi_chords):
|
657 |
if densitylist[i] == 0:
|
658 |
if len(chord) >= 4:
|
|
|
764 |
youtube_dir.mkdir(parents=True)
|
765 |
|
766 |
yObject = YouTube(input_youtube)
|
767 |
+
if yObject.length >= 300:
|
768 |
+
raise gr.Error("We only support duration of video less than 300 seconds")
|
769 |
+
|
770 |
yObject_stream = yObject.streams.get_by_resolution("240p")
|
771 |
fname = yObject.video_id +".mp4"
|
772 |
if yObject_stream == None:
|
|
|
853 |
# with gr.Column(visible=True) as colA:
|
854 |
with gr.Column(visible=True) as rowA:
|
855 |
with gr.Row():
|
856 |
+
input_video = gr.Video(label="Input Video", max_length=299)
|
857 |
with gr.Row():
|
858 |
with gr.Row():
|
859 |
+
input_primer = gr.Textbox(label="Input Primer", value="C Am F G", info="Supported types: dim, sus4, min7(m7), min(m), sus2, aug, dim7, maj6(M6), hdim7, 7, min6(m6), maj7(M7)")
|
860 |
+
input_key = gr.Dropdown(choices=all_key_names, value="C major", label="Input Key")
|
861 |
with gr.Row():
|
862 |
btn = gr.Button("Generate")
|
863 |
|
|
|
866 |
input_video_yt = gr.Textbox(label="YouTube URL")
|
867 |
with gr.Row():
|
868 |
with gr.Row():
|
869 |
+
input_primer_yt = gr.Textbox(label="Input Primer", value="C Am F G", info="Supported types: dim, sus4, min7(m7), min(m), sus2, aug, dim7, maj6(M6), hdim7, 7, min6(m6), maj7(M7)")
|
870 |
+
input_key_yt = gr.Dropdown(choices=all_key_names, value="C major", label="Input Key")
|
871 |
with gr.Row():
|
872 |
btn_yt = gr.Button("Generate")
|
873 |
|