File size: 6,675 Bytes
1a572e4
5332e66
db4880c
71a2b8b
db4880c
1a572e4
71a2b8b
5332e66
 
1a572e4
 
5332e66
1a572e4
 
 
 
 
 
 
 
 
 
 
 
71a2b8b
5332e66
 
 
 
 
 
 
 
 
 
 
 
 
 
db4880c
1a572e4
db4880c
 
1a572e4
 
 
71a2b8b
db4880c
71a2b8b
1a572e4
db4880c
 
 
 
71a2b8b
1a572e4
 
71a2b8b
1a572e4
71a2b8b
1a572e4
db4880c
 
1a572e4
71a2b8b
 
 
1a572e4
71a2b8b
1a572e4
 
 
 
71a2b8b
 
 
 
 
db4880c
71a2b8b
1a572e4
 
71a2b8b
7a3b53b
 
 
 
5332e66
7a3b53b
 
 
 
 
 
 
 
 
 
 
 
 
1a572e4
 
7a3b53b
 
 
 
 
71a2b8b
 
 
 
 
 
 
 
 
 
 
 
db4880c
71a2b8b
 
 
 
 
 
 
 
db4880c
1a572e4
71a2b8b
 
 
 
 
5332e66
71a2b8b
1a572e4
71a2b8b
 
 
 
1a572e4
db4880c
 
 
71a2b8b
db4880c
 
1a572e4
db4880c
 
 
 
e94f209
1a572e4
71a2b8b
db4880c
e94f209
db4880c
1a572e4
 
 
 
 
 
7a3b53b
 
 
 
 
db4880c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import binascii
import os

import gradio as gr
import librosa
import numpy as np
import pretty_midi
import torch
import yt_dlp
from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor

from utils import cli_to_api, mp3_write, normalize

yt_video_dir = "./yt_dir"
outputs_dir = "./midi_wav_outputs"
os.makedirs(outputs_dir, exist_ok=True)
os.makedirs(yt_video_dir, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano").to(device)
processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")
composers = model.generation_config.composer_to_feature_token.keys()


def get_audio_from_yt_video(yt_link: str):
    filename = binascii.hexlify(os.urandom(8)).decode() + ".mp3"
    filename = os.path.join(yt_video_dir, filename)
    yt_opt = cli_to_api(
        [
            "--extract-audio",
            "--audio-format",
            "mp3",
            "--restrict-filenames",
            "-o",
            filename,
        ]
    )
    with yt_dlp.YoutubeDL(yt_opt) as ydl:
        ydl.download([yt_link])

    return filename, filename


def inference(file_uploaded, composer):
    # to save the native sampling rate of the file, sr=None is used, but this can cause some silent errors where the
    # generated output will not be upto the desired quality. If that happens please consider switching sr to 44100 Hz.
    pop_y, sr = librosa.load(file_uploaded, sr=None)

    inputs = processor(audio=pop_y, sampling_rate=sr, return_tensors="pt").to(device)
    model_output = model.generate(input_features=inputs["input_features"], composer=composer)
    tokenizer_output = processor.batch_decode(
        token_ids=model_output.to("cpu"), feature_extractor_output=inputs.to("cpu")
    )["pretty_midi_objects"]

    return prepare_output_file(tokenizer_output, sr, pop_y)


def prepare_output_file(tokenizer_output: pretty_midi.PrettyMIDI, sr: int, pop_y: np.ndarray):
    # Add some random values so that no two file names are same
    output_file_name = "p2p_" + binascii.hexlify(os.urandom(8)).decode()
    midi_output = os.path.join(outputs_dir, output_file_name + ".mid")

    # write the .mid and its wav files
    tokenizer_output[0].write(midi_output)
    midi_y: np.ndarray = tokenizer_output[0].fluidsynth(sr)
    midi_y_path: str = midi_output.replace(".mid", ".mp3")
    mp3_write(midi_y_path, sr, normalize(midi_y), normalized=True)

    # stack stereo audio
    if len(pop_y) > len(midi_y):
        midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y)))
    elif len(pop_y) < len(midi_y):
        pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y)))
    stereo = np.stack((midi_y, pop_y * 0.5))

    # write stereo audio
    stereo_path = midi_output.replace(".mid", ".mix.mp3")
    mp3_write(stereo_path, sr, normalize(stereo.T), normalized=True)

    return midi_y_path, midi_y_path, midi_output, stereo_path, stereo_path


block = gr.Blocks()

with block:
    gr.HTML(
        """
            <div style="text-align: center; max-width: 400px; margin: 0 auto;">
              <div
                style="
                  display: inline-flex;
                  align-items: center;
                  gap: 0.8rem;
                  font-size: 1.75rem;
                "
              >
                <h1 style="font-weight: 900; margin-bottom: 7px;">
                  Pop2piano
                </h1>
              </div>
              <p style="margin-bottom: 10px; font-size: 94%">
                A demo for Pop2Piano:Pop Audio-based Piano Cover Generation.<br>
                Please select the composer(Arranger) and upload the pop audio or enter the YouTube link and then click Generate.
              </p>
            </div>
        """
    )
    with gr.Group():
        with gr.Column():
            with gr.Blocks() as audio_select:
                with gr.Tab("Upload Audio"):
                    file_uploaded = gr.Audio(label="Upload an audio", type="filepath")
                with gr.Tab("YouTube url"):
                    with gr.Row():
                        yt_link = gr.Textbox(
                            label="Enter YouTube Link of the Video", autofocus=True, lines=3
                        )
                        yt_btn = gr.Button("Download Audio from YouTube Link", size="lg")
                    yt_audio_path = gr.Audio(
                        label="Audio Extracted from the YouTube Video", interactive=False
                    )
                    yt_btn.click(
                        get_audio_from_yt_video,
                        inputs=[yt_link],
                        outputs=[yt_audio_path, file_uploaded],
                    )
            with gr.Column():
                composer = gr.Dropdown(label="Arranger", choices=composers, value="composer1")
                generate_btn = gr.Button("Generate")

    with gr.Group():
        gr.HTML(
            """
            <div> <h3> <center> Listen to the generated MIDI. </h3> </div>
            """
        )
        with gr.Row(equal_height=True):
            stereo_mix1 = gr.Audio(label="Listen to the Stereo Mix")
            wav_output1 = gr.Audio(label="Listen to the Generated MIDI")

        with gr.Row():
            stereo_mix2 = gr.File(label="Download the Stereo Mix (.mp3")
            wav_output2 = gr.File(label="Download the Generated MIDI (.mp3)")
            midi_output = gr.File(label="Download the Generated MIDI (.mid)")
            generate_btn.click(
                inference,
                inputs=[file_uploaded, composer],
                outputs=[wav_output1, wav_output2, midi_output, stereo_mix1, stereo_mix2],
            )

    with gr.Group():
        gr.Examples(
            [
                ["./examples/custom_song.mp3", "composer1"],
            ],
            fn=inference,
            inputs=[file_uploaded, composer],
            outputs=[wav_output1, wav_output2, midi_output, stereo_mix1, stereo_mix2],
            cache_examples=True,
        )

        gr.HTML(
            """
        <div class="footer">
                    <center><p><a href="http://sweetcocoa.github.io/pop2piano_samples" style="text-decoration: underline;" target="_blank">Project Page</a> 
                    <center><a href="https://huggingface.co/docs/transformers/main/model_doc/pop2piano" style="text-decoration: underline;" target="_blank">HuggingFace Model Docs</a>
                    <center><a href="https://github.com/sweetcocoa/pop2piano" style="text-decoration: underline;" target="_blank">Github</a>
                    </p>
        </div>
        """
        )

block.launch(debug=False)