Spaces:

sweetcocoa
/

pop2piano

Sleeping

File size: 6,675 Bytes

1a572e4
5332e66
db4880c
71a2b8b
db4880c
1a572e4
71a2b8b
5332e66
 
1a572e4
 
5332e66
1a572e4
 
 
 
 
 
 
 
 
 
 
 
71a2b8b
5332e66
 
 
 
 
 
 
 
 
 
 
 
 
 
db4880c
1a572e4
db4880c
 
1a572e4
 
 
71a2b8b
db4880c
71a2b8b
1a572e4
db4880c
 
 
 
71a2b8b
1a572e4
 
71a2b8b
1a572e4
71a2b8b
1a572e4
db4880c
 
1a572e4
71a2b8b
 
 
1a572e4
71a2b8b
1a572e4
 
 
 
71a2b8b
 
 
 
 
db4880c
71a2b8b
1a572e4
 
71a2b8b
7a3b53b
 
 
 
5332e66
7a3b53b
 
 
 
 
 
 
 
 
 
 
 
 
1a572e4
 
7a3b53b
 
 
 
 
71a2b8b
 
 
 
 
 
 
 
 
 
 
 
db4880c
71a2b8b
 
 
 
 
 
 
 
db4880c
1a572e4
71a2b8b
 
 
 
 
5332e66
71a2b8b
1a572e4
71a2b8b
 
 
 
1a572e4
db4880c
 
 
71a2b8b
db4880c
 
1a572e4
db4880c
 
 
 
e94f209
1a572e4
71a2b8b
db4880c
e94f209
db4880c
1a572e4
 
 
 
 
 
7a3b53b
 
 
 
 
db4880c

import binascii
import os

import gradio as gr
import librosa
import numpy as np
import pretty_midi
import torch
import yt_dlp
from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor

from utils import cli_to_api, mp3_write, normalize

yt_video_dir = "./yt_dir"
outputs_dir = "./midi_wav_outputs"
os.makedirs(outputs_dir, exist_ok=True)
os.makedirs(yt_video_dir, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano").to(device)
processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")
composers = model.generation_config.composer_to_feature_token.keys()


def get_audio_from_yt_video(yt_link: str):
    filename = binascii.hexlify(os.urandom(8)).decode() + ".mp3"
    filename = os.path.join(yt_video_dir, filename)
    yt_opt = cli_to_api(
        [
            "--extract-audio",
            "--audio-format",
            "mp3",
            "--restrict-filenames",
            "-o",
            filename,
        ]
    )
    with yt_dlp.YoutubeDL(yt_opt) as ydl:
        ydl.download([yt_link])

    return filename, filename


def inference(file_uploaded, composer):
    # to save the native sampling rate of the file, sr=None is used, but this can cause some silent errors where the
    # generated output will not be upto the desired quality. If that happens please consider switching sr to 44100 Hz.
    pop_y, sr = librosa.load(file_uploaded, sr=None)

    inputs = processor(audio=pop_y, sampling_rate=sr, return_tensors="pt").to(device)
    model_output = model.generate(input_features=inputs["input_features"], composer=composer)
    tokenizer_output = processor.batch_decode(
        token_ids=model_output.to("cpu"), feature_extractor_output=inputs.to("cpu")
    )["pretty_midi_objects"]

    return prepare_output_file(tokenizer_output, sr, pop_y)


def prepare_output_file(tokenizer_output: pretty_midi.PrettyMIDI, sr: int, pop_y: np.ndarray):
    # Add some random values so that no two file names are same
    output_file_name = "p2p_" + binascii.hexlify(os.urandom(8)).decode()
    midi_output = os.path.join(outputs_dir, output_file_name + ".mid")

    # write the .mid and its wav files
    tokenizer_output[0].write(midi_output)
    midi_y: np.ndarray = tokenizer_output[0].fluidsynth(sr)
    midi_y_path: str = midi_output.replace(".mid", ".mp3")
    mp3_write(midi_y_path, sr, normalize(midi_y), normalized=True)

    # stack stereo audio
    if len(pop_y) > len(midi_y):
        midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y)))
    elif len(pop_y) < len(midi_y):
        pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y)))
    stereo = np.stack((midi_y, pop_y * 0.5))

    # write stereo audio
    stereo_path = midi_output.replace(".mid", ".mix.mp3")
    mp3_write(stereo_path, sr, normalize(stereo.T), normalized=True)

    return midi_y_path, midi_y_path, midi_output, stereo_path, stereo_path


block = gr.Blocks()

with block:
    gr.HTML(
        """
            <div style="text-align: center; max-width: 400px; margin: 0 auto;">
              <div
                style="
                  display: inline-flex;
                  align-items: center;
                  gap: 0.8rem;
                  font-size: 1.75rem;
                "
              >
                <h1 style="font-weight: 900; margin-bottom: 7px;">
                  Pop2piano
                </h1>
              </div>
              <p style="margin-bottom: 10px; font-size: 94%">
                A demo for Pop2Piano:Pop Audio-based Piano Cover Generation.<br>
                Please select the composer(Arranger) and upload the pop audio or enter the YouTube link and then click Generate.
              </p>
            </div>
        """
    )
    with gr.Group():
        with gr.Column():
            with gr.Blocks() as audio_select:
                with gr.Tab("Upload Audio"):
                    file_uploaded = gr.Audio(label="Upload an audio", type="filepath")
                with gr.Tab("YouTube url"):
                    with gr.Row():
                        yt_link = gr.Textbox(
                            label="Enter YouTube Link of the Video", autofocus=True, lines=3
                        )
                        yt_btn = gr.Button("Download Audio from YouTube Link", size="lg")
                    yt_audio_path = gr.Audio(
                        label="Audio Extracted from the YouTube Video", interactive=False
                    )
                    yt_btn.click(
                        get_audio_from_yt_video,
                        inputs=[yt_link],
                        outputs=[yt_audio_path, file_uploaded],
                    )
            with gr.Column():
                composer = gr.Dropdown(label="Arranger", choices=composers, value="composer1")
                generate_btn = gr.Button("Generate")

    with gr.Group():
        gr.HTML(
            """
            <div> <h3> <center> Listen to the generated MIDI. </h3> </div>
            """
        )
        with gr.Row(equal_height=True):
            stereo_mix1 = gr.Audio(label="Listen to the Stereo Mix")
            wav_output1 = gr.Audio(label="Listen to the Generated MIDI")

        with gr.Row():
            stereo_mix2 = gr.File(label="Download the Stereo Mix (.mp3")
            wav_output2 = gr.File(label="Download the Generated MIDI (.mp3)")
            midi_output = gr.File(label="Download the Generated MIDI (.mid)")
            generate_btn.click(
                inference,
                inputs=[file_uploaded, composer],
                outputs=[wav_output1, wav_output2, midi_output, stereo_mix1, stereo_mix2],
            )

    with gr.Group():
        gr.Examples(
            [
                ["./examples/custom_song.mp3", "composer1"],
            ],
            fn=inference,
            inputs=[file_uploaded, composer],
            outputs=[wav_output1, wav_output2, midi_output, stereo_mix1, stereo_mix2],
            cache_examples=True,
        )

        gr.HTML(
            """
        <div class="footer">
                    <center><p><a href="http://sweetcocoa.github.io/pop2piano_samples" style="text-decoration: underline;" target="_blank">Project Page</a> 
                    <center><a href="https://huggingface.co/docs/transformers/main/model_doc/pop2piano" style="text-decoration: underline;" target="_blank">HuggingFace Model Docs</a>
                    <center><a href="https://github.com/sweetcocoa/pop2piano" style="text-decoration: underline;" target="_blank">Github</a>
                    </p>
        </div>
        """
        )

block.launch(debug=False)