File size: 6,234 Bytes
83418c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa6ba21
83418c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa6ba21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83418c6
aa6ba21
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195

import numpy as np
import tensorflow as tf
from scipy.io.wavfile import write
import keras.backend as K
import librosa.display
import cv2
import librosa
import matplotlib.pyplot as plt
import librosa.display
import numpy as np
from keras.applications import VGG16
import os
import scipy
import gradio as gr

# Load the tune recognition model
model = tf.keras.models.load_model('embdmodel_1.hdf5')
embedding_model=model.layers[2]

DURATION = 10
WAVE_OUTPUT_FILE = "my_audio.wav"


# Define function to preprocess input audio
#convert song to mel spectogram as siamese network doesn't work on sound directly
def create_spectrogram(clip,sample_rate,save_path):
    plt.interactive(False)
    fig=plt.figure(figsize=[0.72,0.72])
    S=librosa.feature.melspectrogram(y=clip,sr=sample_rate)
    librosa.display.specshow(librosa.power_to_db(S,ref=np.max))
    fig.savefig(save_path,dpi=400,bbox_inches='tight',pad_inches=0)
    plt.close()
    fig.clf()
    plt.close(fig)
    plt.close('all')
    del save_path,clip,sample_rate,fig,S
    
def load_img(path):
    img=cv2.imread(path)
    img=cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    img=cv2.resize(img,(150,150))
    return img


import pickle
with open('dict.pickle', 'rb') as handle:
    songspecdict = pickle.load(handle)


def list_file_sizes():
    path = "."
 
    # Get list of all files only in the given directory
    fun = lambda x : os.path.isfile(os.path.join(path,x))
    files_list = filter(fun, os.listdir(path))
    
    # Create a list of files in directory along with the size
    size_of_file = [
        (f,os.stat(os.path.join(path, f)).st_size)
        for f in files_list
    ]
    # Iterate over list of files along with size
    # and print them one by one.
    for f,s in size_of_file:
        print("{} : {}MB".format(f, round(s/(1024*1024),3)))



def main(audio):

    with open(WAVE_OUTPUT_FILE, "wb") as file:
        file.write(audio)

    list_file_sizes()

    # Load the song to match
    song, sr = librosa.load("my_audio.wav")
    to_match = np.copy(song[0:220500])
    print("Loaded data into librosa...")

    # Create spectrogram image of the song to match
    create_spectrogram(to_match, sr, 'test.png')
    print("Created spectogram...")

    # Load the spectrogram image of the song to match
    to_match_img = load_img('test.png')
    to_match_img = np.expand_dims(to_match_img, axis=0)
    print("Loaded spectrum image...")

    # Get the embedding of the song to match
    to_match_emb = embedding_model.predict(to_match_img)
    print("Get song embedding...")

    # Calculate the distances between the song to match and the songs in the database
    songsdistdict = {}
    for key, values in songspecdict.items():
        dist_array = []
        for embd in values:
            dist_array.append(np.linalg.norm(to_match_emb - embd))
            
        songsdistdict[key] = min(dist_array)
    song_titles=list(songsdistdict.keys())
    distances=list(songsdistdict.values())

    # Get the title and artist of the recognized song
    recognized_song_artist, recognized_song_title = song_titles[distances.index(min(distances))].split('-')
    recognized_song_title = os.path.splitext(recognized_song_title)[0]
    print(f'Artist: {recognized_song_artist}')
    print(f'Title: {recognized_song_title}')

    from musixmatch import Musixmatch

    # Initialize Musixmatch API
    musixmatch = Musixmatch(apikey='2b0d0615efa782e95598a0e99bda4a60')

    # Search for the recognized song
    track_search_results = musixmatch.track_search(q_track=recognized_song_title, q_artist=recognized_song_artist, page_size=1, page=1, s_track_rating='desc')

    if track_search_results['message']['header']['status_code'] == 200:
        # Get the track ID for the top result
        track_id = track_search_results['message']['body']['track_list'][0]['track']['track_id']

        # Get the lyrics for the recognized song
        lyrics_result = musixmatch.track_lyrics_get(track_id=track_id)

        if lyrics_result['message']['header']['status_code'] == 200:
            # Get the lyrics
            lyrics = lyrics_result['message']['body']['lyrics']['lyrics_body']
            # Remove the annotation tags from the lyrics
            lyrics = lyrics.replace('******* This Lyrics is NOT for Commercial use *******', '').strip()
            print("Lyrics:\n", lyrics)
    else:
        print("Couldn't find lyrics for the recognized song.")



    # Play the recognized song
    recognized_song_file = f'https://huggingface.co/spaces/prerna9811/Chord/tree/main/seismese_net_songs/{song_titles[distances.index(min(distances))]}'
    recognized_song_audio, recognized_song_sr = librosa.load(recognized_song_file)

    audio_file = open(recognized_song_file, 'rb') # enter the filename with filepath
    audio_bytes = audio_file.read() # reading the file

    return audio_bytes


css = """
footer {display:none !important}
.output-markdown{display:none !important}
button.primary {
    z-index: 14;
    left: 0px;
    top: 0px;
    cursor: pointer !important; 
    background: none rgb(17, 20, 45) !important;
    border: none !important;
    color: rgb(255, 255, 255) !important;
    line-height: 1 !important;
    border-radius: 6px !important;
    transition: box-shadow 200ms ease 0s, background 200ms ease 0s !important;
    box-shadow: none !important;
}
button.primary:hover{
    z-index: 14;
    left: 0px;
    top: 0px;
    cursor: pointer !important;
    background: none rgb(37, 56, 133) !important;
    border: none !important;
    color: rgb(255, 255, 255) !important;
    line-height: 1 !important;
    border-radius: 6px !important;
    transition: box-shadow 200ms ease 0s, background 200ms ease 0s !important;
    box-shadow: rgb(0 0 0 / 23%) 0px 1px 7px 0px !important;
}
button.gallery-item:hover {
    border-color: rgb(37 56 133) !important;
    background-color: rgb(229,225,255) !important;
}
"""


demo = gr.Blocks()
mf_transcribe = gr.Interface(
            fn=main,
            inputs=gr.inputs.Audio(source="microphone", type="filepath"),
            outputs="audio",
            layout="horizontal",
            theme="huggingface",
            allow_flagging="never",
            css = css
        )
mf_transcribe.launch()