Spaces:
Runtime error
Runtime error
add n_iter to Mel and update test_mel notebook
Browse files- audiodiffusion/mel.py +20 -17
- notebooks/test_mel.ipynb +7 -1
audiodiffusion/mel.py
CHANGED
@@ -9,15 +9,14 @@ from PIL import Image
|
|
9 |
|
10 |
class Mel:
|
11 |
|
12 |
-
def __init__(
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
):
|
21 |
"""Class to convert audio to mel spectrograms and vice versa.
|
22 |
|
23 |
Args:
|
@@ -27,6 +26,7 @@ class Mel:
|
|
27 |
n_fft (int): number of Fast Fourier Transforms
|
28 |
hop_length (int): hop length (a higher number is recommended for lower than 256 y_res)
|
29 |
top_db (int): loudest in decibels
|
|
|
30 |
"""
|
31 |
self.x_res = x_res
|
32 |
self.y_res = y_res
|
@@ -36,6 +36,7 @@ class Mel:
|
|
36 |
self.n_mels = self.y_res
|
37 |
self.slice_size = self.x_res * self.hop_length - 1
|
38 |
self.top_db = top_db
|
|
|
39 |
self.audio = None
|
40 |
|
41 |
def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
|
@@ -94,13 +95,11 @@ class Mel:
|
|
94 |
Returns:
|
95 |
PIL Image: grayscale image of x_res x y_res
|
96 |
"""
|
97 |
-
S = librosa.feature.melspectrogram(
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
n_mels=self.n_mels
|
103 |
-
)
|
104 |
log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
|
105 |
bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) +
|
106 |
0.5).astype(np.uint8)
|
@@ -121,5 +120,9 @@ class Mel:
|
|
121 |
log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
|
122 |
S = librosa.db_to_power(log_S)
|
123 |
audio = librosa.feature.inverse.mel_to_audio(
|
124 |
-
S,
|
|
|
|
|
|
|
|
|
125 |
return audio
|
|
|
9 |
|
10 |
class Mel:
|
11 |
|
12 |
+
def __init__(self,
|
13 |
+
x_res: int = 256,
|
14 |
+
y_res: int = 256,
|
15 |
+
sample_rate: int = 22050,
|
16 |
+
n_fft: int = 2048,
|
17 |
+
hop_length: int = 512,
|
18 |
+
top_db: int = 80,
|
19 |
+
n_iter: int = 32):
|
|
|
20 |
"""Class to convert audio to mel spectrograms and vice versa.
|
21 |
|
22 |
Args:
|
|
|
26 |
n_fft (int): number of Fast Fourier Transforms
|
27 |
hop_length (int): hop length (a higher number is recommended for lower than 256 y_res)
|
28 |
top_db (int): loudest in decibels
|
29 |
+
n_iter (int): number of iterations for Griffin Linn mel inversion
|
30 |
"""
|
31 |
self.x_res = x_res
|
32 |
self.y_res = y_res
|
|
|
36 |
self.n_mels = self.y_res
|
37 |
self.slice_size = self.x_res * self.hop_length - 1
|
38 |
self.top_db = top_db
|
39 |
+
self.n_iter = n_iter
|
40 |
self.audio = None
|
41 |
|
42 |
def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
|
|
|
95 |
Returns:
|
96 |
PIL Image: grayscale image of x_res x y_res
|
97 |
"""
|
98 |
+
S = librosa.feature.melspectrogram(y=self.get_audio_slice(slice),
|
99 |
+
sr=self.sr,
|
100 |
+
n_fft=self.n_fft,
|
101 |
+
hop_length=self.hop_length,
|
102 |
+
n_mels=self.n_mels)
|
|
|
|
|
103 |
log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
|
104 |
bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) +
|
105 |
0.5).astype(np.uint8)
|
|
|
120 |
log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
|
121 |
S = librosa.db_to_power(log_S)
|
122 |
audio = librosa.feature.inverse.mel_to_audio(
|
123 |
+
S,
|
124 |
+
sr=self.sr,
|
125 |
+
n_fft=self.n_fft,
|
126 |
+
hop_length=self.hop_length,
|
127 |
+
n_iter=self.n_iter)
|
128 |
return audio
|
notebooks/test_mel.ipynb
CHANGED
@@ -41,7 +41,13 @@
|
|
41 |
"metadata": {},
|
42 |
"outputs": [],
|
43 |
"source": [
|
44 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
]
|
46 |
},
|
47 |
{
|
|
|
41 |
"metadata": {},
|
42 |
"outputs": [],
|
43 |
"source": [
|
44 |
+
"# These are the default parameters. If you change any of them, you may have to adjust the others.\n",
|
45 |
+
"mel = Mel(x_res=256,\n",
|
46 |
+
" y_res=256,\n",
|
47 |
+
" hop_length=512,\n",
|
48 |
+
" sample_rate=22050,\n",
|
49 |
+
" n_fft=2048,\n",
|
50 |
+
" n_iter=32)"
|
51 |
]
|
52 |
},
|
53 |
{
|