File size: 8,396 Bytes
06ba6ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 |
# Copyright 2023 The HuggingFace Team. All rights reserved.
import datetime
import platform
import subprocess
from typing import Optional, Tuple, Union
import numpy as np
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
"""
Helper function to read an audio file through ffmpeg.
"""
ar = f"{sampling_rate}"
ac = "1"
format_for_conversion = "f32le"
ffmpeg_command = [
"ffmpeg",
"-i",
"pipe:0",
"-ac",
ac,
"-ar",
ar,
"-f",
format_for_conversion,
"-hide_banner",
"-loglevel",
"quiet",
"pipe:1",
]
try:
with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
output_stream = ffmpeg_process.communicate(bpayload)
except FileNotFoundError as error:
raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error
out_bytes = output_stream[0]
audio = np.frombuffer(out_bytes, np.float32)
if audio.shape[0] == 0:
raise ValueError(
"Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has "
"a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote "
"URL, ensure that the URL is the full address to **download** the audio file."
)
return audio
def ffmpeg_microphone(
sampling_rate: int,
chunk_length_s: float,
format_for_conversion: str = "f32le",
):
"""
Helper function ro read raw microphone data.
"""
ar = f"{sampling_rate}"
ac = "1"
if format_for_conversion == "s16le":
size_of_sample = 2
elif format_for_conversion == "f32le":
size_of_sample = 4
else:
raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
system = platform.system()
if system == "Linux":
format_ = "alsa"
input_ = "default"
elif system == "Darwin":
format_ = "avfoundation"
input_ = ":0"
elif system == "Windows":
format_ = "dshow"
input_ = "default"
ffmpeg_command = [
"ffmpeg",
"-f",
format_,
"-i",
input_,
"-ac",
ac,
"-ar",
ar,
"-f",
format_for_conversion,
"-fflags",
"nobuffer",
"-hide_banner",
"-loglevel",
"quiet",
"pipe:1",
]
chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
iterator = _ffmpeg_stream(ffmpeg_command, chunk_len)
for item in iterator:
yield item
def ffmpeg_microphone_live(
sampling_rate: int,
chunk_length_s: float,
stream_chunk_s: Optional[int] = None,
stride_length_s: Optional[Union[Tuple[float, float], float]] = None,
format_for_conversion: str = "f32le",
):
"""
Helper function to read audio from the microphone file through ffmpeg. This will output `partial` overlapping
chunks starting from `stream_chunk_s` (if it is defined) until `chunk_length_s` is reached. It will make use of
striding to avoid errors on the "sides" of the various chunks.
Arguments:
sampling_rate (`int`):
The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to
avoid resampling later.
chunk_length_s (`float` or `int`):
The length of the maximum chunk of audio to be sent returned. This includes the eventual striding.
stream_chunk_s (`float` or `int`)
The length of the minimal temporary audio to be returned.
stride_length_s (`float` or `int` or `(float, float)`, *optional*, defaults to `None`)
The length of the striding to be used. Stride is used to provide context to a model on the (left, right) of
an audio sample but without using that part to actually make the prediction. Setting this does not change
the length of the chunk.
format_for_conversion (`str`, defalts to `f32le`)
The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
could also be used.
Return:
A generator yielding dictionaries of the following form
`{"sampling_rate": int, "raw": np.array(), "partial" bool}` With optionnally a `"stride" (int, int)` key if
`stride_length_s` is defined.
`stride` and `raw` are all expressed in `samples`, and `partial` is a boolean saying if the current yield item
is a whole chunk, or a partial temporary result to be later replaced by another larger chunk.
"""
if stream_chunk_s is not None:
chunk_s = stream_chunk_s
else:
chunk_s = chunk_length_s
microphone = ffmpeg_microphone(sampling_rate, chunk_s, format_for_conversion=format_for_conversion)
if format_for_conversion == "s16le":
dtype = np.int16
size_of_sample = 2
elif format_for_conversion == "f32le":
dtype = np.float32
size_of_sample = 4
else:
raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
if stride_length_s is None:
stride_length_s = chunk_length_s / 6
chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
if isinstance(stride_length_s, (int, float)):
stride_length_s = [stride_length_s, stride_length_s]
stride_left = int(round(sampling_rate * stride_length_s[0])) * size_of_sample
stride_right = int(round(sampling_rate * stride_length_s[1])) * size_of_sample
audio_time = datetime.datetime.now()
delta = datetime.timedelta(seconds=chunk_s)
for item in chunk_bytes_iter(microphone, chunk_len, stride=(stride_left, stride_right), stream=True):
# Put everything back in numpy scale
item["raw"] = np.frombuffer(item["raw"], dtype=dtype)
item["stride"] = (
item["stride"][0] // size_of_sample,
item["stride"][1] // size_of_sample,
)
item["sampling_rate"] = sampling_rate
audio_time += delta
if datetime.datetime.now() > audio_time + 10 * delta:
# We're late !! SKIP
continue
yield item
def chunk_bytes_iter(iterator, chunk_len: int, stride: Tuple[int, int], stream: bool = False):
"""
Reads raw bytes from an iterator and does chunks of length `chunk_len`. Optionally adds `stride` to each chunks to
get overlaps. `stream` is used to return partial results even if a full `chunk_len` is not yet available.
"""
acc = b""
stride_left, stride_right = stride
if stride_left + stride_right >= chunk_len:
raise ValueError(
f"Stride needs to be strictly smaller than chunk_len: ({stride_left}, {stride_right}) vs {chunk_len}"
)
_stride_left = 0
for raw in iterator:
acc += raw
if stream and len(acc) < chunk_len:
stride = (_stride_left, 0)
yield {"raw": acc[:chunk_len], "stride": stride, "partial": True}
else:
while len(acc) >= chunk_len:
# We are flushing the accumulator
stride = (_stride_left, stride_right)
item = {"raw": acc[:chunk_len], "stride": stride}
if stream:
item["partial"] = False
yield item
_stride_left = stride_left
acc = acc[chunk_len - stride_left - stride_right :]
# Last chunk
if len(acc) > stride_left:
item = {"raw": acc, "stride": (_stride_left, 0)}
if stream:
item["partial"] = False
yield item
def _ffmpeg_stream(ffmpeg_command, buflen: int):
"""
Internal function to create the generator of data through ffmpeg
"""
bufsize = 2**24 # 16Mo
try:
with subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, bufsize=bufsize) as ffmpeg_process:
while True:
raw = ffmpeg_process.stdout.read(buflen)
if raw == b"":
break
yield raw
except FileNotFoundError as error:
raise ValueError("ffmpeg was not found but is required to stream audio files from filename") from error
|