File size: 18,307 Bytes
4c65bff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 |
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Sequence feature extraction class for common feature extractors to preprocess sequences.
"""
from typing import Dict, List, Optional, Union
import numpy as np
from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
from .utils import PaddingStrategy, TensorType, is_tf_tensor, is_torch_tensor, logging, to_numpy
logger = logging.get_logger(__name__)
class SequenceFeatureExtractor(FeatureExtractionMixin):
"""
This is a general feature extraction class for speech recognition.
Args:
feature_size (`int`):
The feature dimension of the extracted features.
sampling_rate (`int`):
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
padding_value (`float`):
The value that is used to fill the padding values / vectors.
"""
def __init__(self, feature_size: int, sampling_rate: int, padding_value: float, **kwargs):
self.feature_size = feature_size
self.sampling_rate = sampling_rate
self.padding_value = padding_value
self.padding_side = kwargs.pop("padding_side", "right")
self.return_attention_mask = kwargs.pop("return_attention_mask", True)
super().__init__(**kwargs)
def pad(
self,
processed_features: Union[
BatchFeature,
List[BatchFeature],
Dict[str, BatchFeature],
Dict[str, List[BatchFeature]],
List[Dict[str, BatchFeature]],
],
padding: Union[bool, str, PaddingStrategy] = True,
max_length: Optional[int] = None,
truncation: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
) -> BatchFeature:
"""
Pad input values / input vectors or a batch of input values / input vectors up to predefined length or to the
max sequence length in the batch.
Padding side (left/right) padding values are defined at the feature extractor level (with `self.padding_side`,
`self.padding_value`)
<Tip>
If the `processed_features` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
PyTorch tensors, you will lose the specific device of your tensors however.
</Tip>
Args:
processed_features ([`BatchFeature`], list of [`BatchFeature`], `Dict[str, List[float]]`, `Dict[str, List[List[float]]` or `List[Dict[str, List[float]]]`):
Processed inputs. Can represent one input ([`BatchFeature`] or `Dict[str, List[float]]`) or a batch of
input values / vectors (list of [`BatchFeature`], *Dict[str, List[List[float]]]* or *List[Dict[str,
List[float]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
collate function.
Instead of `List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
see the note above for the return type.
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
acceptable input length for the model if that argument is not provided.
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
lengths).
max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
truncation (`bool`):
Activates truncation to cut input sequences longer than `max_length` to `max_length`.
pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
`>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
return_attention_mask (`bool`, *optional*):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific feature_extractor's default.
[What are attention masks?](../glossary#attention-mask)
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return Numpy `np.ndarray` objects.
"""
# If we have a list of dicts, let's convert it in a dict of lists
# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
if isinstance(processed_features, (list, tuple)) and isinstance(processed_features[0], (dict, BatchFeature)):
processed_features = {
key: [example[key] for example in processed_features] for key in processed_features[0].keys()
}
# The model's main input name, usually `input_values`, has be passed for padding
if self.model_input_names[0] not in processed_features:
raise ValueError(
"You should supply an instance of `transformers.BatchFeature` or list of `transformers.BatchFeature`"
f" to this method that includes {self.model_input_names[0]}, but you provided"
f" {list(processed_features.keys())}"
)
required_input = processed_features[self.model_input_names[0]]
return_attention_mask = (
return_attention_mask if return_attention_mask is not None else self.return_attention_mask
)
if len(required_input) == 0:
if return_attention_mask:
processed_features["attention_mask"] = []
return processed_features
# If we have PyTorch/TF tensors or lists as inputs, we cast them as Numpy arrays
# and rebuild them afterwards if no return_tensors is specified
# Note that we lose the specific device the tensor may be on for PyTorch
first_element = required_input[0]
if isinstance(first_element, (list, tuple)):
# first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
index = 0
while len(required_input[index]) == 0:
index += 1
if index < len(required_input):
first_element = required_input[index][0]
if return_tensors is None:
if is_tf_tensor(first_element):
return_tensors = "tf"
elif is_torch_tensor(first_element):
return_tensors = "pt"
elif isinstance(first_element, (int, float, list, tuple, np.ndarray)):
return_tensors = "np"
else:
raise ValueError(
f"type of {first_element} unknown: {type(first_element)}. "
"Should be one of a python, numpy, pytorch or tensorflow object."
)
for key, value in processed_features.items():
if isinstance(value[0], (int, float)):
processed_features[key] = to_numpy(value)
else:
processed_features[key] = [to_numpy(v) for v in value]
# Convert padding_strategy in PaddingStrategy
padding_strategy = self._get_padding_strategies(padding=padding, max_length=max_length)
required_input = processed_features[self.model_input_names[0]]
batch_size = len(required_input)
if not all(len(v) == batch_size for v in processed_features.values()):
raise ValueError("Some items in the output dictionary have a different batch size than others.")
truncated_inputs = []
for i in range(batch_size):
inputs = {k: v[i] for k, v in processed_features.items()}
# truncation
inputs_slice = self._truncate(
inputs,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
truncation=truncation,
)
truncated_inputs.append(inputs_slice)
if padding_strategy == PaddingStrategy.LONGEST:
# make sure that `max_length` cannot be longer than the longest truncated length
max_length = max(len(input_slice[self.model_input_names[0]]) for input_slice in truncated_inputs)
padding_strategy = PaddingStrategy.MAX_LENGTH
batch_outputs = {}
for i in range(batch_size):
# padding
outputs = self._pad(
truncated_inputs[i],
max_length=max_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
if value.dtype is np.dtype(np.float64):
value = value.astype(np.float32)
batch_outputs[key].append(value)
return BatchFeature(batch_outputs, tensor_type=return_tensors)
def _pad(
self,
processed_features: Union[Dict[str, np.ndarray], BatchFeature],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
Pad inputs (on left/right and up to predefined length or max length in the batch)
Args:
processed_features (`Union[Dict[str, np.ndarray], BatchFeature]`):
Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch
of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see below)
padding_strategy (`PaddingStrategy`, *optional*, default to `PaddingStrategy.DO_NOT_PAD`):
PaddingStrategy to use for padding.
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
- PaddingStrategy.DO_NOT_PAD: Do not pad
The feature_extractor padding sides are defined in self.padding_side:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
pad_to_multiple_of (`int`, *optional*):
Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
which benefit from having sequence lengths be a multiple of 128.
return_attention_mask (`bool`, *optional*):
Set to False to avoid returning attention mask (default: set to model specifics)
"""
required_input = processed_features[self.model_input_names[0]]
if padding_strategy == PaddingStrategy.LONGEST:
max_length = len(required_input)
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) < max_length
if return_attention_mask and "attention_mask" not in processed_features:
processed_features["attention_mask"] = np.ones(len(required_input), dtype=np.int32)
if needs_to_be_padded:
difference = max_length - len(required_input)
if self.padding_side == "right":
if return_attention_mask:
processed_features["attention_mask"] = np.pad(
processed_features["attention_mask"], (0, difference)
)
padding_shape = ((0, difference), (0, 0)) if self.feature_size > 1 else (0, difference)
processed_features[self.model_input_names[0]] = np.pad(
required_input, padding_shape, "constant", constant_values=self.padding_value
)
elif self.padding_side == "left":
if return_attention_mask:
processed_features["attention_mask"] = np.pad(
processed_features["attention_mask"], (difference, 0)
)
padding_shape = ((difference, 0), (0, 0)) if self.feature_size > 1 else (difference, 0)
processed_features[self.model_input_names[0]] = np.pad(
required_input, padding_shape, "constant", constant_values=self.padding_value
)
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
return processed_features
def _truncate(
self,
processed_features: Union[Dict[str, np.ndarray], BatchFeature],
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
truncation: Optional[bool] = None,
):
"""
Truncate inputs to predefined length or max length in the batch
Args:
processed_features(`Union[Dict[str, np.ndarray], BatchFeature]`):
Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch
of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
max_length (`int`, *optional*):
maximum length of the returned list and optionally padding length (see below)
pad_to_multiple_of (`int`, *optional*) :
Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
which benefit from having sequence lengths be a multiple of 128.
truncation (`bool`, *optional*):
Activates truncation to cut input sequences longer than `max_length` to `max_length`.
"""
if not truncation:
return processed_features
elif truncation and max_length is None:
raise ValueError("When setting ``truncation=True``, make sure that ``max_length`` is defined.")
required_input = processed_features[self.model_input_names[0]]
# find `max_length` that fits `pad_to_multiple_of`
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
needs_to_be_truncated = len(required_input) > max_length
if needs_to_be_truncated:
processed_features[self.model_input_names[0]] = processed_features[self.model_input_names[0]][:max_length]
if "attention_mask" in processed_features:
processed_features["attention_mask"] = processed_features["attention_mask"][:max_length]
return processed_features
def _get_padding_strategies(self, padding=False, max_length=None):
"""
Find the correct padding strategy
"""
# Get padding strategy
if padding is not False:
if padding is True:
padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
elif not isinstance(padding, PaddingStrategy):
padding_strategy = PaddingStrategy(padding)
elif isinstance(padding, PaddingStrategy):
padding_strategy = padding
else:
padding_strategy = PaddingStrategy.DO_NOT_PAD
# Set max length if needed
if max_length is None:
if padding_strategy == PaddingStrategy.MAX_LENGTH:
raise ValueError(
f"When setting ``padding={PaddingStrategy.MAX_LENGTH}``, make sure that max_length is defined"
)
# Test if we have a padding value
if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.padding_value is None):
raise ValueError(
"Asking to pad but the feature_extractor does not have a padding value. Please select a value to use"
" as `padding_value`. For example: `feature_extractor.padding_value = 0.0`."
)
return padding_strategy
|