|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from ctypes import c_float, sizeof |
|
from enum import Enum |
|
from typing import TYPE_CHECKING, Optional, Union |
|
|
|
|
|
if TYPE_CHECKING: |
|
from .. import AutoFeatureExtractor, AutoProcessor, AutoTokenizer |
|
|
|
|
|
class ParameterFormat(Enum): |
|
Float = c_float |
|
|
|
@property |
|
def size(self) -> int: |
|
""" |
|
Number of byte required for this data type |
|
|
|
Returns: |
|
Integer > 0 |
|
""" |
|
return sizeof(self.value) |
|
|
|
|
|
def compute_effective_axis_dimension(dimension: int, fixed_dimension: int, num_token_to_add: int = 0) -> int: |
|
""" |
|
|
|
Args: |
|
dimension: |
|
fixed_dimension: |
|
num_token_to_add: |
|
|
|
Returns: |
|
|
|
""" |
|
|
|
if dimension <= 0: |
|
dimension = fixed_dimension |
|
|
|
dimension -= num_token_to_add |
|
return dimension |
|
|
|
|
|
def compute_serialized_parameters_size(num_parameters: int, dtype: ParameterFormat) -> int: |
|
""" |
|
Compute the size taken by all the parameters in the given the storage format when serializing the model |
|
|
|
Args: |
|
num_parameters: Number of parameters to be saved |
|
dtype: The data format each parameter will be saved |
|
|
|
Returns: |
|
Size (in byte) taken to save all the parameters |
|
""" |
|
return num_parameters * dtype.size |
|
|
|
|
|
def get_preprocessor(model_name: str) -> Optional[Union["AutoTokenizer", "AutoFeatureExtractor", "AutoProcessor"]]: |
|
""" |
|
Gets a preprocessor (tokenizer, feature extractor or processor) that is available for `model_name`. |
|
|
|
Args: |
|
model_name (`str`): Name of the model for which a preprocessor are loaded. |
|
|
|
Returns: |
|
`Optional[Union[AutoTokenizer, AutoFeatureExtractor, AutoProcessor]]`: |
|
If a processor is found, it is returned. Otherwise, if a tokenizer or a feature extractor exists, it is |
|
returned. If both a tokenizer and a feature extractor exist, an error is raised. The function returns |
|
`None` if no preprocessor is found. |
|
""" |
|
|
|
from .. import AutoFeatureExtractor, AutoProcessor, AutoTokenizer |
|
|
|
try: |
|
return AutoProcessor.from_pretrained(model_name) |
|
except (ValueError, OSError, KeyError): |
|
tokenizer, feature_extractor = None, None |
|
try: |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
except (OSError, KeyError): |
|
pass |
|
try: |
|
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name) |
|
except (OSError, KeyError): |
|
pass |
|
|
|
if tokenizer is not None and feature_extractor is not None: |
|
raise ValueError( |
|
f"Couldn't auto-detect preprocessor for {model_name}. Found both a tokenizer and a feature extractor." |
|
) |
|
elif tokenizer is None and feature_extractor is None: |
|
return None |
|
elif tokenizer is not None: |
|
return tokenizer |
|
else: |
|
return feature_extractor |
|
|