mart9992's picture
m
4c65bff
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.from typing import List, Union
from typing import List, Union
from ..utils import is_torch_available
from .base import Pipeline
if is_torch_available():
from ..models.auto.modeling_auto import MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING
from ..models.speecht5.modeling_speecht5 import SpeechT5HifiGan
DEFAULT_VOCODER_ID = "microsoft/speecht5_hifigan"
class TextToAudioPipeline(Pipeline):
"""
Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This
pipeline generates an audio file from an input text and optional other conditional inputs.
Example:
```python
>>> from transformers import pipeline
>>> pipe = pipeline(model="suno/bark-small")
>>> output = pipe("Hey it's HuggingFace on the phone!")
>>> audio = output["audio"]
>>> sampling_rate = output["sampling_rate"]
```
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or
`"text-to-audio"`.
See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech).
"""
def __init__(self, *args, vocoder=None, sampling_rate=None, **kwargs):
super().__init__(*args, **kwargs)
if self.framework == "tf":
raise ValueError("The TextToAudioPipeline is only available in PyTorch.")
self.vocoder = None
if self.model.__class__ in MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING.values():
self.vocoder = (
SpeechT5HifiGan.from_pretrained(DEFAULT_VOCODER_ID).to(self.model.device)
if vocoder is None
else vocoder
)
self.sampling_rate = sampling_rate
if self.vocoder is not None:
self.sampling_rate = self.vocoder.config.sampling_rate
if self.sampling_rate is None:
# get sampling_rate from config and generation config
config = self.model.config
gen_config = self.model.__dict__.get("generation_config", None)
if gen_config is not None:
config.update(gen_config.to_dict())
for sampling_rate_name in ["sample_rate", "sampling_rate"]:
sampling_rate = getattr(config, sampling_rate_name, None)
if sampling_rate is not None:
self.sampling_rate = sampling_rate
def preprocess(self, text, **kwargs):
if isinstance(text, str):
text = [text]
if self.model.config.model_type == "bark":
# bark Tokenizer is called with BarkProcessor which uses those kwargs
new_kwargs = {
"max_length": self.model.generation_config.semantic_config.get("max_input_semantic_length", 256),
"add_special_tokens": False,
"return_attention_mask": True,
"return_token_type_ids": False,
"padding": "max_length",
}
# priority is given to kwargs
new_kwargs.update(kwargs)
kwargs = new_kwargs
output = self.tokenizer(text, **kwargs, return_tensors="pt")
return output
def _forward(self, model_inputs, **kwargs):
# we expect some kwargs to be additional tensors which need to be on the right device
kwargs = self._ensure_tensor_on_device(kwargs, device=self.device)
if self.model.can_generate():
output = self.model.generate(**model_inputs, **kwargs)
else:
output = self.model(**model_inputs, **kwargs)[0]
if self.vocoder is not None:
# in that case, the output is a spectrogram that needs to be converted into a waveform
output = self.vocoder(output)
return output
def __call__(self, text_inputs: Union[str, List[str]], **forward_params):
"""
Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.
Args:
text_inputs (`str` or `List[str]`):
The text(s) to generate.
forward_params (*optional*):
Parameters passed to the model generation/forward method.
Return:
A `dict` or a list of `dict`: The dictionaries have two keys:
- **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform.
- **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform.
"""
return super().__call__(text_inputs, **forward_params)
def _sanitize_parameters(
self,
preprocess_params=None,
forward_params=None,
):
if preprocess_params is None:
preprocess_params = {}
if forward_params is None:
forward_params = {}
postprocess_params = {}
return preprocess_params, forward_params, postprocess_params
def postprocess(self, waveform):
output_dict = {}
output_dict["audio"] = waveform.cpu().float().numpy()
output_dict["sampling_rate"] = self.sampling_rate
return output_dict