|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" BARK model generation configuration""" |
|
|
|
import copy |
|
from typing import Dict |
|
|
|
from ...generation.configuration_utils import GenerationConfig |
|
from ...utils import logging |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
class BarkSemanticGenerationConfig(GenerationConfig): |
|
model_type = "semantic" |
|
|
|
def __init__( |
|
self, |
|
eos_token_id=10_000, |
|
renormalize_logits=True, |
|
max_new_tokens=768, |
|
output_scores=False, |
|
return_dict_in_generate=False, |
|
output_hidden_states=False, |
|
output_attentions=False, |
|
temperature=1.0, |
|
do_sample=False, |
|
text_encoding_offset=10_048, |
|
text_pad_token=129_595, |
|
semantic_infer_token=129_599, |
|
semantic_vocab_size=10_000, |
|
max_input_semantic_length=256, |
|
semantic_rate_hz=49.9, |
|
**kwargs, |
|
): |
|
"""Class that holds a generation configuration for [`BarkSemanticModel`]. |
|
|
|
This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the |
|
documentation from [`GenerationConfig`] for more information. |
|
|
|
Args: |
|
eos_token_id (`int`, *optional*, defaults to 10_000): |
|
The id of the *end-of-sequence* token. |
|
renormalize_logits (`bool`, *optional*, defaults to `True`): |
|
Whether to renormalize the logits after applying all the logits processors or warpers (including the |
|
custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the |
|
score logits are normalized but some logit processors or warpers break the normalization. |
|
max_new_tokens (`int`, *optional*, defaults to 768): |
|
The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt. |
|
output_scores (`bool`, *optional*, defaults to `False`): |
|
Whether or not to return the prediction scores. See `scores` under returned tensors for more details. |
|
return_dict_in_generate (`bool`, *optional*, defaults to `False`): |
|
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. |
|
output_hidden_states (`bool`, *optional*, defaults to `False`): |
|
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors |
|
for more details. |
|
output_attentions (`bool`, *optional*, defaults to `False`): |
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under |
|
returned tensors for more details. |
|
temperature (`float`, *optional*, defaults to 1.0): |
|
The value used to modulate the next token probabilities. |
|
do_sample (`bool`, *optional*, defaults to `False`): |
|
Whether or not to use sampling ; use greedy decoding otherwise. |
|
text_encoding_offset (`int`, *optional*, defaults to 10_048): |
|
Text encoding offset. |
|
text_pad_token (`int`, *optional*, defaults to 129_595): |
|
Text pad token. |
|
semantic_infer_token (`int`, *optional*, defaults to 129_599): |
|
Semantic infer token. |
|
semantic_vocab_size (`int`, *optional*, defaults to 10_000): |
|
Semantic vocab size. |
|
max_input_semantic_length (`int`, *optional*, defaults to 256): |
|
Max length of semantic input vector. |
|
semantic_rate_hz (`float`, *optional*, defaults to 49.9): |
|
Semantic rate in Hertz. |
|
""" |
|
super().__init__( |
|
temperature=temperature, |
|
do_sample=do_sample, |
|
eos_token_id=eos_token_id, |
|
renormalize_logits=renormalize_logits, |
|
max_new_tokens=max_new_tokens, |
|
output_scores=output_scores, |
|
return_dict_in_generate=return_dict_in_generate, |
|
output_hidden_states=output_hidden_states, |
|
output_attentions=output_attentions, |
|
**kwargs, |
|
) |
|
|
|
self.text_encoding_offset = text_encoding_offset |
|
self.text_pad_token = text_pad_token |
|
self.semantic_pad_token = eos_token_id |
|
self.semantic_infer_token = semantic_infer_token |
|
self.semantic_vocab_size = semantic_vocab_size |
|
self.max_input_semantic_length = max_input_semantic_length |
|
self.semantic_rate_hz = semantic_rate_hz |
|
|
|
|
|
class BarkCoarseGenerationConfig(GenerationConfig): |
|
model_type = "coarse_acoustics" |
|
|
|
def __init__( |
|
self, |
|
renormalize_logits=True, |
|
output_scores=False, |
|
return_dict_in_generate=False, |
|
output_hidden_states=False, |
|
output_attentions=False, |
|
temperature=1.0, |
|
do_sample=False, |
|
coarse_semantic_pad_token=12_048, |
|
coarse_rate_hz=75, |
|
n_coarse_codebooks=2, |
|
coarse_infer_token=12_050, |
|
max_coarse_input_length=256, |
|
max_coarse_history: int = 630, |
|
sliding_window_len: int = 60, |
|
**kwargs, |
|
): |
|
"""Class that holds a generation configuration for [`BarkCoarseModel`]. |
|
|
|
This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the |
|
documentation from [`GenerationConfig`] for more information. |
|
|
|
Args: |
|
renormalize_logits (`bool`, *optional*, defaults to `True`): |
|
Whether to renormalize the logits after applying all the logits processors or warpers (including the |
|
custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the |
|
score logits are normalized but some logit processors or warpers break the normalization. |
|
output_scores (`bool`, *optional*, defaults to `False`): |
|
Whether or not to return the prediction scores. See `scores` under returned tensors for more details. |
|
return_dict_in_generate (`bool`, *optional*, defaults to `False`): |
|
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. |
|
output_hidden_states (`bool`, *optional*, defaults to `False`): |
|
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors |
|
for more details. |
|
output_attentions (`bool`, *optional*, defaults to `False`): |
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under |
|
returned tensors for more details. |
|
temperature (`float`, *optional*, defaults to 1.0): |
|
The value used to modulate the next token probabilities. |
|
do_sample (`bool`, *optional*, defaults to `False`): |
|
Whether or not to use sampling ; use greedy decoding otherwise. |
|
coarse_semantic_pad_token (`int`, *optional*, defaults to 12_048): |
|
Coarse semantic pad token. |
|
coarse_rate_hz (`int`, *optional*, defaults to 75): |
|
Coarse rate in Hertz. |
|
n_coarse_codebooks (`int`, *optional*, defaults to 2): |
|
Number of coarse codebooks. |
|
coarse_infer_token (`int`, *optional*, defaults to 12_050): |
|
Coarse infer token. |
|
max_coarse_input_length (`int`, *optional*, defaults to 256): |
|
Max length of input coarse vector. |
|
max_coarse_history (`int`, *optional*, defaults to 630): |
|
Max length of the output of the coarse acoustics model used in the fine generation step. |
|
sliding_window_len (`int`, *optional*, defaults to 60): |
|
The coarse generation step uses a sliding window to generate raw audio. |
|
""" |
|
super().__init__( |
|
temperature=temperature, |
|
do_sample=do_sample, |
|
renormalize_logits=renormalize_logits, |
|
output_scores=output_scores, |
|
return_dict_in_generate=return_dict_in_generate, |
|
output_hidden_states=output_hidden_states, |
|
output_attentions=output_attentions, |
|
**kwargs, |
|
) |
|
|
|
self.coarse_semantic_pad_token = coarse_semantic_pad_token |
|
self.coarse_rate_hz = coarse_rate_hz |
|
self.n_coarse_codebooks = n_coarse_codebooks |
|
self.coarse_infer_token = coarse_infer_token |
|
self.max_coarse_input_length = max_coarse_input_length |
|
self.max_coarse_history = max_coarse_history |
|
self.sliding_window_len = sliding_window_len |
|
|
|
|
|
class BarkFineGenerationConfig(GenerationConfig): |
|
model_type = "fine_acoustics" |
|
|
|
def __init__( |
|
self, |
|
temperature=1.0, |
|
max_fine_history_length=512, |
|
max_fine_input_length=1024, |
|
n_fine_codebooks=8, |
|
**kwargs, |
|
): |
|
"""Class that holds a generation configuration for [`BarkFineModel`]. |
|
|
|
[`BarkFineModel`] is an autoencoder model, so should not usually be used for generation. However, under the |
|
hood, it uses `temperature` when used by [`BarkModel`] |
|
|
|
This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the |
|
documentation from [`GenerationConfig`] for more information. |
|
|
|
Args: |
|
temperature (`float`, *optional*): |
|
The value used to modulate the next token probabilities. |
|
max_fine_history_length (`int`, *optional*, defaults to 512): |
|
Max length of the fine history vector. |
|
max_fine_input_length (`int`, *optional*, defaults to 1024): |
|
Max length of fine input vector. |
|
n_fine_codebooks (`int`, *optional*, defaults to 8): |
|
Number of codebooks used. |
|
""" |
|
super().__init__(temperature=temperature) |
|
|
|
self.max_fine_history_length = max_fine_history_length |
|
self.max_fine_input_length = max_fine_input_length |
|
self.n_fine_codebooks = n_fine_codebooks |
|
|
|
def validate(self, **kwargs): |
|
""" |
|
Overrides GenerationConfig.validate because BarkFineGenerationConfig don't use any parameters outside |
|
temperature. |
|
""" |
|
pass |
|
|
|
|
|
class BarkGenerationConfig(GenerationConfig): |
|
model_type = "bark" |
|
is_composition = True |
|
|
|
|
|
|
|
def __init__( |
|
self, |
|
semantic_config: Dict = None, |
|
coarse_acoustics_config: Dict = None, |
|
fine_acoustics_config: Dict = None, |
|
sample_rate=24_000, |
|
codebook_size=1024, |
|
**kwargs, |
|
): |
|
"""Class that holds a generation configuration for [`BarkModel`]. |
|
|
|
The [`BarkModel`] does not have a `generate` method, but uses this class to generate speeches with a nested |
|
[`BarkGenerationConfig`] which uses [`BarkSemanticGenerationConfig`], [`BarkCoarseGenerationConfig`], |
|
[`BarkFineGenerationConfig`]. |
|
|
|
This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the |
|
documentation from [`GenerationConfig`] for more information. |
|
|
|
Args: |
|
semantic_config (`Dict`, *optional*): |
|
Semantic generation configuration. |
|
coarse_acoustics_config (`Dict`, *optional*): |
|
Coarse generation configuration. |
|
fine_acoustics_config (`Dict`, *optional*): |
|
Fine generation configuration. |
|
sample_rate (`int`, *optional*, defaults to 24_000): |
|
Sample rate. |
|
codebook_size (`int`, *optional*, defaults to 1024): |
|
Vector length for each codebook. |
|
""" |
|
if semantic_config is None: |
|
semantic_config = {} |
|
logger.info("semantic_config is None. initializing the semantic model with default values.") |
|
|
|
if coarse_acoustics_config is None: |
|
coarse_acoustics_config = {} |
|
logger.info("coarse_acoustics_config is None. initializing the coarse model with default values.") |
|
|
|
if fine_acoustics_config is None: |
|
fine_acoustics_config = {} |
|
logger.info("fine_acoustics_config is None. initializing the fine model with default values.") |
|
|
|
self.semantic_config = BarkSemanticGenerationConfig(**semantic_config) |
|
self.coarse_acoustics_config = BarkCoarseGenerationConfig(**coarse_acoustics_config) |
|
self.fine_acoustics_config = BarkFineGenerationConfig(**fine_acoustics_config) |
|
|
|
self.sample_rate = sample_rate |
|
self.codebook_size = codebook_size |
|
|
|
@classmethod |
|
def from_sub_model_configs( |
|
cls, |
|
semantic_config: BarkSemanticGenerationConfig, |
|
coarse_acoustics_config: BarkCoarseGenerationConfig, |
|
fine_acoustics_config: BarkFineGenerationConfig, |
|
**kwargs, |
|
): |
|
r""" |
|
Instantiate a [`BarkGenerationConfig`] (or a derived class) from bark sub-models generation configuration. |
|
|
|
Returns: |
|
[`BarkGenerationConfig`]: An instance of a configuration object |
|
""" |
|
return cls( |
|
semantic_config=semantic_config.to_dict(), |
|
coarse_acoustics_config=coarse_acoustics_config.to_dict(), |
|
fine_acoustics_config=fine_acoustics_config.to_dict(), |
|
**kwargs, |
|
) |
|
|
|
def to_dict(self): |
|
""" |
|
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. |
|
|
|
Returns: |
|
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, |
|
""" |
|
output = copy.deepcopy(self.__dict__) |
|
|
|
output["semantic_config"] = self.semantic_config.to_dict() |
|
output["coarse_acoustics_config"] = self.coarse_acoustics_config.to_dict() |
|
output["fine_acoustics_config"] = self.fine_acoustics_config.to_dict() |
|
|
|
output["model_type"] = self.__class__.model_type |
|
return output |
|
|