|
"""TraVisionLM configuration"""
|
|
|
|
from transformers import PretrainedConfig
|
|
from transformers import logging, CONFIG_MAPPING
|
|
import warnings
|
|
|
|
logger = logging.get_logger(__name__)
|
|
|
|
class TraVisionLMConfig(PretrainedConfig):
|
|
model_type = "travisionlm"
|
|
is_composition = False
|
|
|
|
def __init__(
|
|
self,
|
|
vision_config=None,
|
|
text_config=None,
|
|
ignore_index=-100,
|
|
image_token_idx=50257,
|
|
vocab_size=51282,
|
|
projection_dim=768,
|
|
hidden_size=1280,
|
|
**kwargs,
|
|
):
|
|
self.ignore_index = ignore_index
|
|
self.image_token_index = image_token_idx
|
|
self._vocab_size = vocab_size
|
|
self.projection_dim = projection_dim
|
|
self.hidden_size = hidden_size
|
|
self.vision_config = vision_config
|
|
self.is_encoder_decoder = False
|
|
if isinstance(self.vision_config, dict):
|
|
vision_config["model_type"] = (
|
|
vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model"
|
|
)
|
|
self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
|
|
elif vision_config is None:
|
|
self.vision_config = CONFIG_MAPPING["siglip_vision_model"](
|
|
attention_dropout=0.0,
|
|
hidden_act="gelu_pytorch_tanh",
|
|
hidden_size=768,
|
|
image_size=256,
|
|
intermediate_size=3072,
|
|
layer_norm_eps=1e-06,
|
|
num_attention_heads=12,
|
|
num_channels=3,
|
|
num_hidden_layers=12,
|
|
patch_size=16,
|
|
)
|
|
self.vocab_size = vocab_size
|
|
|
|
self.text_config = text_config
|
|
|
|
if isinstance(self.text_config, dict):
|
|
text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "gpt2"
|
|
self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
|
|
elif text_config is None:
|
|
self.text_config = CONFIG_MAPPING["gpt2"](
|
|
activation_function="gelu_new",
|
|
attn_pdrop=0.1,
|
|
embd_pdrop=0.1,
|
|
initializer_range=0.02,
|
|
layer_norm_epsilon=1e-05,
|
|
n_ctx=1024,
|
|
n_embd=1280,
|
|
n_head=20,
|
|
n_layer=36,
|
|
n_positions=1024,
|
|
reorder_and_upcast_attn=False,
|
|
resid_pdrop=0.1,
|
|
scale_attn_by_inverse_layer_idx=False,
|
|
scale_attn_weights=True,
|
|
vocab_size=vocab_size
|
|
)
|
|
self.num_image_tokens = (self.vision_config.image_size // self.vision_config.patch_size) ** 2
|
|
self.pad_token_id = self.text_config.pad_token_id
|
|
self.vision_config.projection_dim = projection_dim
|
|
super().__init__(**kwargs)
|
|
|
|
@property
|
|
def vocab_size(self):
|
|
warnings.warn(
|
|
"The `vocab_size` attribute is deprecated and will be removed in v4.44, Please use `text_config.vocab_size` instead.",
|
|
FutureWarning,
|
|
)
|
|
return self._vocab_size
|
|
|
|
@vocab_size.setter
|
|
def vocab_size(self, value):
|
|
self._vocab_size = value
|
|
|
|
def to_dict(self):
|
|
output = super().to_dict()
|
|
output.pop("_vocab_size", None)
|
|
return output |