|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" TF 2.0 Cvt model.""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
import collections.abc |
|
from dataclasses import dataclass |
|
from typing import Optional, Tuple, Union |
|
|
|
import tensorflow as tf |
|
|
|
from ...modeling_tf_outputs import TFImageClassifierOutputWithNoAttention |
|
from ...modeling_tf_utils import ( |
|
TFModelInputType, |
|
TFPreTrainedModel, |
|
TFSequenceClassificationLoss, |
|
get_initializer, |
|
keras_serializable, |
|
unpack_inputs, |
|
) |
|
from ...tf_utils import shape_list, stable_softmax |
|
from ...utils import ( |
|
ModelOutput, |
|
add_start_docstrings, |
|
add_start_docstrings_to_model_forward, |
|
logging, |
|
replace_return_docstrings, |
|
) |
|
from .configuration_cvt import CvtConfig |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
_CONFIG_FOR_DOC = "CvtConfig" |
|
|
|
TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST = [ |
|
"microsoft/cvt-13", |
|
"microsoft/cvt-13-384", |
|
"microsoft/cvt-13-384-22k", |
|
"microsoft/cvt-21", |
|
"microsoft/cvt-21-384", |
|
"microsoft/cvt-21-384-22k", |
|
|
|
] |
|
|
|
|
|
@dataclass |
|
class TFBaseModelOutputWithCLSToken(ModelOutput): |
|
""" |
|
Base class for model's outputs. |
|
|
|
Args: |
|
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): |
|
Sequence of hidden-states at the output of the last layer of the model. |
|
cls_token_value (`tf.Tensor` of shape `(batch_size, 1, hidden_size)`): |
|
Classification token at the output of the last layer of the model. |
|
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): |
|
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape |
|
`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus |
|
the initial embedding outputs. |
|
""" |
|
|
|
last_hidden_state: tf.Tensor = None |
|
cls_token_value: tf.Tensor = None |
|
hidden_states: Tuple[tf.Tensor] | None = None |
|
|
|
|
|
class TFCvtDropPath(tf.keras.layers.Layer): |
|
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). |
|
References: |
|
(1) github.com:rwightman/pytorch-image-models |
|
""" |
|
|
|
def __init__(self, drop_prob: float, **kwargs): |
|
super().__init__(**kwargs) |
|
self.drop_prob = drop_prob |
|
|
|
def call(self, x: tf.Tensor, training=None): |
|
if self.drop_prob == 0.0 or not training: |
|
return x |
|
keep_prob = 1 - self.drop_prob |
|
shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) |
|
random_tensor = keep_prob + tf.random.uniform(shape, 0, 1, dtype=self.compute_dtype) |
|
random_tensor = tf.floor(random_tensor) |
|
return (x / keep_prob) * random_tensor |
|
|
|
|
|
class TFCvtEmbeddings(tf.keras.layers.Layer): |
|
"""Construct the Convolutional Token Embeddings.""" |
|
|
|
def __init__( |
|
self, |
|
config: CvtConfig, |
|
patch_size: int, |
|
embed_dim: int, |
|
stride: int, |
|
padding: int, |
|
dropout_rate: float, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
self.convolution_embeddings = TFCvtConvEmbeddings( |
|
config, |
|
patch_size=patch_size, |
|
embed_dim=embed_dim, |
|
stride=stride, |
|
padding=padding, |
|
name="convolution_embeddings", |
|
) |
|
self.dropout = tf.keras.layers.Dropout(dropout_rate) |
|
|
|
def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: |
|
hidden_state = self.convolution_embeddings(pixel_values) |
|
hidden_state = self.dropout(hidden_state, training=training) |
|
return hidden_state |
|
|
|
|
|
class TFCvtConvEmbeddings(tf.keras.layers.Layer): |
|
"""Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts.""" |
|
|
|
def __init__(self, config: CvtConfig, patch_size: int, embed_dim: int, stride: int, padding: int, **kwargs): |
|
super().__init__(**kwargs) |
|
self.padding = tf.keras.layers.ZeroPadding2D(padding=padding) |
|
self.patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) |
|
self.projection = tf.keras.layers.Conv2D( |
|
filters=embed_dim, |
|
kernel_size=patch_size, |
|
strides=stride, |
|
padding="valid", |
|
data_format="channels_last", |
|
kernel_initializer=get_initializer(config.initializer_range), |
|
name="projection", |
|
) |
|
|
|
self.normalization = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="normalization") |
|
|
|
def call(self, pixel_values: tf.Tensor) -> tf.Tensor: |
|
if isinstance(pixel_values, dict): |
|
pixel_values = pixel_values["pixel_values"] |
|
|
|
pixel_values = self.projection(self.padding(pixel_values)) |
|
|
|
|
|
batch_size, height, width, num_channels = shape_list(pixel_values) |
|
hidden_size = height * width |
|
pixel_values = tf.reshape(pixel_values, shape=(batch_size, hidden_size, num_channels)) |
|
pixel_values = self.normalization(pixel_values) |
|
|
|
|
|
pixel_values = tf.reshape(pixel_values, shape=(batch_size, height, width, num_channels)) |
|
return pixel_values |
|
|
|
|
|
class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer): |
|
"""Convolutional projection layer.""" |
|
|
|
def __init__(self, config: CvtConfig, embed_dim: int, kernel_size: int, stride: int, padding: int, **kwargs): |
|
super().__init__(**kwargs) |
|
self.padding = tf.keras.layers.ZeroPadding2D(padding=padding) |
|
self.convolution = tf.keras.layers.Conv2D( |
|
filters=embed_dim, |
|
kernel_size=kernel_size, |
|
kernel_initializer=get_initializer(config.initializer_range), |
|
padding="valid", |
|
strides=stride, |
|
use_bias=False, |
|
name="convolution", |
|
groups=embed_dim, |
|
) |
|
|
|
self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") |
|
|
|
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: |
|
hidden_state = self.convolution(self.padding(hidden_state)) |
|
hidden_state = self.normalization(hidden_state, training=training) |
|
return hidden_state |
|
|
|
|
|
class TFCvtSelfAttentionLinearProjection(tf.keras.layers.Layer): |
|
"""Linear projection layer used to flatten tokens into 1D.""" |
|
|
|
def call(self, hidden_state: tf.Tensor) -> tf.Tensor: |
|
|
|
batch_size, height, width, num_channels = shape_list(hidden_state) |
|
hidden_size = height * width |
|
hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels)) |
|
return hidden_state |
|
|
|
|
|
class TFCvtSelfAttentionProjection(tf.keras.layers.Layer): |
|
"""Convolutional Projection for Attention.""" |
|
|
|
def __init__( |
|
self, |
|
config: CvtConfig, |
|
embed_dim: int, |
|
kernel_size: int, |
|
stride: int, |
|
padding: int, |
|
projection_method: str = "dw_bn", |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
if projection_method == "dw_bn": |
|
self.convolution_projection = TFCvtSelfAttentionConvProjection( |
|
config, embed_dim, kernel_size, stride, padding, name="convolution_projection" |
|
) |
|
self.linear_projection = TFCvtSelfAttentionLinearProjection() |
|
|
|
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: |
|
hidden_state = self.convolution_projection(hidden_state, training=training) |
|
hidden_state = self.linear_projection(hidden_state) |
|
return hidden_state |
|
|
|
|
|
class TFCvtSelfAttention(tf.keras.layers.Layer): |
|
""" |
|
Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for |
|
query, key, and value embeddings. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
config: CvtConfig, |
|
num_heads: int, |
|
embed_dim: int, |
|
kernel_size: int, |
|
stride_q: int, |
|
stride_kv: int, |
|
padding_q: int, |
|
padding_kv: int, |
|
qkv_projection_method: str, |
|
qkv_bias: bool, |
|
attention_drop_rate: float, |
|
with_cls_token: bool = True, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
self.scale = embed_dim**-0.5 |
|
self.with_cls_token = with_cls_token |
|
self.embed_dim = embed_dim |
|
self.num_heads = num_heads |
|
|
|
self.convolution_projection_query = TFCvtSelfAttentionProjection( |
|
config, |
|
embed_dim, |
|
kernel_size, |
|
stride_q, |
|
padding_q, |
|
projection_method="linear" if qkv_projection_method == "avg" else qkv_projection_method, |
|
name="convolution_projection_query", |
|
) |
|
self.convolution_projection_key = TFCvtSelfAttentionProjection( |
|
config, |
|
embed_dim, |
|
kernel_size, |
|
stride_kv, |
|
padding_kv, |
|
projection_method=qkv_projection_method, |
|
name="convolution_projection_key", |
|
) |
|
self.convolution_projection_value = TFCvtSelfAttentionProjection( |
|
config, |
|
embed_dim, |
|
kernel_size, |
|
stride_kv, |
|
padding_kv, |
|
projection_method=qkv_projection_method, |
|
name="convolution_projection_value", |
|
) |
|
|
|
self.projection_query = tf.keras.layers.Dense( |
|
units=embed_dim, |
|
kernel_initializer=get_initializer(config.initializer_range), |
|
use_bias=qkv_bias, |
|
bias_initializer="zeros", |
|
name="projection_query", |
|
) |
|
self.projection_key = tf.keras.layers.Dense( |
|
units=embed_dim, |
|
kernel_initializer=get_initializer(config.initializer_range), |
|
use_bias=qkv_bias, |
|
bias_initializer="zeros", |
|
name="projection_key", |
|
) |
|
self.projection_value = tf.keras.layers.Dense( |
|
units=embed_dim, |
|
kernel_initializer=get_initializer(config.initializer_range), |
|
use_bias=qkv_bias, |
|
bias_initializer="zeros", |
|
name="projection_value", |
|
) |
|
self.dropout = tf.keras.layers.Dropout(attention_drop_rate) |
|
|
|
def rearrange_for_multi_head_attention(self, hidden_state: tf.Tensor) -> tf.Tensor: |
|
batch_size, hidden_size, _ = shape_list(hidden_state) |
|
head_dim = self.embed_dim // self.num_heads |
|
hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, self.num_heads, head_dim)) |
|
hidden_state = tf.transpose(hidden_state, perm=(0, 2, 1, 3)) |
|
return hidden_state |
|
|
|
def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor: |
|
if self.with_cls_token: |
|
cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1) |
|
|
|
|
|
batch_size, hidden_size, num_channels = shape_list(hidden_state) |
|
hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels)) |
|
|
|
key = self.convolution_projection_key(hidden_state, training=training) |
|
query = self.convolution_projection_query(hidden_state, training=training) |
|
value = self.convolution_projection_value(hidden_state, training=training) |
|
|
|
if self.with_cls_token: |
|
query = tf.concat((cls_token, query), axis=1) |
|
key = tf.concat((cls_token, key), axis=1) |
|
value = tf.concat((cls_token, value), axis=1) |
|
|
|
head_dim = self.embed_dim // self.num_heads |
|
|
|
query = self.rearrange_for_multi_head_attention(self.projection_query(query)) |
|
key = self.rearrange_for_multi_head_attention(self.projection_key(key)) |
|
value = self.rearrange_for_multi_head_attention(self.projection_value(value)) |
|
|
|
attention_score = tf.matmul(query, key, transpose_b=True) * self.scale |
|
attention_probs = stable_softmax(logits=attention_score, axis=-1) |
|
attention_probs = self.dropout(attention_probs, training=training) |
|
|
|
context = tf.matmul(attention_probs, value) |
|
|
|
_, _, hidden_size, _ = shape_list(context) |
|
context = tf.transpose(context, perm=(0, 2, 1, 3)) |
|
context = tf.reshape(context, (batch_size, hidden_size, self.num_heads * head_dim)) |
|
return context |
|
|
|
|
|
class TFCvtSelfOutput(tf.keras.layers.Layer): |
|
"""Output of the Attention layer .""" |
|
|
|
def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: float, **kwargs): |
|
super().__init__(**kwargs) |
|
self.dense = tf.keras.layers.Dense( |
|
units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense" |
|
) |
|
self.dropout = tf.keras.layers.Dropout(drop_rate) |
|
|
|
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: |
|
hidden_state = self.dense(inputs=hidden_state) |
|
hidden_state = self.dropout(inputs=hidden_state, training=training) |
|
return hidden_state |
|
|
|
|
|
class TFCvtAttention(tf.keras.layers.Layer): |
|
"""Attention layer. First chunk of the convolutional transformer block.""" |
|
|
|
def __init__( |
|
self, |
|
config: CvtConfig, |
|
num_heads: int, |
|
embed_dim: int, |
|
kernel_size: int, |
|
stride_q: int, |
|
stride_kv: int, |
|
padding_q: int, |
|
padding_kv: int, |
|
qkv_projection_method: str, |
|
qkv_bias: bool, |
|
attention_drop_rate: float, |
|
drop_rate: float, |
|
with_cls_token: bool = True, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
self.attention = TFCvtSelfAttention( |
|
config, |
|
num_heads, |
|
embed_dim, |
|
kernel_size, |
|
stride_q, |
|
stride_kv, |
|
padding_q, |
|
padding_kv, |
|
qkv_projection_method, |
|
qkv_bias, |
|
attention_drop_rate, |
|
with_cls_token, |
|
name="attention", |
|
) |
|
self.dense_output = TFCvtSelfOutput(config, embed_dim, drop_rate, name="output") |
|
|
|
def prune_heads(self, heads): |
|
raise NotImplementedError |
|
|
|
def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False): |
|
self_output = self.attention(hidden_state, height, width, training=training) |
|
attention_output = self.dense_output(self_output, training=training) |
|
return attention_output |
|
|
|
|
|
class TFCvtIntermediate(tf.keras.layers.Layer): |
|
"""Intermediate dense layer. Second chunk of the convolutional transformer block.""" |
|
|
|
def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, **kwargs): |
|
super().__init__(**kwargs) |
|
self.dense = tf.keras.layers.Dense( |
|
units=int(embed_dim * mlp_ratio), |
|
kernel_initializer=get_initializer(config.initializer_range), |
|
activation="gelu", |
|
name="dense", |
|
) |
|
|
|
def call(self, hidden_state: tf.Tensor) -> tf.Tensor: |
|
hidden_state = self.dense(hidden_state) |
|
return hidden_state |
|
|
|
|
|
class TFCvtOutput(tf.keras.layers.Layer): |
|
""" |
|
Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection. |
|
""" |
|
|
|
def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: int, **kwargs): |
|
super().__init__(**kwargs) |
|
self.dense = tf.keras.layers.Dense( |
|
units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense" |
|
) |
|
self.dropout = tf.keras.layers.Dropout(drop_rate) |
|
|
|
def call(self, hidden_state: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: |
|
hidden_state = self.dense(inputs=hidden_state) |
|
hidden_state = self.dropout(inputs=hidden_state, training=training) |
|
hidden_state = hidden_state + input_tensor |
|
return hidden_state |
|
|
|
|
|
class TFCvtLayer(tf.keras.layers.Layer): |
|
""" |
|
Convolutional Transformer Block composed by attention layers, normalization and multi-layer perceptrons (mlps). It |
|
consists of 3 chunks : an attention layer, an intermediate dense layer and an output layer. This corresponds to the |
|
`Block` class in the original implementation. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
config: CvtConfig, |
|
num_heads: int, |
|
embed_dim: int, |
|
kernel_size: int, |
|
stride_q: int, |
|
stride_kv: int, |
|
padding_q: int, |
|
padding_kv: int, |
|
qkv_projection_method: str, |
|
qkv_bias: bool, |
|
attention_drop_rate: float, |
|
drop_rate: float, |
|
mlp_ratio: float, |
|
drop_path_rate: float, |
|
with_cls_token: bool = True, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
self.attention = TFCvtAttention( |
|
config, |
|
num_heads, |
|
embed_dim, |
|
kernel_size, |
|
stride_q, |
|
stride_kv, |
|
padding_q, |
|
padding_kv, |
|
qkv_projection_method, |
|
qkv_bias, |
|
attention_drop_rate, |
|
drop_rate, |
|
with_cls_token, |
|
name="attention", |
|
) |
|
self.intermediate = TFCvtIntermediate(config, embed_dim, mlp_ratio, name="intermediate") |
|
self.dense_output = TFCvtOutput(config, embed_dim, drop_rate, name="output") |
|
|
|
self.drop_path = ( |
|
TFCvtDropPath(drop_path_rate, name="drop_path") |
|
if drop_path_rate > 0.0 |
|
else tf.keras.layers.Activation("linear", name="drop_path") |
|
) |
|
|
|
self.layernorm_before = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before") |
|
self.layernorm_after = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after") |
|
|
|
def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor: |
|
|
|
attention_output = self.attention(self.layernorm_before(hidden_state), height, width, training=training) |
|
attention_output = self.drop_path(attention_output, training=training) |
|
|
|
|
|
hidden_state = attention_output + hidden_state |
|
|
|
|
|
layer_output = self.layernorm_after(hidden_state) |
|
layer_output = self.intermediate(layer_output) |
|
|
|
|
|
layer_output = self.dense_output(layer_output, hidden_state) |
|
layer_output = self.drop_path(layer_output, training=training) |
|
return layer_output |
|
|
|
|
|
class TFCvtStage(tf.keras.layers.Layer): |
|
""" |
|
Cvt stage (encoder block). Each stage has 2 parts : |
|
- (1) A Convolutional Token Embedding layer |
|
- (2) A Convolutional Transformer Block (layer). |
|
The classification token is added only in the last stage. |
|
|
|
Args: |
|
config ([`CvtConfig`]): Model configuration class. |
|
stage (`int`): Stage number. |
|
""" |
|
|
|
def __init__(self, config: CvtConfig, stage: int, **kwargs): |
|
super().__init__(**kwargs) |
|
self.config = config |
|
self.stage = stage |
|
if self.config.cls_token[self.stage]: |
|
self.cls_token = self.add_weight( |
|
shape=(1, 1, self.config.embed_dim[-1]), |
|
initializer=get_initializer(self.config.initializer_range), |
|
trainable=True, |
|
name="cvt.encoder.stages.2.cls_token", |
|
) |
|
|
|
self.embedding = TFCvtEmbeddings( |
|
self.config, |
|
patch_size=config.patch_sizes[self.stage], |
|
stride=config.patch_stride[self.stage], |
|
embed_dim=config.embed_dim[self.stage], |
|
padding=config.patch_padding[self.stage], |
|
dropout_rate=config.drop_rate[self.stage], |
|
name="embedding", |
|
) |
|
|
|
drop_path_rates = tf.linspace(0.0, config.drop_path_rate[self.stage], config.depth[stage]) |
|
drop_path_rates = [x.numpy().item() for x in drop_path_rates] |
|
self.layers = [ |
|
TFCvtLayer( |
|
config, |
|
num_heads=config.num_heads[self.stage], |
|
embed_dim=config.embed_dim[self.stage], |
|
kernel_size=config.kernel_qkv[self.stage], |
|
stride_q=config.stride_q[self.stage], |
|
stride_kv=config.stride_kv[self.stage], |
|
padding_q=config.padding_q[self.stage], |
|
padding_kv=config.padding_kv[self.stage], |
|
qkv_projection_method=config.qkv_projection_method[self.stage], |
|
qkv_bias=config.qkv_bias[self.stage], |
|
attention_drop_rate=config.attention_drop_rate[self.stage], |
|
drop_rate=config.drop_rate[self.stage], |
|
mlp_ratio=config.mlp_ratio[self.stage], |
|
drop_path_rate=drop_path_rates[self.stage], |
|
with_cls_token=config.cls_token[self.stage], |
|
name=f"layers.{j}", |
|
) |
|
for j in range(config.depth[self.stage]) |
|
] |
|
|
|
def call(self, hidden_state: tf.Tensor, training: bool = False): |
|
cls_token = None |
|
hidden_state = self.embedding(hidden_state, training) |
|
|
|
|
|
batch_size, height, width, num_channels = shape_list(hidden_state) |
|
hidden_size = height * width |
|
hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels)) |
|
|
|
if self.config.cls_token[self.stage]: |
|
cls_token = tf.repeat(self.cls_token, repeats=batch_size, axis=0) |
|
hidden_state = tf.concat((cls_token, hidden_state), axis=1) |
|
|
|
for layer in self.layers: |
|
layer_outputs = layer(hidden_state, height, width, training=training) |
|
hidden_state = layer_outputs |
|
|
|
if self.config.cls_token[self.stage]: |
|
cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1) |
|
|
|
|
|
hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels)) |
|
return hidden_state, cls_token |
|
|
|
|
|
class TFCvtEncoder(tf.keras.layers.Layer): |
|
""" |
|
Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers |
|
(depth) being 1, 2 and 10. |
|
|
|
Args: |
|
config ([`CvtConfig`]): Model configuration class. |
|
""" |
|
|
|
config_class = CvtConfig |
|
|
|
def __init__(self, config: CvtConfig, **kwargs): |
|
super().__init__(**kwargs) |
|
self.config = config |
|
self.stages = [ |
|
TFCvtStage(config, stage_idx, name=f"stages.{stage_idx}") for stage_idx in range(len(config.depth)) |
|
] |
|
|
|
def call( |
|
self, |
|
pixel_values: TFModelInputType, |
|
output_hidden_states: Optional[bool] = False, |
|
return_dict: Optional[bool] = True, |
|
training: Optional[bool] = False, |
|
) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]: |
|
all_hidden_states = () if output_hidden_states else None |
|
hidden_state = pixel_values |
|
|
|
|
|
hidden_state = tf.transpose(hidden_state, perm=(0, 2, 3, 1)) |
|
|
|
cls_token = None |
|
for _, (stage_module) in enumerate(self.stages): |
|
hidden_state, cls_token = stage_module(hidden_state, training=training) |
|
if output_hidden_states: |
|
all_hidden_states = all_hidden_states + (hidden_state,) |
|
|
|
|
|
hidden_state = tf.transpose(hidden_state, perm=(0, 3, 1, 2)) |
|
if output_hidden_states: |
|
all_hidden_states = tuple([tf.transpose(hs, perm=(0, 3, 1, 2)) for hs in all_hidden_states]) |
|
|
|
if not return_dict: |
|
return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None) |
|
|
|
return TFBaseModelOutputWithCLSToken( |
|
last_hidden_state=hidden_state, |
|
cls_token_value=cls_token, |
|
hidden_states=all_hidden_states, |
|
) |
|
|
|
|
|
@keras_serializable |
|
class TFCvtMainLayer(tf.keras.layers.Layer): |
|
"""Construct the Cvt model.""" |
|
|
|
config_class = CvtConfig |
|
|
|
def __init__(self, config: CvtConfig, **kwargs): |
|
super().__init__(**kwargs) |
|
self.config = config |
|
self.encoder = TFCvtEncoder(config, name="encoder") |
|
|
|
@unpack_inputs |
|
def call( |
|
self, |
|
pixel_values: TFModelInputType | None = None, |
|
output_hidden_states: Optional[bool] = None, |
|
return_dict: Optional[bool] = None, |
|
training: Optional[bool] = False, |
|
) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]: |
|
if pixel_values is None: |
|
raise ValueError("You have to specify pixel_values") |
|
|
|
encoder_outputs = self.encoder( |
|
pixel_values, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=return_dict, |
|
training=training, |
|
) |
|
|
|
sequence_output = encoder_outputs[0] |
|
|
|
if not return_dict: |
|
return (sequence_output,) + encoder_outputs[1:] |
|
|
|
return TFBaseModelOutputWithCLSToken( |
|
last_hidden_state=sequence_output, |
|
cls_token_value=encoder_outputs.cls_token_value, |
|
hidden_states=encoder_outputs.hidden_states, |
|
) |
|
|
|
|
|
class TFCvtPreTrainedModel(TFPreTrainedModel): |
|
""" |
|
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained |
|
models. |
|
""" |
|
|
|
config_class = CvtConfig |
|
base_model_prefix = "cvt" |
|
main_input_name = "pixel_values" |
|
|
|
|
|
TFCVT_START_DOCSTRING = r""" |
|
|
|
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the |
|
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads |
|
etc.) |
|
|
|
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it |
|
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and |
|
behavior. |
|
|
|
<Tip> |
|
|
|
TF 2.0 models accepts two formats as inputs: |
|
|
|
- having all inputs as keyword arguments (like PyTorch models), or |
|
- having all inputs as a list, tuple or dict in the first positional arguments. |
|
|
|
This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the |
|
tensors in the first argument of the model call function: `model(inputs)`. |
|
|
|
</Tip> |
|
|
|
Args: |
|
config ([`CvtConfig`]): Model configuration class with all the parameters of the model. |
|
Initializing with a config file does not load the weights associated with the model, only the |
|
configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. |
|
""" |
|
|
|
TFCVT_INPUTS_DOCSTRING = r""" |
|
Args: |
|
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): |
|
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`] |
|
for details. |
|
|
|
output_hidden_states (`bool`, *optional*): |
|
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for |
|
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be |
|
used instead. |
|
return_dict (`bool`, *optional*): |
|
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in |
|
eager mode, in graph mode the value will always be set to True. |
|
training (`bool`, *optional*, defaults to `False``): |
|
Whether or not to use the model in training mode (some modules like dropout modules have different |
|
behaviors between training and evaluation). |
|
""" |
|
|
|
|
|
@add_start_docstrings( |
|
"The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.", |
|
TFCVT_START_DOCSTRING, |
|
) |
|
class TFCvtModel(TFCvtPreTrainedModel): |
|
def __init__(self, config: CvtConfig, *inputs, **kwargs): |
|
super().__init__(config, *inputs, **kwargs) |
|
|
|
self.cvt = TFCvtMainLayer(config, name="cvt") |
|
|
|
@unpack_inputs |
|
@add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING) |
|
@replace_return_docstrings(output_type=TFBaseModelOutputWithCLSToken, config_class=_CONFIG_FOR_DOC) |
|
def call( |
|
self, |
|
pixel_values: tf.Tensor | None = None, |
|
output_hidden_states: Optional[bool] = None, |
|
return_dict: Optional[bool] = None, |
|
training: Optional[bool] = False, |
|
) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]: |
|
r""" |
|
Returns: |
|
|
|
Examples: |
|
|
|
```python |
|
>>> from transformers import AutoImageProcessor, TFCvtModel |
|
>>> from PIL import Image |
|
>>> import requests |
|
|
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" |
|
>>> image = Image.open(requests.get(url, stream=True).raw) |
|
|
|
>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13") |
|
>>> model = TFCvtModel.from_pretrained("microsoft/cvt-13") |
|
|
|
>>> inputs = image_processor(images=image, return_tensors="tf") |
|
>>> outputs = model(**inputs) |
|
>>> last_hidden_states = outputs.last_hidden_state |
|
```""" |
|
|
|
if pixel_values is None: |
|
raise ValueError("You have to specify pixel_values") |
|
|
|
outputs = self.cvt( |
|
pixel_values=pixel_values, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=return_dict, |
|
training=training, |
|
) |
|
|
|
if not return_dict: |
|
return (outputs[0],) + outputs[1:] |
|
|
|
return TFBaseModelOutputWithCLSToken( |
|
last_hidden_state=outputs.last_hidden_state, |
|
cls_token_value=outputs.cls_token_value, |
|
hidden_states=outputs.hidden_states, |
|
) |
|
|
|
|
|
@add_start_docstrings( |
|
""" |
|
Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of |
|
the [CLS] token) e.g. for ImageNet. |
|
""", |
|
TFCVT_START_DOCSTRING, |
|
) |
|
class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassificationLoss): |
|
def __init__(self, config: CvtConfig, *inputs, **kwargs): |
|
super().__init__(config, *inputs, **kwargs) |
|
|
|
self.num_labels = config.num_labels |
|
self.cvt = TFCvtMainLayer(config, name="cvt") |
|
|
|
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm") |
|
|
|
|
|
self.classifier = tf.keras.layers.Dense( |
|
units=config.num_labels, |
|
kernel_initializer=get_initializer(config.initializer_range), |
|
use_bias=True, |
|
bias_initializer="zeros", |
|
name="classifier", |
|
) |
|
|
|
@unpack_inputs |
|
@add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING) |
|
@replace_return_docstrings(output_type=TFImageClassifierOutputWithNoAttention, config_class=_CONFIG_FOR_DOC) |
|
def call( |
|
self, |
|
pixel_values: tf.Tensor | None = None, |
|
labels: tf.Tensor | None = None, |
|
output_hidden_states: Optional[bool] = None, |
|
return_dict: Optional[bool] = None, |
|
training: Optional[bool] = False, |
|
) -> Union[TFImageClassifierOutputWithNoAttention, Tuple[tf.Tensor]]: |
|
r""" |
|
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): |
|
Labels for computing the image classification/regression loss. Indices should be in `[0, ..., |
|
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If |
|
`config.num_labels > 1` a classification loss is computed (Cross-Entropy). |
|
|
|
Returns: |
|
|
|
Examples: |
|
|
|
```python |
|
>>> from transformers import AutoImageProcessor, TFCvtForImageClassification |
|
>>> import tensorflow as tf |
|
>>> from PIL import Image |
|
>>> import requests |
|
|
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" |
|
>>> image = Image.open(requests.get(url, stream=True).raw) |
|
|
|
>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13") |
|
>>> model = TFCvtForImageClassification.from_pretrained("microsoft/cvt-13") |
|
|
|
>>> inputs = image_processor(images=image, return_tensors="tf") |
|
>>> outputs = model(**inputs) |
|
>>> logits = outputs.logits |
|
>>> # model predicts one of the 1000 ImageNet classes |
|
>>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0] |
|
>>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)]) |
|
```""" |
|
|
|
outputs = self.cvt( |
|
pixel_values, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=return_dict, |
|
training=training, |
|
) |
|
|
|
sequence_output = outputs[0] |
|
cls_token = outputs[1] |
|
if self.config.cls_token[-1]: |
|
sequence_output = self.layernorm(cls_token) |
|
else: |
|
|
|
batch_size, num_channels, height, width = shape_list(sequence_output) |
|
sequence_output = tf.reshape(sequence_output, shape=(batch_size, num_channels, height * width)) |
|
sequence_output = tf.transpose(sequence_output, perm=(0, 2, 1)) |
|
sequence_output = self.layernorm(sequence_output) |
|
|
|
sequence_output_mean = tf.reduce_mean(sequence_output, axis=1) |
|
logits = self.classifier(sequence_output_mean) |
|
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits) |
|
|
|
if not return_dict: |
|
output = (logits,) + outputs[2:] |
|
return ((loss,) + output) if loss is not None else output |
|
|
|
return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) |
|
|