theIndividual
/

Florence-2-base

@@ -23,7 +23,7 @@ import torch.utils.checkpoint
 from torch import nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
-from torch.nn import CrossEntropyLoss
 from collections import OrderedDict
 from einops import rearrange
 from timm.models.layers import DropPath, trunc_normal_
@@ -39,7 +39,7 @@ from transformers.utils import (
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
 )
-from .configuration_florence2 import Florence2Config
 from .configuration_florence2 import Florence2LanguageConfig
 from .configuration_florence2 import Florence2VisionConfig
@@ -66,6 +66,7 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "Florence2Config"
 class LearnedAbsolutePositionEmbedding2D(nn.Module):
     """
     This module learns positional embeddings up to a fixed maximum size.
@@ -74,22 +75,30 @@ class LearnedAbsolutePositionEmbedding2D(nn.Module):
     def __init__(self, embedding_dim=256, num_pos=50):
         super().__init__()
         self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
-        self.column_embeddings = nn.Embedding(num_pos, embedding_dim - (embedding_dim // 2))
     def forward(self, pixel_values):
         """
-        pixel_values: (batch_size, height, width, num_channels)
         returns: (batch_size, height, width, embedding_dim * 2)
         """
         if len(pixel_values.shape) != 4:
-            raise ValueError('pixel_values must be a 4D tensor')
         height, width = pixel_values.shape[1:3]
         width_values = torch.arange(width, device=pixel_values.device)
         height_values = torch.arange(height, device=pixel_values.device)
         x_emb = self.column_embeddings(width_values)
         y_emb = self.row_embeddings(height_values)
         # (height, width, embedding_dim * 2)
-        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
         # (embedding_dim * 2, height, width)
         pos = pos.permute(2, 0, 1)
         pos = pos.unsqueeze(0)
@@ -99,6 +108,7 @@ class LearnedAbsolutePositionEmbedding2D(nn.Module):
         pos = pos.permute(0, 2, 3, 1)
         return pos
 class PositionalEmbeddingCosine1D(nn.Module):
     """
     This class implements a very simple positional encoding. It follows closely
@@ -110,22 +120,21 @@ class PositionalEmbeddingCosine1D(nn.Module):
         dropout_prob: The dropout probability.
         max_seq_len: The maximum length to precompute the positional encodings.
     """
-    def __init__(
-            self,
-            embed_dim: int = 512,
-            max_seq_len: int = 1024) -> None:
         super(PositionalEmbeddingCosine1D, self).__init__()
         self.embed_dim = embed_dim
         self.max_seq_len = max_seq_len
         # Generate the sinusoidal arrays.
         factor = math.log(10000)
         denominator = torch.exp(
-            -factor * torch.arange(0, self.embed_dim, 2) / self.embed_dim)
         # Matrix where rows correspond to a positional embedding as a function
         # of the position index (i.e., the row index).
-        frequencies = \
-            torch.arange(0, self.max_seq_len) \
-            .reshape(self.max_seq_len, 1) * denominator
         pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
         # Populate uneven entries.
         pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
@@ -149,11 +158,10 @@ class PositionalEmbeddingCosine1D(nn.Module):
         assert 2 <= shape_len <= 3
         len_seq = seq_embeds.size(-2)
         assert len_seq <= self.max_seq_len
-        pos_embeds = self.pos_idx_to_embed[0:seq_embeds.size(-2), :]
         # Adapt pre-computed positional embeddings to the input.
         if shape_len == 3:
-            pos_embeds = pos_embeds.view(
-                (1, pos_embeds.size(0), pos_embeds.size(1)))
         return pos_embeds
@@ -165,10 +173,8 @@ class LearnedAbsolutePositionEmbedding1D(nn.Module):
         embed_dim: The dimension of the embeddings.
         max_seq_len: The maximum length to precompute the positional encodings.
     """
-    def __init__(
-            self,
-            embedding_dim: int = 512,
-            num_pos: int = 1024) -> None:
         super(LearnedAbsolutePositionEmbedding1D, self).__init__()
         self.embeddings = nn.Embedding(num_pos, embedding_dim)
         self.num_pos = num_pos
@@ -193,12 +199,10 @@ class LearnedAbsolutePositionEmbedding1D(nn.Module):
         pos_embeds = self.embeddings(torch.arange(len_seq).to(seq_embeds.device))
         # Adapt pre-computed positional embeddings to the input.
         if shape_len == 3:
-            pos_embeds = pos_embeds.view(
-                (1, pos_embeds.size(0), pos_embeds.size(1)))
         return pos_embeds
 class MySequential(nn.Sequential):
     def forward(self, *inputs):
         for module in self._modules.values():
@@ -242,11 +246,15 @@ class Mlp(nn.Module):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
-        self.net = nn.Sequential(OrderedDict([
-            ("fc1", nn.Linear(in_features, hidden_features)),
-            ("act", act_layer()),
-            ("fc2", nn.Linear(hidden_features, out_features))
-        ]))
     def forward(self, x, size):
         return self.net(x), size
@@ -263,12 +271,13 @@ class DepthWiseConv2d(nn.Module):
     ):
         super().__init__()
         self.dw = nn.Conv2d(
-            dim_in, dim_in,
             kernel_size=kernel_size,
             padding=padding,
             groups=dim_in,
             stride=stride,
-            bias=bias
         )
     def forward(self, x, size):
@@ -283,8 +292,7 @@ class DepthWiseConv2d(nn.Module):
 class ConvEmbed(nn.Module):
-    """ Image to Patch Embedding
-    """
     def __init__(
         self,
@@ -294,16 +302,13 @@ class ConvEmbed(nn.Module):
         stride=4,
         padding=2,
         norm_layer=None,
-        pre_norm=True
     ):
         super().__init__()
         self.patch_size = patch_size
         self.proj = nn.Conv2d(
-            in_chans, embed_dim,
-            kernel_size=patch_size,
-            stride=stride,
-            padding=padding
         )
         dim_norm = in_chans if pre_norm else embed_dim
@@ -316,15 +321,12 @@ class ConvEmbed(nn.Module):
         if len(x.size()) == 3:
             if self.norm and self.pre_norm:
                 x = self.norm(x)
-            x = rearrange(
-                x, 'b (h w) c -> b c h w',
-                h=H, w=W
-            )
         x = self.proj(x)
         _, _, H, W = x.shape
-        x = rearrange(x, 'b c h w -> b (h w) c')
         if self.norm and not self.pre_norm:
             x = self.norm(x)
@@ -343,7 +345,11 @@ class ChannelAttention(nn.Module):
     def forward(self, x, size):
         B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.groups, C // self.groups).permute(2, 0, 3, 1, 4)
         q, k, v = qkv[0], qkv[1], qkv[2]
         q = q * (float(N) ** -0.5)
@@ -357,24 +363,41 @@ class ChannelAttention(nn.Module):
 class ChannelBlock(nn.Module):
-    def __init__(self, dim, groups, mlp_ratio=4., qkv_bias=True,
-                 drop_path_rate=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm,
-                 conv_at_attn=True, conv_at_ffn=True):
         super().__init__()
-        drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-        self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
         self.channel_attn = PreNorm(
             norm_layer(dim),
             ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
-            drop_path
         )
-        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
         self.ffn = PreNorm(
             norm_layer(dim),
-            Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
-            drop_path
         )
     def forward(self, x, size):
@@ -392,15 +415,19 @@ class ChannelBlock(nn.Module):
 def window_partition(x, window_size: int):
     B, H, W, C = x.shape
     x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
     return windows
 def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
-    B = batch_size
     # this will cause onnx conversion failed for dynamic axis, because treated as constant
-    # int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
     x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
     return x
@@ -441,20 +468,22 @@ class WindowAttention(nn.Module):
         # attn_windows = self.attn(x_windows)
         B_, N, C = x.shape
-        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
         q, k, v = qkv[0], qkv[1], qkv[2]
         q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
         attn = self.softmax(attn)
         x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
         x = self.proj(x)
         # merge windows
-        x = x.view(
-            -1, self.window_size, self.window_size, C
-        )
         x = window_reverse(x, B, self.window_size, Hp, Wp)
         if pad_r > 0 or pad_b > 0:
@@ -467,24 +496,42 @@ class WindowAttention(nn.Module):
 class SpatialBlock(nn.Module):
-    def __init__(self, dim, num_heads, window_size,
-                 mlp_ratio=4., qkv_bias=True, drop_path_rate=0., act_layer=nn.GELU,
-                 norm_layer=nn.LayerNorm, conv_at_attn=True, conv_at_ffn=True):
         super().__init__()
-        drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-        self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
         self.window_attn = PreNorm(
             norm_layer(dim),
             WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
-            drop_path
         )
-        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
         self.ffn = PreNorm(
             norm_layer(dim),
-            Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
-            drop_path
         )
     def forward(self, x, size):
@@ -499,7 +546,7 @@ class SpatialBlock(nn.Module):
 class DaViT(nn.Module):
-    """ DaViT: Dual-Attention Transformer
     Args:
         in_chans (int): Number of input image channels. Default: 3.
@@ -534,14 +581,14 @@ class DaViT(nn.Module):
         num_heads=(3, 6, 12, 24),
         num_groups=(3, 6, 12, 24),
         window_size=7,
-        mlp_ratio=4.,
         qkv_bias=True,
         drop_path_rate=0.1,
         norm_layer=nn.LayerNorm,
         enable_checkpoint=False,
         conv_at_attn=True,
         conv_at_ffn=True,
-     ):
         super().__init__()
         self.num_classes = num_classes
@@ -553,7 +600,7 @@ class DaViT(nn.Module):
         assert self.num_stages == len(self.num_heads) == len(self.num_groups)
         num_stages = len(embed_dims)
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)*2)]
         depth_offset = 0
         convs = []
@@ -566,48 +613,59 @@ class DaViT(nn.Module):
                 in_chans=in_chans if i == 0 else self.embed_dims[i - 1],
                 embed_dim=self.embed_dims[i],
                 norm_layer=norm_layer,
-                pre_norm=patch_prenorm[i]
             )
             convs.append(conv_embed)
             block = MySequential(
                 *[
-                    MySequential(OrderedDict([
-                        (
-                            'spatial_block', SpatialBlock(
-                                embed_dims[i],
-                                num_heads[i],
-                                window_size,
-                                drop_path_rate=dpr[depth_offset+j*2],
-                                qkv_bias=qkv_bias,
-                                mlp_ratio=mlp_ratio,
-                                conv_at_attn=conv_at_attn,
-                                conv_at_ffn=conv_at_ffn,
-                            )
-                        ),
-                        (
-                            'channel_block', ChannelBlock(
-                                embed_dims[i],
-                                num_groups[i],
-                                drop_path_rate=dpr[depth_offset+j*2+1],
-                                qkv_bias=qkv_bias,
-                                mlp_ratio=mlp_ratio,
-                                conv_at_attn=conv_at_attn,
-                                conv_at_ffn=conv_at_ffn,
-                            )
                         )
-                    ])) for j in range(depths[i])
                 ]
             )
             blocks.append(block)
-            depth_offset += depths[i]*2
         self.convs = nn.ModuleList(convs)
         self.blocks = nn.ModuleList(blocks)
         self.norms = norm_layer(self.embed_dims[-1])
         self.avgpool = nn.AdaptiveAvgPool1d(1)
-        self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
         self.apply(self._init_weights)
@@ -623,7 +681,7 @@ class DaViT(nn.Module):
         elif isinstance(m, nn.Conv2d):
             nn.init.normal_(m.weight, std=0.02)
             for name, _ in m.named_parameters():
-                if name in ['bias']:
                     nn.init.constant_(m.bias, 0)
         elif isinstance(m, nn.LayerNorm):
             nn.init.constant_(m.weight, 1.0)
@@ -634,7 +692,7 @@ class DaViT(nn.Module):
     def forward_features_unpool(self, x):
         """
-        forward until avg pooling
         Args:
             x (_type_): input image tensor
         """
@@ -662,7 +720,7 @@ class DaViT(nn.Module):
         x = self.forward_features(x)
         x = self.head(x)
         return x
     @classmethod
     def from_config(cls, config):
         return cls(
@@ -679,12 +737,11 @@ class DaViT(nn.Module):
         )
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
@@ -698,7 +755,9 @@ def _get_unpad_data(attention_mask):
     )
-def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
     """
     Shift input ids one token to the right.
     """
@@ -730,7 +789,10 @@ class Florence2LearnedPositionalEmbedding(nn.Embedding):
         bsz, seq_len = input_ids.shape[:2]
         positions = torch.arange(
-            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
         ).expand(bsz, -1)
         return super().forward(positions + self.offset)
@@ -741,7 +803,13 @@ class Florence2ScaledWordEmbedding(nn.Embedding):
     This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
     """
-    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0):
         super().__init__(num_embeddings, embedding_dim, padding_idx)
         self.embed_scale = embed_scale
@@ -784,7 +852,11 @@ class Florence2Attention(nn.Module):
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
     def forward(
         self,
@@ -861,7 +933,10 @@ class Florence2Attention(nn.Module):
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                 )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
@@ -872,7 +947,9 @@ class Florence2Attention(nn.Module):
                     f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
                     f" {layer_head_mask.size()}"
                 )
-            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         if output_attentions:
@@ -880,12 +957,18 @@ class Florence2Attention(nn.Module):
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to be reshaped
             # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
         else:
             attn_weights_reshaped = None
-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
         attn_output = torch.bmm(attn_probs, value_states)
@@ -937,7 +1020,9 @@ class Florence2FlashAttention2(Florence2Attention):
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         # Florence2FlashAttention2 attention does not support output_attentions
         if output_attentions:
-            raise ValueError("Florence2FlashAttention2 attention does not support output_attentions")
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
@@ -967,8 +1052,12 @@ class Florence2FlashAttention2(Florence2Attention):
             # reuse k, v, self_attention
             key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1)
-            value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1)
         else:
             # self_attention
             key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
@@ -1015,7 +1104,12 @@ class Florence2FlashAttention2(Florence2Attention):
             value_states = value_states.to(target_dtype)
         attn_output = self._flash_attention_forward(
-            query_states, key_states, value_states, attention_mask, q_len, dropout=self.dropout
         )
         attn_output = attn_output.reshape(bsz, q_len, -1)
@@ -1028,7 +1122,14 @@ class Florence2FlashAttention2(Florence2Attention):
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
     def _flash_attention_forward(
-        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
     ):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
@@ -1058,7 +1159,14 @@ class Florence2FlashAttention2(Florence2Attention):
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
                 query_states, key_states, value_states, attention_mask, query_length
             )
@@ -1078,28 +1186,40 @@ class Florence2FlashAttention2(Florence2Attention):
                 causal=causal,
             )
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
             attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
             )
         return attn_output
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
         key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
         )
         value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
         )
         if query_length == kv_seq_len:
             query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
             )
             cu_seqlens_q = cu_seqlens_k
             max_seqlen_in_batch_q = max_seqlen_in_batch_k
@@ -1114,7 +1234,9 @@ class Florence2FlashAttention2(Florence2Attention):
         else:
             # The -q_len: slice assumes left padding.
             attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
         return (
             query_layer,
@@ -1202,7 +1324,9 @@ class Florence2SdpaAttention(Florence2Attention):
         # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
         # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
         # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
-        is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
         # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
         # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
@@ -1283,15 +1407,21 @@ class Florence2EncoderLayer(nn.Module):
             layer_head_mask=layer_head_mask,
             output_attentions=output_attentions,
         )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
         residual = hidden_states
         hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
         hidden_states = self.final_layer_norm(hidden_states)
@@ -1299,7 +1429,9 @@ class Florence2EncoderLayer(nn.Module):
             torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
         ):
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
         outputs = (hidden_states,)
@@ -1350,7 +1482,9 @@ class Florence2DecoderLayer(nn.Module):
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = True,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
@@ -1373,7 +1507,9 @@ class Florence2DecoderLayer(nn.Module):
         # Self Attention
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
         # add present self-attn cache to positions 1,2 of present_key_value tuple
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
@@ -1382,7 +1518,9 @@ class Florence2DecoderLayer(nn.Module):
             layer_head_mask=layer_head_mask,
             output_attentions=output_attentions,
         )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -1393,16 +1531,22 @@ class Florence2DecoderLayer(nn.Module):
             residual = hidden_states
             # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
-                hidden_states=hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=cross_attn_past_key_value,
-                output_attentions=output_attentions,
             )
-            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
             hidden_states = residual + hidden_states
             hidden_states = self.encoder_attn_layer_norm(hidden_states)
@@ -1412,9 +1556,13 @@ class Florence2DecoderLayer(nn.Module):
         # Fully Connected
         residual = hidden_states
         hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
         hidden_states = self.final_layer_norm(hidden_states)
@@ -1429,7 +1577,6 @@ class Florence2DecoderLayer(nn.Module):
         return outputs
 class Florence2LanguagePreTrainedModel(PreTrainedModel):
     config_class = Florence2LanguageConfig
     base_model_prefix = "model"
@@ -1454,7 +1601,9 @@ class Florence2LanguagePreTrainedModel(PreTrainedModel):
     @property
     def dummy_inputs(self):
         pad_token = self.config.pad_token_id
-        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
         dummy_inputs = {
             "attention_mask": input_ids.ne(pad_token),
             "input_ids": input_ids,
@@ -1472,7 +1621,11 @@ class Florence2Encoder(Florence2LanguagePreTrainedModel):
         embed_tokens (nn.Embedding): output embedding
     """
-    def __init__(self, config: Florence2LanguageConfig, embed_tokens: Optional[nn.Embedding] = None):
         super().__init__(config)
         self.dropout = config.dropout
@@ -1494,7 +1647,9 @@ class Florence2Encoder(Florence2LanguagePreTrainedModel):
             config.max_position_embeddings,
             embed_dim,
         )
-        self.layers = nn.ModuleList([Florence2EncoderLayer(config) for _ in range(config.encoder_layers)])
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self._use_sdpa = config._attn_implementation == "sdpa"
         self.layernorm_embedding = nn.LayerNorm(embed_dim)
@@ -1555,15 +1710,25 @@ class Florence2Encoder(Florence2LanguagePreTrainedModel):
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input = input_ids
             input_ids = input_ids.view(-1, input_ids.shape[-1])
@@ -1580,7 +1745,9 @@ class Florence2Encoder(Florence2LanguagePreTrainedModel):
         hidden_states = inputs_embeds + embed_pos
         hidden_states = self.layernorm_embedding(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         # expand attention_mask
         if attention_mask is not None:
@@ -1590,10 +1757,14 @@ class Florence2Encoder(Florence2LanguagePreTrainedModel):
                 # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
                 # the manual implementation that requires a 4D causal mask in all cases.
                 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
             else:
                 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -1631,7 +1802,9 @@ class Florence2Encoder(Florence2LanguagePreTrainedModel):
                     layer_outputs = encoder_layer(
                         hidden_states,
                         attention_mask,
-                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                         output_attentions=output_attentions,
                     )
@@ -1644,9 +1817,15 @@ class Florence2Encoder(Florence2LanguagePreTrainedModel):
             encoder_states = encoder_states + (hidden_states,)
         if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
         return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
         )
@@ -1659,7 +1838,11 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
         embed_tokens (nn.Embedding): output embedding
     """
-    def __init__(self, config: Florence2LanguageConfig, embed_tokens: Optional[nn.Embedding] = None):
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
@@ -1678,7 +1861,9 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
             config.max_position_embeddings,
             config.d_model,
         )
-        self.layers = nn.ModuleList([Florence2DecoderLayer(config) for _ in range(config.decoder_layers)])
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self._use_sdpa = config._attn_implementation == "sdpa"
@@ -1774,16 +1959,26 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
         elif input_ids is not None:
             input = input_ids
             input_shape = input.shape
@@ -1792,17 +1987,25 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
             input_shape = inputs_embeds.size()[:-1]
             input = inputs_embeds[:, :, -1]
         else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
         # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input)
         if self._use_flash_attention_2:
             # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
         elif self._use_sdpa and not output_attentions and cross_attn_head_mask is None:
             # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
@@ -1821,8 +2024,14 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
             if self._use_flash_attention_2:
-                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
-            elif self._use_sdpa and cross_attn_head_mask is None and not output_attentions:
                 # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
                 # the manual implementation that requires a 4D causal mask in all cases.
                 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -1844,7 +2053,9 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
         hidden_states = inputs_embeds + positions
         hidden_states = self.layernorm_embedding(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -1856,11 +2067,15 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
-        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
         next_decoder_cache = () if use_cache else None
         # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
-        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
             if attn_mask is not None:
                 if attn_mask.size()[0] != (len(self.layers)):
                     raise ValueError(
@@ -1877,7 +2092,9 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
                 if dropout_probability < self.layerdrop:
                     continue
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
@@ -1887,7 +2104,11 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
                     encoder_hidden_states,
                     encoder_attention_mask,
                     head_mask[idx] if head_mask is not None else None,
-                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
                     None,
                     output_attentions,
                     use_cache,
@@ -1900,7 +2121,9 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
                     encoder_attention_mask=encoder_attention_mask,
                     layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                     cross_attn_layer_head_mask=(
-                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
                     ),
                     past_key_value=past_key_value,
                     output_attentions=output_attentions,
@@ -1925,7 +2148,13 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
         if not return_dict:
             return tuple(
                 v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
                 if v is not None
             )
         return BaseModelOutputWithPastAndCrossAttentions(
@@ -2003,12 +2232,20 @@ class Florence2LanguageModel(Florence2LanguagePreTrainedModel):
                 input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
             )
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
@@ -2061,14 +2298,22 @@ class Florence2LanguageModel(Florence2LanguagePreTrainedModel):
 class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel):
     base_model_prefix = "model"
-    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
     _keys_to_ignore_on_load_missing = ["final_logits_bias"]
     def __init__(self, config: Florence2LanguageConfig):
         super().__init__(config)
         self.model = Florence2LanguageModel(config)
-        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
-        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
@@ -2079,8 +2324,12 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
     def get_decoder(self):
         return self.model.get_decoder()
-    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
         self._resize_final_logits_bias(new_embeddings.weight.shape[0])
         return new_embeddings
@@ -2089,7 +2338,10 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
         if new_num_tokens <= old_num_tokens:
             new_bias = self.final_logits_bias[:, :new_num_tokens]
         else:
-            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
             new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
         self.register_buffer("final_logits_bias", new_bias)
@@ -2126,11 +2378,15 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
         Returns:
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if labels is not None:
             if use_cache:
-                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
             use_cache = False
             if decoder_input_ids is None and decoder_inputs_embeds is None:
                 decoder_input_ids = shift_tokens_right(
@@ -2162,11 +2418,15 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
         if labels is not None:
             labels = labels.to(lm_logits.device)
             loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
         return Seq2SeqLMOutput(
             loss=masked_lm_loss,
@@ -2220,7 +2480,9 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
         }
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
@@ -2228,11 +2490,15 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
         for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
                 + layer_past[2:],
             )
         return reordered_past
 @dataclass
 class Florence2Seq2SeqLMOutput(ModelOutput):
     """
@@ -2289,6 +2555,7 @@ class Florence2Seq2SeqLMOutput(ModelOutput):
             image_hidden_states of the model produced by the vision encoder
     """
     last_hidden_state: torch.FloatTensor = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -2408,6 +2675,7 @@ FLORENCE2_INPUTS_DOCSTRING = r"""
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 @add_start_docstrings(
     """The FLORENCE2 vision model without any head""",
     FLORENCE2_START_DOCSTRING,
@@ -2415,16 +2683,16 @@ FLORENCE2_INPUTS_DOCSTRING = r"""
 class Florence2VisionModel(Florence2PreTrainedModel):
     def __init__(self, config: Florence2VisionConfig):
         super().__init__(config)
-        assert config.model_type == 'davit', 'only DaViT is supported for now'
         self.vision_tower = DaViT.from_config(config=config)
         self.post_init()
     def forward(self, pixel_values):
         if len(pixel_values.shape) == 4:
             x = self.vision_tower.forward_features_unpool(pixel_values)
         else:
-            raise ValueError(f'invalid image shape {pixel_values.shape}')
         return x
@@ -2435,40 +2703,38 @@ class Florence2VisionModel(Florence2PreTrainedModel):
 class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
     def __init__(self, config: Florence2VisionConfig):
         super().__init__(config)
-        assert config.model_type == 'davit', 'only DaViT is supported for now'
         self.vision_tower = DaViT.from_config(config=config)
         self._build_image_projection_layers(config)
         self.post_init()
     def _build_image_projection_layers(self, config):
         image_dim_out = config.dim_embed[-1]
         dim_projection = config.projection_dim
-        self.image_projection = nn.Parameter(
-            torch.empty(image_dim_out, dim_projection)
-        )
         self.image_proj_norm = nn.LayerNorm(dim_projection)
         image_pos_embed_config = config.image_pos_embed
-        if image_pos_embed_config['type'] == 'learned_abs_2d':
             self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
                 embedding_dim=image_dim_out,
-                num_pos=image_pos_embed_config['max_pos_embeddings']
             )
         else:
-            raise NotImplementedError('Not implemented yet')
         self.image_feature_source = config.image_feature_source
         # temporal embedding
         visual_temporal_embedding_config = config.visual_temporal_embedding
-        if visual_temporal_embedding_config['type'] == 'COSINE':
             self.visual_temporal_embed = PositionalEmbeddingCosine1D(
                 embed_dim=image_dim_out,
-                max_seq_len=visual_temporal_embedding_config['max_temporal_embeddings']
             )
         else:
-            raise NotImplementedError('Not implemented yet')
     def forward(self, pixel_values):
         if len(pixel_values.shape) == 4:
@@ -2476,37 +2742,43 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
             T = 1
             x = self.vision_tower.forward_features_unpool(pixel_values)
         else:
-            raise ValueError(f'invalid image shape {pixel_values.shape}')
         if self.image_pos_embed is not None:
             x = x.view(batch_size * T, -1, x.shape[-1])
             num_tokens = x.shape[-2]
-            h, w = int(num_tokens ** 0.5), int(num_tokens ** 0.5)
-            assert h * w == num_tokens, 'only support square feature maps for now'
             x = x.view(batch_size * T, h, w, x.shape[-1])
             pos_embed = self.image_pos_embed(x)
             x = x + pos_embed
-            x = x.view(batch_size, T * h*w, x.shape[-1])
         if self.visual_temporal_embed is not None:
-            visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
-            x = x.view(batch_size, T, -1, x.shape[-1]) + visual_temporal_embed.view(1, T, 1, x.shape[-1])
         x_feat_dict = {}
         spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
-        x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x
         temporal_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=1)
-        x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x
         x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
-        x_feat_dict['last_frame'] = x
         new_x = []
         for _image_feature_source in self.image_feature_source:
             if _image_feature_source not in x_feat_dict:
-                raise ValueError('invalid image feature source: {}'.format(_image_feature_source))
             new_x.append(x_feat_dict[_image_feature_source])
         x = torch.cat(new_x, dim=1)
@@ -2514,11 +2786,9 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
         x = x @ self.image_projection
         x = self.image_proj_norm(x)
         return x
 @add_start_docstrings(
     """The FLORENCE2 model which consists of a vision backbone and a language model.""",
     FLORENCE2_START_DOCSTRING,
@@ -2526,10 +2796,12 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
 class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
     def __init__(self, config: Florence2Config):
         super().__init__(config)
-        assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
         del config.vision_config.model_type
         self.vision_tower = DaViT.from_config(config=config.vision_config)
-        # remove unused layers
         del self.vision_tower.head
         del self.vision_tower.norms
@@ -2537,42 +2809,48 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         self._attn_implementation = config._attn_implementation
         self._build_image_projection_layers(config)
-        language_model = Florence2LanguageForConditionalGeneration(config=config.text_config)
         if language_model._tied_weights_keys is not None:
-            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
         self.language_model = language_model
-        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         self.post_init()
     def _build_image_projection_layers(self, config):
         image_dim_out = config.vision_config.dim_embed[-1]
         dim_projection = config.vision_config.projection_dim
-        self.image_projection = nn.Parameter(
-            torch.empty(image_dim_out, dim_projection)
-        )
         self.image_proj_norm = nn.LayerNorm(dim_projection)
         image_pos_embed_config = config.vision_config.image_pos_embed
-        if image_pos_embed_config['type'] == 'learned_abs_2d':
             self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
                 embedding_dim=image_dim_out,
-                num_pos=image_pos_embed_config['max_pos_embeddings']
             )
         else:
-            raise NotImplementedError('Not implemented yet')
         self.image_feature_source = config.vision_config.image_feature_source
         # temporal embedding
-        visual_temporal_embedding_config = config.vision_config.visual_temporal_embedding
-        if visual_temporal_embedding_config['type'] == 'COSINE':
             self.visual_temporal_embed = PositionalEmbeddingCosine1D(
                 embed_dim=image_dim_out,
-                max_seq_len=visual_temporal_embedding_config['max_temporal_embeddings']
             )
         else:
-            raise NotImplementedError('Not implemented yet')
     def get_encoder(self):
         return self.language_model.get_encoder()
@@ -2583,51 +2861,61 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
-        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
         # update vocab size
         self.config.text_config.vocab_size = model_embeds.num_embeddings
         self.config.vocab_size = model_embeds.num_embeddings
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
     def _encode_image(self, pixel_values):
         if len(pixel_values.shape) == 4:
             batch_size, C, H, W = pixel_values.shape
             T = 1
             x = self.vision_tower.forward_features_unpool(pixel_values)
         else:
-            raise ValueError(f'invalid image shape {pixel_values.shape}')
         if self.image_pos_embed is not None:
             x = x.view(batch_size * T, -1, x.shape[-1])
             num_tokens = x.shape[-2]
-            h, w = int(num_tokens ** 0.5), int(num_tokens ** 0.5)
-            assert h * w == num_tokens, 'only support square feature maps for now'
             x = x.view(batch_size * T, h, w, x.shape[-1])
             pos_embed = self.image_pos_embed(x)
             x = x + pos_embed
-            x = x.view(batch_size, T * h*w, x.shape[-1])
         if self.visual_temporal_embed is not None:
-            visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
-            x = x.view(batch_size, T, -1, x.shape[-1]) + visual_temporal_embed.view(1, T, 1, x.shape[-1])
         x_feat_dict = {}
         spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
-        x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x
         temporal_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=1)
-        x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x
         x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
-        x_feat_dict['last_frame'] = x
         new_x = []
         for _image_feature_source in self.image_feature_source:
             if _image_feature_source not in x_feat_dict:
-                raise ValueError('invalid image feature source: {}'.format(_image_feature_source))
             new_x.append(x_feat_dict[_image_feature_source])
         x = torch.cat(new_x, dim=1)
@@ -2635,11 +2923,9 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         x = x @ self.image_projection
         x = self.image_proj_norm(x)
-        return x
-    def _merge_input_ids_with_image_features(
-        self, image_features, inputs_embeds
-    ):
         batch_size, image_token_length = image_features.size()[:-1]
         device = image_features.device
         image_attention_mask = torch.ones(batch_size, image_token_length, device=device)
@@ -2650,20 +2936,25 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             return image_features, image_attention_mask
         task_prefix_embeds = inputs_embeds
-        task_prefix_attention_mask = torch.ones(batch_size, task_prefix_embeds.size(1), device=device)
         if len(task_prefix_attention_mask.shape) == 3:
             task_prefix_attention_mask = task_prefix_attention_mask[:, 0]
         # concat [image embeds, task prefix embeds]
         inputs_embeds = torch.cat([image_features, task_prefix_embeds], dim=1)
-        attention_mask = torch.cat([image_attention_mask, task_prefix_attention_mask], dim=1)
         return inputs_embeds, attention_mask
     @add_start_docstrings_to_model_forward(FLORENCE2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Florence2Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -2714,11 +3005,19 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "A green car parked in front of a yellow building."
         ```"""
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         image_features = None
         if inputs_embeds is None:
@@ -2729,7 +3028,11 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             if pixel_values is not None:
                 # (batch_size, num_image_tokens, hidden_size)
                 image_features = self._encode_image(pixel_values)
-                inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
         attention_mask = attention_mask.to(inputs_embeds.dtype)
         outputs = self.language_model(
@@ -2757,6 +3060,8 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output
         return Florence2Seq2SeqLMOutput(
             loss=loss,
             logits=logits,
@@ -2767,16 +3072,10 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             encoder_last_hidden_state=outputs.encoder_last_hidden_state,
             encoder_hidden_states=outputs.encoder_hidden_states,
             encoder_attentions=outputs.encoder_attentions,
-            image_hidden_states=image_features
         )
-    def generate(
-        self,
-        input_ids,
-        inputs_embeds=None,
-        pixel_values=None,
-        **kwargs
-        ):
         if inputs_embeds is None:
             # 1. Extra the input embeddings
@@ -2785,12 +3084,14 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             # 2. Merge text and images
             if pixel_values is not None:
                 image_features = self._encode_image(pixel_values)
-                inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
         return self.language_model.generate(
-            input_ids=None,
-            inputs_embeds=inputs_embeds,
-            **kwargs
         )
     def prepare_inputs_for_generation(
@@ -2819,7 +3120,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
                 remove_prefix_length = decoder_input_ids.shape[1] - 1
             decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
         return {
             "input_ids": None,  # encoder_outputs is defined. input_ids not needed
             "encoder_outputs": encoder_outputs,
@@ -2833,9 +3134,9 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             "cross_attn_head_mask": cross_attn_head_mask,
             "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
         }
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return self.language_model.shift_tokens_right(labels)
     def _reorder_cache(self, *args, **kwargs):
-        return self.language_model._reorder_cache(*args, **kwargs)

 from torch import nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
+from torch.nn import CrossEntropyLoss
 from collections import OrderedDict
 from einops import rearrange
 from timm.models.layers import DropPath, trunc_normal_
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
 )
+from .configuration_florence2 import Florence2Config
 from .configuration_florence2 import Florence2LanguageConfig
 from .configuration_florence2 import Florence2VisionConfig
 _CONFIG_FOR_DOC = "Florence2Config"
 class LearnedAbsolutePositionEmbedding2D(nn.Module):
     """
     This module learns positional embeddings up to a fixed maximum size.
     def __init__(self, embedding_dim=256, num_pos=50):
         super().__init__()
         self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
+        self.column_embeddings = nn.Embedding(
+            num_pos, embedding_dim - (embedding_dim // 2)
+        )
     def forward(self, pixel_values):
         """
+        pixel_values: (batch_size, height, width, num_channels)
         returns: (batch_size, height, width, embedding_dim * 2)
         """
         if len(pixel_values.shape) != 4:
+            raise ValueError("pixel_values must be a 4D tensor")
         height, width = pixel_values.shape[1:3]
         width_values = torch.arange(width, device=pixel_values.device)
         height_values = torch.arange(height, device=pixel_values.device)
         x_emb = self.column_embeddings(width_values)
         y_emb = self.row_embeddings(height_values)
         # (height, width, embedding_dim * 2)
+        pos = torch.cat(
+            [
+                x_emb.unsqueeze(0).repeat(height, 1, 1),
+                y_emb.unsqueeze(1).repeat(1, width, 1),
+            ],
+            dim=-1,
+        )
         # (embedding_dim * 2, height, width)
         pos = pos.permute(2, 0, 1)
         pos = pos.unsqueeze(0)
         pos = pos.permute(0, 2, 3, 1)
         return pos
 class PositionalEmbeddingCosine1D(nn.Module):
     """
     This class implements a very simple positional encoding. It follows closely
         dropout_prob: The dropout probability.
         max_seq_len: The maximum length to precompute the positional encodings.
     """
+    def __init__(self, embed_dim: int = 512, max_seq_len: int = 1024) -> None:
         super(PositionalEmbeddingCosine1D, self).__init__()
         self.embed_dim = embed_dim
         self.max_seq_len = max_seq_len
         # Generate the sinusoidal arrays.
         factor = math.log(10000)
         denominator = torch.exp(
+            -factor * torch.arange(0, self.embed_dim, 2) / self.embed_dim
+        )
         # Matrix where rows correspond to a positional embedding as a function
         # of the position index (i.e., the row index).
+        frequencies = (
+            torch.arange(0, self.max_seq_len).reshape(self.max_seq_len, 1) * denominator
+        )
         pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
         # Populate uneven entries.
         pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
         assert 2 <= shape_len <= 3
         len_seq = seq_embeds.size(-2)
         assert len_seq <= self.max_seq_len
+        pos_embeds = self.pos_idx_to_embed[0 : seq_embeds.size(-2), :]
         # Adapt pre-computed positional embeddings to the input.
         if shape_len == 3:
+            pos_embeds = pos_embeds.view((1, pos_embeds.size(0), pos_embeds.size(1)))
         return pos_embeds
         embed_dim: The dimension of the embeddings.
         max_seq_len: The maximum length to precompute the positional encodings.
     """
+    def __init__(self, embedding_dim: int = 512, num_pos: int = 1024) -> None:
         super(LearnedAbsolutePositionEmbedding1D, self).__init__()
         self.embeddings = nn.Embedding(num_pos, embedding_dim)
         self.num_pos = num_pos
         pos_embeds = self.embeddings(torch.arange(len_seq).to(seq_embeds.device))
         # Adapt pre-computed positional embeddings to the input.
         if shape_len == 3:
+            pos_embeds = pos_embeds.view((1, pos_embeds.size(0), pos_embeds.size(1)))
         return pos_embeds
 class MySequential(nn.Sequential):
     def forward(self, *inputs):
         for module in self._modules.values():
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
+        self.net = nn.Sequential(
+            OrderedDict(
+                [
+                    ("fc1", nn.Linear(in_features, hidden_features)),
+                    ("act", act_layer()),
+                    ("fc2", nn.Linear(hidden_features, out_features)),
+                ]
+            )
+        )
     def forward(self, x, size):
         return self.net(x), size
     ):
         super().__init__()
         self.dw = nn.Conv2d(
+            dim_in,
+            dim_in,
             kernel_size=kernel_size,
             padding=padding,
             groups=dim_in,
             stride=stride,
+            bias=bias,
         )
     def forward(self, x, size):
 class ConvEmbed(nn.Module):
+    """Image to Patch Embedding"""
     def __init__(
         self,
         stride=4,
         padding=2,
         norm_layer=None,
+        pre_norm=True,
     ):
         super().__init__()
         self.patch_size = patch_size
         self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=stride, padding=padding
         )
         dim_norm = in_chans if pre_norm else embed_dim
         if len(x.size()) == 3:
             if self.norm and self.pre_norm:
                 x = self.norm(x)
+            x = rearrange(x, "b (h w) c -> b c h w", h=H, w=W)
         x = self.proj(x)
         _, _, H, W = x.shape
+        x = rearrange(x, "b c h w -> b (h w) c")
         if self.norm and not self.pre_norm:
             x = self.norm(x)
     def forward(self, x, size):
         B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.groups, C // self.groups)
+            .permute(2, 0, 3, 1, 4)
+        )
         q, k, v = qkv[0], qkv[1], qkv[2]
         q = q * (float(N) ** -0.5)
 class ChannelBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        groups,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        conv_at_attn=True,
+        conv_at_ffn=True,
+    ):
         super().__init__()
+        drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.conv1 = (
+            PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
+        )
         self.channel_attn = PreNorm(
             norm_layer(dim),
             ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
+            drop_path,
+        )
+        self.conv2 = (
+            PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
         )
         self.ffn = PreNorm(
             norm_layer(dim),
+            Mlp(
+                in_features=dim,
+                hidden_features=int(dim * mlp_ratio),
+                act_layer=act_layer,
+            ),
+            drop_path,
         )
     def forward(self, x, size):
 def window_partition(x, window_size: int):
     B, H, W, C = x.shape
     x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = (
+        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    )
     return windows
 def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
+    B = batch_size
     # this will cause onnx conversion failed for dynamic axis, because treated as constant
+    # int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(
+        B, H // window_size, W // window_size, window_size, window_size, -1
+    )
     x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
     return x
         # attn_windows = self.attn(x_windows)
         B_, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
         q, k, v = qkv[0], qkv[1], qkv[2]
         q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
         attn = self.softmax(attn)
         x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
         x = self.proj(x)
         # merge windows
+        x = x.view(-1, self.window_size, self.window_size, C)
         x = window_reverse(x, B, self.window_size, Hp, Wp)
         if pad_r > 0 or pad_b > 0:
 class SpatialBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        window_size,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        conv_at_attn=True,
+        conv_at_ffn=True,
+    ):
         super().__init__()
+        drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.conv1 = (
+            PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
+        )
         self.window_attn = PreNorm(
             norm_layer(dim),
             WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
+            drop_path,
+        )
+        self.conv2 = (
+            PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
         )
         self.ffn = PreNorm(
             norm_layer(dim),
+            Mlp(
+                in_features=dim,
+                hidden_features=int(dim * mlp_ratio),
+                act_layer=act_layer,
+            ),
+            drop_path,
         )
     def forward(self, x, size):
 class DaViT(nn.Module):
+    """DaViT: Dual-Attention Transformer
     Args:
         in_chans (int): Number of input image channels. Default: 3.
         num_heads=(3, 6, 12, 24),
         num_groups=(3, 6, 12, 24),
         window_size=7,
+        mlp_ratio=4.0,
         qkv_bias=True,
         drop_path_rate=0.1,
         norm_layer=nn.LayerNorm,
         enable_checkpoint=False,
         conv_at_attn=True,
         conv_at_ffn=True,
+    ):
         super().__init__()
         self.num_classes = num_classes
         assert self.num_stages == len(self.num_heads) == len(self.num_groups)
         num_stages = len(embed_dims)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths) * 2)]
         depth_offset = 0
         convs = []
                 in_chans=in_chans if i == 0 else self.embed_dims[i - 1],
                 embed_dim=self.embed_dims[i],
                 norm_layer=norm_layer,
+                pre_norm=patch_prenorm[i],
             )
             convs.append(conv_embed)
             block = MySequential(
                 *[
+                    MySequential(
+                        OrderedDict(
+                            [
+                                (
+                                    "spatial_block",
+                                    SpatialBlock(
+                                        embed_dims[i],
+                                        num_heads[i],
+                                        window_size,
+                                        drop_path_rate=dpr[depth_offset + j * 2],
+                                        qkv_bias=qkv_bias,
+                                        mlp_ratio=mlp_ratio,
+                                        conv_at_attn=conv_at_attn,
+                                        conv_at_ffn=conv_at_ffn,
+                                    ),
+                                ),
+                                (
+                                    "channel_block",
+                                    ChannelBlock(
+                                        embed_dims[i],
+                                        num_groups[i],
+                                        drop_path_rate=dpr[depth_offset + j * 2 + 1],
+                                        qkv_bias=qkv_bias,
+                                        mlp_ratio=mlp_ratio,
+                                        conv_at_attn=conv_at_attn,
+                                        conv_at_ffn=conv_at_ffn,
+                                    ),
+                                ),
+                            ]
                         )
+                    )
+                    for j in range(depths[i])
                 ]
             )
             blocks.append(block)
+            depth_offset += depths[i] * 2
         self.convs = nn.ModuleList(convs)
         self.blocks = nn.ModuleList(blocks)
         self.norms = norm_layer(self.embed_dims[-1])
         self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.head = (
+            nn.Linear(self.embed_dims[-1], num_classes)
+            if num_classes > 0
+            else nn.Identity()
+        )
         self.apply(self._init_weights)
         elif isinstance(m, nn.Conv2d):
             nn.init.normal_(m.weight, std=0.02)
             for name, _ in m.named_parameters():
+                if name in ["bias"]:
                     nn.init.constant_(m.bias, 0)
         elif isinstance(m, nn.LayerNorm):
             nn.init.constant_(m.weight, 1.0)
     def forward_features_unpool(self, x):
         """
+        forward until avg pooling
         Args:
             x (_type_): input image tensor
         """
         x = self.forward_features(x)
         x = self.head(x)
         return x
     @classmethod
     def from_config(cls, config):
         return cls(
         )
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     )
+def shift_tokens_right(
+    input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int
+):
     """
     Shift input ids one token to the right.
     """
         bsz, seq_len = input_ids.shape[:2]
         positions = torch.arange(
+            past_key_values_length,
+            past_key_values_length + seq_len,
+            dtype=torch.long,
+            device=self.weight.device,
         ).expand(bsz, -1)
         return super().forward(positions + self.offset)
     This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
     """
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        padding_idx: int,
+        embed_scale: Optional[float] = 1.0,
+    ):
         super().__init__(num_embeddings, embedding_dim, padding_idx)
         self.embed_scale = embed_scale
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
     def forward(
         self,
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                 )
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + attention_mask
+            )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
                     f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
                     f" {layer_head_mask.size()}"
                 )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
+                bsz, self.num_heads, tgt_len, src_len
+            )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         if output_attentions:
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to be reshaped
             # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(
+                bsz, self.num_heads, tgt_len, src_len
+            )
+            attn_weights = attn_weights_reshaped.view(
+                bsz * self.num_heads, tgt_len, src_len
+            )
         else:
             attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )
         attn_output = torch.bmm(attn_probs, value_states)
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         # Florence2FlashAttention2 attention does not support output_attentions
         if output_attentions:
+            raise ValueError(
+                "Florence2FlashAttention2 attention does not support output_attentions"
+            )
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
             # reuse k, v, self_attention
             key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat(
+                [past_key_value[0].transpose(1, 2), key_states], dim=1
+            )
+            value_states = torch.cat(
+                [past_key_value[1].transpose(1, 2), value_states], dim=1
+            )
         else:
             # self_attention
             key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
             value_states = value_states.to(target_dtype)
         attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=self.dropout,
         )
         attn_output = attn_output.reshape(bsz, q_len, -1)
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
     def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
     ):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
+            (
+                query_states,
+                key_states,
+                value_states,
+                indices_q,
+                cu_seq_lens,
+                max_seq_lens,
+            ) = self._upad_input(
                 query_states, key_states, value_states, attention_mask, query_length
             )
                 causal=causal,
             )
+            attn_output = pad_input(
+                attn_output_unpad, indices_q, batch_size, query_length
+            )
         else:
             attn_output = flash_attn_func(
+                query_states,
+                key_states,
+                value_states,
+                dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
             )
         return attn_output
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
+    def _upad_input(
+        self, query_layer, key_layer, value_layer, attention_mask, query_length
+    ):
         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
         key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
         )
         value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
         )
         if query_length == kv_seq_len:
             query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
+                indices_k,
             )
             cu_seqlens_q = cu_seqlens_k
             max_seqlen_in_batch_q = max_seqlen_in_batch_k
         else:
             # The -q_len: slice assumes left padding.
             attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
+                query_layer, attention_mask
+            )
         return (
             query_layer,
         # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
         # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
         # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = (
+            True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+        )
         # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
         # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
             layer_head_mask=layer_head_mask,
             output_attentions=output_attentions,
         )
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
         hidden_states = residual + hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
         residual = hidden_states
         hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.activation_dropout, training=self.training
+        )
         hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
         hidden_states = residual + hidden_states
         hidden_states = self.final_layer_norm(hidden_states)
             torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
         ):
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value
+            )
         outputs = (hidden_states,)
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = True,
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
         # Self Attention
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = (
+            past_key_value[:2] if past_key_value is not None else None
+        )
         # add present self-attn cache to positions 1,2 of present_key_value tuple
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
             layer_head_mask=layer_head_mask,
             output_attentions=output_attentions,
         )
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
         hidden_states = residual + hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
             residual = hidden_states
             # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = (
+                past_key_value[-2:] if past_key_value is not None else None
+            )
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = (
+                self.encoder_attn(
+                    hidden_states=hidden_states,
+                    key_value_states=encoder_hidden_states,
+                    attention_mask=encoder_attention_mask,
+                    layer_head_mask=cross_attn_layer_head_mask,
+                    past_key_value=cross_attn_past_key_value,
+                    output_attentions=output_attentions,
+                )
+            )
+            hidden_states = nn.functional.dropout(
+                hidden_states, p=self.dropout, training=self.training
             )
             hidden_states = residual + hidden_states
             hidden_states = self.encoder_attn_layer_norm(hidden_states)
         # Fully Connected
         residual = hidden_states
         hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.activation_dropout, training=self.training
+        )
         hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
         hidden_states = residual + hidden_states
         hidden_states = self.final_layer_norm(hidden_states)
         return outputs
 class Florence2LanguagePreTrainedModel(PreTrainedModel):
     config_class = Florence2LanguageConfig
     base_model_prefix = "model"
     @property
     def dummy_inputs(self):
         pad_token = self.config.pad_token_id
+        input_ids = torch.tensor(
+            [[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device
+        )
         dummy_inputs = {
             "attention_mask": input_ids.ne(pad_token),
             "input_ids": input_ids,
         embed_tokens (nn.Embedding): output embedding
     """
+    def __init__(
+        self,
+        config: Florence2LanguageConfig,
+        embed_tokens: Optional[nn.Embedding] = None,
+    ):
         super().__init__(config)
         self.dropout = config.dropout
             config.max_position_embeddings,
             embed_dim,
         )
+        self.layers = nn.ModuleList(
+            [Florence2EncoderLayer(config) for _ in range(config.encoder_layers)]
+        )
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self._use_sdpa = config._attn_implementation == "sdpa"
         self.layernorm_embedding = nn.LayerNorm(embed_dim)
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
         )
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
         elif input_ids is not None:
             input = input_ids
             input_ids = input_ids.view(-1, input_ids.shape[-1])
         hidden_states = inputs_embeds + embed_pos
         hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
         # expand attention_mask
         if attention_mask is not None:
                 # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
                 # the manual implementation that requires a 4D causal mask in all cases.
                 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    attention_mask, inputs_embeds.dtype
+                )
             else:
                 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask(
+                    attention_mask, inputs_embeds.dtype
+                )
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
                     layer_outputs = encoder_layer(
                         hidden_states,
                         attention_mask,
+                        layer_head_mask=(
+                            head_mask[idx] if head_mask is not None else None
+                        ),
                         output_attentions=output_attentions,
                     )
             encoder_states = encoder_states + (hidden_states,)
         if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None
+            )
         return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
         )
         embed_tokens (nn.Embedding): output embedding
     """
+    def __init__(
+        self,
+        config: Florence2LanguageConfig,
+        embed_tokens: Optional[nn.Embedding] = None,
+    ):
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
             config.max_position_embeddings,
             config.d_model,
         )
+        self.layers = nn.ModuleList(
+            [Florence2DecoderLayer(config) for _ in range(config.decoder_layers)]
+        )
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self._use_sdpa = config._attn_implementation == "sdpa"
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
         elif input_ids is not None:
             input = input_ids
             input_shape = input.shape
             input_shape = inputs_embeds.size()[:-1]
             input = inputs_embeds[:, :, -1]
         else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
         # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        )
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input)
         if self._use_flash_attention_2:
             # 2d mask is passed through the layers
+            attention_mask = (
+                attention_mask
+                if (attention_mask is not None and 0 in attention_mask)
+                else None
+            )
         elif self._use_sdpa and not output_attentions and cross_attn_head_mask is None:
             # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
             if self._use_flash_attention_2:
+                encoder_attention_mask = (
+                    encoder_attention_mask if 0 in encoder_attention_mask else None
+                )
+            elif (
+                self._use_sdpa
+                and cross_attn_head_mask is None
+                and not output_attentions
+            ):
                 # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
                 # the manual implementation that requires a 4D causal mask in all cases.
                 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
         hidden_states = inputs_embeds + positions
         hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
         if self.gradient_checkpointing and self.training:
             if use_cache:
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
+        all_cross_attentions = (
+            () if (output_attentions and encoder_hidden_states is not None) else None
+        )
         next_decoder_cache = () if use_cache else None
         # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip(
+            [head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]
+        ):
             if attn_mask is not None:
                 if attn_mask.size()[0] != (len(self.layers)):
                     raise ValueError(
                 if dropout_probability < self.layerdrop:
                     continue
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
                     encoder_hidden_states,
                     encoder_attention_mask,
                     head_mask[idx] if head_mask is not None else None,
+                    (
+                        cross_attn_head_mask[idx]
+                        if cross_attn_head_mask is not None
+                        else None
+                    ),
                     None,
                     output_attentions,
                     use_cache,
                     encoder_attention_mask=encoder_attention_mask,
                     layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                     cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx]
+                        if cross_attn_head_mask is not None
+                        else None
                     ),
                     past_key_value=past_key_value,
                     output_attentions=output_attentions,
         if not return_dict:
             return tuple(
                 v
+                for v in [
+                    hidden_states,
+                    next_cache,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                ]
                 if v is not None
             )
         return BaseModelOutputWithPastAndCrossAttentions(
                 input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
             )
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
 class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel):
     base_model_prefix = "model"
+    _tied_weights_keys = [
+        "encoder.embed_tokens.weight",
+        "decoder.embed_tokens.weight",
+        "lm_head.weight",
+    ]
     _keys_to_ignore_on_load_missing = ["final_logits_bias"]
     def __init__(self, config: Florence2LanguageConfig):
         super().__init__(config)
         self.model = Florence2LanguageModel(config)
+        self.register_buffer(
+            "final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))
+        )
+        self.lm_head = nn.Linear(
+            config.d_model, self.model.shared.num_embeddings, bias=False
+        )
         # Initialize weights and apply final processing
         self.post_init()
     def get_decoder(self):
         return self.model.get_decoder()
+    def resize_token_embeddings(
+        self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None
+    ) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(
+            new_num_tokens, pad_to_multiple_of
+        )
         self._resize_final_logits_bias(new_embeddings.weight.shape[0])
         return new_embeddings
         if new_num_tokens <= old_num_tokens:
             new_bias = self.final_logits_bias[:, :new_num_tokens]
         else:
+            extra_bias = torch.zeros(
+                (1, new_num_tokens - old_num_tokens),
+                device=self.final_logits_bias.device,
+            )
             new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
         self.register_buffer("final_logits_bias", new_bias)
         Returns:
         """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
         if labels is not None:
             if use_cache:
+                logger.warning(
+                    "The `use_cache` argument is changed to `False` since `labels` is provided."
+                )
             use_cache = False
             if decoder_input_ids is None and decoder_inputs_embeds is None:
                 decoder_input_ids = shift_tokens_right(
         if labels is not None:
             labels = labels.to(lm_logits.device)
             loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                lm_logits.view(-1, self.config.vocab_size), labels.view(-1)
+            )
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
+            return (
+                ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+            )
         return Seq2SeqLMOutput(
             loss=masked_lm_loss,
         }
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(
+            labels, self.config.pad_token_id, self.config.decoder_start_token_id
+        )
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx.to(past_state.device))
+                    for past_state in layer_past[:2]
+                )
                 + layer_past[2:],
             )
         return reordered_past
 @dataclass
 class Florence2Seq2SeqLMOutput(ModelOutput):
     """
             image_hidden_states of the model produced by the vision encoder
     """
+    loss: torch.FloatTensor = None
     last_hidden_state: torch.FloatTensor = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 @add_start_docstrings(
     """The FLORENCE2 vision model without any head""",
     FLORENCE2_START_DOCSTRING,
 class Florence2VisionModel(Florence2PreTrainedModel):
     def __init__(self, config: Florence2VisionConfig):
         super().__init__(config)
+        assert config.model_type == "davit", "only DaViT is supported for now"
         self.vision_tower = DaViT.from_config(config=config)
         self.post_init()
     def forward(self, pixel_values):
         if len(pixel_values.shape) == 4:
             x = self.vision_tower.forward_features_unpool(pixel_values)
         else:
+            raise ValueError(f"invalid image shape {pixel_values.shape}")
         return x
 class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
     def __init__(self, config: Florence2VisionConfig):
         super().__init__(config)
+        assert config.model_type == "davit", "only DaViT is supported for now"
         self.vision_tower = DaViT.from_config(config=config)
         self._build_image_projection_layers(config)
         self.post_init()
     def _build_image_projection_layers(self, config):
         image_dim_out = config.dim_embed[-1]
         dim_projection = config.projection_dim
+        self.image_projection = nn.Parameter(torch.empty(image_dim_out, dim_projection))
         self.image_proj_norm = nn.LayerNorm(dim_projection)
         image_pos_embed_config = config.image_pos_embed
+        if image_pos_embed_config["type"] == "learned_abs_2d":
             self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
                 embedding_dim=image_dim_out,
+                num_pos=image_pos_embed_config["max_pos_embeddings"],
             )
         else:
+            raise NotImplementedError("Not implemented yet")
         self.image_feature_source = config.image_feature_source
         # temporal embedding
         visual_temporal_embedding_config = config.visual_temporal_embedding
+        if visual_temporal_embedding_config["type"] == "COSINE":
             self.visual_temporal_embed = PositionalEmbeddingCosine1D(
                 embed_dim=image_dim_out,
+                max_seq_len=visual_temporal_embedding_config["max_temporal_embeddings"],
             )
         else:
+            raise NotImplementedError("Not implemented yet")
     def forward(self, pixel_values):
         if len(pixel_values.shape) == 4:
             T = 1
             x = self.vision_tower.forward_features_unpool(pixel_values)
         else:
+            raise ValueError(f"invalid image shape {pixel_values.shape}")
         if self.image_pos_embed is not None:
             x = x.view(batch_size * T, -1, x.shape[-1])
             num_tokens = x.shape[-2]
+            h, w = int(num_tokens**0.5), int(num_tokens**0.5)
+            assert h * w == num_tokens, "only support square feature maps for now"
             x = x.view(batch_size * T, h, w, x.shape[-1])
             pos_embed = self.image_pos_embed(x)
             x = x + pos_embed
+            x = x.view(batch_size, T * h * w, x.shape[-1])
         if self.visual_temporal_embed is not None:
+            visual_temporal_embed = self.visual_temporal_embed(
+                x.view(batch_size, T, -1, x.shape[-1])[:, :, 0]
+            )
+            x = x.view(batch_size, T, -1, x.shape[-1]) + visual_temporal_embed.view(
+                1, T, 1, x.shape[-1]
+            )
         x_feat_dict = {}
         spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
+        x_feat_dict["spatial_avg_pool"] = spatial_avg_pool_x
         temporal_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=1)
+        x_feat_dict["temporal_avg_pool"] = temporal_avg_pool_x
         x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
+        x_feat_dict["last_frame"] = x
         new_x = []
         for _image_feature_source in self.image_feature_source:
             if _image_feature_source not in x_feat_dict:
+                raise ValueError(
+                    "invalid image feature source: {}".format(_image_feature_source)
+                )
             new_x.append(x_feat_dict[_image_feature_source])
         x = torch.cat(new_x, dim=1)
         x = x @ self.image_projection
         x = self.image_proj_norm(x)
         return x
 @add_start_docstrings(
     """The FLORENCE2 model which consists of a vision backbone and a language model.""",
     FLORENCE2_START_DOCSTRING,
 class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
     def __init__(self, config: Florence2Config):
         super().__init__(config)
+        assert (
+            config.vision_config.model_type == "davit"
+        ), "only DaViT is supported for now"
         del config.vision_config.model_type
         self.vision_tower = DaViT.from_config(config=config.vision_config)
+        # remove unused layers
         del self.vision_tower.head
         del self.vision_tower.norms
         self._attn_implementation = config._attn_implementation
         self._build_image_projection_layers(config)
+        language_model = Florence2LanguageForConditionalGeneration(
+            config=config.text_config
+        )
         if language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [
+                f"language_model.{k}" for k in language_model._tied_weights_keys
+            ]
         self.language_model = language_model
+        self.pad_token_id = (
+            self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        )
         self.post_init()
     def _build_image_projection_layers(self, config):
         image_dim_out = config.vision_config.dim_embed[-1]
         dim_projection = config.vision_config.projection_dim
+        self.image_projection = nn.Parameter(torch.empty(image_dim_out, dim_projection))
         self.image_proj_norm = nn.LayerNorm(dim_projection)
         image_pos_embed_config = config.vision_config.image_pos_embed
+        if image_pos_embed_config["type"] == "learned_abs_2d":
             self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
                 embedding_dim=image_dim_out,
+                num_pos=image_pos_embed_config["max_pos_embeddings"],
             )
         else:
+            raise NotImplementedError("Not implemented yet")
         self.image_feature_source = config.vision_config.image_feature_source
         # temporal embedding
+        visual_temporal_embedding_config = (
+            config.vision_config.visual_temporal_embedding
+        )
+        if visual_temporal_embedding_config["type"] == "COSINE":
             self.visual_temporal_embed = PositionalEmbeddingCosine1D(
                 embed_dim=image_dim_out,
+                max_seq_len=visual_temporal_embedding_config["max_temporal_embeddings"],
             )
         else:
+            raise NotImplementedError("Not implemented yet")
     def get_encoder(self):
         return self.language_model.get_encoder()
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None
+    ) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(
+            new_num_tokens, pad_to_multiple_of
+        )
         # update vocab size
         self.config.text_config.vocab_size = model_embeds.num_embeddings
         self.config.vocab_size = model_embeds.num_embeddings
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
     def _encode_image(self, pixel_values):
         if len(pixel_values.shape) == 4:
             batch_size, C, H, W = pixel_values.shape
             T = 1
             x = self.vision_tower.forward_features_unpool(pixel_values)
         else:
+            raise ValueError(f"invalid image shape {pixel_values.shape}")
         if self.image_pos_embed is not None:
             x = x.view(batch_size * T, -1, x.shape[-1])
             num_tokens = x.shape[-2]
+            h, w = int(num_tokens**0.5), int(num_tokens**0.5)
+            assert h * w == num_tokens, "only support square feature maps for now"
             x = x.view(batch_size * T, h, w, x.shape[-1])
             pos_embed = self.image_pos_embed(x)
             x = x + pos_embed
+            x = x.view(batch_size, T * h * w, x.shape[-1])
         if self.visual_temporal_embed is not None:
+            visual_temporal_embed = self.visual_temporal_embed(
+                x.view(batch_size, T, -1, x.shape[-1])[:, :, 0]
+            )
+            x = x.view(batch_size, T, -1, x.shape[-1]) + visual_temporal_embed.view(
+                1, T, 1, x.shape[-1]
+            )
         x_feat_dict = {}
         spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
+        x_feat_dict["spatial_avg_pool"] = spatial_avg_pool_x
         temporal_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=1)
+        x_feat_dict["temporal_avg_pool"] = temporal_avg_pool_x
         x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
+        x_feat_dict["last_frame"] = x
         new_x = []
         for _image_feature_source in self.image_feature_source:
             if _image_feature_source not in x_feat_dict:
+                raise ValueError(
+                    "invalid image feature source: {}".format(_image_feature_source)
+                )
             new_x.append(x_feat_dict[_image_feature_source])
         x = torch.cat(new_x, dim=1)
         x = x @ self.image_projection
         x = self.image_proj_norm(x)
+        return x
+    def _merge_input_ids_with_image_features(self, image_features, inputs_embeds):
         batch_size, image_token_length = image_features.size()[:-1]
         device = image_features.device
         image_attention_mask = torch.ones(batch_size, image_token_length, device=device)
             return image_features, image_attention_mask
         task_prefix_embeds = inputs_embeds
+        task_prefix_attention_mask = torch.ones(
+            batch_size, task_prefix_embeds.size(1), device=device
+        )
         if len(task_prefix_attention_mask.shape) == 3:
             task_prefix_attention_mask = task_prefix_attention_mask[:, 0]
         # concat [image embeds, task prefix embeds]
         inputs_embeds = torch.cat([image_features, task_prefix_embeds], dim=1)
+        attention_mask = torch.cat(
+            [image_attention_mask, task_prefix_attention_mask], dim=1
+        )
         return inputs_embeds, attention_mask
     @add_start_docstrings_to_model_forward(FLORENCE2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=Florence2Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC
+    )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "A green car parked in front of a yellow building."
         ```"""
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
         )
         image_features = None
         if inputs_embeds is None:
             if pixel_values is not None:
                 # (batch_size, num_image_tokens, hidden_size)
                 image_features = self._encode_image(pixel_values)
+                inputs_embeds, attention_mask = (
+                    self._merge_input_ids_with_image_features(
+                        image_features, inputs_embeds
+                    )
+                )
         attention_mask = attention_mask.to(inputs_embeds.dtype)
         outputs = self.language_model(
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output
+        print(loss)
         return Florence2Seq2SeqLMOutput(
             loss=loss,
             logits=logits,
             encoder_last_hidden_state=outputs.encoder_last_hidden_state,
             encoder_hidden_states=outputs.encoder_hidden_states,
             encoder_attentions=outputs.encoder_attentions,
+            image_hidden_states=image_features,
         )
+    def generate(self, input_ids, inputs_embeds=None, pixel_values=None, **kwargs):
         if inputs_embeds is None:
             # 1. Extra the input embeddings
             # 2. Merge text and images
             if pixel_values is not None:
                 image_features = self._encode_image(pixel_values)
+                inputs_embeds, attention_mask = (
+                    self._merge_input_ids_with_image_features(
+                        image_features, inputs_embeds
+                    )
+                )
         return self.language_model.generate(
+            input_ids=None, inputs_embeds=inputs_embeds, **kwargs
         )
     def prepare_inputs_for_generation(
                 remove_prefix_length = decoder_input_ids.shape[1] - 1
             decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
         return {
             "input_ids": None,  # encoder_outputs is defined. input_ids not needed
             "encoder_outputs": encoder_outputs,
             "cross_attn_head_mask": cross_attn_head_mask,
             "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
         }
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return self.language_model.shift_tokens_right(labels)
     def _reorder_cache(self, *args, **kwargs):
+        return self.language_model._reorder_cache(*args, **kwargs)