bapatra commited on
Commit
25ca391
1 Parent(s): 4443628

Upload 3 files

Browse files

Fixing the AutoTokenizer issue, changing docstrings

config.json CHANGED
@@ -7,10 +7,7 @@
7
  "auto_map": {
8
  "AutoConfig": "configuration_phi3_small.Phi3SmallConfig",
9
  "AutoModelForCausalLM": "modeling_phi3_small.Phi3SmallForCausalLM",
10
- "AutoTokenizer": [
11
- "tokenization_phi3_small.Phi3SmallTokenizer",
12
- "tokenization_phi3_small.Phi3SmallTokenizer"
13
- ]
14
  },
15
  "blocksparse_block_size": 64,
16
  "blocksparse_homo_head_pattern": false,
 
7
  "auto_map": {
8
  "AutoConfig": "configuration_phi3_small.Phi3SmallConfig",
9
  "AutoModelForCausalLM": "modeling_phi3_small.Phi3SmallForCausalLM",
10
+ "AutoTokenizer": "tokenization_phi3_small.Phi3SmallTokenizer"
 
 
 
11
  },
12
  "blocksparse_block_size": 64,
13
  "blocksparse_homo_head_pattern": false,
configuration_phi3_small.py CHANGED
@@ -29,49 +29,89 @@ def next_mult(x, y):
29
 
30
  class Phi3SmallConfig(PretrainedConfig):
31
  """
32
- This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
33
- instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a
34
- configuration with the defaults will yield a similar configuration to that of the GPT-2
35
- [gpt2](https://huggingface.co/gpt2) architecture.
36
 
37
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
38
  documentation from [`PretrainedConfig`] for more information.
39
 
40
 
41
  Args:
42
- vocab_size (`int`, *optional*, defaults to 50257):
43
- Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
44
- `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
45
- n_positions (`int`, *optional*, defaults to 1024):
46
- The maximum sequence length that this model might ever be used with. Typically set this to something large
47
- just in case (e.g., 512 or 1024 or 2048).
48
- n_embd (`int`, *optional*, defaults to 768):
49
- Dimensionality of the embeddings and hidden states.
50
- n_layer (`int`, *optional*, defaults to 12):
51
- Number of hidden layers in the Transformer encoder.
52
- n_head (`int`, *optional*, defaults to 12):
53
- Number of attention heads for each attention layer in the Transformer encoder.
54
- n_inner (`int`, *optional*, defaults to None):
55
- Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
56
- activation_function (`str`, *optional*, defaults to `"gelu"`):
57
- Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
58
- resid_pdrop (`float`, *optional*, defaults to 0.1):
59
- The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
60
- embd_pdrop (`float`, *optional*, defaults to 0.1):
61
- The dropout ratio for the embeddings.
62
- attn_pdrop (`float`, *optional*, defaults to 0.1):
63
- The dropout ratio for the attention.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
65
- The epsilon to use in the layer normalization layers.
66
  initializer_range (`float`, *optional*, defaults to 0.02):
67
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
68
- use_cache (`bool`, *optional*, defaults to `True`):
69
- Whether or not the model should return the last key/values attentions (not used by all models).
70
- scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
71
- Whether to additionally scale attention weights by `1 / layer_idx + 1`.
72
- reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
73
- Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
74
- dot-product/softmax to float() when training with mixed precision.
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  Example:
77
 
@@ -86,7 +126,8 @@ class Phi3SmallConfig(PretrainedConfig):
86
 
87
  >>> # Accessing the model configuration
88
  >>> configuration = model.config
89
- ```"""
 
90
 
91
  model_type = "phi3small"
92
  keys_to_ignore_at_inference = ["past_key_values"]
@@ -113,7 +154,7 @@ class Phi3SmallConfig(PretrainedConfig):
113
  gegelu_pad_to_256: bool = True,
114
  ff_dim_multiplier: Optional[int] = None,
115
  ff_intermediate_size: Optional[int] = 14336,
116
- # Block Sparse Attention
117
  blocksparse_homo_head_pattern: bool = False,
118
  blocksparse_block_size: int = 64,
119
  blocksparse_num_local_blocks: int = 16,
@@ -161,7 +202,6 @@ class Phi3SmallConfig(PretrainedConfig):
161
  self.blocksparse_triton_kernel_block_size = blocksparse_triton_kernel_block_size
162
  # Frequency of block sparsity
163
  self.dense_attention_every_n_layers = dense_attention_every_n_layers
164
-
165
  # Activation function
166
  self.hidden_act = hidden_act
167
  self.gegelu_limit = gegelu_limit
@@ -176,10 +216,8 @@ class Phi3SmallConfig(PretrainedConfig):
176
  self.embedding_dropout_prob = embedding_dropout_prob
177
  self.attention_dropout_prob = attention_dropout_prob
178
  self.ffn_dropout_prob = ffn_dropout_prob
179
-
180
  self.layer_norm_epsilon = layer_norm_epsilon
181
  self.initializer_range = initializer_range
182
-
183
  # MuP parameters
184
  self.mup_use_scaling = mup_use_scaling
185
  self.mup_width_multiplier = mup_width_multiplier
 
29
 
30
  class Phi3SmallConfig(PretrainedConfig):
31
  """
32
+ This is the configuration class to store the configuration of a `Phi3Small` model. It is used to
33
+ instantiate a Phi-3-small model according to the specified arguments, defining the model architecture.
34
+ Instantiating a configuration with the defaults will yield a similar configuration to that of the Phi-3-small
35
+ [phi3](https://arxiv.org/pdf/2404.14219) architecture.
36
 
37
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
38
  documentation from [`PretrainedConfig`] for more information.
39
 
40
 
41
  Args:
42
+ vocab_size (`int`, *optional*, defaults to 100352):
43
+ Vocabulary size of the Phi3Small model. Defines the number of different tokens that can be represented by the
44
+ `inputs_ids` passed when calling `Phi3Small`.
45
+ max_position_embeddings (`int`, *optional*, defaults to 8192):
46
+ The maximum sequence length that this model might safely be used with.
47
+ rope_embedding_base (`float`, *optional*, defaults to 10^6):
48
+ The base value for the RoPE (Relative Position Encoding) embedding.
49
+ rope_position_scale (`float`, *optional*, defaults to 1.0):
50
+ The scale factor for the RoPE position encoding.
51
+ rope_scaling (`Optional[Dict[str, Union[float, List[float], int]]]`, *optional*, defaults to None):
52
+ The scaling configuration used for LongRoPE.
53
+ hidden_size (`int`, *optional*, defaults to 4096):
54
+ The size of the hidden layers in the model.
55
+ num_hidden_layers (`int`, *optional*, defaults to 32):
56
+ The number of layers in the model.
57
+ num_attention_heads (`int`, *optional*, defaults to 32):
58
+ The number of query heads in the model.
59
+ num_key_value_heads (`int`, *optional*, defaults to 8):
60
+ The number of key-value heads in the model.
61
+ hidden_act (`str`, *optional*, defaults to "gegelu"):
62
+ The activation function used in the model.
63
+ gegelu_limit (`float`, *optional*, defaults to 20.0):
64
+ The limit value for the GELU activation function (for numerical stability).
65
+ gegelu_pad_to_256 (`bool`, *optional*, defaults to True):
66
+ Whether to pad the intermediate size to a multiple of 256 (for faster matmul ops).
67
+ ff_dim_multiplier (`Optional[int]`, *optional*, defaults to None):
68
+ The dimension multiplier for the feed-forward layers.
69
+ ff_intermediate_size (`Optional[int]`, *optional*, defaults to 14336):
70
+ The intermediate size for the feed-forward layers.
71
+ One of `ff_dim_multiplier` or `ff_intermediate_size` must be specified.
72
+ blocksparse_homo_head_pattern (`bool`, *optional*, defaults to False):
73
+ Whether to use a homogeneous head pattern for block-sparse attention.
74
+ blocksparse_block_size (`int`, *optional*, defaults to 64):
75
+ The block size for block-sparse attention.
76
+ blocksparse_num_local_blocks (`int`, *optional*, defaults to 16):
77
+ The number of local blocks for block-sparse attention.
78
+ The local window used in blocksparse equals `blocksparse_num_local_blocks * blocksparse_block_size`
79
+ blocksparse_vert_stride (`int`, *optional*, defaults to 8):
80
+ The vertical stride for block-sparse attention.
81
+ blocksparse_triton_kernel_block_size (`int`, *optional*, defaults to 64):
82
+ The kernel block size for block-sparse attention.
83
+ dense_attention_every_n_layers (`Optional[int]`, *optional*, defaults to 2):
84
+ The frequency of all dense attention layers in the model
85
+ embedding_dropout_prob (`float`, *optional*, defaults to 0.1):
86
+ The dropout probability for the embedding layer.
87
+ attention_dropout_prob (`float`, *optional*, defaults to 0.0):
88
+ The dropout probability for the attention layers.
89
+ ffn_dropout_prob (`float`, *optional*, defaults to 0.1):
90
+ The dropout probability for the feed-forward layers.
91
  layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
92
+ The epsilon value for layer normalization.
93
  initializer_range (`float`, *optional*, defaults to 0.02):
94
+ The range for weight initialization.
95
+ mup_use_scaling (`bool`, *optional*, defaults to True):
96
+ Whether to use scaling for MuP parameters (see: https://arxiv.org/abs/2203.03466).
97
+ mup_width_multiplier (`bool`, *optional*, defaults to 8.0):
98
+ The width multiplier for MuP.
99
+ mup_embedding_multiplier (`bool`, *optional*, defaults to 10.0):
100
+ The embedding multiplier for MuP.
101
+ mup_attn_multiplier (`bool`, *optional*, defaults to 1.0):
102
+ The attention multiplier for MuP.
103
+ use_cache (`bool`, *optional*, defaults to True):
104
+ Whether to use cache for the model.
105
+ bos_token_id (`int`, *optional*, defaults to 100257):
106
+ The token ID for the beginning of sentence.
107
+ eos_token_id (`int`, *optional*, defaults to 100257):
108
+ The token ID for the end of sentence.
109
+ reorder_and_upcast_attn (`bool`, *optional*, defaults to False):
110
+ Whether to reorder and upcast attention.
111
+ pad_sequence_to_multiple_of_64 (`bool`, *optional*, defaults to True):
112
+ Whether to pad the sequence length to a multiple of 64.
113
+ **kwargs:
114
+ Additional keyword arguments.
115
 
116
  Example:
117
 
 
126
 
127
  >>> # Accessing the model configuration
128
  >>> configuration = model.config
129
+ ```
130
+ """
131
 
132
  model_type = "phi3small"
133
  keys_to_ignore_at_inference = ["past_key_values"]
 
154
  gegelu_pad_to_256: bool = True,
155
  ff_dim_multiplier: Optional[int] = None,
156
  ff_intermediate_size: Optional[int] = 14336,
157
+ # Block Sparse Attention Parameters
158
  blocksparse_homo_head_pattern: bool = False,
159
  blocksparse_block_size: int = 64,
160
  blocksparse_num_local_blocks: int = 16,
 
202
  self.blocksparse_triton_kernel_block_size = blocksparse_triton_kernel_block_size
203
  # Frequency of block sparsity
204
  self.dense_attention_every_n_layers = dense_attention_every_n_layers
 
205
  # Activation function
206
  self.hidden_act = hidden_act
207
  self.gegelu_limit = gegelu_limit
 
216
  self.embedding_dropout_prob = embedding_dropout_prob
217
  self.attention_dropout_prob = attention_dropout_prob
218
  self.ffn_dropout_prob = ffn_dropout_prob
 
219
  self.layer_norm_epsilon = layer_norm_epsilon
220
  self.initializer_range = initializer_range
 
221
  # MuP parameters
222
  self.mup_use_scaling = mup_use_scaling
223
  self.mup_width_multiplier = mup_width_multiplier
modeling_phi3_small.py CHANGED
@@ -155,7 +155,7 @@ class Phi3SmallMLP(nn.Module):
155
  def __init__(self, config: Phi3SmallConfig):
156
  super().__init__()
157
  self.config = config
158
- assert self.config.hidden_act == "gegelu", "Only `gegelu` is supported for the 4.7 series of models .."
159
  self.hidden_size = config.hidden_size
160
  self.gegelu_limit = config.gegelu_limit
161
  self.intermediate_size = config.intermediate_size
@@ -415,7 +415,7 @@ class Phi3SmallSelfAttention(nn.Module):
415
 
416
  .. note::
417
  Right now, am assuming the expansion for the query key values is already done
418
- outside. But ideally, since Flash attention handles the MQA correctly, we can
419
  avoid doing that.
420
 
421
  """
@@ -496,11 +496,11 @@ class Phi3SmallSelfAttention(nn.Module):
496
  torch.Tensor: The expanded key-value tensor of shape (bsz, nqp, 2, seq_len, hdim).
497
  Where nqp = num_q_per_kv * nkp
498
 
499
- .. note::
500
  Right now, I am using a repeat_interleave to expand the kv to the size of q.
501
  This incurs a memory penalty, since the tensors are actually copied.
502
  TODO: If this does yield benefits, then potentially we can use the re-written
503
- flash attention kernel that can handle the MQA.
504
  """
505
 
506
  repeats = torch.tensor([num_q_per_kv] * kv.size(1)).to(kv.device)
 
155
  def __init__(self, config: Phi3SmallConfig):
156
  super().__init__()
157
  self.config = config
158
+ assert self.config.hidden_act == "gegelu", "Only `gegelu` is supported for the Phi-3-small model .."
159
  self.hidden_size = config.hidden_size
160
  self.gegelu_limit = config.gegelu_limit
161
  self.intermediate_size = config.intermediate_size
 
415
 
416
  .. note::
417
  Right now, am assuming the expansion for the query key values is already done
418
+ outside. But ideally, since Flash attention handles the GQA correctly, we can
419
  avoid doing that.
420
 
421
  """
 
496
  torch.Tensor: The expanded key-value tensor of shape (bsz, nqp, 2, seq_len, hdim).
497
  Where nqp = num_q_per_kv * nkp
498
 
499
+ .. note(bapatra)::
500
  Right now, I am using a repeat_interleave to expand the kv to the size of q.
501
  This incurs a memory penalty, since the tensors are actually copied.
502
  TODO: If this does yield benefits, then potentially we can use the re-written
503
+ flash attention kernel that can handle GQA.
504
  """
505
 
506
  repeats = torch.tensor([num_q_per_kv] * kv.size(1)).to(kv.device)