Upload 3 files
Browse filesFixing the AutoTokenizer issue, changing docstrings
- config.json +1 -4
- configuration_phi3_small.py +78 -40
- modeling_phi3_small.py +4 -4
config.json
CHANGED
@@ -7,10 +7,7 @@
|
|
7 |
"auto_map": {
|
8 |
"AutoConfig": "configuration_phi3_small.Phi3SmallConfig",
|
9 |
"AutoModelForCausalLM": "modeling_phi3_small.Phi3SmallForCausalLM",
|
10 |
-
"AutoTokenizer":
|
11 |
-
"tokenization_phi3_small.Phi3SmallTokenizer",
|
12 |
-
"tokenization_phi3_small.Phi3SmallTokenizer"
|
13 |
-
]
|
14 |
},
|
15 |
"blocksparse_block_size": 64,
|
16 |
"blocksparse_homo_head_pattern": false,
|
|
|
7 |
"auto_map": {
|
8 |
"AutoConfig": "configuration_phi3_small.Phi3SmallConfig",
|
9 |
"AutoModelForCausalLM": "modeling_phi3_small.Phi3SmallForCausalLM",
|
10 |
+
"AutoTokenizer": "tokenization_phi3_small.Phi3SmallTokenizer"
|
|
|
|
|
|
|
11 |
},
|
12 |
"blocksparse_block_size": 64,
|
13 |
"blocksparse_homo_head_pattern": false,
|
configuration_phi3_small.py
CHANGED
@@ -29,49 +29,89 @@ def next_mult(x, y):
|
|
29 |
|
30 |
class Phi3SmallConfig(PretrainedConfig):
|
31 |
"""
|
32 |
-
This is the configuration class to store the configuration of a
|
33 |
-
instantiate a
|
34 |
-
configuration with the defaults will yield a similar configuration to that of the
|
35 |
-
[
|
36 |
|
37 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
38 |
documentation from [`PretrainedConfig`] for more information.
|
39 |
|
40 |
|
41 |
Args:
|
42 |
-
vocab_size (`int`, *optional*, defaults to
|
43 |
-
Vocabulary size of the
|
44 |
-
`inputs_ids` passed when calling
|
45 |
-
|
46 |
-
The maximum sequence length that this model might
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
|
65 |
-
The epsilon
|
66 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
67 |
-
The
|
68 |
-
|
69 |
-
Whether
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
Example:
|
77 |
|
@@ -86,7 +126,8 @@ class Phi3SmallConfig(PretrainedConfig):
|
|
86 |
|
87 |
>>> # Accessing the model configuration
|
88 |
>>> configuration = model.config
|
89 |
-
```
|
|
|
90 |
|
91 |
model_type = "phi3small"
|
92 |
keys_to_ignore_at_inference = ["past_key_values"]
|
@@ -113,7 +154,7 @@ class Phi3SmallConfig(PretrainedConfig):
|
|
113 |
gegelu_pad_to_256: bool = True,
|
114 |
ff_dim_multiplier: Optional[int] = None,
|
115 |
ff_intermediate_size: Optional[int] = 14336,
|
116 |
-
# Block Sparse Attention
|
117 |
blocksparse_homo_head_pattern: bool = False,
|
118 |
blocksparse_block_size: int = 64,
|
119 |
blocksparse_num_local_blocks: int = 16,
|
@@ -161,7 +202,6 @@ class Phi3SmallConfig(PretrainedConfig):
|
|
161 |
self.blocksparse_triton_kernel_block_size = blocksparse_triton_kernel_block_size
|
162 |
# Frequency of block sparsity
|
163 |
self.dense_attention_every_n_layers = dense_attention_every_n_layers
|
164 |
-
|
165 |
# Activation function
|
166 |
self.hidden_act = hidden_act
|
167 |
self.gegelu_limit = gegelu_limit
|
@@ -176,10 +216,8 @@ class Phi3SmallConfig(PretrainedConfig):
|
|
176 |
self.embedding_dropout_prob = embedding_dropout_prob
|
177 |
self.attention_dropout_prob = attention_dropout_prob
|
178 |
self.ffn_dropout_prob = ffn_dropout_prob
|
179 |
-
|
180 |
self.layer_norm_epsilon = layer_norm_epsilon
|
181 |
self.initializer_range = initializer_range
|
182 |
-
|
183 |
# MuP parameters
|
184 |
self.mup_use_scaling = mup_use_scaling
|
185 |
self.mup_width_multiplier = mup_width_multiplier
|
|
|
29 |
|
30 |
class Phi3SmallConfig(PretrainedConfig):
|
31 |
"""
|
32 |
+
This is the configuration class to store the configuration of a `Phi3Small` model. It is used to
|
33 |
+
instantiate a Phi-3-small model according to the specified arguments, defining the model architecture.
|
34 |
+
Instantiating a configuration with the defaults will yield a similar configuration to that of the Phi-3-small
|
35 |
+
[phi3](https://arxiv.org/pdf/2404.14219) architecture.
|
36 |
|
37 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
38 |
documentation from [`PretrainedConfig`] for more information.
|
39 |
|
40 |
|
41 |
Args:
|
42 |
+
vocab_size (`int`, *optional*, defaults to 100352):
|
43 |
+
Vocabulary size of the Phi3Small model. Defines the number of different tokens that can be represented by the
|
44 |
+
`inputs_ids` passed when calling `Phi3Small`.
|
45 |
+
max_position_embeddings (`int`, *optional*, defaults to 8192):
|
46 |
+
The maximum sequence length that this model might safely be used with.
|
47 |
+
rope_embedding_base (`float`, *optional*, defaults to 10^6):
|
48 |
+
The base value for the RoPE (Relative Position Encoding) embedding.
|
49 |
+
rope_position_scale (`float`, *optional*, defaults to 1.0):
|
50 |
+
The scale factor for the RoPE position encoding.
|
51 |
+
rope_scaling (`Optional[Dict[str, Union[float, List[float], int]]]`, *optional*, defaults to None):
|
52 |
+
The scaling configuration used for LongRoPE.
|
53 |
+
hidden_size (`int`, *optional*, defaults to 4096):
|
54 |
+
The size of the hidden layers in the model.
|
55 |
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
56 |
+
The number of layers in the model.
|
57 |
+
num_attention_heads (`int`, *optional*, defaults to 32):
|
58 |
+
The number of query heads in the model.
|
59 |
+
num_key_value_heads (`int`, *optional*, defaults to 8):
|
60 |
+
The number of key-value heads in the model.
|
61 |
+
hidden_act (`str`, *optional*, defaults to "gegelu"):
|
62 |
+
The activation function used in the model.
|
63 |
+
gegelu_limit (`float`, *optional*, defaults to 20.0):
|
64 |
+
The limit value for the GELU activation function (for numerical stability).
|
65 |
+
gegelu_pad_to_256 (`bool`, *optional*, defaults to True):
|
66 |
+
Whether to pad the intermediate size to a multiple of 256 (for faster matmul ops).
|
67 |
+
ff_dim_multiplier (`Optional[int]`, *optional*, defaults to None):
|
68 |
+
The dimension multiplier for the feed-forward layers.
|
69 |
+
ff_intermediate_size (`Optional[int]`, *optional*, defaults to 14336):
|
70 |
+
The intermediate size for the feed-forward layers.
|
71 |
+
One of `ff_dim_multiplier` or `ff_intermediate_size` must be specified.
|
72 |
+
blocksparse_homo_head_pattern (`bool`, *optional*, defaults to False):
|
73 |
+
Whether to use a homogeneous head pattern for block-sparse attention.
|
74 |
+
blocksparse_block_size (`int`, *optional*, defaults to 64):
|
75 |
+
The block size for block-sparse attention.
|
76 |
+
blocksparse_num_local_blocks (`int`, *optional*, defaults to 16):
|
77 |
+
The number of local blocks for block-sparse attention.
|
78 |
+
The local window used in blocksparse equals `blocksparse_num_local_blocks * blocksparse_block_size`
|
79 |
+
blocksparse_vert_stride (`int`, *optional*, defaults to 8):
|
80 |
+
The vertical stride for block-sparse attention.
|
81 |
+
blocksparse_triton_kernel_block_size (`int`, *optional*, defaults to 64):
|
82 |
+
The kernel block size for block-sparse attention.
|
83 |
+
dense_attention_every_n_layers (`Optional[int]`, *optional*, defaults to 2):
|
84 |
+
The frequency of all dense attention layers in the model
|
85 |
+
embedding_dropout_prob (`float`, *optional*, defaults to 0.1):
|
86 |
+
The dropout probability for the embedding layer.
|
87 |
+
attention_dropout_prob (`float`, *optional*, defaults to 0.0):
|
88 |
+
The dropout probability for the attention layers.
|
89 |
+
ffn_dropout_prob (`float`, *optional*, defaults to 0.1):
|
90 |
+
The dropout probability for the feed-forward layers.
|
91 |
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
|
92 |
+
The epsilon value for layer normalization.
|
93 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
94 |
+
The range for weight initialization.
|
95 |
+
mup_use_scaling (`bool`, *optional*, defaults to True):
|
96 |
+
Whether to use scaling for MuP parameters (see: https://arxiv.org/abs/2203.03466).
|
97 |
+
mup_width_multiplier (`bool`, *optional*, defaults to 8.0):
|
98 |
+
The width multiplier for MuP.
|
99 |
+
mup_embedding_multiplier (`bool`, *optional*, defaults to 10.0):
|
100 |
+
The embedding multiplier for MuP.
|
101 |
+
mup_attn_multiplier (`bool`, *optional*, defaults to 1.0):
|
102 |
+
The attention multiplier for MuP.
|
103 |
+
use_cache (`bool`, *optional*, defaults to True):
|
104 |
+
Whether to use cache for the model.
|
105 |
+
bos_token_id (`int`, *optional*, defaults to 100257):
|
106 |
+
The token ID for the beginning of sentence.
|
107 |
+
eos_token_id (`int`, *optional*, defaults to 100257):
|
108 |
+
The token ID for the end of sentence.
|
109 |
+
reorder_and_upcast_attn (`bool`, *optional*, defaults to False):
|
110 |
+
Whether to reorder and upcast attention.
|
111 |
+
pad_sequence_to_multiple_of_64 (`bool`, *optional*, defaults to True):
|
112 |
+
Whether to pad the sequence length to a multiple of 64.
|
113 |
+
**kwargs:
|
114 |
+
Additional keyword arguments.
|
115 |
|
116 |
Example:
|
117 |
|
|
|
126 |
|
127 |
>>> # Accessing the model configuration
|
128 |
>>> configuration = model.config
|
129 |
+
```
|
130 |
+
"""
|
131 |
|
132 |
model_type = "phi3small"
|
133 |
keys_to_ignore_at_inference = ["past_key_values"]
|
|
|
154 |
gegelu_pad_to_256: bool = True,
|
155 |
ff_dim_multiplier: Optional[int] = None,
|
156 |
ff_intermediate_size: Optional[int] = 14336,
|
157 |
+
# Block Sparse Attention Parameters
|
158 |
blocksparse_homo_head_pattern: bool = False,
|
159 |
blocksparse_block_size: int = 64,
|
160 |
blocksparse_num_local_blocks: int = 16,
|
|
|
202 |
self.blocksparse_triton_kernel_block_size = blocksparse_triton_kernel_block_size
|
203 |
# Frequency of block sparsity
|
204 |
self.dense_attention_every_n_layers = dense_attention_every_n_layers
|
|
|
205 |
# Activation function
|
206 |
self.hidden_act = hidden_act
|
207 |
self.gegelu_limit = gegelu_limit
|
|
|
216 |
self.embedding_dropout_prob = embedding_dropout_prob
|
217 |
self.attention_dropout_prob = attention_dropout_prob
|
218 |
self.ffn_dropout_prob = ffn_dropout_prob
|
|
|
219 |
self.layer_norm_epsilon = layer_norm_epsilon
|
220 |
self.initializer_range = initializer_range
|
|
|
221 |
# MuP parameters
|
222 |
self.mup_use_scaling = mup_use_scaling
|
223 |
self.mup_width_multiplier = mup_width_multiplier
|
modeling_phi3_small.py
CHANGED
@@ -155,7 +155,7 @@ class Phi3SmallMLP(nn.Module):
|
|
155 |
def __init__(self, config: Phi3SmallConfig):
|
156 |
super().__init__()
|
157 |
self.config = config
|
158 |
-
assert self.config.hidden_act == "gegelu", "Only `gegelu` is supported for the
|
159 |
self.hidden_size = config.hidden_size
|
160 |
self.gegelu_limit = config.gegelu_limit
|
161 |
self.intermediate_size = config.intermediate_size
|
@@ -415,7 +415,7 @@ class Phi3SmallSelfAttention(nn.Module):
|
|
415 |
|
416 |
.. note::
|
417 |
Right now, am assuming the expansion for the query key values is already done
|
418 |
-
outside. But ideally, since Flash attention handles the
|
419 |
avoid doing that.
|
420 |
|
421 |
"""
|
@@ -496,11 +496,11 @@ class Phi3SmallSelfAttention(nn.Module):
|
|
496 |
torch.Tensor: The expanded key-value tensor of shape (bsz, nqp, 2, seq_len, hdim).
|
497 |
Where nqp = num_q_per_kv * nkp
|
498 |
|
499 |
-
.. note::
|
500 |
Right now, I am using a repeat_interleave to expand the kv to the size of q.
|
501 |
This incurs a memory penalty, since the tensors are actually copied.
|
502 |
TODO: If this does yield benefits, then potentially we can use the re-written
|
503 |
-
flash attention kernel that can handle
|
504 |
"""
|
505 |
|
506 |
repeats = torch.tensor([num_q_per_kv] * kv.size(1)).to(kv.device)
|
|
|
155 |
def __init__(self, config: Phi3SmallConfig):
|
156 |
super().__init__()
|
157 |
self.config = config
|
158 |
+
assert self.config.hidden_act == "gegelu", "Only `gegelu` is supported for the Phi-3-small model .."
|
159 |
self.hidden_size = config.hidden_size
|
160 |
self.gegelu_limit = config.gegelu_limit
|
161 |
self.intermediate_size = config.intermediate_size
|
|
|
415 |
|
416 |
.. note::
|
417 |
Right now, am assuming the expansion for the query key values is already done
|
418 |
+
outside. But ideally, since Flash attention handles the GQA correctly, we can
|
419 |
avoid doing that.
|
420 |
|
421 |
"""
|
|
|
496 |
torch.Tensor: The expanded key-value tensor of shape (bsz, nqp, 2, seq_len, hdim).
|
497 |
Where nqp = num_q_per_kv * nkp
|
498 |
|
499 |
+
.. note(bapatra)::
|
500 |
Right now, I am using a repeat_interleave to expand the kv to the size of q.
|
501 |
This incurs a memory penalty, since the tensors are actually copied.
|
502 |
TODO: If this does yield benefits, then potentially we can use the re-written
|
503 |
+
flash attention kernel that can handle GQA.
|
504 |
"""
|
505 |
|
506 |
repeats = torch.tensor([num_q_per_kv] * kv.size(1)).to(kv.device)
|