x54-729 commited on
Commit
497af06
1 Parent(s): f3a2a3f

Update configuration_internlm.py

Browse files
Files changed (1) hide show
  1. configuration_internlm.py +8 -33
configuration_internlm.py CHANGED
@@ -19,9 +19,8 @@
19
  # limitations under the License.
20
  """ InternLM model configuration"""
21
 
22
- from transformers.utils import logging
23
  from transformers.configuration_utils import PretrainedConfig
24
-
25
 
26
  logger = logging.get_logger(__name__)
27
 
@@ -30,14 +29,11 @@ INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
30
 
31
  class InternLMConfig(PretrainedConfig):
32
  r"""
33
- This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate an InternLM
34
- model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
35
- defaults will yield a similar configuration to that of the InternLM-7B.
36
-
37
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
38
  documentation from [`PretrainedConfig`] for more information.
39
-
40
-
41
  Args:
42
  vocab_size (`int`, *optional*, defaults to 32000):
43
  Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the
@@ -50,19 +46,6 @@ class InternLMConfig(PretrainedConfig):
50
  Number of hidden layers in the Transformer encoder.
51
  num_attention_heads (`int`, *optional*, defaults to 32):
52
  Number of attention heads for each attention layer in the Transformer encoder.
53
- num_key_value_heads (`int`, *optional*):
54
- This is the number of key_value heads that should be used to implement Grouped Query Attention. If
55
- `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
56
- `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
57
- converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
58
- by meanpooling all the original heads within that group. For more details checkout [this
59
- paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
60
- `num_attention_heads`.
61
- pretraining_tp (`int`, *optional*, defaults to `1`):
62
- Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
63
- document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
64
- necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
65
- issue](https://github.com/pytorch/pytorch/issues/76232).
66
  hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
67
  The non-linear activation function (function or string) in the decoder.
68
  max_position_embeddings (`int`, *optional*, defaults to 2048):
@@ -78,30 +61,25 @@ class InternLMConfig(PretrainedConfig):
78
  tie_word_embeddings(`bool`, *optional*, defaults to `False`):
79
  Whether to tie weight embeddings
80
  Example:
81
-
82
  ```python
83
  >>> from transformers import InternLMModel, InternLMConfig
84
-
85
  >>> # Initializing a InternLM internlm-7b style configuration
86
  >>> configuration = InternLMConfig()
87
-
88
  >>> # Initializing a model from the internlm-7b style configuration
89
  >>> model = InternLMModel(configuration)
90
-
91
  >>> # Accessing the model configuration
92
  >>> configuration = model.config
93
  ```"""
94
  model_type = "internlm"
95
  _auto_class = "AutoConfig"
96
 
97
- def __init__(
98
  self,
99
  vocab_size=103168,
100
  hidden_size=4096,
101
  intermediate_size=11008,
102
  num_hidden_layers=32,
103
  num_attention_heads=32,
104
- num_key_value_heads=None,
105
  hidden_act="silu",
106
  max_position_embeddings=2048,
107
  initializer_range=0.02,
@@ -112,6 +90,7 @@ class InternLMConfig(PretrainedConfig):
112
  eos_token_id=2,
113
  tie_word_embeddings=False,
114
  bias=True,
 
115
  **kwargs,
116
  ):
117
  self.vocab_size = vocab_size
@@ -120,20 +99,16 @@ class InternLMConfig(PretrainedConfig):
120
  self.intermediate_size = intermediate_size
121
  self.num_hidden_layers = num_hidden_layers
122
  self.num_attention_heads = num_attention_heads
123
-
124
- if num_key_value_heads is None:
125
- num_key_value_heads = num_attention_heads
126
- self.num_key_value_heads = num_key_value_heads
127
-
128
  self.hidden_act = hidden_act
129
  self.initializer_range = initializer_range
130
  self.rms_norm_eps = rms_norm_eps
131
  self.use_cache = use_cache
132
  self.bias = bias
 
133
  super().__init__(
134
  pad_token_id=pad_token_id,
135
  bos_token_id=bos_token_id,
136
  eos_token_id=eos_token_id,
137
  tie_word_embeddings=tie_word_embeddings,
138
  **kwargs,
139
- )
 
19
  # limitations under the License.
20
  """ InternLM model configuration"""
21
 
 
22
  from transformers.configuration_utils import PretrainedConfig
23
+ from transformers.utils import logging
24
 
25
  logger = logging.get_logger(__name__)
26
 
 
29
 
30
  class InternLMConfig(PretrainedConfig):
31
  r"""
32
+ This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
33
+ an InternLM model according to the specified arguments, defining the model architecture. Instantiating a
34
+ configuration with the defaults will yield a similar configuration to that of the InternLM-7B.
 
35
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
36
  documentation from [`PretrainedConfig`] for more information.
 
 
37
  Args:
38
  vocab_size (`int`, *optional*, defaults to 32000):
39
  Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the
 
46
  Number of hidden layers in the Transformer encoder.
47
  num_attention_heads (`int`, *optional*, defaults to 32):
48
  Number of attention heads for each attention layer in the Transformer encoder.
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
50
  The non-linear activation function (function or string) in the decoder.
51
  max_position_embeddings (`int`, *optional*, defaults to 2048):
 
61
  tie_word_embeddings(`bool`, *optional*, defaults to `False`):
62
  Whether to tie weight embeddings
63
  Example:
 
64
  ```python
65
  >>> from transformers import InternLMModel, InternLMConfig
 
66
  >>> # Initializing a InternLM internlm-7b style configuration
67
  >>> configuration = InternLMConfig()
 
68
  >>> # Initializing a model from the internlm-7b style configuration
69
  >>> model = InternLMModel(configuration)
 
70
  >>> # Accessing the model configuration
71
  >>> configuration = model.config
72
  ```"""
73
  model_type = "internlm"
74
  _auto_class = "AutoConfig"
75
 
76
+ def __init__( # pylint: disable=W0102
77
  self,
78
  vocab_size=103168,
79
  hidden_size=4096,
80
  intermediate_size=11008,
81
  num_hidden_layers=32,
82
  num_attention_heads=32,
 
83
  hidden_act="silu",
84
  max_position_embeddings=2048,
85
  initializer_range=0.02,
 
90
  eos_token_id=2,
91
  tie_word_embeddings=False,
92
  bias=True,
93
+ rotary={"base": 10000, "type": "dynamic"}, # pylint: disable=W0102
94
  **kwargs,
95
  ):
96
  self.vocab_size = vocab_size
 
99
  self.intermediate_size = intermediate_size
100
  self.num_hidden_layers = num_hidden_layers
101
  self.num_attention_heads = num_attention_heads
 
 
 
 
 
102
  self.hidden_act = hidden_act
103
  self.initializer_range = initializer_range
104
  self.rms_norm_eps = rms_norm_eps
105
  self.use_cache = use_cache
106
  self.bias = bias
107
+ self.rotary = rotary
108
  super().__init__(
109
  pad_token_id=pad_token_id,
110
  bos_token_id=bos_token_id,
111
  eos_token_id=eos_token_id,
112
  tie_word_embeddings=tie_word_embeddings,
113
  **kwargs,
114
+ )