LeroyDyer commited on
Commit
1f1e352
1 Parent(s): 95e674e

Update configuration_mistral.py

Browse files
Files changed (1) hide show
  1. configuration_mistral.py +173 -1
configuration_mistral.py CHANGED
@@ -18,6 +18,13 @@ from transformers.configuration_utils import PretrainedConfig
18
  from transformers.utils import logging
19
 
20
 
 
 
 
 
 
 
 
21
  logger = logging.get_logger(__name__)
22
 
23
  MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
@@ -178,4 +185,169 @@ class MistralConfig(PretrainedConfig):
178
  if rope_scaling_type == "yarn" or rope_scaling_type == "dynamic-yarn":
179
  original_max_position_embeddings = self.rope_scaling.get("original_max_position_embeddings", None)
180
  if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int):
181
- raise ValueError(f"`rope_scaling.original_max_position_embeddings` must be set to an int when using yarn, and dynamic-yarn")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  from transformers.utils import logging
19
 
20
 
21
+
22
+
23
+ QUIET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
24
+ "quietai/Quiet-7B-v0.1": "https://huggingface.co/quietai/Quiet-7B-v0.1/resolve/main/config.json",
25
+ "quietai/Quiet-7B-Instruct-v0.1": "https://huggingface.co/quietai/Quiet-7B-Instruct-v0.1/resolve/main/config.json",
26
+ }
27
+
28
  logger = logging.get_logger(__name__)
29
 
30
  MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
185
  if rope_scaling_type == "yarn" or rope_scaling_type == "dynamic-yarn":
186
  original_max_position_embeddings = self.rope_scaling.get("original_max_position_embeddings", None)
187
  if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int):
188
+ raise ValueError(f"`rope_scaling.original_max_position_embeddings` must be set to an int when using yarn, and dynamic-yarn")
189
+
190
+
191
+ # coding=utf-8
192
+ # Copyright 2023 Quiet AI and the HuggingFace Inc. team. All rights reserved.
193
+ #
194
+ # Licensed under the Apache License, Version 2.0 (the "License");
195
+ # you may not use this file except in compliance with the License.
196
+ # You may obtain a copy of the License at
197
+ #
198
+ # http://www.apache.org/licenses/LICENSE-2.0
199
+ #
200
+ # Unless required by applicable law or agreed to in writing, software
201
+ # distributed under the License is distributed on an "AS IS" BASIS,
202
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
203
+ # See the License for the specific language governing permissions and
204
+ # limitations under the License.
205
+ """ Quiet model configuration"""
206
+
207
+ from transformers.configuration_utils import PretrainedConfig
208
+ from transformers.utils import logging
209
+
210
+
211
+
212
+ class QuietConfig(PretrainedConfig):
213
+ r"""
214
+ This is the configuration class to store the configuration of a [`QuietModel`]. It is used to instantiate an
215
+ Quiet model according to the specified arguments, defining the model architecture. Instantiating a configuration
216
+ with the defaults will yield a similar configuration to that of the Quiet-7B-v0.1 or Quiet-7B-Instruct-v0.1.
217
+ [quietai/Quiet-7B-v0.1](https://huggingface.co/quietai/Quiet-7B-v0.1)
218
+ [quietai/Quiet-7B-Instruct-v0.1](https://huggingface.co/quietai/Quiet-7B-Instruct-v0.1)
219
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
220
+ documentation from [`PretrainedConfig`] for more information.
221
+ Args:
222
+ vocab_size (`int`, *optional*, defaults to 32000):
223
+ Vocabulary size of the Quiet model. Defines the number of different tokens that can be represented by the
224
+ `inputs_ids` passed when calling [`QuietModel`]
225
+ hidden_size (`int`, *optional*, defaults to 4096):
226
+ Dimension of the hidden representations.
227
+ intermediate_size (`int`, *optional*, defaults to 14336):
228
+ Dimension of the MLP representations.
229
+ num_hidden_layers (`int`, *optional*, defaults to 32):
230
+ Number of hidden layers in the Transformer encoder.
231
+ num_attention_heads (`int`, *optional*, defaults to 32):
232
+ Number of attention heads for each attention layer in the Transformer encoder.
233
+ num_key_value_heads (`int`, *optional*, defaults to 8):
234
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
235
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
236
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
237
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
238
+ by meanpooling all the original heads within that group. For more details checkout [this
239
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
240
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
241
+ The non-linear activation function (function or string) in the decoder.
242
+ max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
243
+ The maximum sequence length that this model might ever be used with. Quiet's sliding window attention
244
+ allows sequence of up to 4096*32 tokens.
245
+ initializer_range (`float`, *optional*, defaults to 0.02):
246
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
247
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
248
+ The epsilon used by the rms normalization layers.
249
+ use_cache (`bool`, *optional*, defaults to `True`):
250
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
251
+ relevant if `config.is_decoder=True`.
252
+ pad_token_id (`int`, *optional*):
253
+ The id of the padding token.
254
+ bos_token_id (`int`, *optional*, defaults to 1):
255
+ The id of the "beginning-of-sequence" token.
256
+ eos_token_id (`int`, *optional*, defaults to 2):
257
+ The id of the "end-of-sequence" token.
258
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
259
+ Whether the model's input and output word embeddings should be tied.
260
+ rope_theta (`float`, *optional*, defaults to 10000.0):
261
+ The base period of the RoPE embeddings.
262
+ sliding_window (`int`, *optional*, defaults to 4096):
263
+ Sliding window attention window size. If not specified, will default to `4096`.
264
+ attention_dropout (`float`, *optional*, defaults to 0.0):
265
+ The dropout ratio for the attention probabilities.
266
+ ```python
267
+ >>> from transformers import QuietModel, QuietConfig
268
+ >>> # Initializing a Quiet 7B style configuration
269
+ >>> configuration = QuietConfig()
270
+ >>> # Initializing a model from the Quiet 7B style configuration
271
+ >>> model = QuietModel(configuration)
272
+ >>> # Accessing the model configuration
273
+ >>> configuration = model.config
274
+ ```"""
275
+
276
+ model_type = "quiet"
277
+ keys_to_ignore_at_inference = ["past_key_values"]
278
+
279
+ def __init__(
280
+ self,
281
+ vocab_size=32000,
282
+ hidden_size=4096,
283
+ intermediate_size=14336,
284
+ num_hidden_layers=32,
285
+ num_attention_heads=32,
286
+ num_key_value_heads=8,
287
+ hidden_act="silu",
288
+ max_position_embeddings=4096 * 32,
289
+ initializer_range=0.02,
290
+ rms_norm_eps=1e-6,
291
+ use_cache=True,
292
+ pad_token_id=None,
293
+ bos_token_id=1,
294
+ eos_token_id=2,
295
+ tie_word_embeddings=False,
296
+ rope_theta=10000.0,
297
+ complexity_factor = 0.5,
298
+ sliding_window=4096,
299
+ attention_dropout=0.0,
300
+ max_thoughts=16,
301
+ max_temperature=10,
302
+ merged_talk_heads=True,
303
+ merged_lm_and_talk_heads=False,
304
+ merged_lm_and_think_heads=True,
305
+ use_concat_talk_head=True,
306
+ use_shallow_think=True,
307
+ use_shallow_talk=False,
308
+ use_complex_think_head=False,
309
+ use_complex_talk_head=True,
310
+ use_weighted_talk_head=True,
311
+ hidden_dropout_prob=0.0,
312
+ **kwargs,
313
+ ):
314
+ self.vocab_size = vocab_size
315
+ self.max_position_embeddings = max_position_embeddings
316
+ self.hidden_size = hidden_size
317
+ self.intermediate_size = intermediate_size
318
+ self.num_hidden_layers = num_hidden_layers
319
+ self.num_attention_heads = num_attention_heads
320
+ self.sliding_window = sliding_window
321
+
322
+ # for backward compatibility
323
+ if num_key_value_heads is None:
324
+ num_key_value_heads = num_attention_heads
325
+
326
+ self.num_key_value_heads = num_key_value_heads
327
+ self.hidden_act = hidden_act
328
+ self.initializer_range = initializer_range
329
+ self.rms_norm_eps = rms_norm_eps
330
+ self.use_cache = use_cache
331
+ self.rope_theta = rope_theta
332
+ self.attention_dropout = attention_dropout
333
+ self.max_thoughts = max_thoughts
334
+ self.complexity_factor = complexity_factor
335
+ self.max_temperature = max_temperature
336
+ self.merged_talk_heads = merged_talk_heads
337
+ self.merged_lm_and_talk_heads = merged_lm_and_talk_heads
338
+ self.merged_lm_and_think_heads = merged_lm_and_think_heads
339
+ self.use_concat_talk_head = use_concat_talk_head
340
+ self.use_shallow_think = use_shallow_think
341
+ self.use_shallow_talk = use_shallow_talk
342
+ self.use_complex_think_head = use_complex_think_head
343
+ self.use_complex_talk_head = use_complex_talk_head
344
+ self.use_weighted_talk_head = use_weighted_talk_head
345
+ self.hidden_dropout_prob = hidden_dropout_prob
346
+
347
+ super().__init__(
348
+ pad_token_id=pad_token_id,
349
+ bos_token_id=bos_token_id,
350
+ eos_token_id=eos_token_id,
351
+ tie_word_embeddings=tie_word_embeddings,
352
+ **kwargs,
353
+ )