LeroyDyer commited on
Commit
7009bea
1 Parent(s): 8a153c2

Update configuration_mistral.py

Browse files
Files changed (1) hide show
  1. configuration_mistral.py +40 -1
configuration_mistral.py CHANGED
@@ -131,6 +131,18 @@ class MistralConfig(PretrainedConfig):
131
  rope_scaling=None,
132
  rope_theta=10000.0,
133
  sliding_window=4096,
 
 
 
 
 
 
 
 
 
 
 
 
134
  **kwargs,
135
  ):
136
  self.vocab_size = vocab_size
@@ -140,7 +152,19 @@ class MistralConfig(PretrainedConfig):
140
  self.num_hidden_layers = num_hidden_layers
141
  self.num_attention_heads = num_attention_heads
142
  self.sliding_window = sliding_window
143
-
 
 
 
 
 
 
 
 
 
 
 
 
144
  # for backward compatibility
145
  if num_key_value_heads is None:
146
  num_key_value_heads = num_attention_heads
@@ -153,7 +177,22 @@ class MistralConfig(PretrainedConfig):
153
  self.rope_scaling = rope_scaling
154
  self._rope_scaling_validation()
155
  self.rope_theta = rope_theta
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
 
157
  super().__init__(
158
  pad_token_id=pad_token_id,
159
  bos_token_id=bos_token_id,
 
131
  rope_scaling=None,
132
  rope_theta=10000.0,
133
  sliding_window=4096,
134
+ attention_dropout=0.0,
135
+ max_thoughts=16,
136
+ max_temperature=10,
137
+ merged_talk_heads=True,
138
+ merged_lm_and_talk_heads=False,
139
+ merged_lm_and_think_heads=True,
140
+ use_concat_talk_head=True,
141
+ use_shallow_think=True,
142
+ use_shallow_talk=False,
143
+ use_complex_think_head=False,
144
+ use_complex_talk_head=True,
145
+ use_weighted_talk_head=True,
146
  **kwargs,
147
  ):
148
  self.vocab_size = vocab_size
 
152
  self.num_hidden_layers = num_hidden_layers
153
  self.num_attention_heads = num_attention_heads
154
  self.sliding_window = sliding_window
155
+ attention_dropout=0.0,
156
+ max_thoughts=16,
157
+ max_temperature=10,
158
+ complexity_factor = 0.5,
159
+ merged_talk_heads=True,
160
+ merged_lm_and_talk_heads=False,
161
+ merged_lm_and_think_heads=True,
162
+ use_concat_talk_head=True,
163
+ use_shallow_think=True,
164
+ use_shallow_talk=False,
165
+ use_complex_think_head=False,
166
+ use_complex_talk_head=True,
167
+ use_weighted_talk_head=True,
168
  # for backward compatibility
169
  if num_key_value_heads is None:
170
  num_key_value_heads = num_attention_heads
 
177
  self.rope_scaling = rope_scaling
178
  self._rope_scaling_validation()
179
  self.rope_theta = rope_theta
180
+ self.attention_dropout = attention_dropout
181
+ self.max_thoughts = max_thoughts
182
+ self.complexity_factor = complexity_factor
183
+ self.max_temperature = max_temperature
184
+ self.merged_talk_heads = merged_talk_heads
185
+ self.merged_lm_and_talk_heads = merged_lm_and_talk_heads
186
+ self.merged_lm_and_think_heads = merged_lm_and_think_heads
187
+ self.use_concat_talk_head = use_concat_talk_head
188
+ self.use_shallow_think = use_shallow_think
189
+ self.use_shallow_talk = use_shallow_talk
190
+ self.use_complex_think_head = use_complex_think_head
191
+ self.use_complex_talk_head = use_complex_talk_head
192
+ self.use_weighted_talk_head = use_weighted_talk_head
193
+ self.hidden_dropout_prob = hidden_dropout_prob
194
 
195
+
196
  super().__init__(
197
  pad_token_id=pad_token_id,
198
  bos_token_id=bos_token_id,