config.json CHANGED
@@ -2,6 +2,7 @@
2
  "architectures": [
3
  "Phi3ForCausalLM"
4
  ],
 
5
  "attention_dropout": 0.0,
6
  "auto_map": {
7
  "AutoConfig": "configuration_phi3.Phi3Config",
@@ -29,71 +30,64 @@
29
  "rms_norm_eps": 1e-05,
30
  "rope_scaling": {
31
  "long_factor": [
32
- 1.0299999713897705,
33
- 1.0499999523162842,
34
- 1.0499999523162842,
35
- 1.0799999237060547,
36
- 1.2299998998641968,
37
- 1.2299998998641968,
38
- 1.2999999523162842,
39
- 1.4499999284744263,
40
- 1.5999999046325684,
41
- 1.6499998569488525,
42
- 1.8999998569488525,
43
- 2.859999895095825,
44
- 3.68999981880188,
45
- 5.419999599456787,
46
- 5.489999771118164,
47
- 5.489999771118164,
48
- 9.09000015258789,
49
- 11.579999923706055,
50
- 15.65999984741211,
51
- 15.769999504089355,
52
- 15.789999961853027,
53
- 18.360000610351562,
54
- 21.989999771118164,
55
- 23.079999923706055,
56
- 30.009998321533203,
57
- 32.35000228881836,
58
- 32.590003967285156,
59
- 35.56000518798828,
60
- 39.95000457763672,
61
- 53.840003967285156,
62
- 56.20000457763672,
63
- 57.95000457763672,
64
- 59.29000473022461,
65
- 59.77000427246094,
66
- 59.920005798339844,
67
- 61.190006256103516,
68
- 61.96000671386719,
69
- 62.50000762939453,
70
- 63.3700065612793,
71
- 63.48000717163086,
72
- 63.48000717163086,
73
- 63.66000747680664,
74
- 63.850006103515625,
75
- 64.08000946044922,
76
- 64.760009765625,
77
- 64.80001068115234,
78
- 64.81001281738281,
79
- 64.81001281738281
80
  ],
81
  "short_factor": [
82
- 1.05,
83
- 1.05,
84
- 1.05,
85
  1.1,
86
  1.1,
87
- 1.1500000000000001,
88
- 1.2000000000000002,
89
- 1.2500000000000002,
90
  1.3000000000000003,
91
  1.3500000000000003,
92
- 1.5000000000000004,
93
- 2.000000000000001,
94
- 2.000000000000001,
95
- 2.000000000000001,
96
- 2.000000000000001,
97
  2.000000000000001,
98
  2.000000000000001,
99
  2.000000000000001,
@@ -114,27 +108,34 @@
114
  2.0500000000000007,
115
  2.0500000000000007,
116
  2.0500000000000007,
 
 
 
117
  2.1000000000000005,
118
  2.1000000000000005,
119
- 2.1000000000000005,
120
- 2.1500000000000004,
121
  2.1500000000000004,
122
- 2.3499999999999996,
123
- 2.549999999999999,
124
- 2.5999999999999988,
125
- 2.5999999999999988,
 
 
 
 
 
 
126
  2.7499999999999982,
127
- 2.849999999999998,
128
- 2.849999999999998,
129
- 2.9499999999999975
130
  ],
131
- "type": "su"
132
  },
133
  "rope_theta": 10000.0,
134
  "sliding_window": 262144,
135
  "tie_word_embeddings": false,
136
  "torch_dtype": "bfloat16",
137
- "transformers_version": "4.39.3",
138
  "use_cache": true,
139
  "vocab_size": 32064
140
  }
 
2
  "architectures": [
3
  "Phi3ForCausalLM"
4
  ],
5
+ "attention_bias": false,
6
  "attention_dropout": 0.0,
7
  "auto_map": {
8
  "AutoConfig": "configuration_phi3.Phi3Config",
 
30
  "rms_norm_eps": 1e-05,
31
  "rope_scaling": {
32
  "long_factor": [
33
+ 1.0700000524520874,
34
+ 1.1200000047683716,
35
+ 1.149999976158142,
36
+ 1.4199999570846558,
37
+ 1.5699999332427979,
38
+ 1.7999999523162842,
39
+ 2.129999876022339,
40
+ 2.129999876022339,
41
+ 3.009999990463257,
42
+ 5.910000324249268,
43
+ 6.950000286102295,
44
+ 9.070000648498535,
45
+ 9.930000305175781,
46
+ 10.710000038146973,
47
+ 11.130000114440918,
48
+ 14.609999656677246,
49
+ 15.409998893737793,
50
+ 19.809999465942383,
51
+ 37.279998779296875,
52
+ 38.279998779296875,
53
+ 38.599998474121094,
54
+ 40.12000274658203,
55
+ 46.20000457763672,
56
+ 50.940006256103516,
57
+ 53.66000747680664,
58
+ 54.9373893737793,
59
+ 56.89738845825195,
60
+ 57.28738784790039,
61
+ 59.98738479614258,
62
+ 60.86738586425781,
63
+ 60.887386322021484,
64
+ 61.71739196777344,
65
+ 62.91739273071289,
66
+ 62.957393646240234,
67
+ 63.41739273071289,
68
+ 63.8173942565918,
69
+ 63.83739471435547,
70
+ 63.897396087646484,
71
+ 63.93739700317383,
72
+ 64.06739807128906,
73
+ 64.11434936523438,
74
+ 64.12435150146484,
75
+ 64.15435028076172,
76
+ 64.19435119628906,
77
+ 64.24435424804688,
78
+ 64.57435607910156,
79
+ 64.69000244140625,
80
+ 64.76000213623047
81
  ],
82
  "short_factor": [
 
 
 
83
  1.1,
84
  1.1,
85
+ 1.1,
 
 
86
  1.3000000000000003,
87
  1.3500000000000003,
88
+ 1.3500000000000003,
89
+ 1.4000000000000004,
90
+ 1.5500000000000005,
 
 
91
  2.000000000000001,
92
  2.000000000000001,
93
  2.000000000000001,
 
108
  2.0500000000000007,
109
  2.0500000000000007,
110
  2.0500000000000007,
111
+ 2.0500000000000007,
112
+ 2.0500000000000007,
113
+ 2.0500000000000007,
114
  2.1000000000000005,
115
  2.1000000000000005,
 
 
116
  2.1500000000000004,
117
+ 2.25,
118
+ 2.25,
119
+ 2.25,
120
+ 2.25,
121
+ 2.25,
122
+ 2.3999999999999995,
123
+ 2.4499999999999993,
124
+ 2.499999999999999,
125
+ 2.6999999999999984,
126
+ 2.6999999999999984,
127
  2.7499999999999982,
128
+ 2.799999999999998,
129
+ 2.8999999999999977,
130
+ 3.049999999999997
131
  ],
132
+ "type": "longrope"
133
  },
134
  "rope_theta": 10000.0,
135
  "sliding_window": 262144,
136
  "tie_word_embeddings": false,
137
  "torch_dtype": "bfloat16",
138
+ "transformers_version": "4.40.2",
139
  "use_cache": true,
140
  "vocab_size": 32064
141
  }
configuration_phi3.py CHANGED
@@ -1,200 +1,227 @@
1
- # coding=utf-8
2
- # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """ Phi-3 model configuration"""
17
-
18
-
19
- from transformers.configuration_utils import PretrainedConfig
20
- from transformers.utils import logging
21
-
22
-
23
- logger = logging.get_logger(__name__)
24
-
25
- PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
26
- "microsoft/Phi-3-mini-4k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/config.json",
27
- "microsoft/Phi-3-mini-128k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/config.json",
28
- }
29
-
30
-
31
- class Phi3Config(PretrainedConfig):
32
- r"""
33
- This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3
34
- model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
35
- defaults will yield a similar configuration to that of the
36
- [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct).
37
-
38
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
39
- documentation from [`PretrainedConfig`] for more information.
40
-
41
- Args:
42
- vocab_size (`int`, *optional*, defaults to 32064):
43
- Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
44
- `inputs_ids` passed when calling [`Phi3Model`].
45
- hidden_size (`int`, *optional*, defaults to 3072):
46
- Dimension of the hidden representations.
47
- intermediate_size (`int`, *optional*, defaults to 8192):
48
- Dimension of the MLP representations.
49
- num_hidden_layers (`int`, *optional*, defaults to 32):
50
- Number of hidden layers in the Transformer decoder.
51
- num_attention_heads (`int`, *optional*, defaults to 32):
52
- Number of attention heads for each attention layer in the Transformer decoder.
53
- num_key_value_heads (`int`, *optional*):
54
- This is the number of key_value heads that should be used to implement Grouped Query Attention. If
55
- `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
56
- `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
57
- converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
58
- by meanpooling all the original heads within that group. For more details checkout [this
59
- paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
60
- `num_attention_heads`.
61
- resid_pdrop (`float`, *optional*, defaults to 0.0):
62
- Dropout probability for mlp outputs.
63
- embd_pdrop (`int`, *optional*, defaults to 0.0):
64
- The dropout ratio for the embeddings.
65
- attention_dropout (`float`, *optional*, defaults to 0.0):
66
- The dropout ratio after computing the attention scores.
67
- hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
68
- The non-linear activation function (function or string) in the decoder.
69
- max_position_embeddings (`int`, *optional*, defaults to 4096):
70
- The maximum sequence length that this model might ever be used with.
71
- original_max_position_embeddings (`int`, *optional*, defaults to 4096):
72
- The maximum sequence length that this model was trained with. This is used to determine the size of the
73
- original RoPE embeddings when using long scaling.
74
- initializer_range (`float`, *optional*, defaults to 0.02):
75
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
76
- rms_norm_eps (`float`, *optional*, defaults to 1e-05):
77
- The epsilon value used for the RMSNorm.
78
- use_cache (`bool`, *optional*, defaults to `True`):
79
- Whether or not the model should return the last key/values attentions (not used by all models). Only
80
- relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
81
- tie_word_embeddings (`bool`, *optional*, defaults to `False`):
82
- Whether to tie weight embeddings
83
- rope_theta (`float`, *optional*, defaults to 10000.0):
84
- The base period of the RoPE embeddings.
85
- rope_scaling (`dict`, *optional*):
86
- The scaling factor for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
87
- contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
88
- the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
89
- divided by the number of attention heads divided by 2.
90
- eos_token_id (`int`, *optional*, defaults to 32000):
91
- The id of the "end-of-sequence" token.
92
- pad_token_id (`int`, *optional*, defaults to 32000):
93
- The id of the padding token.
94
- sliding_window (`int`, *optional*):
95
- Sliding window attention window size. If `None`, no sliding window is applied.
96
-
97
- Example:
98
-
99
- ```python
100
- >>> from transformers import Phi3Model, Phi3Config
101
-
102
- >>> # Initializing a Phi-3 style configuration
103
- >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
104
-
105
- >>> # Initializing a model from the configuration
106
- >>> model = Phi3Model(configuration)
107
-
108
- >>> # Accessing the model configuration
109
- >>> configuration = model.config
110
- ```"""
111
-
112
- model_type = "phi3"
113
- keys_to_ignore_at_inference = ["past_key_values"]
114
-
115
- def __init__(
116
- self,
117
- vocab_size=32064,
118
- hidden_size=3072,
119
- intermediate_size=8192,
120
- num_hidden_layers=32,
121
- num_attention_heads=32,
122
- num_key_value_heads=None,
123
- resid_pdrop=0.0,
124
- embd_pdrop=0.0,
125
- attention_dropout=0.0,
126
- hidden_act="silu",
127
- max_position_embeddings=4096,
128
- original_max_position_embeddings=4096,
129
- initializer_range=0.02,
130
- rms_norm_eps=1e-5,
131
- use_cache=True,
132
- tie_word_embeddings=False,
133
- rope_theta=10000.0,
134
- rope_scaling=None,
135
- eos_token_id=32000,
136
- pad_token_id=32000,
137
- sliding_window=None,
138
- **kwargs,
139
- ):
140
- self.vocab_size = vocab_size
141
- self.hidden_size = hidden_size
142
- self.intermediate_size = intermediate_size
143
- self.num_hidden_layers = num_hidden_layers
144
- self.num_attention_heads = num_attention_heads
145
-
146
- if num_key_value_heads is None:
147
- num_key_value_heads = num_attention_heads
148
-
149
- self.num_key_value_heads = num_key_value_heads
150
- self.resid_pdrop = resid_pdrop
151
- self.embd_pdrop = embd_pdrop
152
- self.attention_dropout = attention_dropout
153
- self.hidden_act = hidden_act
154
- self.max_position_embeddings = max_position_embeddings
155
- self.original_max_position_embeddings = original_max_position_embeddings
156
- self.initializer_range = initializer_range
157
- self.rms_norm_eps = rms_norm_eps
158
- self.use_cache = use_cache
159
- self.rope_theta = rope_theta
160
- self.rope_scaling = rope_scaling
161
- self.sliding_window = sliding_window
162
-
163
- super().__init__(
164
- eos_token_id=eos_token_id,
165
- pad_token_id=pad_token_id,
166
- tie_word_embeddings=tie_word_embeddings,
167
- **kwargs,
168
- )
169
-
170
- def _rope_scaling_validation(self):
171
- if self.rope_scaling is None:
172
- return
173
-
174
- assert (
175
- (isinstance(self.rope_scaling, dict))
176
- and ("type" in self.rope_scaling)
177
- and ("short_factor" in self.rope_scaling)
178
- and ("long_factor" in self.rope_scaling)
179
- ), (
180
- "`rope_scaling` must be a dictionary with three keys: `type`, `short_factor` and `long_factor`, "
181
- f"got {self.rope_scaling}."
182
- )
183
-
184
- assert self.rope_scaling["type"].lower() == "longrope", "RoPE scaling type must be `longrope`."
185
-
186
- short_factor = self.rope_scaling["short_factor"]
187
- assert isinstance(short_factor, list) and all(
188
- isinstance(x, (int, float)) for x in short_factor
189
- ), f"RoPE scaling factor must be a list of numbers, got {short_factor}."
190
- assert (
191
- len(short_factor) == self.hidden_size // self.num_attention_heads // 2
192
- ), f"Length of RoPE scaling factor must be half of the attention head, got {short_factor}."
193
-
194
- long_factor = self.rope_scaling["long_factor"]
195
- assert isinstance(long_factor, list) and all(
196
- isinstance(x, (int, float)) for x in long_factor
197
- ), f"RoPE scaling factor must be a list of numbers, got {long_factor}."
198
- assert (
199
- len(long_factor) == self.hidden_size // self.num_attention_heads // 2
200
- ), f"Length of RoPE scaling factor must be half of the attention head, got {long_factor}."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """ Phi-3 model configuration"""
17
+
18
+
19
+ from transformers.configuration_utils import PretrainedConfig
20
+ from transformers.utils import logging
21
+
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
26
+ "microsoft/Phi-3-mini-4k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/config.json",
27
+ "microsoft/Phi-3-mini-128k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/config.json",
28
+ }
29
+
30
+
31
+ class Phi3Config(PretrainedConfig):
32
+ r"""
33
+ This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3
34
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
35
+ defaults will yield a similar configuration to that of the
36
+ [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct).
37
+
38
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
39
+ documentation from [`PretrainedConfig`] for more information.
40
+
41
+ Args:
42
+ vocab_size (`int`, *optional*, defaults to 32064):
43
+ Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
44
+ `inputs_ids` passed when calling [`Phi3Model`].
45
+ hidden_size (`int`, *optional*, defaults to 3072):
46
+ Dimension of the hidden representations.
47
+ intermediate_size (`int`, *optional*, defaults to 8192):
48
+ Dimension of the MLP representations.
49
+ num_hidden_layers (`int`, *optional*, defaults to 32):
50
+ Number of hidden layers in the Transformer decoder.
51
+ num_attention_heads (`int`, *optional*, defaults to 32):
52
+ Number of attention heads for each attention layer in the Transformer decoder.
53
+ num_key_value_heads (`int`, *optional*):
54
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
55
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
56
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
57
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
58
+ by meanpooling all the original heads within that group. For more details checkout [this
59
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
60
+ `num_attention_heads`.
61
+ resid_pdrop (`float`, *optional*, defaults to 0.0):
62
+ Dropout probability for mlp outputs.
63
+ embd_pdrop (`int`, *optional*, defaults to 0.0):
64
+ The dropout ratio for the embeddings.
65
+ attention_dropout (`float`, *optional*, defaults to 0.0):
66
+ The dropout ratio after computing the attention scores.
67
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
68
+ The non-linear activation function (function or string) in the decoder.
69
+ max_position_embeddings (`int`, *optional*, defaults to 4096):
70
+ The maximum sequence length that this model might ever be used with.
71
+ original_max_position_embeddings (`int`, *optional*, defaults to 4096):
72
+ The maximum sequence length that this model was trained with. This is used to determine the size of the
73
+ original RoPE embeddings when using long scaling.
74
+ initializer_range (`float`, *optional*, defaults to 0.02):
75
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
76
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
77
+ The epsilon value used for the RMSNorm.
78
+ use_cache (`bool`, *optional*, defaults to `True`):
79
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
80
+ relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
81
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
82
+ Whether to tie weight embeddings
83
+ rope_theta (`float`, *optional*, defaults to 10000.0):
84
+ The base period of the RoPE embeddings.
85
+ rope_scaling (`dict`, *optional*):
86
+ The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
87
+ contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
88
+ the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
89
+ divided by the number of attention heads divided by 2.
90
+ bos_token_id (`int`, *optional*, defaults to 1):
91
+ The id of the "beginning-of-sequence" token.
92
+ eos_token_id (`int`, *optional*, defaults to 32000):
93
+ The id of the "end-of-sequence" token.
94
+ pad_token_id (`int`, *optional*, defaults to 32000):
95
+ The id of the padding token.
96
+ sliding_window (`int`, *optional*):
97
+ Sliding window attention window size. If `None`, no sliding window is applied.
98
+
99
+ Example:
100
+
101
+ ```python
102
+ >>> from transformers import Phi3Model, Phi3Config
103
+
104
+ >>> # Initializing a Phi-3 style configuration
105
+ >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
106
+
107
+ >>> # Initializing a model from the configuration
108
+ >>> model = Phi3Model(configuration)
109
+
110
+ >>> # Accessing the model configuration
111
+ >>> configuration = model.config
112
+ ```"""
113
+
114
+ model_type = "phi3"
115
+ keys_to_ignore_at_inference = ["past_key_values"]
116
+
117
+ def __init__(
118
+ self,
119
+ vocab_size=32064,
120
+ hidden_size=3072,
121
+ intermediate_size=8192,
122
+ num_hidden_layers=32,
123
+ num_attention_heads=32,
124
+ num_key_value_heads=None,
125
+ resid_pdrop=0.0,
126
+ embd_pdrop=0.0,
127
+ attention_dropout=0.0,
128
+ hidden_act="silu",
129
+ max_position_embeddings=4096,
130
+ original_max_position_embeddings=4096,
131
+ initializer_range=0.02,
132
+ rms_norm_eps=1e-5,
133
+ use_cache=True,
134
+ tie_word_embeddings=False,
135
+ rope_theta=10000.0,
136
+ rope_scaling=None,
137
+ bos_token_id=1,
138
+ eos_token_id=32000,
139
+ pad_token_id=32000,
140
+ sliding_window=None,
141
+ **kwargs,
142
+ ):
143
+ self.vocab_size = vocab_size
144
+ self.hidden_size = hidden_size
145
+ self.intermediate_size = intermediate_size
146
+ self.num_hidden_layers = num_hidden_layers
147
+ self.num_attention_heads = num_attention_heads
148
+
149
+ if num_key_value_heads is None:
150
+ num_key_value_heads = num_attention_heads
151
+
152
+ self.num_key_value_heads = num_key_value_heads
153
+ self.resid_pdrop = resid_pdrop
154
+ self.embd_pdrop = embd_pdrop
155
+ self.attention_dropout = attention_dropout
156
+ self.hidden_act = hidden_act
157
+ self.max_position_embeddings = max_position_embeddings
158
+ self.original_max_position_embeddings = original_max_position_embeddings
159
+ self.initializer_range = initializer_range
160
+ self.rms_norm_eps = rms_norm_eps
161
+ self.use_cache = use_cache
162
+ self.rope_theta = rope_theta
163
+ self.rope_scaling = rope_scaling
164
+ self._rope_scaling_adjustment()
165
+ self._rope_scaling_validation()
166
+ self.sliding_window = sliding_window
167
+
168
+ super().__init__(
169
+ bos_token_id=bos_token_id,
170
+ eos_token_id=eos_token_id,
171
+ pad_token_id=pad_token_id,
172
+ tie_word_embeddings=tie_word_embeddings,
173
+ **kwargs,
174
+ )
175
+
176
+ def _rope_scaling_adjustment(self):
177
+ """
178
+ Adjust the `type` of the `rope_scaling` configuration for backward compatibility.
179
+ """
180
+ if self.rope_scaling is None:
181
+ return
182
+
183
+ rope_scaling_type = self.rope_scaling.get("type", None)
184
+
185
+ # For backward compatibility if previous version used "su" or "yarn"
186
+ if rope_scaling_type is not None and rope_scaling_type in ["su", "yarn"]:
187
+ self.rope_scaling["type"] = "longrope"
188
+
189
+ def _rope_scaling_validation(self):
190
+ """
191
+ Validate the `rope_scaling` configuration.
192
+ """
193
+ if self.rope_scaling is None:
194
+ return
195
+
196
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
197
+ raise ValueError(
198
+ "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
199
+ f"got {self.rope_scaling}"
200
+ )
201
+ rope_scaling_type = self.rope_scaling.get("type", None)
202
+ rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
203
+ rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
204
+ if rope_scaling_type is None or rope_scaling_type not in ["longrope"]:
205
+ raise ValueError(f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}")
206
+ if not (
207
+ isinstance(rope_scaling_short_factor, list)
208
+ and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
209
+ ):
210
+ raise ValueError(
211
+ f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
212
+ )
213
+ if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
214
+ raise ValueError(
215
+ f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
216
+ )
217
+ if not (
218
+ isinstance(rope_scaling_long_factor, list)
219
+ and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
220
+ ):
221
+ raise ValueError(
222
+ f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
223
+ )
224
+ if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
225
+ raise ValueError(
226
+ f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
227
+ )
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1bd9d991d1fced77f76bc0a97e3a23620edd623cb25e8a9dcde0b5dc0b852c8c
3
- size 2149696133
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8e1e200c443b2ff91cd47a887119f6448274989b300a5113c2d8e6e87b92223
3
+ size 2149696167
modeling_phi3.py CHANGED
The diff for this file is too large to render. See raw diff
 
sample_finetune.py CHANGED
@@ -1,131 +1,214 @@
1
- import torch
2
- from datasets import load_dataset
3
- from trl import SFTTrainer
4
- from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
5
-
6
- """
7
- A simple example on using SFTTrainer and Accelerate to finetune Phi-3 models. For
8
- a more advanced example, please follow HF alignment-handbook/scripts/run_sft.py
9
-
10
- 1. Install accelerate:
11
- conda install -c conda-forge accelerate
12
- 2. Setup accelerate config:
13
- accelerate config
14
- to simply use all the GPUs available:
15
- python -c "from accelerate.utils import write_basic_config; write_basic_config(mixed_precision='bf16')"
16
- check accelerate config:
17
- accelerate env
18
- 3. Run the code:
19
- accelerate launch sample_finetune.py
20
- """
21
-
22
- ###################
23
- # Hyper-parameters
24
- ###################
25
- args = {
26
- "bf16": True,
27
- "do_eval": False,
28
- "eval_strategy": "no",
29
- "learning_rate": 5.0e-06,
30
- "log_level": "info",
31
- "logging_steps": 20,
32
- "logging_strategy": "steps",
33
- "lr_scheduler_type": "cosine",
34
- "num_train_epochs": 1,
35
- "max_steps": -1,
36
- "output_dir": "./checkpoint_dir",
37
- "overwrite_output_dir": True,
38
- "per_device_eval_batch_size": 4,
39
- "per_device_train_batch_size": 8,
40
- "remove_unused_columns": True,
41
- "save_steps": 100,
42
- "save_total_limit": 1,
43
- "seed": 0,
44
- "gradient_checkpointing": True,
45
- "gradient_checkpointing_kwargs":{"use_reentrant": False},
46
- "gradient_accumulation_steps": 1,
47
- "warmup_ratio": 0.2,
48
- }
49
-
50
- training_args = TrainingArguments(**args)
51
-
52
-
53
- ################
54
- # Modle Loading
55
- ################
56
- checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
57
- # checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"
58
- model_kwargs = dict(
59
- use_cache=False,
60
- trust_remote_code=True,
61
- attn_implementation="flash_attention_2", # loading the model with flash-attenstion support
62
- torch_dtype=torch.bfloat16,
63
- device_map="cuda",
64
- )
65
- model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
66
- tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
67
- tokenizer.pad_token = tokenizer.unk_token # use unk rather than eos token to prevent endless generation
68
- tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
69
- tokenizer.padding_side = 'right'
70
-
71
- ##################
72
- # Data Processing
73
- ##################
74
- def apply_chat_template(
75
- example,
76
- tokenizer,
77
- ):
78
- messages = example["messages"]
79
- # Add an empty system message if there is none
80
- if messages[0]["role"] != "system":
81
- messages.insert(0, {"role": "system", "content": ""})
82
- example["text"] = tokenizer.apply_chat_template(
83
- messages, tokenize=False, add_generation_prompt=False)
84
- return example
85
-
86
- raw_dataset = load_dataset("HuggingFaceH4/ultrachat_200k")
87
- column_names = list(raw_dataset["train_sft"].features)
88
-
89
- processed_dataset = raw_dataset.map(
90
- apply_chat_template,
91
- fn_kwargs={"tokenizer": tokenizer},
92
- num_proc=12,
93
- remove_columns=column_names,
94
- desc="Applying chat template",
95
- )
96
- train_dataset = processed_dataset["train_sft"]
97
- eval_dataset = processed_dataset["test_sft"]
98
-
99
-
100
- ###########
101
- # Training
102
- ###########
103
- trainer = SFTTrainer(
104
- model=model,
105
- args=training_args,
106
- train_dataset=train_dataset,
107
- eval_dataset=eval_dataset,
108
- max_seq_length=2048,
109
- dataset_text_field="text",
110
- tokenizer=tokenizer,
111
- packing=True
112
- )
113
- train_result = trainer.train()
114
- metrics = train_result.metrics
115
- trainer.log_metrics("train", metrics)
116
- trainer.save_metrics("train", metrics)
117
- trainer.save_state()
118
-
119
- #############
120
- # Evaluation
121
- #############
122
- tokenizer.padding_side = 'left'
123
- metrics = trainer.evaluate()
124
- metrics["eval_samples"] = len(eval_dataset)
125
- trainer.log_metrics("eval", metrics)
126
- trainer.save_metrics("eval", metrics)
127
-
128
- ############
129
- # Save model
130
- ############
131
- trainer.save_model(training_args.output_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import logging
3
+
4
+ import datasets
5
+ from datasets import load_dataset
6
+ from peft import LoraConfig
7
+ import torch
8
+ import transformers
9
+ from trl import SFTTrainer
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
11
+
12
+ """
13
+ A simple example on using SFTTrainer and Accelerate to finetune Phi-3 models. For
14
+ a more advanced example, please follow HF alignment-handbook/scripts/run_sft.py.
15
+ This example has utilized DeepSpeed ZeRO3 offload to reduce the memory usage. The
16
+ script can be run on V100 or later generation GPUs. Here are some suggestions on
17
+ futher reducing memory consumption:
18
+ - reduce batch size
19
+ - decrease lora dimension
20
+ - restrict lora target modules
21
+ Please follow these steps to run the script:
22
+ 1. Install dependencies:
23
+ conda install -c conda-forge accelerate
24
+ pip3 install -i https://pypi.org/simple/ bitsandbytes
25
+ pip3 install peft transformers trl datasets
26
+ pip3 install deepspeed
27
+ 2. Setup accelerate and deepspeed config based on the machine used:
28
+ accelerate config
29
+ Here is a sample config for deepspeed zero3:
30
+ compute_environment: LOCAL_MACHINE
31
+ debug: false
32
+ deepspeed_config:
33
+ gradient_accumulation_steps: 1
34
+ offload_optimizer_device: none
35
+ offload_param_device: none
36
+ zero3_init_flag: true
37
+ zero3_save_16bit_model: true
38
+ zero_stage: 3
39
+ distributed_type: DEEPSPEED
40
+ downcast_bf16: 'no'
41
+ enable_cpu_affinity: false
42
+ machine_rank: 0
43
+ main_training_function: main
44
+ mixed_precision: bf16
45
+ num_machines: 1
46
+ num_processes: 4
47
+ rdzv_backend: static
48
+ same_network: true
49
+ tpu_env: []
50
+ tpu_use_cluster: false
51
+ tpu_use_sudo: false
52
+ use_cpu: false
53
+ 3. check accelerate config:
54
+ accelerate env
55
+ 4. Run the code:
56
+ accelerate launch sample_finetune.py
57
+ """
58
+
59
+ logger = logging.getLogger(__name__)
60
+
61
+
62
+ ###################
63
+ # Hyper-parameters
64
+ ###################
65
+ training_config = {
66
+ "bf16": True,
67
+ "do_eval": False,
68
+ "learning_rate": 5.0e-06,
69
+ "log_level": "info",
70
+ "logging_steps": 20,
71
+ "logging_strategy": "steps",
72
+ "lr_scheduler_type": "cosine",
73
+ "num_train_epochs": 1,
74
+ "max_steps": -1,
75
+ "output_dir": "./checkpoint_dir",
76
+ "overwrite_output_dir": True,
77
+ "per_device_eval_batch_size": 4,
78
+ "per_device_train_batch_size": 4,
79
+ "remove_unused_columns": True,
80
+ "save_steps": 100,
81
+ "save_total_limit": 1,
82
+ "seed": 0,
83
+ "gradient_checkpointing": True,
84
+ "gradient_checkpointing_kwargs":{"use_reentrant": False},
85
+ "gradient_accumulation_steps": 1,
86
+ "warmup_ratio": 0.2,
87
+ }
88
+
89
+ peft_config = {
90
+ "r": 16,
91
+ "lora_alpha": 32,
92
+ "lora_dropout": 0.05,
93
+ "bias": "none",
94
+ "task_type": "CAUSAL_LM",
95
+ "target_modules": "all-linear",
96
+ "modules_to_save": None,
97
+ }
98
+ train_conf = TrainingArguments(**training_config)
99
+ peft_conf = LoraConfig(**peft_config)
100
+
101
+
102
+ ###############
103
+ # Setup logging
104
+ ###############
105
+ logging.basicConfig(
106
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
107
+ datefmt="%Y-%m-%d %H:%M:%S",
108
+ handlers=[logging.StreamHandler(sys.stdout)],
109
+ )
110
+ log_level = train_conf.get_process_log_level()
111
+ logger.setLevel(log_level)
112
+ datasets.utils.logging.set_verbosity(log_level)
113
+ transformers.utils.logging.set_verbosity(log_level)
114
+ transformers.utils.logging.enable_default_handler()
115
+ transformers.utils.logging.enable_explicit_format()
116
+
117
+ # Log on each process a small summary
118
+ logger.warning(
119
+ f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
120
+ + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
121
+ )
122
+ logger.info(f"Training/evaluation parameters {train_conf}")
123
+ logger.info(f"PEFT parameters {peft_conf}")
124
+
125
+
126
+ ################
127
+ # Model Loading
128
+ ################
129
+ # checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
130
+ checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"
131
+ model_kwargs = dict(
132
+ use_cache=False,
133
+ trust_remote_code=True,
134
+ attn_implementation="flash_attention_2", # loading the model with flash-attenstion support
135
+ torch_dtype=torch.bfloat16,
136
+ device_map=None
137
+ )
138
+ model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
139
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
140
+ tokenizer.model_max_length = 2048
141
+ tokenizer.pad_token = tokenizer.unk_token # use unk rather than eos token to prevent endless generation
142
+ tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
143
+ tokenizer.padding_side = 'right'
144
+
145
+
146
+ ##################
147
+ # Data Processing
148
+ ##################
149
+ def apply_chat_template(
150
+ example,
151
+ tokenizer,
152
+ ):
153
+ messages = example["messages"]
154
+ example["text"] = tokenizer.apply_chat_template(
155
+ messages, tokenize=False, add_generation_prompt=False)
156
+ return example
157
+
158
+ raw_dataset = load_dataset("HuggingFaceH4/ultrachat_200k")
159
+ train_dataset = raw_dataset["train_sft"]
160
+ test_dataset = raw_dataset["test_sft"]
161
+ column_names = list(train_dataset.features)
162
+
163
+ processed_train_dataset = train_dataset.map(
164
+ apply_chat_template,
165
+ fn_kwargs={"tokenizer": tokenizer},
166
+ num_proc=10,
167
+ remove_columns=column_names,
168
+ desc="Applying chat template to train_sft",
169
+ )
170
+
171
+ processed_test_dataset = test_dataset.map(
172
+ apply_chat_template,
173
+ fn_kwargs={"tokenizer": tokenizer},
174
+ num_proc=10,
175
+ remove_columns=column_names,
176
+ desc="Applying chat template to test_sft",
177
+ )
178
+
179
+
180
+ ###########
181
+ # Training
182
+ ###########
183
+ trainer = SFTTrainer(
184
+ model=model,
185
+ args=train_conf,
186
+ peft_config=peft_conf,
187
+ train_dataset=processed_train_dataset,
188
+ eval_dataset=processed_test_dataset,
189
+ max_seq_length=2048,
190
+ dataset_text_field="text",
191
+ tokenizer=tokenizer,
192
+ packing=True
193
+ )
194
+ train_result = trainer.train()
195
+ metrics = train_result.metrics
196
+ trainer.log_metrics("train", metrics)
197
+ trainer.save_metrics("train", metrics)
198
+ trainer.save_state()
199
+
200
+
201
+ #############
202
+ # Evaluation
203
+ #############
204
+ tokenizer.padding_side = 'left'
205
+ metrics = trainer.evaluate()
206
+ metrics["eval_samples"] = len(processed_test_dataset)
207
+ trainer.log_metrics("eval", metrics)
208
+ trainer.save_metrics("eval", metrics)
209
+
210
+
211
+ # ############
212
+ # # Save model
213
+ # ############
214
+ trainer.save_model(train_conf.output_dir)
special_tokens_map.json CHANGED
@@ -1,7 +1,4 @@
1
  {
2
- "additional_special_tokens": [
3
- "<|/inst|>"
4
- ],
5
  "bos_token": {
6
  "content": "<s>",
7
  "lstrip": false,
@@ -17,7 +14,7 @@
17
  "single_word": false
18
  },
19
  "pad_token": {
20
- "content": "<|end|>",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
 
1
  {
 
 
 
2
  "bos_token": {
3
  "content": "<s>",
4
  "lstrip": false,
 
14
  "single_word": false
15
  },
16
  "pad_token": {
17
+ "content": "<|endoftext|>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
tokenizer.json CHANGED
@@ -50,7 +50,7 @@
50
  },
51
  {
52
  "id": 32002,
53
- "content": "<|step|>",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": true,
@@ -59,7 +59,7 @@
59
  },
60
  {
61
  "id": 32003,
62
- "content": "<|function_output|>",
63
  "single_word": false,
64
  "lstrip": false,
65
  "rstrip": true,
@@ -68,7 +68,7 @@
68
  },
69
  {
70
  "id": 32004,
71
- "content": "<|tag|>",
72
  "single_word": false,
73
  "lstrip": false,
74
  "rstrip": true,
@@ -77,7 +77,7 @@
77
  },
78
  {
79
  "id": 32005,
80
- "content": "<|function_call|>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": true,
@@ -104,7 +104,7 @@
104
  },
105
  {
106
  "id": 32008,
107
- "content": "<|raw|>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": true,
@@ -113,7 +113,7 @@
113
  },
114
  {
115
  "id": 32009,
116
- "content": "<|continue|>",
117
  "single_word": false,
118
  "lstrip": false,
119
  "rstrip": true,
@@ -128,249 +128,6 @@
128
  "rstrip": true,
129
  "normalized": false,
130
  "special": true
131
- },
132
- {
133
- "id": 32011,
134
- "content": "<|function_list|>",
135
- "single_word": false,
136
- "lstrip": false,
137
- "rstrip": true,
138
- "normalized": false,
139
- "special": true
140
- },
141
- {
142
- "id": 32012,
143
- "content": "<|calc|>",
144
- "single_word": false,
145
- "lstrip": false,
146
- "rstrip": true,
147
- "normalized": false,
148
- "special": true
149
- },
150
- {
151
- "id": 32013,
152
- "content": "<|code|>",
153
- "single_word": false,
154
- "lstrip": false,
155
- "rstrip": true,
156
- "normalized": false,
157
- "special": true
158
- },
159
- {
160
- "id": 32014,
161
- "content": "<|/code|>",
162
- "single_word": false,
163
- "lstrip": false,
164
- "rstrip": true,
165
- "normalized": false,
166
- "special": true
167
- },
168
- {
169
- "id": 32015,
170
- "content": "<|summary|>",
171
- "single_word": false,
172
- "lstrip": false,
173
- "rstrip": true,
174
- "normalized": false,
175
- "special": true
176
- },
177
- {
178
- "id": 32016,
179
- "content": "<|resource|>",
180
- "single_word": false,
181
- "lstrip": false,
182
- "rstrip": true,
183
- "normalized": false,
184
- "special": true
185
- },
186
- {
187
- "id": 32017,
188
- "content": "<|assistant_mask|>",
189
- "single_word": false,
190
- "lstrip": false,
191
- "rstrip": true,
192
- "normalized": false,
193
- "special": true
194
- },
195
- {
196
- "id": 32018,
197
- "content": "<|start|>",
198
- "single_word": false,
199
- "lstrip": false,
200
- "rstrip": true,
201
- "normalized": false,
202
- "special": true
203
- },
204
- {
205
- "id": 32019,
206
- "content": "<|message|>",
207
- "single_word": false,
208
- "lstrip": false,
209
- "rstrip": true,
210
- "normalized": false,
211
- "special": true
212
- },
213
- {
214
- "id": 32020,
215
- "content": "<|fim_prefix|>",
216
- "single_word": false,
217
- "lstrip": false,
218
- "rstrip": true,
219
- "normalized": false,
220
- "special": true
221
- },
222
- {
223
- "id": 32021,
224
- "content": "<|fim_middle|>",
225
- "single_word": false,
226
- "lstrip": false,
227
- "rstrip": true,
228
- "normalized": false,
229
- "special": true
230
- },
231
- {
232
- "id": 32022,
233
- "content": "<|fim_suffix|>",
234
- "single_word": false,
235
- "lstrip": false,
236
- "rstrip": true,
237
- "normalized": false,
238
- "special": true
239
- },
240
- {
241
- "id": 32023,
242
- "content": "<|meta_start|>",
243
- "single_word": false,
244
- "lstrip": false,
245
- "rstrip": true,
246
- "normalized": false,
247
- "special": true
248
- },
249
- {
250
- "id": 32024,
251
- "content": "<|ipynb_marker|>",
252
- "single_word": false,
253
- "lstrip": false,
254
- "rstrip": true,
255
- "normalized": false,
256
- "special": true
257
- },
258
- {
259
- "id": 32025,
260
- "content": "<|diff_marker|>",
261
- "single_word": false,
262
- "lstrip": false,
263
- "rstrip": true,
264
- "normalized": false,
265
- "special": true
266
- },
267
- {
268
- "id": 32026,
269
- "content": "<|ghissue|>",
270
- "single_word": false,
271
- "lstrip": false,
272
- "rstrip": true,
273
- "normalized": false,
274
- "special": true
275
- },
276
- {
277
- "id": 32027,
278
- "content": "<|ghreview|>",
279
- "single_word": false,
280
- "lstrip": false,
281
- "rstrip": true,
282
- "normalized": false,
283
- "special": true
284
- },
285
- {
286
- "id": 32028,
287
- "content": "<|disc_start|>",
288
- "single_word": false,
289
- "lstrip": false,
290
- "rstrip": true,
291
- "normalized": false,
292
- "special": true
293
- },
294
- {
295
- "id": 32029,
296
- "content": "<|disc_sep|>",
297
- "single_word": false,
298
- "lstrip": false,
299
- "rstrip": true,
300
- "normalized": false,
301
- "special": true
302
- },
303
- {
304
- "id": 32030,
305
- "content": "<|disc_thread|><|query|>",
306
- "single_word": false,
307
- "lstrip": false,
308
- "rstrip": true,
309
- "normalized": false,
310
- "special": true
311
- },
312
- {
313
- "id": 32031,
314
- "content": "<|/query|>",
315
- "single_word": false,
316
- "lstrip": false,
317
- "rstrip": true,
318
- "normalized": false,
319
- "special": true
320
- },
321
- {
322
- "id": 32032,
323
- "content": "<|data|>",
324
- "single_word": false,
325
- "lstrip": false,
326
- "rstrip": true,
327
- "normalized": false,
328
- "special": true
329
- },
330
- {
331
- "id": 32033,
332
- "content": "<|/data|>",
333
- "single_word": false,
334
- "lstrip": false,
335
- "rstrip": true,
336
- "normalized": false,
337
- "special": true
338
- },
339
- {
340
- "id": 32034,
341
- "content": "<|sys|>",
342
- "single_word": false,
343
- "lstrip": false,
344
- "rstrip": true,
345
- "normalized": false,
346
- "special": true
347
- },
348
- {
349
- "id": 32035,
350
- "content": "<|/sys|>",
351
- "single_word": false,
352
- "lstrip": false,
353
- "rstrip": true,
354
- "normalized": false,
355
- "special": true
356
- },
357
- {
358
- "id": 32036,
359
- "content": "<|inst|>",
360
- "single_word": false,
361
- "lstrip": false,
362
- "rstrip": true,
363
- "normalized": false,
364
- "special": true
365
- },
366
- {
367
- "id": 32037,
368
- "content": "<|/inst|>",
369
- "single_word": false,
370
- "lstrip": false,
371
- "rstrip": true,
372
- "normalized": false,
373
- "special": true
374
  }
375
  ],
376
  "normalizer": {
@@ -393,12 +150,6 @@
393
  "post_processor": {
394
  "type": "TemplateProcessing",
395
  "single": [
396
- {
397
- "SpecialToken": {
398
- "id": "<s>",
399
- "type_id": 0
400
- }
401
- },
402
  {
403
  "Sequence": {
404
  "id": "A",
@@ -407,24 +158,12 @@
407
  }
408
  ],
409
  "pair": [
410
- {
411
- "SpecialToken": {
412
- "id": "<s>",
413
- "type_id": 0
414
- }
415
- },
416
  {
417
  "Sequence": {
418
  "id": "A",
419
  "type_id": 0
420
  }
421
  },
422
- {
423
- "SpecialToken": {
424
- "id": "<s>",
425
- "type_id": 1
426
- }
427
- },
428
  {
429
  "Sequence": {
430
  "id": "B",
@@ -432,17 +171,7 @@
432
  }
433
  }
434
  ],
435
- "special_tokens": {
436
- "<s>": {
437
- "id": "<s>",
438
- "ids": [
439
- 1
440
- ],
441
- "tokens": [
442
- "<s>"
443
- ]
444
- }
445
- }
446
  },
447
  "decoder": {
448
  "type": "Sequence",
 
50
  },
51
  {
52
  "id": 32002,
53
+ "content": "<|placeholder1|>",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": true,
 
59
  },
60
  {
61
  "id": 32003,
62
+ "content": "<|placeholder2|>",
63
  "single_word": false,
64
  "lstrip": false,
65
  "rstrip": true,
 
68
  },
69
  {
70
  "id": 32004,
71
+ "content": "<|placeholder3|>",
72
  "single_word": false,
73
  "lstrip": false,
74
  "rstrip": true,
 
77
  },
78
  {
79
  "id": 32005,
80
+ "content": "<|placeholder4|>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": true,
 
104
  },
105
  {
106
  "id": 32008,
107
+ "content": "<|placeholder5|>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": true,
 
113
  },
114
  {
115
  "id": 32009,
116
+ "content": "<|placeholder6|>",
117
  "single_word": false,
118
  "lstrip": false,
119
  "rstrip": true,
 
128
  "rstrip": true,
129
  "normalized": false,
130
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  }
132
  ],
133
  "normalizer": {
 
150
  "post_processor": {
151
  "type": "TemplateProcessing",
152
  "single": [
 
 
 
 
 
 
153
  {
154
  "Sequence": {
155
  "id": "A",
 
158
  }
159
  ],
160
  "pair": [
 
 
 
 
 
 
161
  {
162
  "Sequence": {
163
  "id": "A",
164
  "type_id": 0
165
  }
166
  },
 
 
 
 
 
 
167
  {
168
  "Sequence": {
169
  "id": "B",
 
171
  }
172
  }
173
  ],
174
+ "special_tokens": {}
 
 
 
 
 
 
 
 
 
 
175
  },
176
  "decoder": {
177
  "type": "Sequence",
tokenizer_config.json CHANGED
@@ -1,6 +1,7 @@
1
  {
2
- "add_bos_token": true,
3
  "add_eos_token": false,
 
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",
@@ -43,7 +44,7 @@
43
  "special": true
44
  },
45
  "32002": {
46
- "content": "<|step|>",
47
  "lstrip": false,
48
  "normalized": false,
49
  "rstrip": true,
@@ -51,7 +52,7 @@
51
  "special": true
52
  },
53
  "32003": {
54
- "content": "<|function_output|>",
55
  "lstrip": false,
56
  "normalized": false,
57
  "rstrip": true,
@@ -59,7 +60,7 @@
59
  "special": true
60
  },
61
  "32004": {
62
- "content": "<|tag|>",
63
  "lstrip": false,
64
  "normalized": false,
65
  "rstrip": true,
@@ -67,7 +68,7 @@
67
  "special": true
68
  },
69
  "32005": {
70
- "content": "<|function_call|>",
71
  "lstrip": false,
72
  "normalized": false,
73
  "rstrip": true,
@@ -91,7 +92,7 @@
91
  "special": true
92
  },
93
  "32008": {
94
- "content": "<|raw|>",
95
  "lstrip": false,
96
  "normalized": false,
97
  "rstrip": true,
@@ -99,7 +100,7 @@
99
  "special": true
100
  },
101
  "32009": {
102
- "content": "<|continue|>",
103
  "lstrip": false,
104
  "normalized": false,
105
  "rstrip": true,
@@ -113,234 +114,15 @@
113
  "rstrip": true,
114
  "single_word": false,
115
  "special": true
116
- },
117
- "32011": {
118
- "content": "<|function_list|>",
119
- "lstrip": false,
120
- "normalized": false,
121
- "rstrip": true,
122
- "single_word": false,
123
- "special": true
124
- },
125
- "32012": {
126
- "content": "<|calc|>",
127
- "lstrip": false,
128
- "normalized": false,
129
- "rstrip": true,
130
- "single_word": false,
131
- "special": true
132
- },
133
- "32013": {
134
- "content": "<|code|>",
135
- "lstrip": false,
136
- "normalized": false,
137
- "rstrip": true,
138
- "single_word": false,
139
- "special": true
140
- },
141
- "32014": {
142
- "content": "<|/code|>",
143
- "lstrip": false,
144
- "normalized": false,
145
- "rstrip": true,
146
- "single_word": false,
147
- "special": true
148
- },
149
- "32015": {
150
- "content": "<|summary|>",
151
- "lstrip": false,
152
- "normalized": false,
153
- "rstrip": true,
154
- "single_word": false,
155
- "special": true
156
- },
157
- "32016": {
158
- "content": "<|resource|>",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": true,
162
- "single_word": false,
163
- "special": true
164
- },
165
- "32017": {
166
- "content": "<|assistant_mask|>",
167
- "lstrip": false,
168
- "normalized": false,
169
- "rstrip": true,
170
- "single_word": false,
171
- "special": true
172
- },
173
- "32018": {
174
- "content": "<|start|>",
175
- "lstrip": false,
176
- "normalized": false,
177
- "rstrip": true,
178
- "single_word": false,
179
- "special": true
180
- },
181
- "32019": {
182
- "content": "<|message|>",
183
- "lstrip": false,
184
- "normalized": false,
185
- "rstrip": true,
186
- "single_word": false,
187
- "special": true
188
- },
189
- "32020": {
190
- "content": "<|fim_prefix|>",
191
- "lstrip": false,
192
- "normalized": false,
193
- "rstrip": true,
194
- "single_word": false,
195
- "special": true
196
- },
197
- "32021": {
198
- "content": "<|fim_middle|>",
199
- "lstrip": false,
200
- "normalized": false,
201
- "rstrip": true,
202
- "single_word": false,
203
- "special": true
204
- },
205
- "32022": {
206
- "content": "<|fim_suffix|>",
207
- "lstrip": false,
208
- "normalized": false,
209
- "rstrip": true,
210
- "single_word": false,
211
- "special": true
212
- },
213
- "32023": {
214
- "content": "<|meta_start|>",
215
- "lstrip": false,
216
- "normalized": false,
217
- "rstrip": true,
218
- "single_word": false,
219
- "special": true
220
- },
221
- "32024": {
222
- "content": "<|ipynb_marker|>",
223
- "lstrip": false,
224
- "normalized": false,
225
- "rstrip": true,
226
- "single_word": false,
227
- "special": true
228
- },
229
- "32025": {
230
- "content": "<|diff_marker|>",
231
- "lstrip": false,
232
- "normalized": false,
233
- "rstrip": true,
234
- "single_word": false,
235
- "special": true
236
- },
237
- "32026": {
238
- "content": "<|ghissue|>",
239
- "lstrip": false,
240
- "normalized": false,
241
- "rstrip": true,
242
- "single_word": false,
243
- "special": true
244
- },
245
- "32027": {
246
- "content": "<|ghreview|>",
247
- "lstrip": false,
248
- "normalized": false,
249
- "rstrip": true,
250
- "single_word": false,
251
- "special": true
252
- },
253
- "32028": {
254
- "content": "<|disc_start|>",
255
- "lstrip": false,
256
- "normalized": false,
257
- "rstrip": true,
258
- "single_word": false,
259
- "special": true
260
- },
261
- "32029": {
262
- "content": "<|disc_sep|>",
263
- "lstrip": false,
264
- "normalized": false,
265
- "rstrip": true,
266
- "single_word": false,
267
- "special": true
268
- },
269
- "32030": {
270
- "content": "<|disc_thread|><|query|>",
271
- "lstrip": false,
272
- "normalized": false,
273
- "rstrip": true,
274
- "single_word": false,
275
- "special": true
276
- },
277
- "32031": {
278
- "content": "<|/query|>",
279
- "lstrip": false,
280
- "normalized": false,
281
- "rstrip": true,
282
- "single_word": false,
283
- "special": true
284
- },
285
- "32032": {
286
- "content": "<|data|>",
287
- "lstrip": false,
288
- "normalized": false,
289
- "rstrip": true,
290
- "single_word": false,
291
- "special": true
292
- },
293
- "32033": {
294
- "content": "<|/data|>",
295
- "lstrip": false,
296
- "normalized": false,
297
- "rstrip": true,
298
- "single_word": false,
299
- "special": true
300
- },
301
- "32034": {
302
- "content": "<|sys|>",
303
- "lstrip": false,
304
- "normalized": false,
305
- "rstrip": true,
306
- "single_word": false,
307
- "special": true
308
- },
309
- "32035": {
310
- "content": "<|/sys|>",
311
- "lstrip": false,
312
- "normalized": false,
313
- "rstrip": true,
314
- "single_word": false,
315
- "special": true
316
- },
317
- "32036": {
318
- "content": "<|inst|>",
319
- "lstrip": false,
320
- "normalized": false,
321
- "rstrip": true,
322
- "single_word": false,
323
- "special": true
324
- },
325
- "32037": {
326
- "content": "<|/inst|>",
327
- "lstrip": false,
328
- "normalized": false,
329
- "rstrip": true,
330
- "single_word": false,
331
- "special": true
332
  }
333
  },
334
- "additional_special_tokens": [
335
- "<|/inst|>"
336
- ],
337
  "bos_token": "<s>",
338
- "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
339
  "clean_up_tokenization_spaces": false,
340
  "eos_token": "<|endoftext|>",
341
  "legacy": false,
342
  "model_max_length": 131072,
343
- "pad_token": "<|end|>",
344
  "padding_side": "left",
345
  "sp_model_kwargs": {},
346
  "tokenizer_class": "LlamaTokenizer",
 
1
  {
2
+ "add_bos_token": false,
3
  "add_eos_token": false,
4
+ "add_prefix_space": null,
5
  "added_tokens_decoder": {
6
  "0": {
7
  "content": "<unk>",
 
44
  "special": true
45
  },
46
  "32002": {
47
+ "content": "<|placeholder1|>",
48
  "lstrip": false,
49
  "normalized": false,
50
  "rstrip": true,
 
52
  "special": true
53
  },
54
  "32003": {
55
+ "content": "<|placeholder2|>",
56
  "lstrip": false,
57
  "normalized": false,
58
  "rstrip": true,
 
60
  "special": true
61
  },
62
  "32004": {
63
+ "content": "<|placeholder3|>",
64
  "lstrip": false,
65
  "normalized": false,
66
  "rstrip": true,
 
68
  "special": true
69
  },
70
  "32005": {
71
+ "content": "<|placeholder4|>",
72
  "lstrip": false,
73
  "normalized": false,
74
  "rstrip": true,
 
92
  "special": true
93
  },
94
  "32008": {
95
+ "content": "<|placeholder5|>",
96
  "lstrip": false,
97
  "normalized": false,
98
  "rstrip": true,
 
100
  "special": true
101
  },
102
  "32009": {
103
+ "content": "<|placeholder6|>",
104
  "lstrip": false,
105
  "normalized": false,
106
  "rstrip": true,
 
114
  "rstrip": true,
115
  "single_word": false,
116
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  }
118
  },
 
 
 
119
  "bos_token": "<s>",
120
+ "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
121
  "clean_up_tokenization_spaces": false,
122
  "eos_token": "<|endoftext|>",
123
  "legacy": false,
124
  "model_max_length": 131072,
125
+ "pad_token": "<|endoftext|>",
126
  "padding_side": "left",
127
  "sp_model_kwargs": {},
128
  "tokenizer_class": "LlamaTokenizer",