hezarai
/

bert-fa-pos-lscp-500k

arxyzan commited on Oct 1, 2023

Commit

309a24b

•

1 Parent(s): bd7e88b

Hezar: Upload tokenizer and config

Files changed (1) hide show

preprocessor/tokenizer_config.yaml CHANGED Viewed

@@ -1,6 +1,5 @@
 name: wordpiece_tokenizer
 config_type: preprocessor
-pretrained_path: hezarai/bert-fa-pos-lscp-500k
 max_length: 512
 truncation_strategy: longest_first
 truncation_direction: right
@@ -8,18 +7,14 @@ stride: 0
 padding_strategy: longest
 padding_direction: right
 pad_to_multiple_of: 0
-pad_token_id: 0
-pad_token: '[PAD]'
 pad_token_type_id: 0
 unk_token: '[UNK]'
-special_tokens:
-- '[UNK]'
-- '[SEP]'
-- '[CLS]'
-- '[PAD]'
-- '[MASK]'
 wordpieces_prefix: '##'
-vocab_size: 30000
 min_frequency: 2
 limit_alphabet: 1000
 initial_alphabet: []

 name: wordpiece_tokenizer
 config_type: preprocessor
 max_length: 512
 truncation_strategy: longest_first
 truncation_direction: right
 padding_strategy: longest
 padding_direction: right
 pad_to_multiple_of: 0
 pad_token_type_id: 0
 unk_token: '[UNK]'
+sep_token: '[SEP]'
+pad_token: '[PAD]'
+cls_token: '[CLS]'
+mask_token: '[MASK]'
 wordpieces_prefix: '##'
+vocab_size: 42000
 min_frequency: 2
 limit_alphabet: 1000
 initial_alphabet: []