bert-fa-pos-lscp-500k / preprocessor /tokenizer_config.yaml

Hezar: Upload tokenizer and config

e7805ae over 1 year ago

524 Bytes

	name: wordpiece_tokenizer
	config_type: preprocessor
	pretrained_path: hezarai/bert-fa-pos-lscp-500k
	max_length: 512
	truncation_strategy: longest_first
	truncation_direction: right
	stride: 0
	padding_strategy: longest
	padding_direction: right
	pad_to_multiple_of: 0
	pad_token_id: 0
	pad_token: '[PAD]'
	pad_token_type_id: 0
	unk_token: '[UNK]'
	special_tokens:
	- '[UNK]'
	- '[SEP]'
	- '[CLS]'
	- '[PAD]'
	- '[MASK]'
	wordpieces_prefix: '##'
	vocab_size: 30000
	min_frequency: 2
	limit_alphabet: 1000
	initial_alphabet: []
	show_progress: true