File size: 1,779 Bytes
72c619d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
name: whisper_bpe_tokenizer
config_type: preprocessor
pretrained_path: hezarai/whisper-small
max_length: 512
truncation_strategy: longest_first
truncation_direction: right
stride: 0
padding_strategy: longest
padding_direction: right
pad_to_multiple_of: 0
pad_token_id: 0
pad_token: <pad>
pad_token_type_id: 0
unk_token: <|endoftext|>
special_tokens:
- <|endoftext|>
- <|endoftext|>
- <|startoftranscript|>
- <|en|>
- <|zh|>
- <|de|>
- <|es|>
- <|ru|>
- <|ko|>
- <|fr|>
- <|ja|>
- <|pt|>
- <|tr|>
- <|pl|>
- <|ca|>
- <|nl|>
- <|ar|>
- <|sv|>
- <|it|>
- <|id|>
- <|hi|>
- <|fi|>
- <|vi|>
- <|he|>
- <|uk|>
- <|el|>
- <|ms|>
- <|cs|>
- <|ro|>
- <|da|>
- <|hu|>
- <|ta|>
- <|no|>
- <|th|>
- <|ur|>
- <|hr|>
- <|bg|>
- <|lt|>
- <|la|>
- <|mi|>
- <|ml|>
- <|cy|>
- <|sk|>
- <|te|>
- <|fa|>
- <|lv|>
- <|bn|>
- <|sr|>
- <|az|>
- <|sl|>
- <|kn|>
- <|et|>
- <|mk|>
- <|br|>
- <|eu|>
- <|is|>
- <|hy|>
- <|ne|>
- <|mn|>
- <|bs|>
- <|kk|>
- <|sq|>
- <|sw|>
- <|gl|>
- <|mr|>
- <|pa|>
- <|si|>
- <|km|>
- <|sn|>
- <|yo|>
- <|so|>
- <|af|>
- <|oc|>
- <|ka|>
- <|be|>
- <|tg|>
- <|sd|>
- <|gu|>
- <|am|>
- <|yi|>
- <|lo|>
- <|uz|>
- <|fo|>
- <|ht|>
- <|ps|>
- <|tk|>
- <|nn|>
- <|mt|>
- <|sa|>
- <|lb|>
- <|my|>
- <|bo|>
- <|tl|>
- <|mg|>
- <|as|>
- <|tt|>
- <|haw|>
- <|ln|>
- <|ha|>
- <|ba|>
- <|jw|>
- <|su|>
- <|translate|>
- <|transcribe|>
- <|startoflm|>
- <|startofprev|>
- <|nocaptions|>
- <|notimestamps|>
continuing_subword_prefix: ''
end_of_word_suffix: ''
fuse_unk: false
vocab_size: 50364
min_frequency: 2
limit_alphabet: 1000
initial_alphabet: []
show_progress: true
unk_token_id: 50257
bos_token: <|startoftranscript|>
bos_token_id: 50257
eos_token: <|endoftext|>
eos_token_id: 50257
add_prefix_space: false
add_bos_token: false
model_max_length: 1024
predict_timestamps: false
|