File size: 11,757 Bytes
4c65bff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 |
import os
from typing import List, Union
import tensorflow as tf
from tensorflow_text import BertTokenizer as BertTokenizerLayer
from tensorflow_text import FastBertTokenizer, ShrinkLongestTrimmer, case_fold_utf8, combine_segments, pad_model_inputs
from .tokenization_bert import BertTokenizer
class TFBertTokenizer(tf.keras.layers.Layer):
"""
This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the
`from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings
from an existing standard tokenizer object.
In-graph tokenizers, unlike other Hugging Face tokenizers, are actually Keras layers and are designed to be run
when the model is called, rather than during preprocessing. As a result, they have somewhat more limited options
than standard tokenizer classes. They are most useful when you want to create an end-to-end model that goes
straight from `tf.string` inputs to outputs.
Args:
vocab_list (`list`):
List containing the vocabulary.
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
cls_token_id (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
sep_token_id (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token_id (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
padding (`str`, defaults to `"longest"`):
The type of padding to use. Can be either `"longest"`, to pad only up to the longest sample in the batch,
or `"max_length", to pad all inputs to the maximum length supported by the tokenizer.
truncation (`bool`, *optional*, defaults to `True`):
Whether to truncate the sequence to the maximum length.
max_length (`int`, *optional*, defaults to `512`):
The maximum length of the sequence, used for padding (if `padding` is "max_length") and/or truncation (if
`truncation` is `True`).
pad_to_multiple_of (`int`, *optional*, defaults to `None`):
If set, the sequence will be padded to a multiple of this value.
return_token_type_ids (`bool`, *optional*, defaults to `True`):
Whether to return token_type_ids.
return_attention_mask (`bool`, *optional*, defaults to `True`):
Whether to return the attention_mask.
use_fast_bert_tokenizer (`bool`, *optional*, defaults to `True`):
If True, will use the FastBertTokenizer class from Tensorflow Text. If False, will use the BertTokenizer
class instead. BertTokenizer supports some additional options, but is slower and cannot be exported to
TFLite.
"""
def __init__(
self,
vocab_list: List,
do_lower_case: bool,
cls_token_id: int = None,
sep_token_id: int = None,
pad_token_id: int = None,
padding: str = "longest",
truncation: bool = True,
max_length: int = 512,
pad_to_multiple_of: int = None,
return_token_type_ids: bool = True,
return_attention_mask: bool = True,
use_fast_bert_tokenizer: bool = True,
**tokenizer_kwargs,
):
super().__init__()
if use_fast_bert_tokenizer:
self.tf_tokenizer = FastBertTokenizer(
vocab_list, token_out_type=tf.int64, lower_case_nfd_strip_accents=do_lower_case, **tokenizer_kwargs
)
else:
lookup_table = tf.lookup.StaticVocabularyTable(
tf.lookup.KeyValueTensorInitializer(
keys=vocab_list,
key_dtype=tf.string,
values=tf.range(tf.size(vocab_list, out_type=tf.int64), dtype=tf.int64),
value_dtype=tf.int64,
),
num_oov_buckets=1,
)
self.tf_tokenizer = BertTokenizerLayer(
lookup_table, token_out_type=tf.int64, lower_case=do_lower_case, **tokenizer_kwargs
)
self.vocab_list = vocab_list
self.do_lower_case = do_lower_case
self.cls_token_id = cls_token_id or vocab_list.index("[CLS]")
self.sep_token_id = sep_token_id or vocab_list.index("[SEP]")
self.pad_token_id = pad_token_id or vocab_list.index("[PAD]")
self.paired_trimmer = ShrinkLongestTrimmer(max_length - 3, axis=1) # Allow room for special tokens
self.max_length = max_length
self.padding = padding
self.truncation = truncation
self.pad_to_multiple_of = pad_to_multiple_of
self.return_token_type_ids = return_token_type_ids
self.return_attention_mask = return_attention_mask
@classmethod
def from_tokenizer(cls, tokenizer: "PreTrainedTokenizerBase", **kwargs): # noqa: F821
"""
Initialize a `TFBertTokenizer` from an existing `Tokenizer`.
Args:
tokenizer (`PreTrainedTokenizerBase`):
The tokenizer to use to initialize the `TFBertTokenizer`.
Examples:
```python
from transformers import AutoTokenizer, TFBertTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tf_tokenizer = TFBertTokenizer.from_tokenizer(tokenizer)
```
"""
do_lower_case = kwargs.pop("do_lower_case", None)
do_lower_case = tokenizer.do_lower_case if do_lower_case is None else do_lower_case
cls_token_id = kwargs.pop("cls_token_id", None)
cls_token_id = tokenizer.cls_token_id if cls_token_id is None else cls_token_id
sep_token_id = kwargs.pop("sep_token_id", None)
sep_token_id = tokenizer.sep_token_id if sep_token_id is None else sep_token_id
pad_token_id = kwargs.pop("pad_token_id", None)
pad_token_id = tokenizer.pad_token_id if pad_token_id is None else pad_token_id
vocab = tokenizer.get_vocab()
vocab = sorted(vocab.items(), key=lambda x: x[1])
vocab_list = [entry[0] for entry in vocab]
return cls(
vocab_list=vocab_list,
do_lower_case=do_lower_case,
cls_token_id=cls_token_id,
sep_token_id=sep_token_id,
pad_token_id=pad_token_id,
**kwargs,
)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs):
"""
Instantiate a `TFBertTokenizer` from a pre-trained tokenizer.
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
The name or path to the pre-trained tokenizer.
Examples:
```python
from transformers import TFBertTokenizer
tf_tokenizer = TFBertTokenizer.from_pretrained("bert-base-uncased")
```
"""
try:
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
except: # noqa: E722
from .tokenization_bert_fast import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
return cls.from_tokenizer(tokenizer, **kwargs)
def unpaired_tokenize(self, texts):
if self.do_lower_case:
texts = case_fold_utf8(texts)
tokens = self.tf_tokenizer.tokenize(texts)
return tokens.merge_dims(1, -1)
def call(
self,
text,
text_pair=None,
padding=None,
truncation=None,
max_length=None,
pad_to_multiple_of=None,
return_token_type_ids=None,
return_attention_mask=None,
):
if padding is None:
padding = self.padding
if padding not in ("longest", "max_length"):
raise ValueError("Padding must be either 'longest' or 'max_length'!")
if max_length is not None and text_pair is not None:
# Because we have to instantiate a Trimmer to do it properly
raise ValueError("max_length cannot be overridden at call time when truncating paired texts!")
if max_length is None:
max_length = self.max_length
if truncation is None:
truncation = self.truncation
if pad_to_multiple_of is None:
pad_to_multiple_of = self.pad_to_multiple_of
if return_token_type_ids is None:
return_token_type_ids = self.return_token_type_ids
if return_attention_mask is None:
return_attention_mask = self.return_attention_mask
if not isinstance(text, tf.Tensor):
text = tf.convert_to_tensor(text)
if text_pair is not None and not isinstance(text_pair, tf.Tensor):
text_pair = tf.convert_to_tensor(text_pair)
if text_pair is not None:
if text.shape.rank > 1:
raise ValueError("text argument should not be multidimensional when a text pair is supplied!")
if text_pair.shape.rank > 1:
raise ValueError("text_pair should not be multidimensional!")
if text.shape.rank == 2:
text, text_pair = text[:, 0], text[:, 1]
text = self.unpaired_tokenize(text)
if text_pair is None: # Unpaired text
if truncation:
text = text[:, : max_length - 2] # Allow room for special tokens
input_ids, token_type_ids = combine_segments(
(text,), start_of_sequence_id=self.cls_token_id, end_of_segment_id=self.sep_token_id
)
else: # Paired text
text_pair = self.unpaired_tokenize(text_pair)
if truncation:
text, text_pair = self.paired_trimmer.trim([text, text_pair])
input_ids, token_type_ids = combine_segments(
(text, text_pair), start_of_sequence_id=self.cls_token_id, end_of_segment_id=self.sep_token_id
)
if padding == "longest":
pad_length = input_ids.bounding_shape(axis=1)
if pad_to_multiple_of is not None:
# No ceiling division in tensorflow, so we negate floordiv instead
pad_length = pad_to_multiple_of * (-tf.math.floordiv(-pad_length, pad_to_multiple_of))
else:
pad_length = max_length
input_ids, attention_mask = pad_model_inputs(input_ids, max_seq_length=pad_length, pad_value=self.pad_token_id)
output = {"input_ids": input_ids}
if return_attention_mask:
output["attention_mask"] = attention_mask
if return_token_type_ids:
token_type_ids, _ = pad_model_inputs(
token_type_ids, max_seq_length=pad_length, pad_value=self.pad_token_id
)
output["token_type_ids"] = token_type_ids
return output
def get_config(self):
return {
"vocab_list": self.vocab_list,
"do_lower_case": self.do_lower_case,
"cls_token_id": self.cls_token_id,
"sep_token_id": self.sep_token_id,
"pad_token_id": self.pad_token_id,
}
|