jinaai
/

jina-bert-flash-implementation

@@ -1,48 +1,69 @@
-import torch
-import numpy as np
-from transformers import RobertaTokenizer, BatchEncoding
 import warnings
 class JinaTokenizer(RobertaTokenizer):
-    def __init__(self, *args, task_type_vocab_size=6, **kwargs):
         super().__init__(*args, **kwargs)
         self.task_type_vocab_size = task_type_vocab_size
     def __call__(self, *args, task_type=None, **kwargs):
-        batch_encoding = super().__call__(*args, **kwargs)
-        batch_encoding = BatchEncoding(
-            {
-                'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
-                **batch_encoding,
-            },
-            tensor_type=kwargs.get('return_tensors'),
         )
-        return batch_encoding
-    def _batch_encode_plus(self, *args, task_type=None, **kwargs):
-        batch_encoding = super()._batch_encode_plus(*args, **kwargs)
-        if task_type is not None:
-            batch_encoding = BatchEncoding(
-                {
-                    'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
-                    **batch_encoding,
-                },
-                tensor_type=kwargs.get('return_tensors'),
             )
-        return batch_encoding
-    def _encode_plus(self, *args, task_type=None, **kwargs):
-        batch_encoding = super()._encode_plus(*args, **kwargs)
         if task_type is not None:
-            batch_encoding = BatchEncoding(
-                {
-                    'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
-                    **batch_encoding,
-                },
-                tensor_type=kwargs.get('return_tensors'),
-            )
-        return batch_encoding
     @staticmethod
     def _get_task_type_ids(batch_encoding: BatchEncoding, task_type: int):

 import warnings
+import numpy as np
+import torch
+from transformers import BatchEncoding, RobertaTokenizer
 class JinaTokenizer(RobertaTokenizer):
+    def __init__(
+        self, *args, task_type_vocab_size=6, cls_token_interval=None, **kwargs
+    ):
         super().__init__(*args, **kwargs)
         self.task_type_vocab_size = task_type_vocab_size
+        self.cls_token_interval = cls_token_interval
     def __call__(self, *args, task_type=None, **kwargs):
+        kwargs['task_type'] = task_type
+        return super().__call__(*args, **kwargs)
+    def _encode_plus(self, *args, **kwargs):
+        return self._process_encoding(super()._encode_plus(*args, **kwargs), **kwargs)
+    def _batch_encode_plus(self, *args, **kwargs):
+        return self._process_encoding(
+            super()._batch_encode_plus(*args, **kwargs), **kwargs
         )
+    def _process_encoding(self, batch_encoding, **kwargs):
+        task_type = kwargs.get('task_type')
+        if self.cls_token_interval is not None:
+            modified_input_ids, modified_attention_mask = self._insert_cls_tokens(
+                batch_encoding
             )
+            batch_encoding['input_ids'] = modified_input_ids
+            if 'attention_mask' in batch_encoding:
+                batch_encoding['attention_mask'] = modified_attention_mask
         if task_type is not None:
+            task_type_ids = self._get_task_type_ids(batch_encoding, task_type)
+            batch_encoding['task_type_ids'] = task_type_ids
+        return BatchEncoding(batch_encoding, tensor_type=kwargs.get('return_tensors'))
+    def _insert_cls_tokens(self, batch_encoding):
+        new_input_ids = []
+        new_attention_masks = []
+        sequences = batch_encoding['input_ids'].tolist()
+        for sequence in sequences:
+            modified_sequence = [sequence[0]]
+            for i in range(1, len(sequence), self.cls_token_interval):
+                chunk = sequence[i : i + self.cls_token_interval]
+                modified_sequence.extend(chunk)
+                if i + self.cls_token_interval < len(sequence):
+                    modified_sequence.append(self.cls_token_id)
+            attention_mask = [1 for _ in range(len(modified_sequence))]
+            new_input_ids.append(modified_sequence)
+            new_attention_masks.append(attention_mask)
+        new_input_ids = torch.tensor(new_input_ids, dtype=torch.long)
+        new_attention_masks = torch.tensor(new_attention_masks, dtype=torch.long)
+        return new_input_ids, new_attention_masks
     @staticmethod
     def _get_task_type_ids(batch_encoding: BatchEncoding, task_type: int):