import tiktoken | |
pre_encodings = 'p50k_base' | |
pre_model = 'text-davinci-003' | |
class Tokenizer: | |
def __init__(self, encoding=None, model=None): | |
self.encodings = encoding if encoding is not None else pre_encodings | |
self.model = model if model is not None else pre_model | |
self.tokenizer = tiktoken.get_encoding(self.encodings) | |
self.tokenizer = tiktoken.encoding_for_model(self.model) | |
def encode(self, data): | |
return self.tokenizer.encode(data) | |
def decode(self, tokens): | |
return self.tokenizer.decode(tokens) | |
def get_vocab(self): | |
return self.tokenizer.n_vocab |