avia-4x500m / base /tokenizer.py
shivendrra's picture
added train and model files
7f4e854 verified
raw
history blame contribute delete
No virus
593 Bytes
import tiktoken
pre_encodings = 'p50k_base'
pre_model = 'text-davinci-003'
class Tokenizer:
def __init__(self, encoding=None, model=None):
self.encodings = encoding if encoding is not None else pre_encodings
self.model = model if model is not None else pre_model
self.tokenizer = tiktoken.get_encoding(self.encodings)
self.tokenizer = tiktoken.encoding_for_model(self.model)
def encode(self, data):
return self.tokenizer.encode(data)
def decode(self, tokens):
return self.tokenizer.decode(tokens)
def get_vocab(self):
return self.tokenizer.n_vocab