File size: 593 Bytes
7f4e854
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import tiktoken

pre_encodings = 'p50k_base'
pre_model = 'text-davinci-003'
class Tokenizer:
  def __init__(self, encoding=None, model=None):
    self.encodings = encoding if encoding is not None else pre_encodings
    self.model = model if model is not None else pre_model
    self.tokenizer = tiktoken.get_encoding(self.encodings)
    self.tokenizer = tiktoken.encoding_for_model(self.model)
  
  def encode(self, data):
    return self.tokenizer.encode(data)
  
  def decode(self, tokens):
    return self.tokenizer.decode(tokens)
  
  def get_vocab(self):
    return self.tokenizer.n_vocab