import os | |
import pandas as pd | |
import torch | |
from settings import parquet_file | |
import tiktoken # tokenizer library for use with OpenAI LLMs | |
from llama_index.legacy.text_splitter import SentenceSplitter | |
from sentence_transformers import SentenceTransformer | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# create tensors on GPU if available | |
if torch.cuda.is_available(): | |
torch.set_default_tensor_type('torch.cuda.FloatTensor') | |
def chunk_vectorize(doc_content: dict = None, | |
chunk_size: int = 256, # limit for 'all-mpnet-base-v2' | |
chunk_overlap: int = 20, # some overlap to link the chunks | |
encoder: str = 'gpt-3.5-turbo-0613', | |
model_name: str = 'sentence-transformers/all-mpnet-base-v2'): # can try all-MiniLM-L6-v2 | |
# see tests in chunking_indexing.ipynb for more details | |
encoding = tiktoken.encoding_for_model(encoder) | |
splitter = SentenceSplitter(chunk_size=chunk_size, | |
tokenizer=encoding.encode, | |
chunk_overlap=chunk_overlap) | |
# let's create the splits for every document | |
contents_splits = {} | |
for fname, content in doc_content.items(): | |
splits = [splitter.split_text(page) for page in content] | |
contents_splits[fname] = [split for sublist in splits for split in sublist] | |
model = SentenceTransformer(model_name) | |
content_emb = {} | |
for fname, splits in contents_splits.items(): | |
content_emb[fname] = [(split, model.encode(split)) for split in splits] | |
# save fname since it carries information, and could be used as a property in Weaviate | |
text_vector_tuples = [(fname, split, emb.tolist()) for fname, splits_emb in content_emb.items() for split, emb in splits_emb] | |
new_df = pd.DataFrame( | |
text_vector_tuples, | |
columns=['file', 'content', 'content_embedding'] | |
) | |
# load the existing parquet file if it exists and update it | |
if os.path.exists(parquet_file): | |
new_df = pd.concat([pd.read_parquet(parquet_file), new_df]) | |
# no optimization here (zipping etc) since the data is small | |
new_df.to_parquet(parquet_file, index=False) | |
return | |
# TODO | |
# import unittest | |
# from unitesting_utils import load_impact_theory_data | |
# class TestSplitContents(unittest.TestCase): | |
# ''' | |
# Unit test to ensure proper functionality of split_contents function | |
# ''' | |
# def test_split_contents(self): | |
# import tiktoken | |
# from llama_index.text_splitter import SentenceSplitter | |
# data = load_impact_theory_data() | |
# subset = data[:3] | |
# chunk_size = 256 | |
# chunk_overlap = 0 | |
# encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0613') | |
# gpt35_txt_splitter = SentenceSplitter(chunk_size=chunk_size, tokenizer=encoding.encode, chunk_overlap=chunk_overlap) | |
# results = split_contents(subset, gpt35_txt_splitter) | |
# self.assertEqual(len(results), 3) | |
# self.assertEqual(len(results[0]), 83) | |
# self.assertEqual(len(results[1]), 178) | |
# self.assertEqual(len(results[2]), 144) | |
# self.assertTrue(isinstance(results, list)) | |
# self.assertTrue(isinstance(results[0], list)) | |
# self.assertTrue(isinstance(results[0][0], str)) | |
# unittest.TextTestRunner().run(unittest.TestLoader().loadTestsFromTestCase(TestSplitContents)) | |