File size: 3,478 Bytes
10d6a86 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import os
import pandas as pd
import torch
from settings import parquet_file
import tiktoken # tokenizer library for use with OpenAI LLMs
from llama_index.legacy.text_splitter import SentenceSplitter
from sentence_transformers import SentenceTransformer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# create tensors on GPU if available
if torch.cuda.is_available():
torch.set_default_tensor_type('torch.cuda.FloatTensor')
def chunk_vectorize(doc_content: dict = None,
chunk_size: int = 256, # limit for 'all-mpnet-base-v2'
chunk_overlap: int = 20, # some overlap to link the chunks
encoder: str = 'gpt-3.5-turbo-0613',
model_name: str = 'sentence-transformers/all-mpnet-base-v2'): # can try all-MiniLM-L6-v2
# see tests in chunking_indexing.ipynb for more details
encoding = tiktoken.encoding_for_model(encoder)
splitter = SentenceSplitter(chunk_size=chunk_size,
tokenizer=encoding.encode,
chunk_overlap=chunk_overlap)
# let's create the splits for every document
contents_splits = {}
for fname, content in doc_content.items():
splits = [splitter.split_text(page) for page in content]
contents_splits[fname] = [split for sublist in splits for split in sublist]
model = SentenceTransformer(model_name)
content_emb = {}
for fname, splits in contents_splits.items():
content_emb[fname] = [(split, model.encode(split)) for split in splits]
# save fname since it carries information, and could be used as a property in Weaviate
text_vector_tuples = [(fname, split, emb.tolist()) for fname, splits_emb in content_emb.items() for split, emb in splits_emb]
new_df = pd.DataFrame(
text_vector_tuples,
columns=['file', 'content', 'content_embedding']
)
# load the existing parquet file if it exists and update it
if os.path.exists(parquet_file):
new_df = pd.concat([pd.read_parquet(parquet_file), new_df])
# no optimization here (zipping etc) since the data is small
new_df.to_parquet(parquet_file, index=False)
return
# TODO
# import unittest
# from unitesting_utils import load_impact_theory_data
# class TestSplitContents(unittest.TestCase):
# '''
# Unit test to ensure proper functionality of split_contents function
# '''
# def test_split_contents(self):
# import tiktoken
# from llama_index.text_splitter import SentenceSplitter
# data = load_impact_theory_data()
# subset = data[:3]
# chunk_size = 256
# chunk_overlap = 0
# encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0613')
# gpt35_txt_splitter = SentenceSplitter(chunk_size=chunk_size, tokenizer=encoding.encode, chunk_overlap=chunk_overlap)
# results = split_contents(subset, gpt35_txt_splitter)
# self.assertEqual(len(results), 3)
# self.assertEqual(len(results[0]), 83)
# self.assertEqual(len(results[1]), 178)
# self.assertEqual(len(results[2]), 144)
# self.assertTrue(isinstance(results, list))
# self.assertTrue(isinstance(results[0], list))
# self.assertTrue(isinstance(results[0][0], str))
# unittest.TextTestRunner().run(unittest.TestLoader().loadTestsFromTestCase(TestSplitContents))
|