|
|
|
|
|
import os |
|
import pandas as pd |
|
import torch |
|
|
|
from app.settings import parquet_file |
|
|
|
import tiktoken |
|
from llama_index.legacy.text_splitter import SentenceSplitter |
|
from sentence_transformers import SentenceTransformer |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
if torch.cuda.is_available(): |
|
torch.set_default_tensor_type('torch.cuda.FloatTensor') |
|
|
|
|
|
def chunk_vectorize(doc_content: dict = None, |
|
chunk_size: int = 256, |
|
chunk_overlap: int = 20, |
|
encoder: str = 'gpt-3.5-turbo-0613', |
|
model_name: str = 'sentence-transformers/all-mpnet-base-v2'): |
|
|
|
|
|
encoding = tiktoken.encoding_for_model(encoder) |
|
|
|
splitter = SentenceSplitter(chunk_size=chunk_size, |
|
tokenizer=encoding.encode, |
|
chunk_overlap=chunk_overlap) |
|
|
|
|
|
contents_splits = {} |
|
for fname, content in doc_content.items(): |
|
splits = [splitter.split_text(page) for page in content] |
|
contents_splits[fname] = [split for sublist in splits for split in sublist] |
|
|
|
model = SentenceTransformer(model_name) |
|
|
|
content_emb = {} |
|
for fname, splits in contents_splits.items(): |
|
content_emb[fname] = [(split, model.encode(split)) for split in splits] |
|
|
|
|
|
text_vector_tuples = [(fname, split, emb.tolist()) for fname, splits_emb in content_emb.items() for split, emb in splits_emb] |
|
|
|
new_df = pd.DataFrame( |
|
text_vector_tuples, |
|
columns=['file', 'content', 'content_embedding'] |
|
) |
|
|
|
|
|
if os.path.exists(parquet_file): |
|
new_df = pd.concat([pd.read_parquet(parquet_file), new_df]) |
|
|
|
|
|
new_df.to_parquet(parquet_file, index=False) |
|
|
|
return |
|
|