File size: 1,579 Bytes
10d6a86 0a6f805 10d6a86 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import os, pickle
from typing import List
from engine.loaders.file import pdf_extractor
from engine.chunk_embed import chunk_vectorize
from settings import parquet_file
from .logger import logger
from .vectorstore import VectorStore
# I allow relative imports inside the engine package
# I could have created a module but things are still changing
finrag_vectorstore = VectorStore(model_path='sentence-transformers/all-mpnet-base-v2')
def empty_collection():
""" Deletes the Finrag collection if it exists """
status = finrag_vectorstore.empty_collection()
return status
def index_data():
if not os.path.exists(parquet_file):
logger.info(f"Parquet file {parquet_file} does not exists")
return 'no data to index'
# load the parquet file into the vectorstore
finrag_vectorstore.index_data()
os.remove(parquet_file)
# delete the files so we can load several files and index them when we want
# without having to keep track of those that have been indexed already
# this is a simple solution for now, but we can do better
return "Index creation successful"
def process_pdf(filepath:str) -> dict:
new_content = pdf_extractor('PyPDFLoader', filepath).extract_text()
logger.info(f"Successfully extracted text from PDF")
chunk_vectorize(new_content)
logger.info(f"Successfully vectorized PDF content")
return new_content
def vector_search(question:str) -> List[str]:
ans = finrag_vectorstore.hybrid_search(query=question, limit=3, alpha=0.8)
return ans
|