|
import os, pickle |
|
from typing import List |
|
from engine.loaders.file import pdf_extractor |
|
from engine.chunk_embed import chunk_vectorize |
|
from settings import parquet_file |
|
from .logger import logger |
|
from .vectorstore import VectorStore |
|
|
|
|
|
|
|
finrag_vectorstore = VectorStore(model_path='sentence-transformers/all-mpnet-base-v2') |
|
|
|
|
|
def empty_collection(): |
|
""" Deletes the Finrag collection if it exists """ |
|
status = finrag_vectorstore.empty_collection() |
|
return status |
|
|
|
|
|
def index_data(): |
|
|
|
if not os.path.exists(parquet_file): |
|
logger.info(f"Parquet file {parquet_file} does not exists") |
|
return 'no data to index' |
|
|
|
|
|
finrag_vectorstore.index_data() |
|
os.remove(parquet_file) |
|
|
|
|
|
|
|
|
|
return "Index creation successful" |
|
|
|
|
|
def process_pdf(filepath:str) -> dict: |
|
|
|
new_content = pdf_extractor('PyPDFLoader', filepath).extract_text() |
|
logger.info(f"Successfully extracted text from PDF") |
|
|
|
chunk_vectorize(new_content) |
|
logger.info(f"Successfully vectorized PDF content") |
|
return new_content |
|
|
|
def vector_search(question:str) -> List[str]: |
|
|
|
ans = finrag_vectorstore.hybrid_search(query=question, limit=3, alpha=0.8) |
|
return ans |
|
|