File size: 1,579 Bytes
10d6a86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a6f805
10d6a86
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import os, pickle
from typing import List
from engine.loaders.file import pdf_extractor
from engine.chunk_embed import chunk_vectorize
from settings import parquet_file
from .logger import logger
from .vectorstore import VectorStore
# I allow relative imports inside the engine package
# I could have created a module but things are still changing

finrag_vectorstore = VectorStore(model_path='sentence-transformers/all-mpnet-base-v2')
    

def empty_collection():
    """ Deletes the Finrag collection if it exists """
    status = finrag_vectorstore.empty_collection()
    return status


def index_data():
    
    if not os.path.exists(parquet_file):
        logger.info(f"Parquet file {parquet_file} does not exists")
        return 'no data to index'
    
    # load the parquet file into the vectorstore
    finrag_vectorstore.index_data()
    os.remove(parquet_file)
    # delete the files so we can load several files and index them when we want
    # without having to keep track of those that have been indexed already
    # this is a simple solution for now, but we can do better
    
    return "Index creation successful"
    

def process_pdf(filepath:str) -> dict:
    
    new_content = pdf_extractor('PyPDFLoader', filepath).extract_text()
    logger.info(f"Successfully extracted text from PDF")
    
    chunk_vectorize(new_content)
    logger.info(f"Successfully vectorized PDF content")
    return new_content

def vector_search(question:str) -> List[str]:
    
    ans = finrag_vectorstore.hybrid_search(query=question, limit=3, alpha=0.8)
    return ans