|
import os, pickle |
|
from typing import List |
|
from .loaders.file import extractor |
|
from .chunk_embed import chunk_vectorize |
|
from ..settings import parquet_file |
|
from .logger import logger |
|
from .vectorstore import VectorStore |
|
from .post_process import IsPii, IsToxic, IsRelevant |
|
from .summary import summarize_it |
|
|
|
multirag_vectorstore = VectorStore(model_path='sentence-transformers/all-mpnet-base-v2') |
|
|
|
|
|
def empty_collection(): |
|
""" Deletes the MultiRAG collection if it exists """ |
|
status = multirag_vectorstore.empty_collection() |
|
return status |
|
|
|
|
|
def index_data(): |
|
|
|
if not os.path.exists(parquet_file): |
|
logger(f"Parquet file {parquet_file} does not exists") |
|
return 'no data to index' |
|
|
|
|
|
multirag_vectorstore.index_data() |
|
os.remove(parquet_file) |
|
|
|
|
|
|
|
|
|
return "Index creation successful" |
|
|
|
|
|
def process_pdf(filepath: str) -> dict: |
|
|
|
new_content = extractor('PyPDFLoader', filepath).extract_text() |
|
logger(f"Successfully extracted text from PDF") |
|
|
|
chunk_vectorize(new_content) |
|
logger(f"Successfully vectorized PDF content of {filepath}") |
|
return new_content |
|
|
|
|
|
def process_txt(filepath: str) -> dict: |
|
|
|
new_content = extractor('txt', filepath).extract_text() |
|
logger(f"Successfully extracted text from TXT") |
|
|
|
chunk_vectorize(new_content) |
|
logger(f"Successfully vectorized TXT content") |
|
return new_content |
|
|
|
|
|
def vector_search_raw(question: str) -> List[str]: |
|
""" Just vector search """ |
|
|
|
ans = multirag_vectorstore.hybrid_search(query=question, |
|
limit=10, |
|
alpha=0.8) |
|
return ans |
|
|
|
def vector_search(question: str, relevance_thr=0.3) -> List[str]: |
|
""" Search + pre/post processing """ |
|
|
|
|
|
if IsToxic(question): |
|
ans = [f"\"{question}\" is toxic, try again"] |
|
return ans |
|
|
|
ans = multirag_vectorstore.hybrid_search(query=question, |
|
limit=10, |
|
alpha=0.8) |
|
|
|
max_score = max([score for _, _, score in ans]) |
|
|
|
|
|
|
|
if max_score < relevance_thr: |
|
return [f"{question} is IRRELEVANT with max score: {max_score:.2f}, try again"] |
|
else: |
|
answers = [f"{question} is deemed RELEVANT with max score: {max_score:.2f}"] |
|
|
|
|
|
for i, (fname, ans, score) in enumerate(ans, 1): |
|
|
|
if score < relevance_thr: |
|
continue |
|
|
|
if IsPii(ans): |
|
ans = " Pii detected -" + ans |
|
|
|
|
|
if IsRelevant(ans, question): |
|
relevant = 'RELEVANT' |
|
else: |
|
|
|
relevant = 'IRRELEVANT' |
|
|
|
summary = summarize_it(question, [ans]) |
|
ans = f"{ans}\n SUMMARY: {summary}" |
|
|
|
answers.append(f"{i}: from {fname} - score:{score:.2f} - {relevant} answer - {ans}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return answers |