mr / app /engine /processing.py
JPBianchi's picture
endpoint only, no UI
ae92cb7
raw
history blame
4.84 kB
import os, pickle
from typing import List
from .loaders.file import extractor
from .chunk_embed import chunk_vectorize
from ..settings import parquet_file
from .logger import logger
from .vectorstore import VectorStore
from .post_process import IsPii, IsToxic, IsRelevant
from .summary import summarize_it
multirag_vectorstore = VectorStore(model_path='sentence-transformers/all-mpnet-base-v2')
def empty_collection():
""" Deletes the MultiRAG collection if it exists """
status = multirag_vectorstore.empty_collection()
return status
def index_data():
if not os.path.exists(parquet_file):
logger(f"Parquet file {parquet_file} does not exists")
return 'no data to index'
# load the parquet file into the vectorstore
multirag_vectorstore.index_data()
os.remove(parquet_file)
# delete the files so we can load several files and index them when we want
# without having to keep track of those that have been indexed already
# this is a simple solution for now, but we can do better
return "Index creation successful"
def process_pdf(filepath: str) -> dict:
new_content = extractor('PyPDFLoader', filepath).extract_text()
logger(f"Successfully extracted text from PDF")
chunk_vectorize(new_content)
logger(f"Successfully vectorized PDF content of {filepath}")
return new_content
def process_txt(filepath: str) -> dict:
new_content = extractor('txt', filepath).extract_text()
logger(f"Successfully extracted text from TXT")
chunk_vectorize(new_content)
logger(f"Successfully vectorized TXT content")
return new_content
def vector_search_raw(question: str) -> List[str]:
""" Just vector search """
ans = multirag_vectorstore.hybrid_search(query=question,
limit=10,
alpha=0.8)
return ans
def vector_search(question: str, relevance_thr=0.3) -> List[str]:
""" Search + pre/post processing """
## PRE PROCESSING
if IsToxic(question):
ans = [f"\"{question}\" is toxic, try again"]
return ans
ans = multirag_vectorstore.hybrid_search(query=question,
limit=10,
alpha=0.8)
max_score = max([score for _, _, score in ans])
# if no answer has a score high enough, we consider the question irrelevant
# we could do better with reranking but here the question is trivial, y/n
# it's not like reranking 100 answers to pick the best 5 for RAGing
if max_score < relevance_thr:
return [f"{question} is IRRELEVANT with max score: {max_score:.2f}, try again"]
else:
answers = [f"{question} is deemed RELEVANT with max score: {max_score:.2f}"]
# let's first quickly print the answers, without summary
for i, (fname, ans, score) in enumerate(ans, 1):
if score < relevance_thr:
continue
if IsPii(ans):
ans = " Pii detected -" + ans
# removed, not accurate
if IsRelevant(ans, question):
relevant = 'RELEVANT'
else:
# irrelevant answer
relevant = 'IRRELEVANT'
summary = summarize_it(question, [ans])
ans = f"{ans}\n SUMMARY: {summary}"
answers.append(f"{i}: from {fname} - score:{score:.2f} - {relevant} answer - {ans}")
# msg = f"Answers to '{self.question}' with summaries"
# self.chats[self.current_chat] = [qa1]
# for i, (fname, ans, score) in enumerate(self.answer['answer'], 1):
# if score < relevance_thr:
# continue
# msg = ""
# summary = summarize_it(self.question, [ans])
# # if IsPii(ans):
# # qa.answer += " Pii detected -"
# # removed, not accurate
# # if IsRelevant(ans, self.question):
# # relevant = 'RELEVANT'
# # else:
# # # irrelevant answer
# # relevant = 'IRRELEVANT'
# # qa.answer += f" {relevant} ANSWER - {ans} \n SUMMARY: {summary}"
# qa = QA(question=msg,
# answer=f"{i}: from {fname} - score:{score:.2f} - {ans} - SUMMARY: {summary}"
# )
# # paths are from /assets, so data is assets/data
# search = ans[:30].replace(" ", "%20") # let's search only first 30 chars
# qa.link = f'data/{fname}#:~:text={search}'
# qa.msg = " - Verify in the document"
# logger(f"Summary: {summary}")
# # it's slower now because of the summaries
# self.chats[self.current_chat].append(qa)
# yield
# msg = ""
return answers