import os, pickle from typing import List from .loaders.file import extractor from .chunk_embed import chunk_vectorize from ..settings import parquet_file from .logger import logger from .vectorstore import VectorStore from .post_process import IsPii, IsToxic, IsRelevant from .summary import summarize_it multirag_vectorstore = VectorStore(model_path='sentence-transformers/all-mpnet-base-v2') def empty_collection(): """ Deletes the MultiRAG collection if it exists """ status = multirag_vectorstore.empty_collection() return status def index_data(): if not os.path.exists(parquet_file): logger(f"Parquet file {parquet_file} does not exists") return 'no data to index' # load the parquet file into the vectorstore multirag_vectorstore.index_data() os.remove(parquet_file) # delete the files so we can load several files and index them when we want # without having to keep track of those that have been indexed already # this is a simple solution for now, but we can do better return "Index creation successful" def process_pdf(filepath: str) -> dict: new_content = extractor('PyPDFLoader', filepath).extract_text() logger(f"Successfully extracted text from PDF") chunk_vectorize(new_content) logger(f"Successfully vectorized PDF content of {filepath}") return new_content def process_txt(filepath: str) -> dict: new_content = extractor('txt', filepath).extract_text() logger(f"Successfully extracted text from TXT") chunk_vectorize(new_content) logger(f"Successfully vectorized TXT content") return new_content def vector_search_raw(question: str) -> List[str]: """ Just vector search """ ans = multirag_vectorstore.hybrid_search(query=question, limit=10, alpha=0.8) return ans def vector_search(question: str, relevance_thr=0.3) -> List[str]: """ Search + pre/post processing """ ## PRE PROCESSING if IsToxic(question): ans = [f"\"{question}\" is toxic, try again"] return ans ans = multirag_vectorstore.hybrid_search(query=question, limit=10, alpha=0.8) max_score = max([score for _, _, score in ans]) # if no answer has a score high enough, we consider the question irrelevant # we could do better with reranking but here the question is trivial, y/n # it's not like reranking 100 answers to pick the best 5 for RAGing if max_score < relevance_thr: return [f"{question} is IRRELEVANT with max score: {max_score:.2f}, try again"] else: answers = [f"{question} is deemed RELEVANT with max score: {max_score:.2f}"] # let's first quickly print the answers, without summary for i, (fname, ans, score) in enumerate(ans, 1): if score < relevance_thr: continue if IsPii(ans): ans = " Pii detected -" + ans # removed, not accurate if IsRelevant(ans, question): relevant = 'RELEVANT' else: # irrelevant answer relevant = 'IRRELEVANT' summary = summarize_it(question, [ans]) ans = f"{ans}\n SUMMARY: {summary}" answers.append(f"{i}: from {fname} - score:{score:.2f} - {relevant} answer - {ans}") # msg = f"Answers to '{self.question}' with summaries" # self.chats[self.current_chat] = [qa1] # for i, (fname, ans, score) in enumerate(self.answer['answer'], 1): # if score < relevance_thr: # continue # msg = "" # summary = summarize_it(self.question, [ans]) # # if IsPii(ans): # # qa.answer += " Pii detected -" # # removed, not accurate # # if IsRelevant(ans, self.question): # # relevant = 'RELEVANT' # # else: # # # irrelevant answer # # relevant = 'IRRELEVANT' # # qa.answer += f" {relevant} ANSWER - {ans} \n SUMMARY: {summary}" # qa = QA(question=msg, # answer=f"{i}: from {fname} - score:{score:.2f} - {ans} - SUMMARY: {summary}" # ) # # paths are from /assets, so data is assets/data # search = ans[:30].replace(" ", "%20") # let's search only first 30 chars # qa.link = f'data/{fname}#:~:text={search}' # qa.msg = " - Verify in the document" # logger(f"Summary: {summary}") # # it's slower now because of the summaries # self.chats[self.current_chat].append(qa) # yield # msg = "" return answers