File size: 4,838 Bytes
10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import os, pickle
from typing import List
from .loaders.file import extractor
from .chunk_embed import chunk_vectorize
from ..settings import parquet_file
from .logger import logger
from .vectorstore import VectorStore
from .post_process import IsPii, IsToxic, IsRelevant
from .summary import summarize_it
multirag_vectorstore = VectorStore(model_path='sentence-transformers/all-mpnet-base-v2')
def empty_collection():
""" Deletes the MultiRAG collection if it exists """
status = multirag_vectorstore.empty_collection()
return status
def index_data():
if not os.path.exists(parquet_file):
logger(f"Parquet file {parquet_file} does not exists")
return 'no data to index'
# load the parquet file into the vectorstore
multirag_vectorstore.index_data()
os.remove(parquet_file)
# delete the files so we can load several files and index them when we want
# without having to keep track of those that have been indexed already
# this is a simple solution for now, but we can do better
return "Index creation successful"
def process_pdf(filepath: str) -> dict:
new_content = extractor('PyPDFLoader', filepath).extract_text()
logger(f"Successfully extracted text from PDF")
chunk_vectorize(new_content)
logger(f"Successfully vectorized PDF content of {filepath}")
return new_content
def process_txt(filepath: str) -> dict:
new_content = extractor('txt', filepath).extract_text()
logger(f"Successfully extracted text from TXT")
chunk_vectorize(new_content)
logger(f"Successfully vectorized TXT content")
return new_content
def vector_search_raw(question: str) -> List[str]:
""" Just vector search """
ans = multirag_vectorstore.hybrid_search(query=question,
limit=10,
alpha=0.8)
return ans
def vector_search(question: str, relevance_thr=0.3) -> List[str]:
""" Search + pre/post processing """
## PRE PROCESSING
if IsToxic(question):
ans = [f"\"{question}\" is toxic, try again"]
return ans
ans = multirag_vectorstore.hybrid_search(query=question,
limit=10,
alpha=0.8)
max_score = max([score for _, _, score in ans])
# if no answer has a score high enough, we consider the question irrelevant
# we could do better with reranking but here the question is trivial, y/n
# it's not like reranking 100 answers to pick the best 5 for RAGing
if max_score < relevance_thr:
return [f"{question} is IRRELEVANT with max score: {max_score:.2f}, try again"]
else:
answers = [f"{question} is deemed RELEVANT with max score: {max_score:.2f}"]
# let's first quickly print the answers, without summary
for i, (fname, ans, score) in enumerate(ans, 1):
if score < relevance_thr:
continue
if IsPii(ans):
ans = " Pii detected -" + ans
# removed, not accurate
if IsRelevant(ans, question):
relevant = 'RELEVANT'
else:
# irrelevant answer
relevant = 'IRRELEVANT'
summary = summarize_it(question, [ans])
ans = f"{ans}\n SUMMARY: {summary}"
answers.append(f"{i}: from {fname} - score:{score:.2f} - {relevant} answer - {ans}")
# msg = f"Answers to '{self.question}' with summaries"
# self.chats[self.current_chat] = [qa1]
# for i, (fname, ans, score) in enumerate(self.answer['answer'], 1):
# if score < relevance_thr:
# continue
# msg = ""
# summary = summarize_it(self.question, [ans])
# # if IsPii(ans):
# # qa.answer += " Pii detected -"
# # removed, not accurate
# # if IsRelevant(ans, self.question):
# # relevant = 'RELEVANT'
# # else:
# # # irrelevant answer
# # relevant = 'IRRELEVANT'
# # qa.answer += f" {relevant} ANSWER - {ans} \n SUMMARY: {summary}"
# qa = QA(question=msg,
# answer=f"{i}: from {fname} - score:{score:.2f} - {ans} - SUMMARY: {summary}"
# )
# # paths are from /assets, so data is assets/data
# search = ans[:30].replace(" ", "%20") # let's search only first 30 chars
# qa.link = f'data/{fname}#:~:text={search}'
# qa.msg = " - Verify in the document"
# logger(f"Summary: {summary}")
# # it's slower now because of the summaries
# self.chats[self.current_chat].append(qa)
# yield
# msg = ""
return answers |