mr

Build error

App Files Files Community

mr / app /engine /processing.py

JPBianchi

endpoint only, no UI

ae92cb7 5 months ago

raw

history blame

4.84 kB

	import os, pickle
	from typing import List
	from .loaders.file import extractor
	from .chunk_embed import chunk_vectorize
	from ..settings import parquet_file
	from .logger import logger
	from .vectorstore import VectorStore
	from .post_process import IsPii, IsToxic, IsRelevant
	from .summary import summarize_it

	multirag_vectorstore = VectorStore(model_path='sentence-transformers/all-mpnet-base-v2')


	def empty_collection():
	""" Deletes the MultiRAG collection if it exists """
	status = multirag_vectorstore.empty_collection()
	return status


	def index_data():

	if not os.path.exists(parquet_file):
	logger(f"Parquet file {parquet_file} does not exists")
	return 'no data to index'

	# load the parquet file into the vectorstore
	multirag_vectorstore.index_data()
	os.remove(parquet_file)
	# delete the files so we can load several files and index them when we want
	# without having to keep track of those that have been indexed already
	# this is a simple solution for now, but we can do better

	return "Index creation successful"


	def process_pdf(filepath: str) -> dict:

	new_content = extractor('PyPDFLoader', filepath).extract_text()
	logger(f"Successfully extracted text from PDF")

	chunk_vectorize(new_content)
	logger(f"Successfully vectorized PDF content of {filepath}")
	return new_content


	def process_txt(filepath: str) -> dict:

	new_content = extractor('txt', filepath).extract_text()
	logger(f"Successfully extracted text from TXT")

	chunk_vectorize(new_content)
	logger(f"Successfully vectorized TXT content")
	return new_content


	def vector_search_raw(question: str) -> List[str]:
	""" Just vector search """

	ans = multirag_vectorstore.hybrid_search(query=question,
	limit=10,
	alpha=0.8)
	return ans

	def vector_search(question: str, relevance_thr=0.3) -> List[str]:
	""" Search + pre/post processing """

	## PRE PROCESSING
	if IsToxic(question):
	ans = [f"\"{question}\" is toxic, try again"]
	return ans

	ans = multirag_vectorstore.hybrid_search(query=question,
	limit=10,
	alpha=0.8)

	max_score = max([score for _, _, score in ans])
	# if no answer has a score high enough, we consider the question irrelevant
	# we could do better with reranking but here the question is trivial, y/n
	# it's not like reranking 100 answers to pick the best 5 for RAGing
	if max_score < relevance_thr:
	return [f"{question} is IRRELEVANT with max score: {max_score:.2f}, try again"]
	else:
	answers = [f"{question} is deemed RELEVANT with max score: {max_score:.2f}"]

	# let's first quickly print the answers, without summary
	for i, (fname, ans, score) in enumerate(ans, 1):

	if score < relevance_thr:
	continue

	if IsPii(ans):
	ans = " Pii detected -" + ans

	# removed, not accurate
	if IsRelevant(ans, question):
	relevant = 'RELEVANT'
	else:
	# irrelevant answer
	relevant = 'IRRELEVANT'

	summary = summarize_it(question, [ans])
	ans = f"{ans}\n SUMMARY: {summary}"

	answers.append(f"{i}: from {fname} - score:{score:.2f} - {relevant} answer - {ans}")

	# msg = f"Answers to '{self.question}' with summaries"
	# self.chats[self.current_chat] = [qa1]

	# for i, (fname, ans, score) in enumerate(self.answer['answer'], 1):

	# if score < relevance_thr:
	# continue

	# msg = ""
	# summary = summarize_it(self.question, [ans])

	# # if IsPii(ans):
	# # qa.answer += " Pii detected -"

	# # removed, not accurate
	# # if IsRelevant(ans, self.question):
	# # relevant = 'RELEVANT'
	# # else:
	# # # irrelevant answer
	# # relevant = 'IRRELEVANT'
	# # qa.answer += f" {relevant} ANSWER - {ans} \n SUMMARY: {summary}"

	# qa = QA(question=msg,
	# answer=f"{i}: from {fname} - score:{score:.2f} - {ans} - SUMMARY: {summary}"
	# )

	# # paths are from /assets, so data is assets/data
	# search = ans[:30].replace(" ", "%20") # let's search only first 30 chars
	# qa.link = f'data/{fname}#:~:text={search}'
	# qa.msg = " - Verify in the document"
	# logger(f"Summary: {summary}")

	# # it's slower now because of the summaries
	# self.chats[self.current_chat].append(qa)
	# yield

	# msg = ""

	return answers