import os, pickle
from typing import List
from .loaders.file import extractor
from .chunk_embed import chunk_vectorize
from ..settings import parquet_file
from .logger import logger
from .vectorstore import VectorStore
from .post_process import IsPii, IsToxic, IsRelevant
from .summary import summarize_it

multirag_vectorstore = VectorStore(model_path='sentence-transformers/all-mpnet-base-v2')


def empty_collection():
    """ Deletes the MultiRAG collection if it exists """
    status = multirag_vectorstore.empty_collection()
    return status


def index_data():
    
    if not os.path.exists(parquet_file):
        logger(f"Parquet file {parquet_file} does not exists")
        return 'no data to index'
    
    # load the parquet file into the vectorstore
    multirag_vectorstore.index_data()
    os.remove(parquet_file)
    # delete the files so we can load several files and index them when we want
    # without having to keep track of those that have been indexed already
    # this is a simple solution for now, but we can do better
    
    return "Index creation successful"


def process_pdf(filepath: str) -> dict:
    
    new_content = extractor('PyPDFLoader', filepath).extract_text()
    logger(f"Successfully extracted text from PDF")
    
    chunk_vectorize(new_content)
    logger(f"Successfully vectorized PDF content of {filepath}")
    return new_content


def process_txt(filepath: str) -> dict:
    
    new_content = extractor('txt', filepath).extract_text()
    logger(f"Successfully extracted text from TXT")
    
    chunk_vectorize(new_content)
    logger(f"Successfully vectorized TXT content")
    return new_content


def vector_search_raw(question: str) -> List[str]:
    """ Just vector search """
    
    ans = multirag_vectorstore.hybrid_search(query=question, 
                                             limit=10, 
                                             alpha=0.8)
    return ans

def vector_search(question: str, relevance_thr=0.3) -> List[str]:
    """ Search + pre/post processing """
    
    ## PRE PROCESSING
    if IsToxic(question):
        ans =  [f"\"{question}\" is toxic, try again"]
        return ans

    ans = multirag_vectorstore.hybrid_search(query=question, 
                                             limit=10, 
                                             alpha=0.8)
    
    max_score = max([score for _, _, score in ans])
    # if no answer has a score high enough, we consider the question irrelevant
    # we could do better with reranking but here the question is trivial, y/n
    # it's not like reranking 100 answers to pick the best 5 for RAGing
    if max_score < relevance_thr:
        return [f"{question} is IRRELEVANT with max score: {max_score:.2f}, try again"]
    else:
        answers = [f"{question} is deemed RELEVANT with max score: {max_score:.2f}"]
    
    # let's first quickly print the answers, without summary
    for i, (fname, ans, score) in enumerate(ans, 1):
        
        if score < relevance_thr:
            continue
        
        if IsPii(ans):
          ans = " Pii detected -" + ans
        
        # removed, not accurate
        if IsRelevant(ans, question):
            relevant = 'RELEVANT'
        else:
            # irrelevant answer
            relevant = 'IRRELEVANT'
        
        summary = summarize_it(question, [ans])
        ans = f"{ans}\n SUMMARY: {summary}"
        
        answers.append(f"{i}: from {fname} - score:{score:.2f} - {relevant} answer - {ans}")

    # msg = f"Answers to '{self.question}' with summaries"
    # self.chats[self.current_chat] = [qa1]
    
    # for i, (fname, ans, score) in enumerate(self.answer['answer'], 1):
        
    #     if score < relevance_thr:
    #         continue
        
    #     msg = ""
    #     summary = summarize_it(self.question, [ans])
        
    #     # if IsPii(ans):
    #     #   qa.answer += " Pii detected -"
        
    #     # removed, not accurate
    #     # if IsRelevant(ans, self.question):
    #     #     relevant = 'RELEVANT'
    #     # else:
    #     #     # irrelevant answer
    #     #     relevant = 'IRRELEVANT'
    #     # qa.answer += f" {relevant} ANSWER - {ans} \n SUMMARY: {summary}"
        
    #     qa = QA(question=msg, 
    #             answer=f"{i}: from {fname} - score:{score:.2f} - {ans} - SUMMARY: {summary}"
    #             )
        
    #     # paths are from /assets, so data is assets/data
    #     search = ans[:30].replace(" ", "%20")   # let's search only first 30 chars
    #     qa.link = f'data/{fname}#:~:text={search}'
    #     qa.msg = " - Verify in the document"
    #     logger(f"Summary: {summary}")
        
    #     # it's slower now because of the summaries
    #     self.chats[self.current_chat].append(qa)
    #     yield

    #     msg = ""
    
    return answers