mr

Build error

App Files Files Community

JPBianchi commited on Jun 22

Commit

ae92cb7

•

1 Parent(s): 94263d8

endpoint only, no UI

Browse files

Files changed (19) hide show

Dockerfile +1 -1
app/engine/chunk_embed.py +1 -1
app/engine/loaders/file.py +49 -6
app/engine/logger.py +14 -8
app/engine/post_process.py +74 -0
app/engine/processing.py +113 -17
app/engine/summary.py +53 -0
app/engine/vectorstore.py +138 -37
app/main_reflex.py +246 -0
app/notebooks/upload_index.ipynb +0 -0
app/rag/rag.py +1 -1
app/settings.py +3 -1
assets/IO_logo.webp +0 -0
assets/OI_logo.jpg +0 -0
assets/amazon_forecast.jpg +0 -0
assets/amazon_idiot.jpg +0 -0
assets/favicon.ico +0 -0
assets/homepage.jpg +0 -0
assets/irrelevant_amazon.jpg +0 -0

Dockerfile CHANGED Viewed

@@ -24,4 +24,4 @@ ENV TRANSFORMERS_CACHE=/usr/local/lib/python3.10/site-packages/llama_index/legac
 # ^ not elegant but it works
 # HF warning says that TRANSFORMERS_CACHE will be deprecated in transformers v5, and advise to use HF_HOME
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

 # ^ not elegant but it works
 # HF warning says that TRANSFORMERS_CACHE will be deprecated in transformers v5, and advise to use HF_HOME
+CMD ["uvicorn", "main_reflex:app", "--host", "0.0.0.0", "--port", "7860"]

app/engine/chunk_embed.py CHANGED Viewed

@@ -4,7 +4,7 @@ import os
 import pandas as pd
 import torch
-from settings import parquet_file
 import tiktoken  # tokenizer library for use with OpenAI LLMs
 from llama_index.legacy.text_splitter import SentenceSplitter

 import pandas as pd
 import torch
+from app.settings import parquet_file
 import tiktoken  # tokenizer library for use with OpenAI LLMs
 from llama_index.legacy.text_splitter import SentenceSplitter

app/engine/loaders/file.py CHANGED Viewed

@@ -3,13 +3,15 @@ import os
 # from langchain.document_loaders import PyPDFLoader  # deprecated
 from langchain_community.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from llama_parse import LlamaParse
 from typing import Union, List, Dict
 from abc import ABC, abstractmethod
-class PDFExtractor(ABC):
     def __init__(self, file_or_list: Union[str, List[str]], num_workers: int = 1, verbose: bool = False):
         """ We can provide a list of files or a single file """
@@ -40,7 +42,7 @@ class PDFExtractor(ABC):
         """
         pass
-class _PyPDFLoader(PDFExtractor):
     def extract_text(self):
         output_dict = {}
@@ -58,7 +60,7 @@ class _PyPDFLoader(PDFExtractor):
         return
-class _LlamaParse(PDFExtractor):
     def extract_text(self):
         # https://github.com/run-llama/llama_parse
@@ -88,18 +90,59 @@ class _LlamaParse(PDFExtractor):
         raise NotImplementedError("Not implemented or LlamaParse does not support table extraction")
         return
-def pdf_extractor(extractor_type: str, *args, **kwargs) -> PDFExtractor:
-    """ Factory function to return the appropriate PDF extractor instance, properly initialized """
     if extractor_type == 'PyPDFLoader':
         return _PyPDFLoader(*args, **kwargs)
     elif extractor_type == 'LlamaParse':
         return _LlamaParse(*args, **kwargs)
     else:
         raise ValueError(f"Unsupported PDF extractor type: {extractor_type}")

 # from langchain.document_loaders import PyPDFLoader  # deprecated
 from langchain_community.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders.csv_loader import CSVLoader
+# ^ if we want to add CSV support, it will transform every row into a k:v pair
 from llama_parse import LlamaParse
 from typing import Union, List, Dict
 from abc import ABC, abstractmethod
+class Extractor(ABC):
     def __init__(self, file_or_list: Union[str, List[str]], num_workers: int = 1, verbose: bool = False):
         """ We can provide a list of files or a single file """
         """
         pass
+class _PyPDFLoader(Extractor):
     def extract_text(self):
         output_dict = {}
         return
+class _LlamaParse(Extractor):
     def extract_text(self):
         # https://github.com/run-llama/llama_parse
         raise NotImplementedError("Not implemented or LlamaParse does not support table extraction")
         return
+class _TXTLoader(Extractor):
+    def extract_text(self):
+        output_dict = {}
+        for fpath in self.filelist:
+            fname = fpath.split('/')[-1]
+            output_dict[fname] = [open(fpath, 'r').read()]
+            # with pdfs, we use a list of strings, one for each page
+            # so we must return a list here, even if it's just one string with everything
+        return output_dict
+    def extract_images(self):
+        raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction")
+        return
+    def extract_tables(self):
+        raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction")
+        return
+class _CSVLoader(Extractor):
+    # mock code for now, as a reminder of what we could do if time allows TODO
+    def extract_text(self):
+        output_dict = {}
+        for fpath in self.filelist:
+            fname = fpath.split('/')[-1]
+            output_dict[fname] = [CSVLoader(fpath).load()]  # <<  untested!
+        return output_dict
+    def extract_images(self):
+        raise NotImplementedError("Not implemented or CSVLoader does not support image extraction")
+        return
+    def extract_tables(self):
+        raise NotImplementedError("Not implemented or CSVLoader does not support table extraction")
+        return
+def extractor(extractor_type: str, *args, **kwargs) -> Extractor:
+    """ Function factory to return the appropriate PDF extractor instance, properly initialized """
     if extractor_type == 'PyPDFLoader':
         return _PyPDFLoader(*args, **kwargs)
     elif extractor_type == 'LlamaParse':
         return _LlamaParse(*args, **kwargs)
+    elif extractor_type == 'txt':
+        return _TXTLoader(*args, **kwargs)
     else:
         raise ValueError(f"Unsupported PDF extractor type: {extractor_type}")
+#/usr/bin/env /Users/jpb2/Library/Caches/pypoetry/virtualenvs/reflex-Y1r5RCNB-py3.10/bin/python /Users/jpb2/.vscode/extensions/ms-python.debugpy-2024.6.0-darwin-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 51572 -- -m reflex run --frontend-port 3000 --loglevel debug
+#/usr/bin/env /Volumes/DATA/Dropbox/IMAC_BACKUP/WORK/PROJECTS/INNOVATION/venv/bin/python /Users/jpb2/.vscode/extensions/ms-python.debugpy-2024.6.0-darwin-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 53961 -- -m reflex run --frontend-port 3001 --loglevel debug --env dev

app/engine/logger.py CHANGED Viewed

@@ -1,10 +1,16 @@
 import os, logging
-environment = os.getenv("ENVIRONMENT", "dev")
-if environment == "dev":
-    logger = logging.getLogger("uvicorn")
-else:
-    logger = lambda x: _
-    # we should log also in production  TODO
-    # check how it works on HuggingFace, if possible
-    # because we don't have access to the container's file system

 import os, logging
+import reflex as rx
+logger = logging.getLogger("uvicorn").info
+# logger = lambda x: rx.console_log(x)
+# let's use reflex's logger, but doesn't show in the console??
+# environment = os.getenv("ENVIRONMENT", "dev")
+# if environment == "dev":
+#     logger = logging.getLogger("uvicorn").info
+# else:
+#     logger = lambda x: print(x)
+#     # we should log also in production  TODO
+#     # check how it works on HuggingFace, if possible
+#     # because we don't have access to the container's file system unless in pro mode

app/engine/post_process.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import reflex as rx
+import json
+import requests
+from typing import Optional, List
+from pydantic import BaseModel, Field
+# from rerank import ReRanker
+# https://hub.guardrailsai.com/validator/guardrails/toxic_language
+from guardrails.hub import ToxicLanguage
+from guardrails import Guard
+# guardrails hub install hub://guardrails/detect_pii
+from guardrails.hub import DetectPII
+# https://hub.guardrailsai.com/validator/guardrails/qa_relevance_llm_eval
+from guardrails.hub import QARelevanceLLMEval
+import logging
+logger = logging.getLogger("uvicorn").info
+from .summary import summarize_it
+def IsPii(answer: str) -> bool:
+    guard = Guard().use(DetectPII,
+                        ["EMAIL_ADDRESS", "PHONE_NUMBER"],
+                        "exception",
+                        )
+    try:
+        guard.validate(answer)
+        return True
+    except Exception as e:
+        print(e)
+        return False
+def IsToxic(query: str, threshold=0.5) -> bool:
+    # https://hub.guardrailsai.com/validator/guardrails/toxic_language
+    # Use the Guard with the validator
+    guard = Guard().use(
+            ToxicLanguage,
+            threshold=threshold, # high for highly toxic only
+            validation_method="sentence",
+            on_fail="exception"
+        )
+    try:
+        guard.validate(query)
+        return False
+    except Exception as e:
+        print(e)  # will output the toxic question
+        return True
+def IsRelevant(answer: str, query: str, model: str="gpt-3.5-turbo") -> bool:
+    guard = Guard().use(
+        QARelevanceLLMEval,
+        llm_callable=model,
+        on_fail="exception",
+    )
+    try:
+        guard.validate(
+            answer,
+            metadata={"original_prompt": query},
+        )
+        return True
+    except Exception as e:
+        print(e)
+        return False

app/engine/processing.py CHANGED Viewed

@@ -1,48 +1,144 @@
 import os, pickle
 from typing import List
-from engine.loaders.file import pdf_extractor
-from engine.chunk_embed import chunk_vectorize
-from settings import parquet_file
 from .logger import logger
 from .vectorstore import VectorStore
-# I allow relative imports inside the engine package
-# I could have created a module but things are still changing
-finrag_vectorstore = VectorStore(model_path='sentence-transformers/all-mpnet-base-v2')
 def empty_collection():
-    """ Deletes the Finrag collection if it exists """
-    status = finrag_vectorstore.empty_collection()
     return status
 def index_data():
     if not os.path.exists(parquet_file):
-        logger.info(f"Parquet file {parquet_file} does not exists")
         return 'no data to index'
     # load the parquet file into the vectorstore
-    finrag_vectorstore.index_data()
     os.remove(parquet_file)
     # delete the files so we can load several files and index them when we want
     # without having to keep track of those that have been indexed already
     # this is a simple solution for now, but we can do better
     return "Index creation successful"
-def process_pdf(filepath:str) -> dict:
-    new_content = pdf_extractor('PyPDFLoader', filepath).extract_text()
-    logger.info(f"Successfully extracted text from PDF")
     chunk_vectorize(new_content)
-    logger.info(f"Successfully vectorized PDF content")
     return new_content
-def vector_search(question:str) -> List[str]:
-    ans = finrag_vectorstore.hybrid_search(query=question, limit=3, alpha=0.8)
     return ans

 import os, pickle
 from typing import List
+from .loaders.file import extractor
+from .chunk_embed import chunk_vectorize
+from ..settings import parquet_file
 from .logger import logger
 from .vectorstore import VectorStore
+from .post_process import IsPii, IsToxic, IsRelevant
+from .summary import summarize_it
+multirag_vectorstore = VectorStore(model_path='sentence-transformers/all-mpnet-base-v2')
 def empty_collection():
+    """ Deletes the MultiRAG collection if it exists """
+    status = multirag_vectorstore.empty_collection()
     return status
 def index_data():
     if not os.path.exists(parquet_file):
+        logger(f"Parquet file {parquet_file} does not exists")
         return 'no data to index'
     # load the parquet file into the vectorstore
+    multirag_vectorstore.index_data()
     os.remove(parquet_file)
     # delete the files so we can load several files and index them when we want
     # without having to keep track of those that have been indexed already
     # this is a simple solution for now, but we can do better
     return "Index creation successful"
+def process_pdf(filepath: str) -> dict:
+    new_content = extractor('PyPDFLoader', filepath).extract_text()
+    logger(f"Successfully extracted text from PDF")
+    chunk_vectorize(new_content)
+    logger(f"Successfully vectorized PDF content of {filepath}")
+    return new_content
+def process_txt(filepath: str) -> dict:
+    new_content = extractor('txt', filepath).extract_text()
+    logger(f"Successfully extracted text from TXT")
     chunk_vectorize(new_content)
+    logger(f"Successfully vectorized TXT content")
     return new_content
+def vector_search_raw(question: str) -> List[str]:
+    """ Just vector search """
+    ans = multirag_vectorstore.hybrid_search(query=question,
+                                             limit=10,
+                                             alpha=0.8)
     return ans
+def vector_search(question: str, relevance_thr=0.3) -> List[str]:
+    """ Search + pre/post processing """
+    ## PRE PROCESSING
+    if IsToxic(question):
+        ans =  [f"\"{question}\" is toxic, try again"]
+        return ans
+    ans = multirag_vectorstore.hybrid_search(query=question,
+                                             limit=10,
+                                             alpha=0.8)
+    max_score = max([score for _, _, score in ans])
+    # if no answer has a score high enough, we consider the question irrelevant
+    # we could do better with reranking but here the question is trivial, y/n
+    # it's not like reranking 100 answers to pick the best 5 for RAGing
+    if max_score < relevance_thr:
+        return [f"{question} is IRRELEVANT with max score: {max_score:.2f}, try again"]
+    else:
+        answers = [f"{question} is deemed RELEVANT with max score: {max_score:.2f}"]
+    # let's first quickly print the answers, without summary
+    for i, (fname, ans, score) in enumerate(ans, 1):
+        if score < relevance_thr:
+            continue
+        if IsPii(ans):
+          ans = " Pii detected -" + ans
+        # removed, not accurate
+        if IsRelevant(ans, question):
+            relevant = 'RELEVANT'
+        else:
+            # irrelevant answer
+            relevant = 'IRRELEVANT'
+        summary = summarize_it(question, [ans])
+        ans = f"{ans}\n SUMMARY: {summary}"
+        answers.append(f"{i}: from {fname} - score:{score:.2f} - {relevant} answer - {ans}")
+    # msg = f"Answers to '{self.question}' with summaries"
+    # self.chats[self.current_chat] = [qa1]
+    # for i, (fname, ans, score) in enumerate(self.answer['answer'], 1):
+    #     if score < relevance_thr:
+    #         continue
+    #     msg = ""
+    #     summary = summarize_it(self.question, [ans])
+    #     # if IsPii(ans):
+    #     #   qa.answer += " Pii detected -"
+    #     # removed, not accurate
+    #     # if IsRelevant(ans, self.question):
+    #     #     relevant = 'RELEVANT'
+    #     # else:
+    #     #     # irrelevant answer
+    #     #     relevant = 'IRRELEVANT'
+    #     # qa.answer += f" {relevant} ANSWER - {ans} \n SUMMARY: {summary}"
+    #     qa = QA(question=msg,
+    #             answer=f"{i}: from {fname} - score:{score:.2f} - {ans} - SUMMARY: {summary}"
+    #             )
+    #     # paths are from /assets, so data is assets/data
+    #     search = ans[:30].replace(" ", "%20")   # let's search only first 30 chars
+    #     qa.link = f'data/{fname}#:~:text={search}'
+    #     qa.msg = " - Verify in the document"
+    #     logger(f"Summary: {summary}")
+    #     # it's slower now because of the summaries
+    #     self.chats[self.current_chat].append(qa)
+    #     yield
+    #     msg = ""
+    return answers

app/engine/summary.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from typing import List
+from app.rag.llm import LLM
+#the LLM Class uses the OPENAI_API_KEY env var as the default api_key
+async def summarize_it(question: str,
+           search_results: List[str],
+           model: str = 'gpt-3.5-turbo-0125',
+           ) -> str:
+    # TODO turn this into a class if time allows
+    llm = LLM(model)
+    system_message = """
+    You are able to quickly understand a few paragraphs, or quips even, generated by vector search system
+    and generate a one-line summary.
+    """
+    searches = "\n".join([f"Search result {i}: {v}" for i,v in enumerate(search_results,1)])
+    user_prompt = f"""
+    Use the below context enclosed in triple back ticks to answer the question. \n
+    The context is given by a vector search into a vector database made from the company's documents,
+    so you can assume the context is accurate. \n
+    ```
+    Context:
+    ```
+    {searches}
+    ```
+    Question:\n
+    {question}\n
+    ------------------------
+    1. If the context is not relevant to the question, simply say 'Irrelevant content' and nothing else.
+    Pay great attention to making sure your answer is relevant to the question and the context.
+    (for instance, never answer a question about a topic that is not explicitely mentioned in the question)
+    2. Using any external knowledge or resources to answer the question is forbidden.
+    3. Generate a ONE-LINE ONE-LINE summary within the limits of the context and the question.
+    4. Avoid mentioning 'search results' in the answer.
+       Instead, incorporate the information from the search results into the answer.
+    5. Create a clean answer, without backticks, or starting with a new line for instance.
+    ------------------------
+    Answer:\n
+    """.format(searches=searches, question=question)
+    response = await llm.chat_completion(system_message=system_message,
+                                   user_message=user_prompt,
+                                   temperature=0.01,  # let's not allow the model to be creative
+                                   stream=False,
+                                   raw_response=False)
+    return response

app/engine/vectorstore.py CHANGED Viewed

@@ -1,19 +1,109 @@
 import os, logging
 from typing import List, Any
 import pandas as pd
 from weaviate.classes.config import Property, DataType
 from .weaviate_interface_v4 import WeaviateWCS, WeaviateIndexer
-from .logger import logger
-from settings import parquet_file
 class VectorStore:
-    def __init__(self, model_path:str = 'sentence-transformers/all-mpnet-base-v2'):
         # we can create several instances to test various models, especially if we finetune one
-        self.finrag_properties = [
-                Property(name='filename',
                          data_type=DataType.TEXT,
                          description='Name of the file',
                          index_filterable=True,
@@ -30,45 +120,54 @@ class VectorStore:
                          index_searchable=True),
               ]
-        self.class_name = "FinRag_all-mpnet-base-v2"
         self.class_config = {'classes': [
                             {"class": self.class_name,
-                            "description": "Financial reports",
                             "vectorIndexType": "hnsw",
-                            # Vector index specific settings for HSNW
                             "vectorIndexConfig": {
                                     "ef": 64,  # higher is better quality vs slower search
                                     "efConstruction": 128, # higher = better index but slower build
                                     "maxConnections": 32,  # max conn per layer - higher = more memory
                             },
                             "vectorizer": "none",
-                            "properties": self.finrag_properties }
                             ]
         }
         self.model_path = model_path
         try:
             self.api_key = os.environ.get('FINRAG_WEAVIATE_API_KEY')
-            self.url =  os.environ.get('FINRAG_WEAVIATE_ENDPOINT')
-            self.client = WeaviateWCS(endpoint=self.url,
-                                      api_key=self.api_key,
-                                      model_name_or_path=self.model_path)
         except Exception as e:
             # raise Exception(f"Could not create Weaviate client: {e}")
-            print(f"Could not create Weaviate client: {e}")
-        assert self.client._client.is_live(), "Weaviate is not live"
-        assert self.client._client.is_ready(), "Weaviate is not ready"
         # careful with accessing '_client' since the weaviate helper usually closes the connection every time
         self.indexer = None
@@ -80,19 +179,21 @@ class VectorStore:
         return self.client.show_all_collections()
-    def create_collection(self, collection_name: str='Finrag', description: str='Financial reports'):
         self.collection_name = collection_name
         if collection_name not in self.collections:
             self.client.create_collection(collection_name=collection_name,
-                                          properties=self.finrag_properties,
                                           description=description)
-            self.collection_name = collection_name
         else:
-            logging.warning(f"Collection {collection_name} already exists")
-    def empty_collection(self, collection_name: str='Finrag') -> bool:
         # not in the library yet, so I simply delete and recreate it
         if collection_name in self.collections:
@@ -100,11 +201,11 @@ class VectorStore:
             self.create_collection()
             return True
         else:
-            logging.warning(f"Collection {collection_name} doesn't exist")
             return False
-    def index_data(self, data: List[dict]= None, collection_name: str='Finrag'):
         if self.indexer is None:
             self.indexer = WeaviateIndexer(self.client)
@@ -127,25 +228,25 @@ class VectorStore:
     def keyword_search(self,
                        query: str,
                        limit: int=5,
-                       return_properties: List[str]=['filename', 'content'],
                        alpha=None  # dummy parameter to match the hybrid_search signature
                        ) -> List[str]:
         response = self.client.keyword_search(
                                 request=query,
                                 collection_name=self.collection_name,
-                                query_properties=['content'],
                                 limit=limit,
                                 filter=None,
                                 return_properties=return_properties,
                                 return_raw=False)
-        return [res['content'] for res in response]
     def vector_search(self,
                       query: str,
                       limit: int=5,
-                      return_properties: List[str]=['filename', 'content'],
                       alpha=None  # dummy parameter to match the hybrid_search signature
                       ) -> List[str]:
@@ -157,24 +258,24 @@ class VectorStore:
                                 return_properties=return_properties,
                                 return_raw=False)
-        return [res['content'] for res in response]
     def hybrid_search(self,
                       query: str,
-                      limit: int=5,
                       alpha=0.5,  # higher = more vector search
-                      return_properties: List[str]=['filename', 'content']
                       ) -> List[str]:
         response = self.client.hybrid_search(
                                 request=query,
                                 collection_name=self.collection_name,
-                                query_properties=['content'],
                                 alpha=alpha,
                                 limit=limit,
                                 filter=None,
                                 return_properties=return_properties,
                                 return_raw=False)
-        return [res['content'] for res in response]

 import os, logging
+from app.engine.logger import logger
 from typing import List, Any
 import pandas as pd
 from weaviate.classes.config import Property, DataType
 from .weaviate_interface_v4 import WeaviateWCS, WeaviateIndexer
+from ..settings import parquet_file
+from weaviate.classes.query import Filter
+from torch import cuda
+if os.path.exists('.we_are_local'):
+    COLLECTION = 'MultiRAG_local_mr'
+else:
+    COLLECTION = 'MultiRAG'
+class dummyWeaviate:
+    """ Created to pass on HF since I had again the client creation issue
+        Temporary solution
+    """
+    def __init__(self,
+                 endpoint: str=None,
+                 api_key: str=None,
+                 model_name_or_path: str='sentence-transformers/all-MiniLM-L6-v2',
+                 embedded: bool=False,
+                 openai_api_key: str=None,
+                 skip_init_checks: bool=False,
+                 **kwargs
+                ):
+        return
+    def _connect(self) -> None:
+        return
+    def _client(self):
+        return
+    def create_collection(self,
+                          collection_name: str,
+                          properties: list[Property],
+                          description: str=None,
+                          **kwargs
+                          ) -> None:
+        return
+    def show_all_collections(self,
+                             detailed: bool=False,
+                             max_details: bool=False
+                             ) -> list[str] | dict:
+        return ['abc', 'def']
+    def show_collection_config(self, collection_name: str):
+        return
+    def show_collection_properties(self, collection_name: str):
+        return
+    def delete_collection(self, collection_name: str):
+        return
+    def get_doc_count(self, collection_name: str):
+        return
+    def keyword_search(self,
+                       request: str,
+                       collection_name: str,
+                       query_properties: list[str]=['content'],
+                       limit: int=10,
+                       filter: Filter=None,
+                       return_properties: list[str]=None,
+                       return_raw: bool=False
+                       ):
+        return
+    def vector_search(self,
+                      request: str,
+                      collection_name: str,
+                      limit: int=10,
+                      return_properties: list[str]=None,
+                      filter: Filter=None,
+                      return_raw: bool=False,
+                      device: str='cuda:0' if cuda.is_available() else 'cpu'
+                      ):
+        return
+    def hybrid_search(self,
+                      request: str,
+                      collection_name: str,
+                      query_properties: list[str]=['content'],
+                      alpha: float=0.5,
+                      limit: int=10,
+                      filter: Filter=None,
+                      return_properties: list[str]=None,
+                      return_raw: bool=False,
+                      device: str='cuda:0' if cuda.is_available() else 'cpu'
+                     ):
+        return
 class VectorStore:
+    def __init__(self, model_path: str = 'sentence-transformers/all-mpnet-base-v2'):
         # we can create several instances to test various models, especially if we finetune one
+        self.MultiRAG_properties = [
+                Property(name='file',
                          data_type=DataType.TEXT,
                          description='Name of the file',
                          index_filterable=True,
                          index_searchable=True),
               ]
+        self.class_name = "MultiRAG_all-mpnet-base-v2"
         self.class_config = {'classes': [
                             {"class": self.class_name,
+                            "description": "multiple types of docs",
                             "vectorIndexType": "hnsw",
+                            # Vector index specific app.settings for HSNW
                             "vectorIndexConfig": {
                                     "ef": 64,  # higher is better quality vs slower search
                                     "efConstruction": 128, # higher = better index but slower build
                                     "maxConnections": 32,  # max conn per layer - higher = more memory
                             },
                             "vectorizer": "none",
+                            "properties": self.MultiRAG_properties}
                             ]
         }
         self.model_path = model_path
         try:
             self.api_key = os.environ.get('FINRAG_WEAVIATE_API_KEY')
+            logger(f"API key: {self.api_key[:5]}")
+            self.url = os.environ.get('FINRAG_WEAVIATE_ENDPOINT')
+            logger(f"URL: {self.url[8:15]}")
+            self.client = WeaviateWCS(
+                    endpoint=self.url,
+                    api_key=self.api_key,
+                    model_name_or_path=self.model_path,
+                    )
+            assert self.client._client.is_live(), "Weaviate is not live"
+            assert self.client._client.is_ready(), "Weaviate is not ready"
+            logger(f"Weaviate client created")
         except Exception as e:
             # raise Exception(f"Could not create Weaviate client: {e}")
+            self.client = dummyWeaviate()  # used when issue with HF client creation, to continue on HF
+            logger(f"Could not create Weaviate client: {e}")
+        # if we fail these tests 'VectorStore' object has no attribute 'client'
+        # it's prob not the env var but the model missing
+        # assert self.client._client.is_live(), "Weaviate is not live"
+        # assert self.client._client.is_ready(), "Weaviate is not ready"
         # careful with accessing '_client' since the weaviate helper usually closes the connection every time
         self.indexer = None
         return self.client.show_all_collections()
+    def create_collection(self,
+                          collection_name: str=COLLECTION,
+                          description: str='Documents'):
         self.collection_name = collection_name
         if collection_name not in self.collections:
             self.client.create_collection(collection_name=collection_name,
+                                          properties=self.MultiRAG_properties,
                                           description=description)
+            # self.collection_name = collection_name
         else:
+            logger(f"Collection {collection_name} already exists")
+    def empty_collection(self, collection_name: str=COLLECTION) -> bool:
         # not in the library yet, so I simply delete and recreate it
         if collection_name in self.collections:
             self.create_collection()
             return True
         else:
+            logger(f"Collection {collection_name} doesn't exist")
             return False
+    def index_data(self, data: List[dict]= None, collection_name: str=COLLECTION):
         if self.indexer is None:
             self.indexer = WeaviateIndexer(self.client)
     def keyword_search(self,
                        query: str,
                        limit: int=5,
+                       return_properties: List[str]=['file', 'content'],
                        alpha=None  # dummy parameter to match the hybrid_search signature
                        ) -> List[str]:
         response = self.client.keyword_search(
                                 request=query,
                                 collection_name=self.collection_name,
+                                query_properties=['file', 'content'],
                                 limit=limit,
                                 filter=None,
                                 return_properties=return_properties,
                                 return_raw=False)
+        return [(res['file'], res['content'], res['score']) for res in response]
     def vector_search(self,
                       query: str,
                       limit: int=5,
+                      return_properties: List[str]=['file', 'content'],
                       alpha=None  # dummy parameter to match the hybrid_search signature
                       ) -> List[str]:
                                 return_properties=return_properties,
                                 return_raw=False)
+        return [(res['file'], res['content'], res['score']) for res in response]
     def hybrid_search(self,
                       query: str,
+                      limit: int=10,
                       alpha=0.5,  # higher = more vector search
+                      return_properties: List[str]=['file', 'content']
                       ) -> List[str]:
         response = self.client.hybrid_search(
                                 request=query,
                                 collection_name=self.collection_name,
+                                query_properties=['file', 'content'],
                                 alpha=alpha,
                                 limit=limit,
                                 filter=None,
                                 return_properties=return_properties,
                                 return_raw=False)
+        return [(res['file'], res['content'], res['score']) for res in response]

app/main_reflex.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# this is the original main.py file, but without the call to fastapi
+# since it is done by reflex's own fast api server
+import os, random, logging, pickle, shutil
+from dotenv import load_dotenv, find_dotenv
+from typing import Optional
+from pydantic import BaseModel, Field
+from fastapi import FastAPI, HTTPException, File, UploadFile, status
+from fastapi.responses import HTMLResponse
+from fastapi.middleware.cors import CORSMiddleware
+try:
+    load_dotenv(find_dotenv('env'))
+except Exception as e:
+    pass
+from .engine.processing import (  # << creates the collection already
+    process_pdf,
+    process_txt,
+    index_data,
+    empty_collection,
+    vector_search,
+    vector_search_raw
+)
+from .rag.rag import rag_it
+from .engine.logger import logger
+from .settings import datadir, datadir2
+if not os.path.exists(datadir):
+    os.makedirs(datadir, exist_ok=True)
+if not os.path.exists(datadir2):
+    os.makedirs(datadir2, exist_ok=True)
+os.makedirs(datadir, exist_ok=True)
+EXTENSIONS = ["pdf", "txt"]
+app = FastAPI()
+environment = os.getenv("ENVIRONMENT", "dev")  # created by dockerfile
+# replaced by cors_allowed_origins=['*'] in rxconfig.py when using Reflex endpoint
+# if environment == "dev":
+#     logger("Running in development mode - allowing CORS for all origins")
+#     app.add_middleware(
+#         CORSMiddleware,
+#         allow_origins=["*"],
+#         allow_credentials=True,
+#         allow_methods=["*"],
+#         allow_headers=["*"],
+#     )
+# not used when using Reflex endpoint
+@app.get("/", response_class=HTMLResponse)
+def read_root():
+    logger("Title displayed on home page")
+    return """
+    <html>
+        <body>
+            <h1>Welcome to MultiRAG, a RAG system designed by JP Bianchi!</h1>
+        </body>
+    </html>
+    """
+# already provided by Reflex
+@app.get("/ping/")
+def ping():
+    """ Testing """
+    logger("Someone is pinging the server")
+    return {"answer": str(int(random.random() * 100))}
+@app.delete("/erase_data/")
+def erase_data():
+    """ Erase all files in the data directory at the first level only,
+        (in case we would like to use it for something else)
+        but not the vector store or the parquet file.
+        We can do it since the embeddings are in the parquet file already.
+    """
+    if len(os.listdir(datadir)) == 0:
+        logger("No data to erase")
+        return {"message": "No data to erase"}
+    # if we try to rmtree datadir, it looks like /data can't be deleted on HF
+    for f in os.listdir(datadir):
+        if f == '.DS_Store' or f.split('.')[-1].lower() in EXTENSIONS:
+            print(f"Removing {f}")
+            os.remove(os.path.join(datadir, f))
+            # we don't remove the parquet file, create_index does that
+    logger("All data has been erased")
+    return {"message": "All data has been erased"}
+@app.delete("/empty_collection/")
+def delete_vectors():
+    """ Empty the collection in the vector store """
+    try:
+        status = empty_collection()
+        return {"message": f"Collection{'' if status else ' NOT'} erased!"}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+@app.get("/list_files/")
+def list_files():
+    """ List all files in the data directory """
+    print("Listing files")
+    files = os.listdir(datadir)
+    logger(f"Files in data directory: {files}")
+    return {"files": files}
+@app.post("/upload/")
+# @limiter.limit("5/minute") see 'slowapi' for rate limiting
+async def upload_file(file: UploadFile = File(...)):
+    """  Uploads a file in data directory, for later indexing """
+    try:
+        filepath = os.path.join(datadir, file.filename)
+        logger(f"Fiename detected: {file.filename}")
+        if os.path.exists(filepath):
+            logger(f"File {file.filename} already exists: no processing done")
+            return {"message": f"File {file.filename} already exists: no processing done"}
+        else:
+            logger(f"Receiving file: {file.filename}")
+            contents = await file.read()
+            logger(f"File reception complete!")
+    except Exception as e:
+        logger(f"Error during file upload: {str(e)}")
+        return {"message": f"Error during file upload:  {str(e)}"}
+    if file.filename.endswith('.pdf'):
+        # let's save the file in /data even if it's temp storage on HF
+        with open(filepath, 'wb') as f:
+            f.write(contents)
+        # save it also in assets/data because data can be cleared
+        filepath2 = os.path.join(datadir2, file.filename)
+        with open(filepath2, 'wb') as f:
+            f.write(contents)
+        try:
+            logger(f"Starting to process {file.filename}")
+            new_content = process_pdf(filepath)
+            success = {"message": f"Successfully uploaded {file.filename}"}
+            success.update(new_content)
+            return success
+        except Exception as e:
+            return {"message": f"Failed to extract text from PDF: {str(e)}"}
+    elif file.filename.endswith('.txt'):
+        with open(filepath, 'wb') as f:
+            f.write(contents)
+        filepath2 = os.path.join(datadir2, file.filename)
+        with open(filepath2, 'wb') as f:
+            f.write(contents)
+        try:
+            logger(f"Reading {file.filename}")
+            new_content = process_txt(filepath)
+            success = {"message": f"Successfully uploaded {file.filename}"}
+            success.update(new_content)
+            return success
+        except Exception as e:
+            return {"message": f"Failed to extract text from TXT: {str(e)}"}
+    else:
+        return {"message": "Only PDF & txt files are accepted"}
+@app.post("/create_index/")
+async def create_index():
+    """ Create an index for the uploaded files """
+    logger("Creating index for uploaded files")
+    try:
+        msg = index_data()
+        return {"message": msg}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+class Question(BaseModel):
+    question: str
+@app.post("/ask/")
+async def hybrid_search(question: Question):
+    logger(f"Processing question: {question.question}")
+    try:
+        search_results = vector_search(question.question)
+        logger(f"Answer: {search_results}")
+        return {"answer": search_results}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+@app.post("/ragit/")
+async def ragit(question: Question):
+    logger(f"Processing question: {question.question}")
+    try:
+        search_results = vector_search_raw(question.question)
+        logger(f"Search results generated: {search_results}")
+        answer = rag_it(question.question, search_results)
+        logger(f"Answer: {answer}")
+        return {"answer": answer}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+if __name__ == '__main__':
+    import uvicorn
+    from os import getenv
+    port = int(getenv("PORT", 80))
+    print(f"Starting server on port {port}")
+    reload = True if environment == "dev" else False
+    uvicorn.run("main:app", host="0.0.0.0", port=port, reload=reload)
+# Examples:
+# curl -X POST "http://localhost:8001/upload" -F "[email protected]"
+# curl -X DELETE "http://localhost:8001/erase_data/"
+# curl -X GET "http://localhost:8001/list_files/"
+# hf space is at https://jpbianchi-multirag.hf.space/
+# code given by https://jpbianchi-multirag.hf.space/docs
+# Space must be public
+# curl -X POST "https://jpbianchi-multirag.hf.space/upload/" -F "[email protected]"
+# curl -X POST http://localhost:80/ask/ -H "Content-Type: application/json" -d '{"question": "what is Amazon loss"}'
+# curl -X POST http://localhost:80/ragit/ -H "Content-Type: application/json" -d '{"question": "Does ATT have postpaid phone customers?"}'
+# see more in notebook upload_index.ipynb

app/notebooks/upload_index.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

app/rag/rag.py CHANGED Viewed

@@ -37,7 +37,7 @@ def rag_it(question: str,
     1. If the context does not provide enough information to answer the question, then
     state that you cannot answer the question with the provided context.
     Pay great attention to making sure your answer is relevant to the question
-    (for instance, never answer a question about a topic or company that are not explicitely mentioned in the context)
     2. Do not use any external knowledge or resources to answer the question.
     3. Answer the question directly and with as much detail as possible, within the limits of the context.
     4. Avoid mentioning 'search results' in the answer.

     1. If the context does not provide enough information to answer the question, then
     state that you cannot answer the question with the provided context.
     Pay great attention to making sure your answer is relevant to the question
+    For instance, never answer a question about a topic or company that are not either explicitely mentioned in the context or implied by the context.
     2. Do not use any external knowledge or resources to answer the question.
     3. Answer the question directly and with as much detail as possible, within the limits of the context.
     4. Avoid mentioning 'search results' in the answer.

app/settings.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import os
-datadir = '../data'  # will be used in main.py
 parquet_file = os.path.join(datadir, 'text_vectors.parquet') # used by the files in 'engine'

 import os
+datadir = 'data'  # will be used in main.py
+datadir2 = 'assets/data'  # backup since data can be emptied
 parquet_file = os.path.join(datadir, 'text_vectors.parquet') # used by the files in 'engine'

assets/IO_logo.webp ADDED Viewed

assets/OI_logo.jpg ADDED Viewed

assets/amazon_forecast.jpg ADDED Viewed

assets/amazon_idiot.jpg ADDED Viewed

assets/favicon.ico ADDED Viewed

assets/homepage.jpg ADDED Viewed

assets/irrelevant_amazon.jpg ADDED Viewed