Spaces:

sabazo
/

insurance_advisor_wb

Sleeping

App Files Files Community

Asaad Almutareb commited on Jul 6

Commit

6f2843a

•

1 Parent(s): fb95c43

corrected S3 vectorstore

Browse files

Files changed (11) hide show

None +0 -0
app_gui.py +1 -1
example.env +6 -3
rag_app/{react_agent.py → agents/react_agent.py} +0 -0
rag_app/{simple_qa_chain.py → chains/simple_qa_chain.py} +0 -0
rag_app/{build_vector_store.py → knowledge_base/build_vector_store.py} +8 -6
rag_app/{create_embedding.py → knowledge_base/create_embedding.py} +6 -2
rag_app/{load_vector_stores.py → loading_data/load_S3_vector_stores.py} +5 -2
rag_app/structured_tools/structured_tools.py +2 -2
rag_app/{generate_summary.py → utils/generate_summary.py} +0 -0
test_this.py +8 -5

None ADDED Viewed

Binary file (12.3 kB). View file

app_gui.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Import Gradio for UI, along with other necessary libraries
 import gradio as gr
-from rag_app.react_agent import agent_executor
 # need to import the qa!
 # Function to add a new input to the chat history

 # Import Gradio for UI, along with other necessary libraries
 import gradio as gr
+from rag_app.agents.react_agent import agent_executor
 # need to import the qa!
 # Function to add a new input to the chat history

example.env CHANGED Viewed

@@ -5,8 +5,10 @@ GOOGLE_API_KEY=""
 # Vectorstore storage on S3 and locally
 S3_LOCATION="rad-rag-demos"
-FAISS_VS_NAME="vectorstores/faiss-insurance-agent-500.zip"
-CHROMA_VS_NAME=""
 FAISS_INDEX_PATH = "./vectorstore/faiss-insurance-agent-500"
 CHROMA_DIRECTORY = "./vectorstore/chroma-insurance-agent-500"
@@ -18,7 +20,8 @@ CONVERSATION_COLLECTION_NAME="ConversationMemory"
 # llm and embedding models
 #EMBEDDING_MODEL="sentence-transformers/multi-qa-mpnet-base-dot-v1"
-EMBEDDING_MODEL="microsoft/Multilingual-MiniLM-L12-H384"
 LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
 LLM_MODEL_ARGS=

 # Vectorstore storage on S3 and locally
 S3_LOCATION="rad-rag-demos"
+#faiss-insurance-agent-mpnet-1500.zip
+FAISS_VS_NAME="vectorstores/faiss-insurance-agent-MiniLM-1500.zip"
+#chroma-insurance-agent-mpnet-1500.zip
+CHROMA_VS_NAME="vectorstore/chroma-insurance-agent-MiniLM-1500.zip"
 FAISS_INDEX_PATH = "./vectorstore/faiss-insurance-agent-500"
 CHROMA_DIRECTORY = "./vectorstore/chroma-insurance-agent-500"
 # llm and embedding models
 #EMBEDDING_MODEL="sentence-transformers/multi-qa-mpnet-base-dot-v1"
+EMBEDDING_MODEL="sentence-transformers/distiluse-base-multilingual-cased-v2" #512 dims
+#EMBEDDING_MODEL="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" #384 dims
 LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
 LLM_MODEL_ARGS=

rag_app/{react_agent.py → agents/react_agent.py} RENAMED Viewed

File without changes

rag_app/{simple_qa_chain.py → chains/simple_qa_chain.py} RENAMED Viewed

File without changes

rag_app/{build_vector_store.py → knowledge_base/build_vector_store.py} RENAMED Viewed

@@ -1,15 +1,17 @@
 # vectorization functions
 from langchain_community.vectorstores import FAISS
 from langchain_community.vectorstores import Chroma
-from langchain_community.document_loaders import DirectoryLoader
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.retrievers import BM25Retriever
-from rag_app.create_embedding import create_embeddings
-from rag_app.generate_summary import generate_description, generate_keywords
 import time
 import os
-from dotenv import load_dotenv
 def build_vector_store(
         docs: list,

 # vectorization functions
 from langchain_community.vectorstores import FAISS
 from langchain_community.vectorstores import Chroma
+#from langchain_community.document_loaders import DirectoryLoader
+#from langchain_text_splitters import RecursiveCharacterTextSplitter
+#from langchain_community.embeddings.sentence_transformer import (
+#    SentenceTransformerEmbeddings,
+#)
 from langchain_community.retrievers import BM25Retriever
+from rag_app.knowledge_base.create_embedding import create_embeddings
+from rag_app.utils.generate_summary import generate_description, generate_keywords
 import time
 import os
+#from dotenv import load_dotenv
 def build_vector_store(
         docs: list,

rag_app/{create_embedding.py → knowledge_base/create_embedding.py} RENAMED Viewed

@@ -3,7 +3,10 @@
 #from langchain_community.document_loaders import ReadTheDocsLoader
 #from langchain_community.vectorstores.utils import filter_complex_metadata
 from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_huggingface import HuggingFaceEmbeddings
 import time
 from langchain_core.documents import Document
@@ -44,7 +47,8 @@ def create_embeddings(
     print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
     #Stage two: embed the docs.
-    embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
     print(f"created a total of {len(chunks)} chunks")
     return embeddings,chunks

 #from langchain_community.document_loaders import ReadTheDocsLoader
 #from langchain_community.vectorstores.utils import filter_complex_metadata
 from langchain_text_splitters import RecursiveCharacterTextSplitter
+# from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.embeddings.sentence_transformer import (
+    SentenceTransformerEmbeddings,
+)
 import time
 from langchain_core.documents import Document
     print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
     #Stage two: embed the docs.
+    #embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
+    embeddings = SentenceTransformerEmbeddings(model_name=embedding_model)
     print(f"created a total of {len(chunks)} chunks")
     return embeddings,chunks

rag_app/{load_vector_stores.py → loading_data/load_S3_vector_stores.py} RENAMED Viewed

@@ -40,14 +40,14 @@ def get_faiss_vs():
     VS_DESTINATION = FAISS_INDEX_PATH + ".zip"
     try:
         # Download the pre-prepared vectorized index from the S3 bucket
-        print("Downloading the pre-prepared vectorized index from S3...")
         s3.download_file(S3_LOCATION, FAISS_VS_NAME, VS_DESTINATION)
         # Extract the downloaded zip file
         with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
             zip_ref.extractall('./vectorstore/')
         print("Download and extraction completed.")
-        return FAISS.load_local(FAISS_INDEX_PATH, embeddings)
     except Exception as e:
         print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
@@ -61,9 +61,12 @@ def get_chroma_vs():
     VS_DESTINATION = CHROMA_DIRECTORY+".zip"
     try:
         s3.download_file(S3_LOCATION, CHROMA_VS_NAME, VS_DESTINATION)
         with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
             zip_ref.extractall('./vectorstore/')
         chromadb = Chroma(persist_directory=CHROMA_DIRECTORY, embedding_function=embeddings)
         chromadb.get()
     except Exception as e:

     VS_DESTINATION = FAISS_INDEX_PATH + ".zip"
     try:
         # Download the pre-prepared vectorized index from the S3 bucket
+        print("Downloading the pre-prepared FAISS vectorized index from S3...")
         s3.download_file(S3_LOCATION, FAISS_VS_NAME, VS_DESTINATION)
         # Extract the downloaded zip file
         with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
             zip_ref.extractall('./vectorstore/')
         print("Download and extraction completed.")
+        return FAISS.load_local(FAISS_INDEX_PATH, embeddings,allow_dangerous_deserialization=True)
     except Exception as e:
         print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
     VS_DESTINATION = CHROMA_DIRECTORY+".zip"
     try:
+        # Download the pre-prepared vectorized index from the S3 bucket
+        print("Downloading the pre-prepared chroma vectorstore from S3...")
         s3.download_file(S3_LOCATION, CHROMA_VS_NAME, VS_DESTINATION)
         with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
             zip_ref.extractall('./vectorstore/')
+        print("Download and extraction completed.")
         chromadb = Chroma(persist_directory=CHROMA_DIRECTORY, embedding_function=embeddings)
         chromadb.get()
     except Exception as e:

rag_app/structured_tools/structured_tools.py CHANGED Viewed

@@ -52,7 +52,7 @@ def memory_search(query:str) -> str:
 @tool
 def knowledgeBase_search(query:str) -> str:
-    """Search the internal knowledge base for research papers and relevent chunks"""
     # Since we have more than one collections we should change the name of this tool
     client = chromadb.PersistentClient(
      path=persist_directory,
@@ -82,7 +82,7 @@ def knowledgeBase_search(query:str) -> str:
 @tool
 def google_search(query: str) -> str:
-    """Search Google for additional results when you can't answer questions using arxiv search or wikipedia search."""
     global all_sources
     websearch = GoogleSearchAPIWrapper()

 @tool
 def knowledgeBase_search(query:str) -> str:
+    """Suche die interne Datenbank nach passenden Versicherungsprodukten und Informationen zu den Versicherungen"""
     # Since we have more than one collections we should change the name of this tool
     client = chromadb.PersistentClient(
      path=persist_directory,
 @tool
 def google_search(query: str) -> str:
+    """Verbessere die Ergebnisse durch eine Suche über die Webseite der Versicherung. Erstelle eine neue Suchanfrage, um die Erfolgschancen zu verbesseren."""
     global all_sources
     websearch = GoogleSearchAPIWrapper()

rag_app/{generate_summary.py → utils/generate_summary.py} RENAMED Viewed

File without changes

test_this.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from rag_app.loading_data.load_urls_recurisvely import load_docs_from_urls
-from rag_app.create_embedding import create_embeddings
-from rag_app.generate_summary import generate_description, generate_keywords
-from rag_app.build_vector_store import build_vector_store
 # 1. load the urls
 # 2. build the vectorstore -> the function will create the chunking and embeddings
@@ -22,6 +24,7 @@ from rag_app.build_vector_store import build_vector_store
 # print(create_embeddings(docs))
-from rag_app.loading_data.scrap_website import scrap_website
-print(scrap_website(target_url='https://www.wuerttembergische.de/',depth=1))

 from rag_app.loading_data.load_urls_recurisvely import load_docs_from_urls
+from rag_app.knowledge_base.create_embedding import create_embeddings
+from rag_app.utils.generate_summary import generate_description, generate_keywords
+from rag_app.knowledge_base.build_vector_store import build_vector_store
+from rag_app.loading_data.scrap_website import scrap_website
+from rag_app.loading_data.load_S3_vector_stores import get_chroma_vs, get_faiss_vs
 # 1. load the urls
 # 2. build the vectorstore -> the function will create the chunking and embeddings
 # print(create_embeddings(docs))
+#print(scrap_website(target_url='https://www.wuerttembergische.de/',depth=1))
+get_faiss_vs()
+#get_chroma_vs()