Asaad Almutareb commited on
Commit
6f2843a
β€’
1 Parent(s): fb95c43

corrected S3 vectorstore

Browse files
None ADDED
Binary file (12.3 kB). View file
 
app_gui.py CHANGED
@@ -1,6 +1,6 @@
1
  # Import Gradio for UI, along with other necessary libraries
2
  import gradio as gr
3
- from rag_app.react_agent import agent_executor
4
  # need to import the qa!
5
 
6
  # Function to add a new input to the chat history
 
1
  # Import Gradio for UI, along with other necessary libraries
2
  import gradio as gr
3
+ from rag_app.agents.react_agent import agent_executor
4
  # need to import the qa!
5
 
6
  # Function to add a new input to the chat history
example.env CHANGED
@@ -5,8 +5,10 @@ GOOGLE_API_KEY=""
5
 
6
  # Vectorstore storage on S3 and locally
7
  S3_LOCATION="rad-rag-demos"
8
- FAISS_VS_NAME="vectorstores/faiss-insurance-agent-500.zip"
9
- CHROMA_VS_NAME=""
 
 
10
  FAISS_INDEX_PATH = "./vectorstore/faiss-insurance-agent-500"
11
  CHROMA_DIRECTORY = "./vectorstore/chroma-insurance-agent-500"
12
 
@@ -18,7 +20,8 @@ CONVERSATION_COLLECTION_NAME="ConversationMemory"
18
 
19
  # llm and embedding models
20
  #EMBEDDING_MODEL="sentence-transformers/multi-qa-mpnet-base-dot-v1"
21
- EMBEDDING_MODEL="microsoft/Multilingual-MiniLM-L12-H384"
 
22
  LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
23
  LLM_MODEL_ARGS=
24
 
 
5
 
6
  # Vectorstore storage on S3 and locally
7
  S3_LOCATION="rad-rag-demos"
8
+ #faiss-insurance-agent-mpnet-1500.zip
9
+ FAISS_VS_NAME="vectorstores/faiss-insurance-agent-MiniLM-1500.zip"
10
+ #chroma-insurance-agent-mpnet-1500.zip
11
+ CHROMA_VS_NAME="vectorstore/chroma-insurance-agent-MiniLM-1500.zip"
12
  FAISS_INDEX_PATH = "./vectorstore/faiss-insurance-agent-500"
13
  CHROMA_DIRECTORY = "./vectorstore/chroma-insurance-agent-500"
14
 
 
20
 
21
  # llm and embedding models
22
  #EMBEDDING_MODEL="sentence-transformers/multi-qa-mpnet-base-dot-v1"
23
+ EMBEDDING_MODEL="sentence-transformers/distiluse-base-multilingual-cased-v2" #512 dims
24
+ #EMBEDDING_MODEL="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" #384 dims
25
  LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
26
  LLM_MODEL_ARGS=
27
 
rag_app/{react_agent.py β†’ agents/react_agent.py} RENAMED
File without changes
rag_app/{simple_qa_chain.py β†’ chains/simple_qa_chain.py} RENAMED
File without changes
rag_app/{build_vector_store.py β†’ knowledge_base/build_vector_store.py} RENAMED
@@ -1,15 +1,17 @@
1
  # vectorization functions
2
  from langchain_community.vectorstores import FAISS
3
  from langchain_community.vectorstores import Chroma
4
- from langchain_community.document_loaders import DirectoryLoader
5
- from langchain_text_splitters import RecursiveCharacterTextSplitter
6
- from langchain_huggingface import HuggingFaceEmbeddings
 
 
7
  from langchain_community.retrievers import BM25Retriever
8
- from rag_app.create_embedding import create_embeddings
9
- from rag_app.generate_summary import generate_description, generate_keywords
10
  import time
11
  import os
12
- from dotenv import load_dotenv
13
 
14
  def build_vector_store(
15
  docs: list,
 
1
  # vectorization functions
2
  from langchain_community.vectorstores import FAISS
3
  from langchain_community.vectorstores import Chroma
4
+ #from langchain_community.document_loaders import DirectoryLoader
5
+ #from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ #from langchain_community.embeddings.sentence_transformer import (
7
+ # SentenceTransformerEmbeddings,
8
+ #)
9
  from langchain_community.retrievers import BM25Retriever
10
+ from rag_app.knowledge_base.create_embedding import create_embeddings
11
+ from rag_app.utils.generate_summary import generate_description, generate_keywords
12
  import time
13
  import os
14
+ #from dotenv import load_dotenv
15
 
16
  def build_vector_store(
17
  docs: list,
rag_app/{create_embedding.py β†’ knowledge_base/create_embedding.py} RENAMED
@@ -3,7 +3,10 @@
3
  #from langchain_community.document_loaders import ReadTheDocsLoader
4
  #from langchain_community.vectorstores.utils import filter_complex_metadata
5
  from langchain_text_splitters import RecursiveCharacterTextSplitter
6
- from langchain_huggingface import HuggingFaceEmbeddings
 
 
 
7
  import time
8
  from langchain_core.documents import Document
9
 
@@ -44,7 +47,8 @@ def create_embeddings(
44
  print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
45
 
46
  #Stage two: embed the docs.
47
- embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
 
48
  print(f"created a total of {len(chunks)} chunks")
49
 
50
  return embeddings,chunks
 
3
  #from langchain_community.document_loaders import ReadTheDocsLoader
4
  #from langchain_community.vectorstores.utils import filter_complex_metadata
5
  from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ # from langchain_huggingface import HuggingFaceEmbeddings
7
+ from langchain_community.embeddings.sentence_transformer import (
8
+ SentenceTransformerEmbeddings,
9
+ )
10
  import time
11
  from langchain_core.documents import Document
12
 
 
47
  print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
48
 
49
  #Stage two: embed the docs.
50
+ #embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
51
+ embeddings = SentenceTransformerEmbeddings(model_name=embedding_model)
52
  print(f"created a total of {len(chunks)} chunks")
53
 
54
  return embeddings,chunks
rag_app/{load_vector_stores.py β†’ loading_data/load_S3_vector_stores.py} RENAMED
@@ -40,14 +40,14 @@ def get_faiss_vs():
40
  VS_DESTINATION = FAISS_INDEX_PATH + ".zip"
41
  try:
42
  # Download the pre-prepared vectorized index from the S3 bucket
43
- print("Downloading the pre-prepared vectorized index from S3...")
44
  s3.download_file(S3_LOCATION, FAISS_VS_NAME, VS_DESTINATION)
45
 
46
  # Extract the downloaded zip file
47
  with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
48
  zip_ref.extractall('./vectorstore/')
49
  print("Download and extraction completed.")
50
- return FAISS.load_local(FAISS_INDEX_PATH, embeddings)
51
 
52
  except Exception as e:
53
  print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
@@ -61,9 +61,12 @@ def get_chroma_vs():
61
 
62
  VS_DESTINATION = CHROMA_DIRECTORY+".zip"
63
  try:
 
 
64
  s3.download_file(S3_LOCATION, CHROMA_VS_NAME, VS_DESTINATION)
65
  with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
66
  zip_ref.extractall('./vectorstore/')
 
67
  chromadb = Chroma(persist_directory=CHROMA_DIRECTORY, embedding_function=embeddings)
68
  chromadb.get()
69
  except Exception as e:
 
40
  VS_DESTINATION = FAISS_INDEX_PATH + ".zip"
41
  try:
42
  # Download the pre-prepared vectorized index from the S3 bucket
43
+ print("Downloading the pre-prepared FAISS vectorized index from S3...")
44
  s3.download_file(S3_LOCATION, FAISS_VS_NAME, VS_DESTINATION)
45
 
46
  # Extract the downloaded zip file
47
  with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
48
  zip_ref.extractall('./vectorstore/')
49
  print("Download and extraction completed.")
50
+ return FAISS.load_local(FAISS_INDEX_PATH, embeddings,allow_dangerous_deserialization=True)
51
 
52
  except Exception as e:
53
  print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
 
61
 
62
  VS_DESTINATION = CHROMA_DIRECTORY+".zip"
63
  try:
64
+ # Download the pre-prepared vectorized index from the S3 bucket
65
+ print("Downloading the pre-prepared chroma vectorstore from S3...")
66
  s3.download_file(S3_LOCATION, CHROMA_VS_NAME, VS_DESTINATION)
67
  with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
68
  zip_ref.extractall('./vectorstore/')
69
+ print("Download and extraction completed.")
70
  chromadb = Chroma(persist_directory=CHROMA_DIRECTORY, embedding_function=embeddings)
71
  chromadb.get()
72
  except Exception as e:
rag_app/structured_tools/structured_tools.py CHANGED
@@ -52,7 +52,7 @@ def memory_search(query:str) -> str:
52
 
53
  @tool
54
  def knowledgeBase_search(query:str) -> str:
55
- """Search the internal knowledge base for research papers and relevent chunks"""
56
  # Since we have more than one collections we should change the name of this tool
57
  client = chromadb.PersistentClient(
58
  path=persist_directory,
@@ -82,7 +82,7 @@ def knowledgeBase_search(query:str) -> str:
82
 
83
  @tool
84
  def google_search(query: str) -> str:
85
- """Search Google for additional results when you can't answer questions using arxiv search or wikipedia search."""
86
  global all_sources
87
 
88
  websearch = GoogleSearchAPIWrapper()
 
52
 
53
  @tool
54
  def knowledgeBase_search(query:str) -> str:
55
+ """Suche die interne Datenbank nach passenden Versicherungsprodukten und Informationen zu den Versicherungen"""
56
  # Since we have more than one collections we should change the name of this tool
57
  client = chromadb.PersistentClient(
58
  path=persist_directory,
 
82
 
83
  @tool
84
  def google_search(query: str) -> str:
85
+ """Verbessere die Ergebnisse durch eine Suche ΓΌber die Webseite der Versicherung. Erstelle eine neue Suchanfrage, um die Erfolgschancen zu verbesseren."""
86
  global all_sources
87
 
88
  websearch = GoogleSearchAPIWrapper()
rag_app/{generate_summary.py β†’ utils/generate_summary.py} RENAMED
File without changes
test_this.py CHANGED
@@ -1,7 +1,9 @@
1
  from rag_app.loading_data.load_urls_recurisvely import load_docs_from_urls
2
- from rag_app.create_embedding import create_embeddings
3
- from rag_app.generate_summary import generate_description, generate_keywords
4
- from rag_app.build_vector_store import build_vector_store
 
 
5
 
6
  # 1. load the urls
7
  # 2. build the vectorstore -> the function will create the chunking and embeddings
@@ -22,6 +24,7 @@ from rag_app.build_vector_store import build_vector_store
22
 
23
  # print(create_embeddings(docs))
24
 
25
- from rag_app.loading_data.scrap_website import scrap_website
26
 
27
- print(scrap_website(target_url='https://www.wuerttembergische.de/',depth=1))
 
 
 
1
  from rag_app.loading_data.load_urls_recurisvely import load_docs_from_urls
2
+ from rag_app.knowledge_base.create_embedding import create_embeddings
3
+ from rag_app.utils.generate_summary import generate_description, generate_keywords
4
+ from rag_app.knowledge_base.build_vector_store import build_vector_store
5
+ from rag_app.loading_data.scrap_website import scrap_website
6
+ from rag_app.loading_data.load_S3_vector_stores import get_chroma_vs, get_faiss_vs
7
 
8
  # 1. load the urls
9
  # 2. build the vectorstore -> the function will create the chunking and embeddings
 
24
 
25
  # print(create_embeddings(docs))
26
 
 
27
 
28
+ #print(scrap_website(target_url='https://www.wuerttembergische.de/',depth=1))
29
+ get_faiss_vs()
30
+ #get_chroma_vs()