Spaces:
Sleeping
Sleeping
Asaad Almutareb
commited on
Commit
β’
6f2843a
1
Parent(s):
fb95c43
corrected S3 vectorstore
Browse files- None +0 -0
- app_gui.py +1 -1
- example.env +6 -3
- rag_app/{react_agent.py β agents/react_agent.py} +0 -0
- rag_app/{simple_qa_chain.py β chains/simple_qa_chain.py} +0 -0
- rag_app/{build_vector_store.py β knowledge_base/build_vector_store.py} +8 -6
- rag_app/{create_embedding.py β knowledge_base/create_embedding.py} +6 -2
- rag_app/{load_vector_stores.py β loading_data/load_S3_vector_stores.py} +5 -2
- rag_app/structured_tools/structured_tools.py +2 -2
- rag_app/{generate_summary.py β utils/generate_summary.py} +0 -0
- test_this.py +8 -5
None
ADDED
Binary file (12.3 kB). View file
|
|
app_gui.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
# Import Gradio for UI, along with other necessary libraries
|
2 |
import gradio as gr
|
3 |
-
from rag_app.react_agent import agent_executor
|
4 |
# need to import the qa!
|
5 |
|
6 |
# Function to add a new input to the chat history
|
|
|
1 |
# Import Gradio for UI, along with other necessary libraries
|
2 |
import gradio as gr
|
3 |
+
from rag_app.agents.react_agent import agent_executor
|
4 |
# need to import the qa!
|
5 |
|
6 |
# Function to add a new input to the chat history
|
example.env
CHANGED
@@ -5,8 +5,10 @@ GOOGLE_API_KEY=""
|
|
5 |
|
6 |
# Vectorstore storage on S3 and locally
|
7 |
S3_LOCATION="rad-rag-demos"
|
8 |
-
|
9 |
-
|
|
|
|
|
10 |
FAISS_INDEX_PATH = "./vectorstore/faiss-insurance-agent-500"
|
11 |
CHROMA_DIRECTORY = "./vectorstore/chroma-insurance-agent-500"
|
12 |
|
@@ -18,7 +20,8 @@ CONVERSATION_COLLECTION_NAME="ConversationMemory"
|
|
18 |
|
19 |
# llm and embedding models
|
20 |
#EMBEDDING_MODEL="sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
21 |
-
EMBEDDING_MODEL="
|
|
|
22 |
LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
|
23 |
LLM_MODEL_ARGS=
|
24 |
|
|
|
5 |
|
6 |
# Vectorstore storage on S3 and locally
|
7 |
S3_LOCATION="rad-rag-demos"
|
8 |
+
#faiss-insurance-agent-mpnet-1500.zip
|
9 |
+
FAISS_VS_NAME="vectorstores/faiss-insurance-agent-MiniLM-1500.zip"
|
10 |
+
#chroma-insurance-agent-mpnet-1500.zip
|
11 |
+
CHROMA_VS_NAME="vectorstore/chroma-insurance-agent-MiniLM-1500.zip"
|
12 |
FAISS_INDEX_PATH = "./vectorstore/faiss-insurance-agent-500"
|
13 |
CHROMA_DIRECTORY = "./vectorstore/chroma-insurance-agent-500"
|
14 |
|
|
|
20 |
|
21 |
# llm and embedding models
|
22 |
#EMBEDDING_MODEL="sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
23 |
+
EMBEDDING_MODEL="sentence-transformers/distiluse-base-multilingual-cased-v2" #512 dims
|
24 |
+
#EMBEDDING_MODEL="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" #384 dims
|
25 |
LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
|
26 |
LLM_MODEL_ARGS=
|
27 |
|
rag_app/{react_agent.py β agents/react_agent.py}
RENAMED
File without changes
|
rag_app/{simple_qa_chain.py β chains/simple_qa_chain.py}
RENAMED
File without changes
|
rag_app/{build_vector_store.py β knowledge_base/build_vector_store.py}
RENAMED
@@ -1,15 +1,17 @@
|
|
1 |
# vectorization functions
|
2 |
from langchain_community.vectorstores import FAISS
|
3 |
from langchain_community.vectorstores import Chroma
|
4 |
-
from langchain_community.document_loaders import DirectoryLoader
|
5 |
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
6 |
-
from
|
|
|
|
|
7 |
from langchain_community.retrievers import BM25Retriever
|
8 |
-
from rag_app.create_embedding import create_embeddings
|
9 |
-
from rag_app.generate_summary import generate_description, generate_keywords
|
10 |
import time
|
11 |
import os
|
12 |
-
from dotenv import load_dotenv
|
13 |
|
14 |
def build_vector_store(
|
15 |
docs: list,
|
|
|
1 |
# vectorization functions
|
2 |
from langchain_community.vectorstores import FAISS
|
3 |
from langchain_community.vectorstores import Chroma
|
4 |
+
#from langchain_community.document_loaders import DirectoryLoader
|
5 |
+
#from langchain_text_splitters import RecursiveCharacterTextSplitter
|
6 |
+
#from langchain_community.embeddings.sentence_transformer import (
|
7 |
+
# SentenceTransformerEmbeddings,
|
8 |
+
#)
|
9 |
from langchain_community.retrievers import BM25Retriever
|
10 |
+
from rag_app.knowledge_base.create_embedding import create_embeddings
|
11 |
+
from rag_app.utils.generate_summary import generate_description, generate_keywords
|
12 |
import time
|
13 |
import os
|
14 |
+
#from dotenv import load_dotenv
|
15 |
|
16 |
def build_vector_store(
|
17 |
docs: list,
|
rag_app/{create_embedding.py β knowledge_base/create_embedding.py}
RENAMED
@@ -3,7 +3,10 @@
|
|
3 |
#from langchain_community.document_loaders import ReadTheDocsLoader
|
4 |
#from langchain_community.vectorstores.utils import filter_complex_metadata
|
5 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
6 |
-
from langchain_huggingface import HuggingFaceEmbeddings
|
|
|
|
|
|
|
7 |
import time
|
8 |
from langchain_core.documents import Document
|
9 |
|
@@ -44,7 +47,8 @@ def create_embeddings(
|
|
44 |
print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
|
45 |
|
46 |
#Stage two: embed the docs.
|
47 |
-
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
|
|
|
48 |
print(f"created a total of {len(chunks)} chunks")
|
49 |
|
50 |
return embeddings,chunks
|
|
|
3 |
#from langchain_community.document_loaders import ReadTheDocsLoader
|
4 |
#from langchain_community.vectorstores.utils import filter_complex_metadata
|
5 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
6 |
+
# from langchain_huggingface import HuggingFaceEmbeddings
|
7 |
+
from langchain_community.embeddings.sentence_transformer import (
|
8 |
+
SentenceTransformerEmbeddings,
|
9 |
+
)
|
10 |
import time
|
11 |
from langchain_core.documents import Document
|
12 |
|
|
|
47 |
print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
|
48 |
|
49 |
#Stage two: embed the docs.
|
50 |
+
#embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
|
51 |
+
embeddings = SentenceTransformerEmbeddings(model_name=embedding_model)
|
52 |
print(f"created a total of {len(chunks)} chunks")
|
53 |
|
54 |
return embeddings,chunks
|
rag_app/{load_vector_stores.py β loading_data/load_S3_vector_stores.py}
RENAMED
@@ -40,14 +40,14 @@ def get_faiss_vs():
|
|
40 |
VS_DESTINATION = FAISS_INDEX_PATH + ".zip"
|
41 |
try:
|
42 |
# Download the pre-prepared vectorized index from the S3 bucket
|
43 |
-
print("Downloading the pre-prepared vectorized index from S3...")
|
44 |
s3.download_file(S3_LOCATION, FAISS_VS_NAME, VS_DESTINATION)
|
45 |
|
46 |
# Extract the downloaded zip file
|
47 |
with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
|
48 |
zip_ref.extractall('./vectorstore/')
|
49 |
print("Download and extraction completed.")
|
50 |
-
return FAISS.load_local(FAISS_INDEX_PATH, embeddings)
|
51 |
|
52 |
except Exception as e:
|
53 |
print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
|
@@ -61,9 +61,12 @@ def get_chroma_vs():
|
|
61 |
|
62 |
VS_DESTINATION = CHROMA_DIRECTORY+".zip"
|
63 |
try:
|
|
|
|
|
64 |
s3.download_file(S3_LOCATION, CHROMA_VS_NAME, VS_DESTINATION)
|
65 |
with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
|
66 |
zip_ref.extractall('./vectorstore/')
|
|
|
67 |
chromadb = Chroma(persist_directory=CHROMA_DIRECTORY, embedding_function=embeddings)
|
68 |
chromadb.get()
|
69 |
except Exception as e:
|
|
|
40 |
VS_DESTINATION = FAISS_INDEX_PATH + ".zip"
|
41 |
try:
|
42 |
# Download the pre-prepared vectorized index from the S3 bucket
|
43 |
+
print("Downloading the pre-prepared FAISS vectorized index from S3...")
|
44 |
s3.download_file(S3_LOCATION, FAISS_VS_NAME, VS_DESTINATION)
|
45 |
|
46 |
# Extract the downloaded zip file
|
47 |
with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
|
48 |
zip_ref.extractall('./vectorstore/')
|
49 |
print("Download and extraction completed.")
|
50 |
+
return FAISS.load_local(FAISS_INDEX_PATH, embeddings,allow_dangerous_deserialization=True)
|
51 |
|
52 |
except Exception as e:
|
53 |
print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
|
|
|
61 |
|
62 |
VS_DESTINATION = CHROMA_DIRECTORY+".zip"
|
63 |
try:
|
64 |
+
# Download the pre-prepared vectorized index from the S3 bucket
|
65 |
+
print("Downloading the pre-prepared chroma vectorstore from S3...")
|
66 |
s3.download_file(S3_LOCATION, CHROMA_VS_NAME, VS_DESTINATION)
|
67 |
with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
|
68 |
zip_ref.extractall('./vectorstore/')
|
69 |
+
print("Download and extraction completed.")
|
70 |
chromadb = Chroma(persist_directory=CHROMA_DIRECTORY, embedding_function=embeddings)
|
71 |
chromadb.get()
|
72 |
except Exception as e:
|
rag_app/structured_tools/structured_tools.py
CHANGED
@@ -52,7 +52,7 @@ def memory_search(query:str) -> str:
|
|
52 |
|
53 |
@tool
|
54 |
def knowledgeBase_search(query:str) -> str:
|
55 |
-
"""
|
56 |
# Since we have more than one collections we should change the name of this tool
|
57 |
client = chromadb.PersistentClient(
|
58 |
path=persist_directory,
|
@@ -82,7 +82,7 @@ def knowledgeBase_search(query:str) -> str:
|
|
82 |
|
83 |
@tool
|
84 |
def google_search(query: str) -> str:
|
85 |
-
"""
|
86 |
global all_sources
|
87 |
|
88 |
websearch = GoogleSearchAPIWrapper()
|
|
|
52 |
|
53 |
@tool
|
54 |
def knowledgeBase_search(query:str) -> str:
|
55 |
+
"""Suche die interne Datenbank nach passenden Versicherungsprodukten und Informationen zu den Versicherungen"""
|
56 |
# Since we have more than one collections we should change the name of this tool
|
57 |
client = chromadb.PersistentClient(
|
58 |
path=persist_directory,
|
|
|
82 |
|
83 |
@tool
|
84 |
def google_search(query: str) -> str:
|
85 |
+
"""Verbessere die Ergebnisse durch eine Suche ΓΌber die Webseite der Versicherung. Erstelle eine neue Suchanfrage, um die Erfolgschancen zu verbesseren."""
|
86 |
global all_sources
|
87 |
|
88 |
websearch = GoogleSearchAPIWrapper()
|
rag_app/{generate_summary.py β utils/generate_summary.py}
RENAMED
File without changes
|
test_this.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
from rag_app.loading_data.load_urls_recurisvely import load_docs_from_urls
|
2 |
-
from rag_app.create_embedding import create_embeddings
|
3 |
-
from rag_app.generate_summary import generate_description, generate_keywords
|
4 |
-
from rag_app.build_vector_store import build_vector_store
|
|
|
|
|
5 |
|
6 |
# 1. load the urls
|
7 |
# 2. build the vectorstore -> the function will create the chunking and embeddings
|
@@ -22,6 +24,7 @@ from rag_app.build_vector_store import build_vector_store
|
|
22 |
|
23 |
# print(create_embeddings(docs))
|
24 |
|
25 |
-
from rag_app.loading_data.scrap_website import scrap_website
|
26 |
|
27 |
-
print(scrap_website(target_url='https://www.wuerttembergische.de/',depth=1))
|
|
|
|
|
|
1 |
from rag_app.loading_data.load_urls_recurisvely import load_docs_from_urls
|
2 |
+
from rag_app.knowledge_base.create_embedding import create_embeddings
|
3 |
+
from rag_app.utils.generate_summary import generate_description, generate_keywords
|
4 |
+
from rag_app.knowledge_base.build_vector_store import build_vector_store
|
5 |
+
from rag_app.loading_data.scrap_website import scrap_website
|
6 |
+
from rag_app.loading_data.load_S3_vector_stores import get_chroma_vs, get_faiss_vs
|
7 |
|
8 |
# 1. load the urls
|
9 |
# 2. build the vectorstore -> the function will create the chunking and embeddings
|
|
|
24 |
|
25 |
# print(create_embeddings(docs))
|
26 |
|
|
|
27 |
|
28 |
+
#print(scrap_website(target_url='https://www.wuerttembergische.de/',depth=1))
|
29 |
+
get_faiss_vs()
|
30 |
+
#get_chroma_vs()
|