Spaces:
Sleeping
Sleeping
Asaad Almutareb
commited on
Commit
•
f5d22a4
1
Parent(s):
5c11631
migrated notebook to python code
Browse files- .gitignore +3 -0
- app_gui.py +65 -0
- core-langchain-rag.py +78 -30
- example.env +15 -0
- rag-system-anatomy/build_vector_store.py +0 -46
- rag-system-anatomy/load_example_embeddings.py +0 -37
- {rag-system-anatomy → rag_app}/create_embedding.py +13 -11
- rag_app/generate_summary.py +69 -0
- {rag-system-anatomy → rag_app}/get_db_retriever.py +0 -0
- rag_app/handle_vector_store.py +82 -0
- {rag-system-anatomy → rag_app}/load_data_from_urls.py +23 -5
- rag_app/load_vector_stores.py +70 -0
- rag_app/react_agent.py +0 -0
- rag_app/simple_qa_chain.py +0 -0
- requirements.txt +2 -1
- test_this.py +17 -0
.gitignore
CHANGED
@@ -158,3 +158,6 @@ cython_debug/
|
|
158 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
#.idea/
|
|
|
|
|
|
|
|
158 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
#.idea/
|
161 |
+
*.zip
|
162 |
+
*.pkl
|
163 |
+
*.faiss
|
app_gui.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import Gradio for UI, along with other necessary libraries
|
2 |
+
import gradio as gr
|
3 |
+
# need to import the qa!
|
4 |
+
|
5 |
+
# Function to add a new input to the chat history
|
6 |
+
def add_text(history, text):
|
7 |
+
# Append the new text to the history with a placeholder for the response
|
8 |
+
history = history + [(text, None)]
|
9 |
+
return history, ""
|
10 |
+
|
11 |
+
# Function representing the bot's response mechanism
|
12 |
+
def bot(history):
|
13 |
+
# Obtain the response from the 'infer' function using the latest input
|
14 |
+
response = infer(history[-1][0], history)
|
15 |
+
sources = [doc.metadata.get("source") for doc in response['source_documents']]
|
16 |
+
src_list = '\n'.join(sources)
|
17 |
+
print_this = response['result'] + "\n\n\n Sources: \n\n\n" + src_list
|
18 |
+
|
19 |
+
|
20 |
+
history[-1][1] = print_this #response['answer']
|
21 |
+
# Update the history with the bot's response
|
22 |
+
#history[-1][1] = response['result']
|
23 |
+
return history
|
24 |
+
|
25 |
+
# Function to infer the response using the RAG model
|
26 |
+
def infer(question, history):
|
27 |
+
# Use the question and history to query the RAG model
|
28 |
+
result = qa({"query": question, "history": history, "question": question})
|
29 |
+
return result
|
30 |
+
|
31 |
+
# CSS styling for the Gradio interface
|
32 |
+
css = """
|
33 |
+
#col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
|
34 |
+
"""
|
35 |
+
|
36 |
+
# HTML content for the Gradio interface title
|
37 |
+
title = """
|
38 |
+
<div style="text-align:left;">
|
39 |
+
<p>Hello, I BotTina 2.0, your intelligent AI assistant. I can help you explore Wuerttembergische Versicherungs products.<br />
|
40 |
+
</div>
|
41 |
+
"""
|
42 |
+
|
43 |
+
# Building the Gradio interface
|
44 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
45 |
+
with gr.Column(elem_id="col-container"):
|
46 |
+
gr.HTML(title) # Add the HTML title to the interface
|
47 |
+
chatbot = gr.Chatbot([], elem_id="chatbot",
|
48 |
+
bubble_full_width=False,
|
49 |
+
avatar_images=(None, "https://dacodi-production.s3.amazonaws.com/store/87bc00b6727589462954f2e3ff6f531c.png"),
|
50 |
+
height=680,) # Initialize the chatbot component
|
51 |
+
clear = gr.Button("Clear") # Add a button to clear the chat
|
52 |
+
|
53 |
+
# Create a row for the question input
|
54 |
+
with gr.Row():
|
55 |
+
question = gr.Textbox(label="Question", placeholder="Type your question and hit Enter ")
|
56 |
+
|
57 |
+
# Define the action when the question is submitted
|
58 |
+
question.submit(add_text, [chatbot, question], [chatbot, question], queue=False).then(
|
59 |
+
bot, chatbot, chatbot
|
60 |
+
)
|
61 |
+
# Define the action for the clear button
|
62 |
+
clear.click(lambda: None, None, chatbot, queue=False)
|
63 |
+
|
64 |
+
# Launch the Gradio demo interface
|
65 |
+
demo.launch(share=False, debug=True)
|
core-langchain-rag.py
CHANGED
@@ -97,29 +97,31 @@ config = load_dotenv(".env")
|
|
97 |
# Retrieve the Hugging Face API token from environment variables
|
98 |
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
99 |
S3_LOCATION = os.getenv("S3_LOCATION")
|
|
|
|
|
100 |
|
101 |
-
try:
|
102 |
-
|
103 |
-
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
|
118 |
-
except Exception as e:
|
119 |
-
|
120 |
|
121 |
# Define the model name for embeddings
|
122 |
-
model_name = "
|
123 |
|
124 |
try:
|
125 |
# Initialize HuggingFace embeddings with the specified model
|
@@ -135,11 +137,13 @@ except Exception as e:
|
|
135 |
from langchain_huggingface import HuggingFaceEndpoint
|
136 |
|
137 |
# Initialize the vector store as a retriever for the RAG pipeline
|
138 |
-
retriever = db.as_retriever(search_type="mmr", search_kwargs={'k': 3, 'lambda_mult': 0.25})
|
|
|
|
|
139 |
|
140 |
try:
|
141 |
# Load the model from the Hugging Face Hub
|
142 |
-
model_id = HuggingFaceEndpoint(repo_id=
|
143 |
temperature=0.1, # Controls randomness in response generation (lower value means less random)
|
144 |
max_new_tokens=1024, # Maximum number of new tokens to generate in responses
|
145 |
repetition_penalty=1.2, # Penalty for repeating the same words (higher value increases penalty)
|
@@ -153,16 +157,19 @@ except Exception as e:
|
|
153 |
|
154 |
# Importing necessary modules for retrieval-based question answering and prompt handling
|
155 |
from langchain.chains import RetrievalQA
|
156 |
-
from langchain.
|
|
|
157 |
from langchain.memory import ConversationBufferMemory
|
|
|
158 |
|
159 |
# Declare a global variable 'qa' for the retrieval-based question answering system
|
160 |
global qa
|
161 |
|
162 |
# Define a prompt template for guiding the model's responses
|
163 |
template = """
|
164 |
-
You are
|
165 |
-
|
|
|
166 |
------
|
167 |
<ctx>
|
168 |
{context}
|
@@ -200,10 +207,46 @@ qa = RetrievalQA.from_chain_type(
|
|
200 |
}
|
201 |
)
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
# Import Gradio for UI, along with other necessary libraries
|
204 |
import gradio as gr
|
205 |
-
import random
|
206 |
-
import time
|
207 |
|
208 |
# Function to add a new input to the chat history
|
209 |
def add_text(history, text):
|
@@ -220,6 +263,7 @@ def bot(history):
|
|
220 |
print_this = response['result'] + "\n\n\n Sources: \n\n\n" + src_list
|
221 |
|
222 |
|
|
|
223 |
history[-1][1] = print_this #response['answer']
|
224 |
# Update the history with the bot's response
|
225 |
#history[-1][1] = response['result']
|
@@ -228,7 +272,9 @@ def bot(history):
|
|
228 |
# Function to infer the response using the RAG model
|
229 |
def infer(question, history):
|
230 |
# Use the question and history to query the RAG model
|
|
|
231 |
result = qa({"query": question, "history": history, "question": question})
|
|
|
232 |
return result
|
233 |
|
234 |
# CSS styling for the Gradio interface
|
@@ -238,18 +284,20 @@ css = """
|
|
238 |
|
239 |
# HTML content for the Gradio interface title
|
240 |
title = """
|
241 |
-
<div style="text-align:
|
242 |
-
<
|
243 |
-
<p style="text-align: center;">Chat with LangChain Documentation, <br />
|
244 |
-
You can ask questions about the LangChain docu ;)</p>
|
245 |
</div>
|
246 |
"""
|
247 |
|
248 |
# Building the Gradio interface
|
249 |
-
with gr.Blocks(
|
250 |
with gr.Column(elem_id="col-container"):
|
251 |
gr.HTML(title) # Add the HTML title to the interface
|
252 |
-
chatbot = gr.Chatbot([], elem_id="chatbot"
|
|
|
|
|
|
|
|
|
253 |
clear = gr.Button("Clear") # Add a button to clear the chat
|
254 |
|
255 |
# Create a row for the question input
|
@@ -264,4 +312,4 @@ with gr.Blocks(css=css) as demo:
|
|
264 |
clear.click(lambda: None, None, chatbot, queue=False)
|
265 |
|
266 |
# Launch the Gradio demo interface
|
267 |
-
demo.launch(
|
|
|
97 |
# Retrieve the Hugging Face API token from environment variables
|
98 |
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
99 |
S3_LOCATION = os.getenv("S3_LOCATION")
|
100 |
+
S3_FILE_NAME = os.getenv("FAISS_VS_NAME")
|
101 |
+
FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
|
102 |
|
103 |
+
# try:
|
104 |
+
# # Initialize an S3 client with unsigned configuration for public access
|
105 |
+
# s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
|
106 |
|
107 |
+
# # Define the FAISS index path and the destination for the downloaded file
|
108 |
+
# #FAISS_INDEX_PATH = './vectorstore/lc-faiss-multi-mpnet-500-markdown'
|
109 |
+
# VS_DESTINATION = FAISS_INDEX_PATH + ".zip"
|
110 |
|
111 |
+
# # Download the pre-prepared vectorized index from the S3 bucket
|
112 |
+
# print("Downloading the pre-prepared vectorized index from S3...")
|
113 |
+
# s3.download_file(S3_LOCATION, S3_FILE_NAME, VS_DESTINATION)
|
114 |
|
115 |
+
# # Extract the downloaded zip file
|
116 |
+
# with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
|
117 |
+
# zip_ref.extractall('./vectorstore/')
|
118 |
+
# print("Download and extraction completed.")
|
119 |
|
120 |
+
# except Exception as e:
|
121 |
+
# print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
|
122 |
|
123 |
# Define the model name for embeddings
|
124 |
+
model_name = os.getenv("EMBEDDING_MODEL")
|
125 |
|
126 |
try:
|
127 |
# Initialize HuggingFace embeddings with the specified model
|
|
|
137 |
from langchain_huggingface import HuggingFaceEndpoint
|
138 |
|
139 |
# Initialize the vector store as a retriever for the RAG pipeline
|
140 |
+
retriever = db.as_retriever()#search_type="mmr", search_kwargs={'k': 3, 'lambda_mult': 0.25})
|
141 |
+
|
142 |
+
llm_model = os.getenv("LLM_MODEL")
|
143 |
|
144 |
try:
|
145 |
# Load the model from the Hugging Face Hub
|
146 |
+
model_id = HuggingFaceEndpoint(repo_id=llm_model,
|
147 |
temperature=0.1, # Controls randomness in response generation (lower value means less random)
|
148 |
max_new_tokens=1024, # Maximum number of new tokens to generate in responses
|
149 |
repetition_penalty=1.2, # Penalty for repeating the same words (higher value increases penalty)
|
|
|
157 |
|
158 |
# Importing necessary modules for retrieval-based question answering and prompt handling
|
159 |
from langchain.chains import RetrievalQA
|
160 |
+
from langchain.chains import LLMChain
|
161 |
+
from langchain_core.prompts import PromptTemplate
|
162 |
from langchain.memory import ConversationBufferMemory
|
163 |
+
from langchain_core.output_parsers import StrOutputParser
|
164 |
|
165 |
# Declare a global variable 'qa' for the retrieval-based question answering system
|
166 |
global qa
|
167 |
|
168 |
# Define a prompt template for guiding the model's responses
|
169 |
template = """
|
170 |
+
You are a friendly insurance product advisor, your task is to help customers find the best products from Württembergische GmbH.\
|
171 |
+
You help the user find the answers to all his questions queries. Answer in short and simple terms and offer to explain the product and terms to the user.\
|
172 |
+
Use the following context (delimited by <ctx></ctx>) and the chat history (delimited by <hs></hs>) to help find the best product for the user:
|
173 |
------
|
174 |
<ctx>
|
175 |
{context}
|
|
|
207 |
}
|
208 |
)
|
209 |
|
210 |
+
def generate_qa_retriever(history: dict, question: str, llm_model:HuggingFaceEndpoint = model_id) -> dict:
|
211 |
+
""" Generare a response to queries using the retriever"""
|
212 |
+
|
213 |
+
# Define a prompt template for guiding the model's responses
|
214 |
+
template = """
|
215 |
+
You are a friendly insurance product advisor, your task is to help customers find the best products from Württembergische GmbH.\
|
216 |
+
You help the user find the answers to all his questions. Answer in short and simple terms and offer to explain the product and terms to the user.\
|
217 |
+
Respond only using the provided context (delimited by <ctx></ctx>) and only in German or Englisch, depending on the question's language.
|
218 |
+
Use the chat history (delimited by <hs></hs>) to help find the best product for the user:
|
219 |
+
------
|
220 |
+
<ctx>
|
221 |
+
{context}
|
222 |
+
</ctx>
|
223 |
+
------
|
224 |
+
<hs>
|
225 |
+
{history}
|
226 |
+
</hs>
|
227 |
+
------
|
228 |
+
{question}
|
229 |
+
Answer:
|
230 |
+
"""
|
231 |
+
|
232 |
+
# Create a PromptTemplate object with specified input variables and the defined template
|
233 |
+
prompt = PromptTemplate.from_template(
|
234 |
+
template=template, # The prompt template as defined above
|
235 |
+
)
|
236 |
+
prompt.format(context="context", history="history", question="question")
|
237 |
+
# Create a memory buffer to manage conversation history
|
238 |
+
memory = ConversationBufferMemory(
|
239 |
+
memory_key="history", # Key for storing the conversation history
|
240 |
+
input_key="question" # Key for the input question
|
241 |
+
)
|
242 |
+
|
243 |
+
llm_chain = prompt | llm_model
|
244 |
+
result = llm_chain.invoke({"context": retriever, "history": history, "question": question})
|
245 |
+
print(result)
|
246 |
+
return result
|
247 |
+
|
248 |
# Import Gradio for UI, along with other necessary libraries
|
249 |
import gradio as gr
|
|
|
|
|
250 |
|
251 |
# Function to add a new input to the chat history
|
252 |
def add_text(history, text):
|
|
|
263 |
print_this = response['result'] + "\n\n\n Sources: \n\n\n" + src_list
|
264 |
|
265 |
|
266 |
+
#history[-1][1] = response #print_this #response['answer']
|
267 |
history[-1][1] = print_this #response['answer']
|
268 |
# Update the history with the bot's response
|
269 |
#history[-1][1] = response['result']
|
|
|
272 |
# Function to infer the response using the RAG model
|
273 |
def infer(question, history):
|
274 |
# Use the question and history to query the RAG model
|
275 |
+
#result = generate_qa_retriever(history, question)
|
276 |
result = qa({"query": question, "history": history, "question": question})
|
277 |
+
print(*result)
|
278 |
return result
|
279 |
|
280 |
# CSS styling for the Gradio interface
|
|
|
284 |
|
285 |
# HTML content for the Gradio interface title
|
286 |
title = """
|
287 |
+
<div style="text-align:left;">
|
288 |
+
<p>Hello, I BotTina 2.0, your intelligent AI assistant. I can help you explore Wuerttembergische Versicherungs products.<br />
|
|
|
|
|
289 |
</div>
|
290 |
"""
|
291 |
|
292 |
# Building the Gradio interface
|
293 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
294 |
with gr.Column(elem_id="col-container"):
|
295 |
gr.HTML(title) # Add the HTML title to the interface
|
296 |
+
chatbot = gr.Chatbot([], elem_id="chatbot",
|
297 |
+
label="BotTina 2.0",
|
298 |
+
bubble_full_width=False,
|
299 |
+
avatar_images=(None, "https://dacodi-production.s3.amazonaws.com/store/87bc00b6727589462954f2e3ff6f531c.png"),
|
300 |
+
height=680,) # Initialize the chatbot component
|
301 |
clear = gr.Button("Clear") # Add a button to clear the chat
|
302 |
|
303 |
# Create a row for the question input
|
|
|
312 |
clear.click(lambda: None, None, chatbot, queue=False)
|
313 |
|
314 |
# Launch the Gradio demo interface
|
315 |
+
demo.launch(debug=True)
|
example.env
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# API Keys for services
|
2 |
+
HUGGINGFACEHUB_API_TOKEN=""
|
3 |
+
GOOGLE_CSE_ID=""
|
4 |
+
GOOGLE_API_KEY=""
|
5 |
+
|
6 |
+
# AWS S3 object storage
|
7 |
+
S3_LOCATION=""
|
8 |
+
S3_FILE_NAME=""
|
9 |
+
|
10 |
+
# Local vectorstore storage
|
11 |
+
FAISS_INDEX_PATH = ""
|
12 |
+
|
13 |
+
# llm and embedding models
|
14 |
+
embedding_model=""
|
15 |
+
llm_model=""
|
rag-system-anatomy/build_vector_store.py
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
# vectorization functions
|
2 |
-
from langchain.vectorstores import FAISS
|
3 |
-
from langchain.document_loaders import ReadTheDocsLoader
|
4 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
-
from langchain.embeddings import HuggingFaceEmbeddings
|
6 |
-
from create_embedding import create_embeddings
|
7 |
-
import time
|
8 |
-
|
9 |
-
def build_vector_store(
|
10 |
-
docs: list,
|
11 |
-
db_path: str,
|
12 |
-
embedding_model: str,
|
13 |
-
new_db:bool=False,
|
14 |
-
chunk_size:int=500,
|
15 |
-
chunk_overlap:int=50,
|
16 |
-
):
|
17 |
-
"""
|
18 |
-
|
19 |
-
"""
|
20 |
-
|
21 |
-
if db_path is None:
|
22 |
-
FAISS_INDEX_PATH = "./vectorstore/py-faiss-multi-mpnet-500"
|
23 |
-
else:
|
24 |
-
FAISS_INDEX_PATH = db_path
|
25 |
-
|
26 |
-
embeddings,chunks = create_embeddings(docs, embedding_model, chunk_size, chunk_overlap)
|
27 |
-
|
28 |
-
#load chunks into vector store
|
29 |
-
print(f'Loading chunks into faiss vector store ...')
|
30 |
-
st = time.time()
|
31 |
-
if new_db:
|
32 |
-
db_faiss = FAISS.from_documents(chunks, embeddings)
|
33 |
-
else:
|
34 |
-
db_faiss = FAISS.add_documents(chunks, embeddings)
|
35 |
-
db_faiss.save_local(FAISS_INDEX_PATH)
|
36 |
-
et = time.time() - st
|
37 |
-
print(f'Time taken: {et} seconds.')
|
38 |
-
|
39 |
-
#print(f'Loading chunks into chroma vector store ...')
|
40 |
-
#st = time.time()
|
41 |
-
#persist_directory='./vectorstore/py-chroma-multi-mpnet-500'
|
42 |
-
#db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
|
43 |
-
#et = time.time() - st
|
44 |
-
#print(f'Time taken: {et} seconds.')
|
45 |
-
result = f"built vectore store at {FAISS_INDEX_PATH}"
|
46 |
-
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rag-system-anatomy/load_example_embeddings.py
DELETED
@@ -1,37 +0,0 @@
|
|
1 |
-
# preprocessed vectorstore retrieval
|
2 |
-
import boto3
|
3 |
-
from botocore import UNSIGNED
|
4 |
-
from botocore.client import Config
|
5 |
-
import zipfile
|
6 |
-
from langchain.vectorstores import FAISS
|
7 |
-
from langchain.vectorstores import Chroma
|
8 |
-
from langchain.embeddings import HuggingFaceEmbeddings
|
9 |
-
|
10 |
-
# access .env file
|
11 |
-
|
12 |
-
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
|
13 |
-
|
14 |
-
model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
15 |
-
#model_kwargs = {"device": "cuda"}
|
16 |
-
|
17 |
-
embeddings = HuggingFaceEmbeddings(
|
18 |
-
model_name=model_name,
|
19 |
-
# model_kwargs=model_kwargs
|
20 |
-
)
|
21 |
-
|
22 |
-
## FAISS
|
23 |
-
FAISS_INDEX_PATH='./vectorstore/lc-faiss-multi-mpnet-500-markdown'
|
24 |
-
VS_DESTINATION = FAISS_INDEX_PATH+".zip"
|
25 |
-
s3.download_file('rad-rag-demos', 'vectorstores/lc-faiss-multi-mpnet-500-markdown.zip', VS_DESTINATION)
|
26 |
-
with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
|
27 |
-
zip_ref.extractall('./vectorstore/')
|
28 |
-
faissdb = FAISS.load_local(FAISS_INDEX_PATH, embeddings)
|
29 |
-
|
30 |
-
## Chroma DB
|
31 |
-
chroma_directory="./vectorstore/lc-chroma-multi-mpnet-500-markdown"
|
32 |
-
VS_DESTINATION = chroma_directory+".zip"
|
33 |
-
s3.download_file('rad-rag-demos', 'vectorstores/lc-chroma-multi-mpnet-500-markdown.zip', VS_DESTINATION)
|
34 |
-
with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
|
35 |
-
zip_ref.extractall('./vectorstore/')
|
36 |
-
chromadb = Chroma(persist_directory=chroma_directory, embedding_function=embeddings)
|
37 |
-
chromadb.get()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{rag-system-anatomy → rag_app}/create_embedding.py
RENAMED
@@ -1,16 +1,17 @@
|
|
1 |
# embeddings functions
|
2 |
-
from
|
3 |
-
from
|
4 |
-
from
|
5 |
-
from
|
|
|
6 |
import time
|
7 |
from langchain_core.documents import Document
|
8 |
|
9 |
|
10 |
def create_embeddings(
|
11 |
docs: list[Document],
|
12 |
-
chunk_size:int,
|
13 |
-
chunk_overlap:int,
|
14 |
embedding_model: str = "sentence-transformers/multi-qa-mpnet-base-dot-v1",
|
15 |
):
|
16 |
"""given a sequence of `Document` objects this fucntion will
|
@@ -18,8 +19,8 @@ def create_embeddings(
|
|
18 |
|
19 |
## argument
|
20 |
:params docs (list[Document]) -> list of `list[Document]`
|
21 |
-
:params chunk_size (int) -> chunk size in which documents are chunks
|
22 |
-
:params chunk_overlap (int) -> the amount of token that will be overlapped between chunks
|
23 |
:params embedding_model (str) -> the huggingspace model that will embed the documents
|
24 |
## Return
|
25 |
Tuple of embedding and chunks
|
@@ -35,14 +36,15 @@ def create_embeddings(
|
|
35 |
|
36 |
# Stage one: read all the docs, split them into chunks.
|
37 |
st = time.time()
|
38 |
-
print('Loading documents ...')
|
39 |
|
|
|
40 |
chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
|
41 |
et = time.time() - st
|
42 |
-
print(f'Time taken: {et} seconds.')
|
43 |
|
44 |
#Stage two: embed the docs.
|
45 |
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
|
46 |
-
print(f"
|
47 |
|
48 |
return embeddings,chunks
|
|
|
1 |
# embeddings functions
|
2 |
+
#from langchain_community.vectorstores import FAISS
|
3 |
+
#from langchain_community.document_loaders import ReadTheDocsLoader
|
4 |
+
#from langchain_community.vectorstores.utils import filter_complex_metadata
|
5 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
6 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
7 |
import time
|
8 |
from langchain_core.documents import Document
|
9 |
|
10 |
|
11 |
def create_embeddings(
|
12 |
docs: list[Document],
|
13 |
+
chunk_size:int = 500,
|
14 |
+
chunk_overlap:int = 50,
|
15 |
embedding_model: str = "sentence-transformers/multi-qa-mpnet-base-dot-v1",
|
16 |
):
|
17 |
"""given a sequence of `Document` objects this fucntion will
|
|
|
19 |
|
20 |
## argument
|
21 |
:params docs (list[Document]) -> list of `list[Document]`
|
22 |
+
:params chunk_size (int) -> chunk size in which documents are chunks, defaults to 500
|
23 |
+
:params chunk_overlap (int) -> the amount of token that will be overlapped between chunks, defaults to 50
|
24 |
:params embedding_model (str) -> the huggingspace model that will embed the documents
|
25 |
## Return
|
26 |
Tuple of embedding and chunks
|
|
|
36 |
|
37 |
# Stage one: read all the docs, split them into chunks.
|
38 |
st = time.time()
|
39 |
+
print('Loading documents and creating chunks ...')
|
40 |
|
41 |
+
# Split each document into chunks using the configured text splitter
|
42 |
chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
|
43 |
et = time.time() - st
|
44 |
+
print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
|
45 |
|
46 |
#Stage two: embed the docs.
|
47 |
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
|
48 |
+
print(f"created a total of {len(chunks)} chunks")
|
49 |
|
50 |
return embeddings,chunks
|
rag_app/generate_summary.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_huggingface import HuggingFaceEndpoint
|
2 |
+
from langchain_core.prompts import PromptTemplate
|
3 |
+
from langchain_core.output_parsers import StrOutputParser
|
4 |
+
import json
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
import os
|
7 |
+
|
8 |
+
load_dotenv()
|
9 |
+
|
10 |
+
HF_API_TOKEN = os.getenv('HUGGINGFACE_API_TOKEN')
|
11 |
+
model_id=os.getenv('LLM_MODEL')
|
12 |
+
|
13 |
+
LLM = HuggingFaceEndpoint(
|
14 |
+
repo_id=model_id,
|
15 |
+
temperature=0.1,
|
16 |
+
max_new_tokens=512,
|
17 |
+
repetition_penalty=1.2,
|
18 |
+
return_full_text=False,
|
19 |
+
huggingfacehub_api_token=HF_API_TOKEN)
|
20 |
+
|
21 |
+
def generate_keywords(document:dict,
|
22 |
+
llm_model:HuggingFaceEndpoint = LLM) -> str:
|
23 |
+
""" Generate a meaningful list of meta keywords for the provided document or chunk"""
|
24 |
+
|
25 |
+
template = (
|
26 |
+
"""
|
27 |
+
You are a SEO expert bot. Your task is to craft a meaningful list of 5 keywords to organize documents.
|
28 |
+
The keywords should help us in searching and retrieving the documents later.
|
29 |
+
|
30 |
+
You will only respond with the clear, concise and meaningful 5 of keywords separated by comma.
|
31 |
+
|
32 |
+
<<<
|
33 |
+
Document: {document}
|
34 |
+
>>>
|
35 |
+
|
36 |
+
Keywords:
|
37 |
+
"""
|
38 |
+
)
|
39 |
+
|
40 |
+
prompt = PromptTemplate.from_template(template=template)
|
41 |
+
|
42 |
+
chain = prompt | llm_model | StrOutputParser()
|
43 |
+
result = chain.invoke({'document': document})
|
44 |
+
return result.strip()
|
45 |
+
|
46 |
+
def generate_description(document:dict,
|
47 |
+
llm_model:HuggingFaceEndpoint = LLM) -> str:
|
48 |
+
""" Generate a meaningful document description based on document content """
|
49 |
+
|
50 |
+
template = (
|
51 |
+
"""
|
52 |
+
You are a SEO expert bot. Your task is to craft a meaningful summary to descripe and organize documents.
|
53 |
+
The description should be a meaningful summary of the document's content and help us in searching and retrieving the documents later.
|
54 |
+
|
55 |
+
You will only respond with the clear, concise and meaningful description.
|
56 |
+
|
57 |
+
<<<
|
58 |
+
Document: {document}
|
59 |
+
>>>
|
60 |
+
|
61 |
+
Description:
|
62 |
+
"""
|
63 |
+
)
|
64 |
+
|
65 |
+
prompt = PromptTemplate.from_template(template=template)
|
66 |
+
|
67 |
+
chain = prompt | llm_model | StrOutputParser()
|
68 |
+
result = chain.invoke({'document': document})
|
69 |
+
return result.strip()
|
{rag-system-anatomy → rag_app}/get_db_retriever.py
RENAMED
File without changes
|
rag_app/handle_vector_store.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# vectorization functions
|
2 |
+
from langchain_community.vectorstores import FAISS
|
3 |
+
from langchain_community.vectorstores import Chroma
|
4 |
+
from langchain_community.document_loaders import ReadTheDocsLoader
|
5 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
6 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
7 |
+
from langchain_community.retrievers import BM25Retriever
|
8 |
+
from rag_app.create_embedding import create_embeddings
|
9 |
+
from rag_app.generate_summary import generate_description, generate_keywords
|
10 |
+
import time
|
11 |
+
import os
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
|
14 |
+
def build_vector_store(
|
15 |
+
docs: list,
|
16 |
+
db_path: str,
|
17 |
+
embedding_model: str,
|
18 |
+
new_db:bool=False,
|
19 |
+
chunk_size:int=500,
|
20 |
+
chunk_overlap:int=50,
|
21 |
+
):
|
22 |
+
"""
|
23 |
+
|
24 |
+
"""
|
25 |
+
|
26 |
+
if db_path is None:
|
27 |
+
FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
|
28 |
+
else:
|
29 |
+
FAISS_INDEX_PATH = db_path
|
30 |
+
|
31 |
+
embeddings,chunks = create_embeddings(docs, chunk_size, chunk_overlap, embedding_model)
|
32 |
+
for chunk in chunks:
|
33 |
+
keywords=generate_keywords(chunk)
|
34 |
+
description=generate_description(chunk)
|
35 |
+
chunk.metadata['chunk_keywords']=keywords
|
36 |
+
chunk.metadata['chunk_description']=description
|
37 |
+
|
38 |
+
#load chunks into vector store
|
39 |
+
print(f'Loading chunks into faiss vector store ...')
|
40 |
+
st = time.time()
|
41 |
+
if new_db:
|
42 |
+
db_faiss = FAISS.from_documents(chunks, embeddings)
|
43 |
+
bm25_retriever = BM25Retriever.from_documents(chunks)
|
44 |
+
else:
|
45 |
+
db_faiss = FAISS.add_documents(chunks, embeddings)
|
46 |
+
bm25_retriever = BM25Retriever.add_documents(chunks)
|
47 |
+
db_faiss.save_local(FAISS_INDEX_PATH)
|
48 |
+
et = time.time() - st
|
49 |
+
print(f'Time taken: {et} seconds.')
|
50 |
+
|
51 |
+
print(f'Loading chunks into chroma vector store ...')
|
52 |
+
st = time.time()
|
53 |
+
persist_directory='./vectorstore/chroma-insurance-agent-1500'
|
54 |
+
db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
|
55 |
+
et = time.time() - st
|
56 |
+
print(f'Time taken: {et} seconds.')
|
57 |
+
result = f"built vectore store at {FAISS_INDEX_PATH}"
|
58 |
+
return result
|
59 |
+
|
60 |
+
|
61 |
+
# # Path for saving the FAISS index
|
62 |
+
# FAISS_INDEX_PATH = "./vectorstore/lc-faiss-multi-mpnet-500"
|
63 |
+
|
64 |
+
# try:
|
65 |
+
# # Stage two: Vectorization of the document chunks
|
66 |
+
# model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1" # Model used for embedding
|
67 |
+
|
68 |
+
# # Initialize HuggingFace embeddings with the specified model
|
69 |
+
# embeddings = HuggingFaceEmbeddings(model_name=model_name)
|
70 |
+
|
71 |
+
# print(f'Loading chunks into vector store ...')
|
72 |
+
# st = time.time() # Start time for performance measurement
|
73 |
+
# # Create a FAISS vector store from the document chunks and save it locally
|
74 |
+
# db = FAISS.from_documents(filter_complex_metadata(chunks), embeddings)
|
75 |
+
# db.save_local(FAISS_INDEX_PATH)
|
76 |
+
# et = time.time() - st # Calculate time taken for vectorization
|
77 |
+
# print(f'Time taken for vectorization and saving: {et} seconds.')
|
78 |
+
# except Exception as e:
|
79 |
+
# print(f"Error during vectorization or FAISS index saving: {e}", file=sys.stderr)
|
80 |
+
|
81 |
+
# alternatively download a preparaed vectorized index from S3 and load the index into vectorstore
|
82 |
+
# Import necessary libraries for AWS S3 interaction, file handling, and FAISS vector stores
|
{rag-system-anatomy → rag_app}/load_data_from_urls.py
RENAMED
@@ -1,8 +1,13 @@
|
|
1 |
# documents loader function
|
2 |
-
from
|
3 |
from bs4 import BeautifulSoup as Soup
|
4 |
from validators import url as url_validator
|
5 |
from langchain_core.documents import Document
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
def load_docs_from_urls(
|
8 |
urls: list = ["https://docs.python.org/3/"],
|
@@ -21,12 +26,25 @@ def load_docs_from_urls(
|
|
21 |
## Raises:
|
22 |
ValueError: If any URL in the provided list is invalid.
|
23 |
"""
|
24 |
-
|
25 |
docs = []
|
26 |
for url in urls:
|
|
|
27 |
if not url_validator(url):
|
28 |
raise ValueError(f"Invalid URL: {url}")
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
return docs
|
|
|
1 |
# documents loader function
|
2 |
+
from langchain_community.document_loaders import RecursiveUrlLoader
|
3 |
from bs4 import BeautifulSoup as Soup
|
4 |
from validators import url as url_validator
|
5 |
from langchain_core.documents import Document
|
6 |
+
import time
|
7 |
+
import logging
|
8 |
+
import sys
|
9 |
+
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
|
12 |
def load_docs_from_urls(
|
13 |
urls: list = ["https://docs.python.org/3/"],
|
|
|
26 |
## Raises:
|
27 |
ValueError: If any URL in the provided list is invalid.
|
28 |
"""
|
29 |
+
stf = time.time() # Start time for performance measurement
|
30 |
docs = []
|
31 |
for url in urls:
|
32 |
+
st = time.time() # Start time for outer performance measurement
|
33 |
if not url_validator(url):
|
34 |
raise ValueError(f"Invalid URL: {url}")
|
35 |
+
try:
|
36 |
+
st = time.time() # Start time for inner performance measurement
|
37 |
+
loader = RecursiveUrlLoader(url=url, max_depth=max_depth, extractor=lambda x: Soup(x, "html.parser").text)
|
38 |
+
docs.extend(loader.load())
|
39 |
+
|
40 |
+
et = time.time() - st # Calculate time taken for splitting
|
41 |
+
logMessage=f'Time taken for downloading documents from {url}: {et} seconds.'
|
42 |
+
logger.info(logMessage)
|
43 |
+
print(logMessage)
|
44 |
+
except Exception as e:
|
45 |
+
logMessage=f"Failed to load or parse the URL {url}. Error: {e}"
|
46 |
+
logger.error(logMessage)
|
47 |
+
print(logMessage, file=sys.stderr)
|
48 |
+
etf = time.time() - stf # Calculate time taken for scrapping all URLs
|
49 |
+
print(f'Total time taken for downloading {len(docs)} documents: {etf} seconds.')
|
50 |
return docs
|
rag_app/load_vector_stores.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# preprocessed vectorstore retrieval
|
2 |
+
import boto3
|
3 |
+
from botocore import UNSIGNED
|
4 |
+
from botocore.client import Config
|
5 |
+
import zipfile
|
6 |
+
from langchain_community.vectorstores import FAISS
|
7 |
+
from langchain_community.vectorstores import Chroma
|
8 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
import os
|
11 |
+
import sys
|
12 |
+
import logging
|
13 |
+
|
14 |
+
# Load environment variables from a .env file
|
15 |
+
config = load_dotenv(".env")
|
16 |
+
|
17 |
+
# Retrieve the Hugging Face API token from environment variables
|
18 |
+
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
19 |
+
S3_LOCATION = os.getenv("S3_LOCATION")
|
20 |
+
FAISS_VS_NAME = os.getenv("FAISS_VS_NAME")
|
21 |
+
FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
|
22 |
+
CHROMA_DIRECTORY = os.getenv("CHROMA_DIRECTORY")
|
23 |
+
CHROMA_VS_NAME = os.getenv("CHROMA_VS_NAME")
|
24 |
+
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
|
25 |
+
|
26 |
+
model_name = EMBEDDING_MODEL
|
27 |
+
#model_kwargs = {"device": "cuda"}
|
28 |
+
|
29 |
+
embeddings = HuggingFaceEmbeddings(
|
30 |
+
model_name=model_name,
|
31 |
+
# model_kwargs=model_kwargs
|
32 |
+
)
|
33 |
+
|
34 |
+
## FAISS
|
35 |
+
def get_faiss_vs():
|
36 |
+
# Initialize an S3 client with unsigned configuration for public access
|
37 |
+
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
|
38 |
+
|
39 |
+
# Define the destination for the downloaded file
|
40 |
+
VS_DESTINATION = FAISS_INDEX_PATH + ".zip"
|
41 |
+
try:
|
42 |
+
# Download the pre-prepared vectorized index from the S3 bucket
|
43 |
+
print("Downloading the pre-prepared vectorized index from S3...")
|
44 |
+
s3.download_file(S3_LOCATION, FAISS_VS_NAME, VS_DESTINATION)
|
45 |
+
|
46 |
+
# Extract the downloaded zip file
|
47 |
+
with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
|
48 |
+
zip_ref.extractall('./vectorstore/')
|
49 |
+
print("Download and extraction completed.")
|
50 |
+
return FAISS.load_local(FAISS_INDEX_PATH, embeddings)
|
51 |
+
|
52 |
+
except Exception as e:
|
53 |
+
print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
|
54 |
+
#faissdb = FAISS.load_local(FAISS_INDEX_PATH, embeddings)
|
55 |
+
|
56 |
+
|
57 |
+
## Chroma DB
|
58 |
+
def get_chroma_vs():
|
59 |
+
# Initialize an S3 client with unsigned configuration for public access
|
60 |
+
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
|
61 |
+
|
62 |
+
VS_DESTINATION = CHROMA_DIRECTORY+".zip"
|
63 |
+
try:
|
64 |
+
s3.download_file(S3_LOCATION, CHROMA_VS_NAME, VS_DESTINATION)
|
65 |
+
with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
|
66 |
+
zip_ref.extractall('./vectorstore/')
|
67 |
+
chromadb = Chroma(persist_directory=CHROMA_DIRECTORY, embedding_function=embeddings)
|
68 |
+
chromadb.get()
|
69 |
+
except Exception as e:
|
70 |
+
print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
|
rag_app/react_agent.py
ADDED
File without changes
|
rag_app/simple_qa_chain.py
ADDED
File without changes
|
requirements.txt
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
langchain
|
2 |
langchain-community
|
3 |
-
langchain-
|
|
|
4 |
beautifulsoup4
|
5 |
faiss-cpu
|
6 |
chromadb
|
|
|
1 |
langchain
|
2 |
langchain-community
|
3 |
+
langchain-HuggingFace
|
4 |
+
langchain-text-splitters
|
5 |
beautifulsoup4
|
6 |
faiss-cpu
|
7 |
chromadb
|
test_this.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from rag_app.load_data_from_urls import load_docs_from_urls
|
2 |
+
from rag_app.create_embedding import create_embeddings
|
3 |
+
from rag_app.generate_summary import generate_description, generate_keywords
|
4 |
+
from rag_app.handle_vector_store import build_vector_store
|
5 |
+
|
6 |
+
docs = load_docs_from_urls(["https://www.wuerttembergische.de/"],5)
|
7 |
+
|
8 |
+
for doc in docs:
|
9 |
+
keywords=generate_keywords(doc)
|
10 |
+
description=generate_description(doc)
|
11 |
+
doc.metadata['keywords']=keywords
|
12 |
+
doc.metadata['description']=description
|
13 |
+
|
14 |
+
build_vector_store(docs, './vectorstore/faiss-insurance-agent-1500','sentence-transformers/multi-qa-mpnet-base-dot-v1',True,1500,150)
|
15 |
+
|
16 |
+
|
17 |
+
#print(create_embeddings(docs))
|