Asaad Almutareb commited on
Commit
f5d22a4
1 Parent(s): 5c11631

migrated notebook to python code

Browse files
.gitignore CHANGED
@@ -158,3 +158,6 @@ cython_debug/
158
  # and can be added to the global gitignore or merged into this file. For a more nuclear
159
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
  #.idea/
 
 
 
 
158
  # and can be added to the global gitignore or merged into this file. For a more nuclear
159
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
  #.idea/
161
+ *.zip
162
+ *.pkl
163
+ *.faiss
app_gui.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import Gradio for UI, along with other necessary libraries
2
+ import gradio as gr
3
+ # need to import the qa!
4
+
5
+ # Function to add a new input to the chat history
6
+ def add_text(history, text):
7
+ # Append the new text to the history with a placeholder for the response
8
+ history = history + [(text, None)]
9
+ return history, ""
10
+
11
+ # Function representing the bot's response mechanism
12
+ def bot(history):
13
+ # Obtain the response from the 'infer' function using the latest input
14
+ response = infer(history[-1][0], history)
15
+ sources = [doc.metadata.get("source") for doc in response['source_documents']]
16
+ src_list = '\n'.join(sources)
17
+ print_this = response['result'] + "\n\n\n Sources: \n\n\n" + src_list
18
+
19
+
20
+ history[-1][1] = print_this #response['answer']
21
+ # Update the history with the bot's response
22
+ #history[-1][1] = response['result']
23
+ return history
24
+
25
+ # Function to infer the response using the RAG model
26
+ def infer(question, history):
27
+ # Use the question and history to query the RAG model
28
+ result = qa({"query": question, "history": history, "question": question})
29
+ return result
30
+
31
+ # CSS styling for the Gradio interface
32
+ css = """
33
+ #col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
34
+ """
35
+
36
+ # HTML content for the Gradio interface title
37
+ title = """
38
+ <div style="text-align:left;">
39
+ <p>Hello, I BotTina 2.0, your intelligent AI assistant. I can help you explore Wuerttembergische Versicherungs products.<br />
40
+ </div>
41
+ """
42
+
43
+ # Building the Gradio interface
44
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
45
+ with gr.Column(elem_id="col-container"):
46
+ gr.HTML(title) # Add the HTML title to the interface
47
+ chatbot = gr.Chatbot([], elem_id="chatbot",
48
+ bubble_full_width=False,
49
+ avatar_images=(None, "https://dacodi-production.s3.amazonaws.com/store/87bc00b6727589462954f2e3ff6f531c.png"),
50
+ height=680,) # Initialize the chatbot component
51
+ clear = gr.Button("Clear") # Add a button to clear the chat
52
+
53
+ # Create a row for the question input
54
+ with gr.Row():
55
+ question = gr.Textbox(label="Question", placeholder="Type your question and hit Enter ")
56
+
57
+ # Define the action when the question is submitted
58
+ question.submit(add_text, [chatbot, question], [chatbot, question], queue=False).then(
59
+ bot, chatbot, chatbot
60
+ )
61
+ # Define the action for the clear button
62
+ clear.click(lambda: None, None, chatbot, queue=False)
63
+
64
+ # Launch the Gradio demo interface
65
+ demo.launch(share=False, debug=True)
core-langchain-rag.py CHANGED
@@ -97,29 +97,31 @@ config = load_dotenv(".env")
97
  # Retrieve the Hugging Face API token from environment variables
98
  HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
99
  S3_LOCATION = os.getenv("S3_LOCATION")
 
 
100
 
101
- try:
102
- # Initialize an S3 client with unsigned configuration for public access
103
- s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
104
 
105
- # Define the FAISS index path and the destination for the downloaded file
106
- FAISS_INDEX_PATH = './vectorstore/lc-faiss-multi-mpnet-500-markdown'
107
- VS_DESTINATION = FAISS_INDEX_PATH + ".zip"
108
 
109
- # Download the pre-prepared vectorized index from the S3 bucket
110
- print("Downloading the pre-prepared vectorized index from S3...")
111
- s3.download_file(S3_LOCATION, 'vectorstores/lc-faiss-multi-mpnet-500-markdown.zip', VS_DESTINATION)
112
 
113
- # Extract the downloaded zip file
114
- with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
115
- zip_ref.extractall('./vectorstore/')
116
- print("Download and extraction completed.")
117
 
118
- except Exception as e:
119
- print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
120
 
121
  # Define the model name for embeddings
122
- model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
123
 
124
  try:
125
  # Initialize HuggingFace embeddings with the specified model
@@ -135,11 +137,13 @@ except Exception as e:
135
  from langchain_huggingface import HuggingFaceEndpoint
136
 
137
  # Initialize the vector store as a retriever for the RAG pipeline
138
- retriever = db.as_retriever(search_type="mmr", search_kwargs={'k': 3, 'lambda_mult': 0.25})
 
 
139
 
140
  try:
141
  # Load the model from the Hugging Face Hub
142
- model_id = HuggingFaceEndpoint(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
143
  temperature=0.1, # Controls randomness in response generation (lower value means less random)
144
  max_new_tokens=1024, # Maximum number of new tokens to generate in responses
145
  repetition_penalty=1.2, # Penalty for repeating the same words (higher value increases penalty)
@@ -153,16 +157,19 @@ except Exception as e:
153
 
154
  # Importing necessary modules for retrieval-based question answering and prompt handling
155
  from langchain.chains import RetrievalQA
156
- from langchain.prompts import PromptTemplate
 
157
  from langchain.memory import ConversationBufferMemory
 
158
 
159
  # Declare a global variable 'qa' for the retrieval-based question answering system
160
  global qa
161
 
162
  # Define a prompt template for guiding the model's responses
163
  template = """
164
- You are the friendly documentation buddy Arti, if you don't know the answer say 'I don't know' and don't make things up.\
165
- Use the following context (delimited by <ctx></ctx>) and the chat history (delimited by <hs></hs>) to answer the question :
 
166
  ------
167
  <ctx>
168
  {context}
@@ -200,10 +207,46 @@ qa = RetrievalQA.from_chain_type(
200
  }
201
  )
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  # Import Gradio for UI, along with other necessary libraries
204
  import gradio as gr
205
- import random
206
- import time
207
 
208
  # Function to add a new input to the chat history
209
  def add_text(history, text):
@@ -220,6 +263,7 @@ def bot(history):
220
  print_this = response['result'] + "\n\n\n Sources: \n\n\n" + src_list
221
 
222
 
 
223
  history[-1][1] = print_this #response['answer']
224
  # Update the history with the bot's response
225
  #history[-1][1] = response['result']
@@ -228,7 +272,9 @@ def bot(history):
228
  # Function to infer the response using the RAG model
229
  def infer(question, history):
230
  # Use the question and history to query the RAG model
 
231
  result = qa({"query": question, "history": history, "question": question})
 
232
  return result
233
 
234
  # CSS styling for the Gradio interface
@@ -238,18 +284,20 @@ css = """
238
 
239
  # HTML content for the Gradio interface title
240
  title = """
241
- <div style="text-align: center;max-width: 700px;">
242
- <h1>Chat with your Documentation</h1>
243
- <p style="text-align: center;">Chat with LangChain Documentation, <br />
244
- You can ask questions about the LangChain docu ;)</p>
245
  </div>
246
  """
247
 
248
  # Building the Gradio interface
249
- with gr.Blocks(css=css) as demo:
250
  with gr.Column(elem_id="col-container"):
251
  gr.HTML(title) # Add the HTML title to the interface
252
- chatbot = gr.Chatbot([], elem_id="chatbot") # Initialize the chatbot component
 
 
 
 
253
  clear = gr.Button("Clear") # Add a button to clear the chat
254
 
255
  # Create a row for the question input
@@ -264,4 +312,4 @@ with gr.Blocks(css=css) as demo:
264
  clear.click(lambda: None, None, chatbot, queue=False)
265
 
266
  # Launch the Gradio demo interface
267
- demo.launch(share=False)
 
97
  # Retrieve the Hugging Face API token from environment variables
98
  HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
99
  S3_LOCATION = os.getenv("S3_LOCATION")
100
+ S3_FILE_NAME = os.getenv("FAISS_VS_NAME")
101
+ FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
102
 
103
+ # try:
104
+ # # Initialize an S3 client with unsigned configuration for public access
105
+ # s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
106
 
107
+ # # Define the FAISS index path and the destination for the downloaded file
108
+ # #FAISS_INDEX_PATH = './vectorstore/lc-faiss-multi-mpnet-500-markdown'
109
+ # VS_DESTINATION = FAISS_INDEX_PATH + ".zip"
110
 
111
+ # # Download the pre-prepared vectorized index from the S3 bucket
112
+ # print("Downloading the pre-prepared vectorized index from S3...")
113
+ # s3.download_file(S3_LOCATION, S3_FILE_NAME, VS_DESTINATION)
114
 
115
+ # # Extract the downloaded zip file
116
+ # with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
117
+ # zip_ref.extractall('./vectorstore/')
118
+ # print("Download and extraction completed.")
119
 
120
+ # except Exception as e:
121
+ # print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
122
 
123
  # Define the model name for embeddings
124
+ model_name = os.getenv("EMBEDDING_MODEL")
125
 
126
  try:
127
  # Initialize HuggingFace embeddings with the specified model
 
137
  from langchain_huggingface import HuggingFaceEndpoint
138
 
139
  # Initialize the vector store as a retriever for the RAG pipeline
140
+ retriever = db.as_retriever()#search_type="mmr", search_kwargs={'k': 3, 'lambda_mult': 0.25})
141
+
142
+ llm_model = os.getenv("LLM_MODEL")
143
 
144
  try:
145
  # Load the model from the Hugging Face Hub
146
+ model_id = HuggingFaceEndpoint(repo_id=llm_model,
147
  temperature=0.1, # Controls randomness in response generation (lower value means less random)
148
  max_new_tokens=1024, # Maximum number of new tokens to generate in responses
149
  repetition_penalty=1.2, # Penalty for repeating the same words (higher value increases penalty)
 
157
 
158
  # Importing necessary modules for retrieval-based question answering and prompt handling
159
  from langchain.chains import RetrievalQA
160
+ from langchain.chains import LLMChain
161
+ from langchain_core.prompts import PromptTemplate
162
  from langchain.memory import ConversationBufferMemory
163
+ from langchain_core.output_parsers import StrOutputParser
164
 
165
  # Declare a global variable 'qa' for the retrieval-based question answering system
166
  global qa
167
 
168
  # Define a prompt template for guiding the model's responses
169
  template = """
170
+ You are a friendly insurance product advisor, your task is to help customers find the best products from Württembergische GmbH.\
171
+ You help the user find the answers to all his questions queries. Answer in short and simple terms and offer to explain the product and terms to the user.\
172
+ Use the following context (delimited by <ctx></ctx>) and the chat history (delimited by <hs></hs>) to help find the best product for the user:
173
  ------
174
  <ctx>
175
  {context}
 
207
  }
208
  )
209
 
210
+ def generate_qa_retriever(history: dict, question: str, llm_model:HuggingFaceEndpoint = model_id) -> dict:
211
+ """ Generare a response to queries using the retriever"""
212
+
213
+ # Define a prompt template for guiding the model's responses
214
+ template = """
215
+ You are a friendly insurance product advisor, your task is to help customers find the best products from Württembergische GmbH.\
216
+ You help the user find the answers to all his questions. Answer in short and simple terms and offer to explain the product and terms to the user.\
217
+ Respond only using the provided context (delimited by <ctx></ctx>) and only in German or Englisch, depending on the question's language.
218
+ Use the chat history (delimited by <hs></hs>) to help find the best product for the user:
219
+ ------
220
+ <ctx>
221
+ {context}
222
+ </ctx>
223
+ ------
224
+ <hs>
225
+ {history}
226
+ </hs>
227
+ ------
228
+ {question}
229
+ Answer:
230
+ """
231
+
232
+ # Create a PromptTemplate object with specified input variables and the defined template
233
+ prompt = PromptTemplate.from_template(
234
+ template=template, # The prompt template as defined above
235
+ )
236
+ prompt.format(context="context", history="history", question="question")
237
+ # Create a memory buffer to manage conversation history
238
+ memory = ConversationBufferMemory(
239
+ memory_key="history", # Key for storing the conversation history
240
+ input_key="question" # Key for the input question
241
+ )
242
+
243
+ llm_chain = prompt | llm_model
244
+ result = llm_chain.invoke({"context": retriever, "history": history, "question": question})
245
+ print(result)
246
+ return result
247
+
248
  # Import Gradio for UI, along with other necessary libraries
249
  import gradio as gr
 
 
250
 
251
  # Function to add a new input to the chat history
252
  def add_text(history, text):
 
263
  print_this = response['result'] + "\n\n\n Sources: \n\n\n" + src_list
264
 
265
 
266
+ #history[-1][1] = response #print_this #response['answer']
267
  history[-1][1] = print_this #response['answer']
268
  # Update the history with the bot's response
269
  #history[-1][1] = response['result']
 
272
  # Function to infer the response using the RAG model
273
  def infer(question, history):
274
  # Use the question and history to query the RAG model
275
+ #result = generate_qa_retriever(history, question)
276
  result = qa({"query": question, "history": history, "question": question})
277
+ print(*result)
278
  return result
279
 
280
  # CSS styling for the Gradio interface
 
284
 
285
  # HTML content for the Gradio interface title
286
  title = """
287
+ <div style="text-align:left;">
288
+ <p>Hello, I BotTina 2.0, your intelligent AI assistant. I can help you explore Wuerttembergische Versicherungs products.<br />
 
 
289
  </div>
290
  """
291
 
292
  # Building the Gradio interface
293
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
294
  with gr.Column(elem_id="col-container"):
295
  gr.HTML(title) # Add the HTML title to the interface
296
+ chatbot = gr.Chatbot([], elem_id="chatbot",
297
+ label="BotTina 2.0",
298
+ bubble_full_width=False,
299
+ avatar_images=(None, "https://dacodi-production.s3.amazonaws.com/store/87bc00b6727589462954f2e3ff6f531c.png"),
300
+ height=680,) # Initialize the chatbot component
301
  clear = gr.Button("Clear") # Add a button to clear the chat
302
 
303
  # Create a row for the question input
 
312
  clear.click(lambda: None, None, chatbot, queue=False)
313
 
314
  # Launch the Gradio demo interface
315
+ demo.launch(debug=True)
example.env ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # API Keys for services
2
+ HUGGINGFACEHUB_API_TOKEN=""
3
+ GOOGLE_CSE_ID=""
4
+ GOOGLE_API_KEY=""
5
+
6
+ # AWS S3 object storage
7
+ S3_LOCATION=""
8
+ S3_FILE_NAME=""
9
+
10
+ # Local vectorstore storage
11
+ FAISS_INDEX_PATH = ""
12
+
13
+ # llm and embedding models
14
+ embedding_model=""
15
+ llm_model=""
rag-system-anatomy/build_vector_store.py DELETED
@@ -1,46 +0,0 @@
1
- # vectorization functions
2
- from langchain.vectorstores import FAISS
3
- from langchain.document_loaders import ReadTheDocsLoader
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain.embeddings import HuggingFaceEmbeddings
6
- from create_embedding import create_embeddings
7
- import time
8
-
9
- def build_vector_store(
10
- docs: list,
11
- db_path: str,
12
- embedding_model: str,
13
- new_db:bool=False,
14
- chunk_size:int=500,
15
- chunk_overlap:int=50,
16
- ):
17
- """
18
-
19
- """
20
-
21
- if db_path is None:
22
- FAISS_INDEX_PATH = "./vectorstore/py-faiss-multi-mpnet-500"
23
- else:
24
- FAISS_INDEX_PATH = db_path
25
-
26
- embeddings,chunks = create_embeddings(docs, embedding_model, chunk_size, chunk_overlap)
27
-
28
- #load chunks into vector store
29
- print(f'Loading chunks into faiss vector store ...')
30
- st = time.time()
31
- if new_db:
32
- db_faiss = FAISS.from_documents(chunks, embeddings)
33
- else:
34
- db_faiss = FAISS.add_documents(chunks, embeddings)
35
- db_faiss.save_local(FAISS_INDEX_PATH)
36
- et = time.time() - st
37
- print(f'Time taken: {et} seconds.')
38
-
39
- #print(f'Loading chunks into chroma vector store ...')
40
- #st = time.time()
41
- #persist_directory='./vectorstore/py-chroma-multi-mpnet-500'
42
- #db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
43
- #et = time.time() - st
44
- #print(f'Time taken: {et} seconds.')
45
- result = f"built vectore store at {FAISS_INDEX_PATH}"
46
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rag-system-anatomy/load_example_embeddings.py DELETED
@@ -1,37 +0,0 @@
1
- # preprocessed vectorstore retrieval
2
- import boto3
3
- from botocore import UNSIGNED
4
- from botocore.client import Config
5
- import zipfile
6
- from langchain.vectorstores import FAISS
7
- from langchain.vectorstores import Chroma
8
- from langchain.embeddings import HuggingFaceEmbeddings
9
-
10
- # access .env file
11
-
12
- s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
13
-
14
- model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
15
- #model_kwargs = {"device": "cuda"}
16
-
17
- embeddings = HuggingFaceEmbeddings(
18
- model_name=model_name,
19
- # model_kwargs=model_kwargs
20
- )
21
-
22
- ## FAISS
23
- FAISS_INDEX_PATH='./vectorstore/lc-faiss-multi-mpnet-500-markdown'
24
- VS_DESTINATION = FAISS_INDEX_PATH+".zip"
25
- s3.download_file('rad-rag-demos', 'vectorstores/lc-faiss-multi-mpnet-500-markdown.zip', VS_DESTINATION)
26
- with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
27
- zip_ref.extractall('./vectorstore/')
28
- faissdb = FAISS.load_local(FAISS_INDEX_PATH, embeddings)
29
-
30
- ## Chroma DB
31
- chroma_directory="./vectorstore/lc-chroma-multi-mpnet-500-markdown"
32
- VS_DESTINATION = chroma_directory+".zip"
33
- s3.download_file('rad-rag-demos', 'vectorstores/lc-chroma-multi-mpnet-500-markdown.zip', VS_DESTINATION)
34
- with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
35
- zip_ref.extractall('./vectorstore/')
36
- chromadb = Chroma(persist_directory=chroma_directory, embedding_function=embeddings)
37
- chromadb.get()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
{rag-system-anatomy → rag_app}/create_embedding.py RENAMED
@@ -1,16 +1,17 @@
1
  # embeddings functions
2
- from langchain.vectorstores import FAISS
3
- from langchain.document_loaders import ReadTheDocsLoader
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain.embeddings import HuggingFaceEmbeddings
 
6
  import time
7
  from langchain_core.documents import Document
8
 
9
 
10
  def create_embeddings(
11
  docs: list[Document],
12
- chunk_size:int,
13
- chunk_overlap:int,
14
  embedding_model: str = "sentence-transformers/multi-qa-mpnet-base-dot-v1",
15
  ):
16
  """given a sequence of `Document` objects this fucntion will
@@ -18,8 +19,8 @@ def create_embeddings(
18
 
19
  ## argument
20
  :params docs (list[Document]) -> list of `list[Document]`
21
- :params chunk_size (int) -> chunk size in which documents are chunks
22
- :params chunk_overlap (int) -> the amount of token that will be overlapped between chunks
23
  :params embedding_model (str) -> the huggingspace model that will embed the documents
24
  ## Return
25
  Tuple of embedding and chunks
@@ -35,14 +36,15 @@ def create_embeddings(
35
 
36
  # Stage one: read all the docs, split them into chunks.
37
  st = time.time()
38
- print('Loading documents ...')
39
 
 
40
  chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
41
  et = time.time() - st
42
- print(f'Time taken: {et} seconds.')
43
 
44
  #Stage two: embed the docs.
45
  embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
46
- print(f"create a total of {len(chunks)}")
47
 
48
  return embeddings,chunks
 
1
  # embeddings functions
2
+ #from langchain_community.vectorstores import FAISS
3
+ #from langchain_community.document_loaders import ReadTheDocsLoader
4
+ #from langchain_community.vectorstores.utils import filter_complex_metadata
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain_huggingface import HuggingFaceEmbeddings
7
  import time
8
  from langchain_core.documents import Document
9
 
10
 
11
  def create_embeddings(
12
  docs: list[Document],
13
+ chunk_size:int = 500,
14
+ chunk_overlap:int = 50,
15
  embedding_model: str = "sentence-transformers/multi-qa-mpnet-base-dot-v1",
16
  ):
17
  """given a sequence of `Document` objects this fucntion will
 
19
 
20
  ## argument
21
  :params docs (list[Document]) -> list of `list[Document]`
22
+ :params chunk_size (int) -> chunk size in which documents are chunks, defaults to 500
23
+ :params chunk_overlap (int) -> the amount of token that will be overlapped between chunks, defaults to 50
24
  :params embedding_model (str) -> the huggingspace model that will embed the documents
25
  ## Return
26
  Tuple of embedding and chunks
 
36
 
37
  # Stage one: read all the docs, split them into chunks.
38
  st = time.time()
39
+ print('Loading documents and creating chunks ...')
40
 
41
+ # Split each document into chunks using the configured text splitter
42
  chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
43
  et = time.time() - st
44
+ print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
45
 
46
  #Stage two: embed the docs.
47
  embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
48
+ print(f"created a total of {len(chunks)} chunks")
49
 
50
  return embeddings,chunks
rag_app/generate_summary.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_huggingface import HuggingFaceEndpoint
2
+ from langchain_core.prompts import PromptTemplate
3
+ from langchain_core.output_parsers import StrOutputParser
4
+ import json
5
+ from dotenv import load_dotenv
6
+ import os
7
+
8
+ load_dotenv()
9
+
10
+ HF_API_TOKEN = os.getenv('HUGGINGFACE_API_TOKEN')
11
+ model_id=os.getenv('LLM_MODEL')
12
+
13
+ LLM = HuggingFaceEndpoint(
14
+ repo_id=model_id,
15
+ temperature=0.1,
16
+ max_new_tokens=512,
17
+ repetition_penalty=1.2,
18
+ return_full_text=False,
19
+ huggingfacehub_api_token=HF_API_TOKEN)
20
+
21
+ def generate_keywords(document:dict,
22
+ llm_model:HuggingFaceEndpoint = LLM) -> str:
23
+ """ Generate a meaningful list of meta keywords for the provided document or chunk"""
24
+
25
+ template = (
26
+ """
27
+ You are a SEO expert bot. Your task is to craft a meaningful list of 5 keywords to organize documents.
28
+ The keywords should help us in searching and retrieving the documents later.
29
+
30
+ You will only respond with the clear, concise and meaningful 5 of keywords separated by comma.
31
+
32
+ <<<
33
+ Document: {document}
34
+ >>>
35
+
36
+ Keywords:
37
+ """
38
+ )
39
+
40
+ prompt = PromptTemplate.from_template(template=template)
41
+
42
+ chain = prompt | llm_model | StrOutputParser()
43
+ result = chain.invoke({'document': document})
44
+ return result.strip()
45
+
46
+ def generate_description(document:dict,
47
+ llm_model:HuggingFaceEndpoint = LLM) -> str:
48
+ """ Generate a meaningful document description based on document content """
49
+
50
+ template = (
51
+ """
52
+ You are a SEO expert bot. Your task is to craft a meaningful summary to descripe and organize documents.
53
+ The description should be a meaningful summary of the document's content and help us in searching and retrieving the documents later.
54
+
55
+ You will only respond with the clear, concise and meaningful description.
56
+
57
+ <<<
58
+ Document: {document}
59
+ >>>
60
+
61
+ Description:
62
+ """
63
+ )
64
+
65
+ prompt = PromptTemplate.from_template(template=template)
66
+
67
+ chain = prompt | llm_model | StrOutputParser()
68
+ result = chain.invoke({'document': document})
69
+ return result.strip()
{rag-system-anatomy → rag_app}/get_db_retriever.py RENAMED
File without changes
rag_app/handle_vector_store.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # vectorization functions
2
+ from langchain_community.vectorstores import FAISS
3
+ from langchain_community.vectorstores import Chroma
4
+ from langchain_community.document_loaders import ReadTheDocsLoader
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain_huggingface import HuggingFaceEmbeddings
7
+ from langchain_community.retrievers import BM25Retriever
8
+ from rag_app.create_embedding import create_embeddings
9
+ from rag_app.generate_summary import generate_description, generate_keywords
10
+ import time
11
+ import os
12
+ from dotenv import load_dotenv
13
+
14
+ def build_vector_store(
15
+ docs: list,
16
+ db_path: str,
17
+ embedding_model: str,
18
+ new_db:bool=False,
19
+ chunk_size:int=500,
20
+ chunk_overlap:int=50,
21
+ ):
22
+ """
23
+
24
+ """
25
+
26
+ if db_path is None:
27
+ FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
28
+ else:
29
+ FAISS_INDEX_PATH = db_path
30
+
31
+ embeddings,chunks = create_embeddings(docs, chunk_size, chunk_overlap, embedding_model)
32
+ for chunk in chunks:
33
+ keywords=generate_keywords(chunk)
34
+ description=generate_description(chunk)
35
+ chunk.metadata['chunk_keywords']=keywords
36
+ chunk.metadata['chunk_description']=description
37
+
38
+ #load chunks into vector store
39
+ print(f'Loading chunks into faiss vector store ...')
40
+ st = time.time()
41
+ if new_db:
42
+ db_faiss = FAISS.from_documents(chunks, embeddings)
43
+ bm25_retriever = BM25Retriever.from_documents(chunks)
44
+ else:
45
+ db_faiss = FAISS.add_documents(chunks, embeddings)
46
+ bm25_retriever = BM25Retriever.add_documents(chunks)
47
+ db_faiss.save_local(FAISS_INDEX_PATH)
48
+ et = time.time() - st
49
+ print(f'Time taken: {et} seconds.')
50
+
51
+ print(f'Loading chunks into chroma vector store ...')
52
+ st = time.time()
53
+ persist_directory='./vectorstore/chroma-insurance-agent-1500'
54
+ db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
55
+ et = time.time() - st
56
+ print(f'Time taken: {et} seconds.')
57
+ result = f"built vectore store at {FAISS_INDEX_PATH}"
58
+ return result
59
+
60
+
61
+ # # Path for saving the FAISS index
62
+ # FAISS_INDEX_PATH = "./vectorstore/lc-faiss-multi-mpnet-500"
63
+
64
+ # try:
65
+ # # Stage two: Vectorization of the document chunks
66
+ # model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1" # Model used for embedding
67
+
68
+ # # Initialize HuggingFace embeddings with the specified model
69
+ # embeddings = HuggingFaceEmbeddings(model_name=model_name)
70
+
71
+ # print(f'Loading chunks into vector store ...')
72
+ # st = time.time() # Start time for performance measurement
73
+ # # Create a FAISS vector store from the document chunks and save it locally
74
+ # db = FAISS.from_documents(filter_complex_metadata(chunks), embeddings)
75
+ # db.save_local(FAISS_INDEX_PATH)
76
+ # et = time.time() - st # Calculate time taken for vectorization
77
+ # print(f'Time taken for vectorization and saving: {et} seconds.')
78
+ # except Exception as e:
79
+ # print(f"Error during vectorization or FAISS index saving: {e}", file=sys.stderr)
80
+
81
+ # alternatively download a preparaed vectorized index from S3 and load the index into vectorstore
82
+ # Import necessary libraries for AWS S3 interaction, file handling, and FAISS vector stores
{rag-system-anatomy → rag_app}/load_data_from_urls.py RENAMED
@@ -1,8 +1,13 @@
1
  # documents loader function
2
- from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
3
  from bs4 import BeautifulSoup as Soup
4
  from validators import url as url_validator
5
  from langchain_core.documents import Document
 
 
 
 
 
6
 
7
  def load_docs_from_urls(
8
  urls: list = ["https://docs.python.org/3/"],
@@ -21,12 +26,25 @@ def load_docs_from_urls(
21
  ## Raises:
22
  ValueError: If any URL in the provided list is invalid.
23
  """
24
-
25
  docs = []
26
  for url in urls:
 
27
  if not url_validator(url):
28
  raise ValueError(f"Invalid URL: {url}")
29
- loader = RecursiveUrlLoader(url=url, max_depth=max_depth, extractor=lambda x: Soup(x, "html.parser").text)
30
- docs.extend(loader.load())
31
- print(f"loaded {len(docs)} pages")
 
 
 
 
 
 
 
 
 
 
 
 
32
  return docs
 
1
  # documents loader function
2
+ from langchain_community.document_loaders import RecursiveUrlLoader
3
  from bs4 import BeautifulSoup as Soup
4
  from validators import url as url_validator
5
  from langchain_core.documents import Document
6
+ import time
7
+ import logging
8
+ import sys
9
+
10
+ logger = logging.getLogger(__name__)
11
 
12
  def load_docs_from_urls(
13
  urls: list = ["https://docs.python.org/3/"],
 
26
  ## Raises:
27
  ValueError: If any URL in the provided list is invalid.
28
  """
29
+ stf = time.time() # Start time for performance measurement
30
  docs = []
31
  for url in urls:
32
+ st = time.time() # Start time for outer performance measurement
33
  if not url_validator(url):
34
  raise ValueError(f"Invalid URL: {url}")
35
+ try:
36
+ st = time.time() # Start time for inner performance measurement
37
+ loader = RecursiveUrlLoader(url=url, max_depth=max_depth, extractor=lambda x: Soup(x, "html.parser").text)
38
+ docs.extend(loader.load())
39
+
40
+ et = time.time() - st # Calculate time taken for splitting
41
+ logMessage=f'Time taken for downloading documents from {url}: {et} seconds.'
42
+ logger.info(logMessage)
43
+ print(logMessage)
44
+ except Exception as e:
45
+ logMessage=f"Failed to load or parse the URL {url}. Error: {e}"
46
+ logger.error(logMessage)
47
+ print(logMessage, file=sys.stderr)
48
+ etf = time.time() - stf # Calculate time taken for scrapping all URLs
49
+ print(f'Total time taken for downloading {len(docs)} documents: {etf} seconds.')
50
  return docs
rag_app/load_vector_stores.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # preprocessed vectorstore retrieval
2
+ import boto3
3
+ from botocore import UNSIGNED
4
+ from botocore.client import Config
5
+ import zipfile
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain_community.vectorstores import Chroma
8
+ from langchain_huggingface import HuggingFaceEmbeddings
9
+ from dotenv import load_dotenv
10
+ import os
11
+ import sys
12
+ import logging
13
+
14
+ # Load environment variables from a .env file
15
+ config = load_dotenv(".env")
16
+
17
+ # Retrieve the Hugging Face API token from environment variables
18
+ HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
19
+ S3_LOCATION = os.getenv("S3_LOCATION")
20
+ FAISS_VS_NAME = os.getenv("FAISS_VS_NAME")
21
+ FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
22
+ CHROMA_DIRECTORY = os.getenv("CHROMA_DIRECTORY")
23
+ CHROMA_VS_NAME = os.getenv("CHROMA_VS_NAME")
24
+ EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
25
+
26
+ model_name = EMBEDDING_MODEL
27
+ #model_kwargs = {"device": "cuda"}
28
+
29
+ embeddings = HuggingFaceEmbeddings(
30
+ model_name=model_name,
31
+ # model_kwargs=model_kwargs
32
+ )
33
+
34
+ ## FAISS
35
+ def get_faiss_vs():
36
+ # Initialize an S3 client with unsigned configuration for public access
37
+ s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
38
+
39
+ # Define the destination for the downloaded file
40
+ VS_DESTINATION = FAISS_INDEX_PATH + ".zip"
41
+ try:
42
+ # Download the pre-prepared vectorized index from the S3 bucket
43
+ print("Downloading the pre-prepared vectorized index from S3...")
44
+ s3.download_file(S3_LOCATION, FAISS_VS_NAME, VS_DESTINATION)
45
+
46
+ # Extract the downloaded zip file
47
+ with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
48
+ zip_ref.extractall('./vectorstore/')
49
+ print("Download and extraction completed.")
50
+ return FAISS.load_local(FAISS_INDEX_PATH, embeddings)
51
+
52
+ except Exception as e:
53
+ print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
54
+ #faissdb = FAISS.load_local(FAISS_INDEX_PATH, embeddings)
55
+
56
+
57
+ ## Chroma DB
58
+ def get_chroma_vs():
59
+ # Initialize an S3 client with unsigned configuration for public access
60
+ s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
61
+
62
+ VS_DESTINATION = CHROMA_DIRECTORY+".zip"
63
+ try:
64
+ s3.download_file(S3_LOCATION, CHROMA_VS_NAME, VS_DESTINATION)
65
+ with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
66
+ zip_ref.extractall('./vectorstore/')
67
+ chromadb = Chroma(persist_directory=CHROMA_DIRECTORY, embedding_function=embeddings)
68
+ chromadb.get()
69
+ except Exception as e:
70
+ print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
rag_app/react_agent.py ADDED
File without changes
rag_app/simple_qa_chain.py ADDED
File without changes
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  langchain
2
  langchain-community
3
- langchain-huggingface
 
4
  beautifulsoup4
5
  faiss-cpu
6
  chromadb
 
1
  langchain
2
  langchain-community
3
+ langchain-HuggingFace
4
+ langchain-text-splitters
5
  beautifulsoup4
6
  faiss-cpu
7
  chromadb
test_this.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rag_app.load_data_from_urls import load_docs_from_urls
2
+ from rag_app.create_embedding import create_embeddings
3
+ from rag_app.generate_summary import generate_description, generate_keywords
4
+ from rag_app.handle_vector_store import build_vector_store
5
+
6
+ docs = load_docs_from_urls(["https://www.wuerttembergische.de/"],5)
7
+
8
+ for doc in docs:
9
+ keywords=generate_keywords(doc)
10
+ description=generate_description(doc)
11
+ doc.metadata['keywords']=keywords
12
+ doc.metadata['description']=description
13
+
14
+ build_vector_store(docs, './vectorstore/faiss-insurance-agent-1500','sentence-transformers/multi-qa-mpnet-base-dot-v1',True,1500,150)
15
+
16
+
17
+ #print(create_embeddings(docs))