Spaces:

sabazo
/

insurance_advisor_wb

Sleeping

App Files Files Community

Asaad Almutareb commited on Jul 4

Commit

f5d22a4

•

1 Parent(s): 5c11631

migrated notebook to python code

Browse files

Files changed (16) hide show

.gitignore +3 -0
app_gui.py +65 -0
core-langchain-rag.py +78 -30
example.env +15 -0
rag-system-anatomy/build_vector_store.py +0 -46
rag-system-anatomy/load_example_embeddings.py +0 -37
{rag-system-anatomy → rag_app}/create_embedding.py +13 -11
rag_app/generate_summary.py +69 -0
{rag-system-anatomy → rag_app}/get_db_retriever.py +0 -0
rag_app/handle_vector_store.py +82 -0
{rag-system-anatomy → rag_app}/load_data_from_urls.py +23 -5
rag_app/load_vector_stores.py +70 -0
rag_app/react_agent.py +0 -0
rag_app/simple_qa_chain.py +0 -0
requirements.txt +2 -1
test_this.py +17 -0

.gitignore CHANGED Viewed

@@ -158,3 +158,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/

 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+*.zip
+*.pkl
+*.faiss

app_gui.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Import Gradio for UI, along with other necessary libraries
+import gradio as gr
+# need to import the qa!
+# Function to add a new input to the chat history
+def add_text(history, text):
+    # Append the new text to the history with a placeholder for the response
+    history = history + [(text, None)]
+    return history, ""
+# Function representing the bot's response mechanism
+def bot(history):
+    # Obtain the response from the 'infer' function using the latest input
+    response = infer(history[-1][0], history)
+    sources = [doc.metadata.get("source") for doc in response['source_documents']]
+    src_list = '\n'.join(sources)
+    print_this = response['result'] + "\n\n\n Sources: \n\n\n" + src_list
+    history[-1][1] = print_this #response['answer']
+    # Update the history with the bot's response
+    #history[-1][1] = response['result']
+    return history
+# Function to infer the response using the RAG model
+def infer(question, history):
+    # Use the question and history to query the RAG model
+    result = qa({"query": question, "history": history, "question": question})
+    return result
+# CSS styling for the Gradio interface
+css = """
+#col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
+"""
+# HTML content for the Gradio interface title
+title = """
+<div style="text-align:left;">
+    <p>Hello, I BotTina 2.0, your intelligent AI assistant. I can help you explore Wuerttembergische Versicherungs products.<br />
+</div>
+"""
+# Building the Gradio interface
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.HTML(title)  # Add the HTML title to the interface
+        chatbot = gr.Chatbot([], elem_id="chatbot",
+                                     bubble_full_width=False,
+                                     avatar_images=(None, "https://dacodi-production.s3.amazonaws.com/store/87bc00b6727589462954f2e3ff6f531c.png"),
+                                     height=680,)  # Initialize the chatbot component
+        clear = gr.Button("Clear")  # Add a button to clear the chat
+        # Create a row for the question input
+        with gr.Row():
+            question = gr.Textbox(label="Question", placeholder="Type your question and hit Enter ")
+    # Define the action when the question is submitted
+    question.submit(add_text, [chatbot, question], [chatbot, question], queue=False).then(
+        bot, chatbot, chatbot
+    )
+    # Define the action for the clear button
+    clear.click(lambda: None, None, chatbot, queue=False)
+# Launch the Gradio demo interface
+demo.launch(share=False, debug=True)

core-langchain-rag.py CHANGED Viewed

@@ -97,29 +97,31 @@ config = load_dotenv(".env")
 # Retrieve the Hugging Face API token from environment variables
 HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 S3_LOCATION = os.getenv("S3_LOCATION")
-try:
-    # Initialize an S3 client with unsigned configuration for public access
-    s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
-    # Define the FAISS index path and the destination for the downloaded file
-    FAISS_INDEX_PATH = './vectorstore/lc-faiss-multi-mpnet-500-markdown'
-    VS_DESTINATION = FAISS_INDEX_PATH + ".zip"
-    # Download the pre-prepared vectorized index from the S3 bucket
-    print("Downloading the pre-prepared vectorized index from S3...")
-    s3.download_file(S3_LOCATION, 'vectorstores/lc-faiss-multi-mpnet-500-markdown.zip', VS_DESTINATION)
-    # Extract the downloaded zip file
-    with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
-        zip_ref.extractall('./vectorstore/')
-    print("Download and extraction completed.")
-except Exception as e:
-    print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
 # Define the model name for embeddings
-model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
 try:
     # Initialize HuggingFace embeddings with the specified model
@@ -135,11 +137,13 @@ except Exception as e:
 from langchain_huggingface import HuggingFaceEndpoint
 # Initialize the vector store as a retriever for the RAG pipeline
-retriever = db.as_retriever(search_type="mmr", search_kwargs={'k': 3, 'lambda_mult': 0.25})
 try:
     # Load the model from the Hugging Face Hub
-    model_id = HuggingFaceEndpoint(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
         temperature=0.1,         # Controls randomness in response generation (lower value means less random)
         max_new_tokens=1024,     # Maximum number of new tokens to generate in responses
         repetition_penalty=1.2,  # Penalty for repeating the same words (higher value increases penalty)
@@ -153,16 +157,19 @@ except Exception as e:
 # Importing necessary modules for retrieval-based question answering and prompt handling
 from langchain.chains import RetrievalQA
-from langchain.prompts import PromptTemplate
 from langchain.memory import ConversationBufferMemory
 # Declare a global variable 'qa' for the retrieval-based question answering system
 global qa
 # Define a prompt template for guiding the model's responses
 template = """
-You are the friendly documentation buddy Arti, if you don't know the answer say 'I don't know' and don't make things up.\
-    Use the following context (delimited by <ctx></ctx>) and the chat history (delimited by <hs></hs>) to answer the question :
 ------
 <ctx>
 {context}
@@ -200,10 +207,46 @@ qa = RetrievalQA.from_chain_type(
     }
 )
 # Import Gradio for UI, along with other necessary libraries
 import gradio as gr
-import random
-import time
 # Function to add a new input to the chat history
 def add_text(history, text):
@@ -220,6 +263,7 @@ def bot(history):
     print_this = response['result'] + "\n\n\n Sources: \n\n\n" + src_list
     history[-1][1] = print_this #response['answer']
     # Update the history with the bot's response
     #history[-1][1] = response['result']
@@ -228,7 +272,9 @@ def bot(history):
 # Function to infer the response using the RAG model
 def infer(question, history):
     # Use the question and history to query the RAG model
     result = qa({"query": question, "history": history, "question": question})
     return result
 # CSS styling for the Gradio interface
@@ -238,18 +284,20 @@ css = """
 # HTML content for the Gradio interface title
 title = """
-<div style="text-align: center;max-width: 700px;">
-    <h1>Chat with your Documentation</h1>
-    <p style="text-align: center;">Chat with LangChain Documentation, <br />
-    You can ask questions about the LangChain docu ;)</p>
 </div>
 """
 # Building the Gradio interface
-with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.HTML(title)  # Add the HTML title to the interface
-        chatbot = gr.Chatbot([], elem_id="chatbot")  # Initialize the chatbot component
         clear = gr.Button("Clear")  # Add a button to clear the chat
         # Create a row for the question input
@@ -264,4 +312,4 @@ with gr.Blocks(css=css) as demo:
     clear.click(lambda: None, None, chatbot, queue=False)
 # Launch the Gradio demo interface
-demo.launch(share=False)

 # Retrieve the Hugging Face API token from environment variables
 HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 S3_LOCATION = os.getenv("S3_LOCATION")
+S3_FILE_NAME = os.getenv("FAISS_VS_NAME")
+FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
+# try:
+#     # Initialize an S3 client with unsigned configuration for public access
+#     s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
+#     # Define the FAISS index path and the destination for the downloaded file
+#     #FAISS_INDEX_PATH = './vectorstore/lc-faiss-multi-mpnet-500-markdown'
+#     VS_DESTINATION = FAISS_INDEX_PATH + ".zip"
+#     # Download the pre-prepared vectorized index from the S3 bucket
+#     print("Downloading the pre-prepared vectorized index from S3...")
+#     s3.download_file(S3_LOCATION, S3_FILE_NAME, VS_DESTINATION)
+#     # Extract the downloaded zip file
+#     with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
+#         zip_ref.extractall('./vectorstore/')
+#     print("Download and extraction completed.")
+# except Exception as e:
+#     print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
 # Define the model name for embeddings
+model_name = os.getenv("EMBEDDING_MODEL")
 try:
     # Initialize HuggingFace embeddings with the specified model
 from langchain_huggingface import HuggingFaceEndpoint
 # Initialize the vector store as a retriever for the RAG pipeline
+retriever = db.as_retriever()#search_type="mmr", search_kwargs={'k': 3, 'lambda_mult': 0.25})
+llm_model = os.getenv("LLM_MODEL")
 try:
     # Load the model from the Hugging Face Hub
+    model_id = HuggingFaceEndpoint(repo_id=llm_model,
         temperature=0.1,         # Controls randomness in response generation (lower value means less random)
         max_new_tokens=1024,     # Maximum number of new tokens to generate in responses
         repetition_penalty=1.2,  # Penalty for repeating the same words (higher value increases penalty)
 # Importing necessary modules for retrieval-based question answering and prompt handling
 from langchain.chains import RetrievalQA
+from langchain.chains import LLMChain
+from langchain_core.prompts import PromptTemplate
 from langchain.memory import ConversationBufferMemory
+from langchain_core.output_parsers import StrOutputParser
 # Declare a global variable 'qa' for the retrieval-based question answering system
 global qa
 # Define a prompt template for guiding the model's responses
 template = """
+You are a friendly insurance product advisor, your task is to help customers find the best products from Württembergische GmbH.\
+  You help the user find the answers to all his questions queries. Answer in short and simple terms and offer to explain the product and terms to the user.\
+    Use the following context (delimited by <ctx></ctx>) and the chat history (delimited by <hs></hs>) to help find the best product for the user:
 ------
 <ctx>
 {context}
     }
 )
+def generate_qa_retriever(history: dict, question: str, llm_model:HuggingFaceEndpoint = model_id) -> dict:
+    """ Generare a response to queries using the retriever"""
+    # Define a prompt template for guiding the model's responses
+    template = """
+    You are a friendly insurance product advisor, your task is to help customers find the best products from Württembergische GmbH.\
+    You help the user find the answers to all his questions. Answer in short and simple terms and offer to explain the product and terms to the user.\
+    Respond only using the provided context (delimited by <ctx></ctx>) and only in German or Englisch, depending on the question's language.
+    Use the chat history (delimited by <hs></hs>) to help find the best product for the user:
+    ------
+    <ctx>
+    {context}
+    </ctx>
+    ------
+    <hs>
+    {history}
+    </hs>
+    ------
+    {question}
+    Answer:
+    """
+    # Create a PromptTemplate object with specified input variables and the defined template
+    prompt = PromptTemplate.from_template(
+        template=template,  # The prompt template as defined above
+    )
+    prompt.format(context="context", history="history", question="question")
+    # Create a memory buffer to manage conversation history
+    memory = ConversationBufferMemory(
+        memory_key="history",  # Key for storing the conversation history
+        input_key="question"  # Key for the input question
+    )
+    llm_chain = prompt | llm_model
+    result = llm_chain.invoke({"context": retriever, "history": history, "question": question})
+    print(result)
+    return result
 # Import Gradio for UI, along with other necessary libraries
 import gradio as gr
 # Function to add a new input to the chat history
 def add_text(history, text):
     print_this = response['result'] + "\n\n\n Sources: \n\n\n" + src_list
+    #history[-1][1] = response #print_this #response['answer']
     history[-1][1] = print_this #response['answer']
     # Update the history with the bot's response
     #history[-1][1] = response['result']
 # Function to infer the response using the RAG model
 def infer(question, history):
     # Use the question and history to query the RAG model
+    #result = generate_qa_retriever(history, question)
     result = qa({"query": question, "history": history, "question": question})
+    print(*result)
     return result
 # CSS styling for the Gradio interface
 # HTML content for the Gradio interface title
 title = """
+<div style="text-align:left;">
+    <p>Hello, I BotTina 2.0, your intelligent AI assistant. I can help you explore Wuerttembergische Versicherungs products.<br />
 </div>
 """
 # Building the Gradio interface
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
     with gr.Column(elem_id="col-container"):
         gr.HTML(title)  # Add the HTML title to the interface
+        chatbot = gr.Chatbot([], elem_id="chatbot",
+                                    label="BotTina 2.0",
+                                     bubble_full_width=False,
+                                     avatar_images=(None, "https://dacodi-production.s3.amazonaws.com/store/87bc00b6727589462954f2e3ff6f531c.png"),
+                                     height=680,)  # Initialize the chatbot component
         clear = gr.Button("Clear")  # Add a button to clear the chat
         # Create a row for the question input
     clear.click(lambda: None, None, chatbot, queue=False)
 # Launch the Gradio demo interface
+demo.launch(debug=True)

example.env ADDED Viewed

	@@ -0,0 +1,15 @@

+# API Keys for services
+HUGGINGFACEHUB_API_TOKEN=""
+GOOGLE_CSE_ID=""
+GOOGLE_API_KEY=""
+# AWS S3 object storage
+S3_LOCATION=""
+S3_FILE_NAME=""
+# Local vectorstore storage
+FAISS_INDEX_PATH = ""
+# llm and embedding models
+embedding_model=""
+llm_model=""

rag-system-anatomy/build_vector_store.py DELETED Viewed

@@ -1,46 +0,0 @@
-# vectorization functions
-from langchain.vectorstores import FAISS
-from langchain.document_loaders import ReadTheDocsLoader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.embeddings import HuggingFaceEmbeddings
-from create_embedding import create_embeddings
-import time
-def build_vector_store(
-        docs: list,
-        db_path: str,
-        embedding_model: str,
-        new_db:bool=False,
-        chunk_size:int=500,
-        chunk_overlap:int=50,
-        ):
-    """
-    """
-    if db_path is None:
-        FAISS_INDEX_PATH = "./vectorstore/py-faiss-multi-mpnet-500"
-    else:
-        FAISS_INDEX_PATH = db_path
-    embeddings,chunks = create_embeddings(docs, embedding_model, chunk_size, chunk_overlap)
-    #load chunks into vector store
-    print(f'Loading chunks into faiss vector store ...')
-    st = time.time()
-    if new_db:
-        db_faiss = FAISS.from_documents(chunks, embeddings)
-    else:
-        db_faiss = FAISS.add_documents(chunks, embeddings)
-    db_faiss.save_local(FAISS_INDEX_PATH)
-    et = time.time() - st
-    print(f'Time taken: {et} seconds.')
-    #print(f'Loading chunks into chroma vector store ...')
-    #st = time.time()
-    #persist_directory='./vectorstore/py-chroma-multi-mpnet-500'
-    #db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
-    #et = time.time() - st
-    #print(f'Time taken: {et} seconds.')
-    result = f"built vectore store at {FAISS_INDEX_PATH}"
-    return result

rag-system-anatomy/load_example_embeddings.py DELETED Viewed

@@ -1,37 +0,0 @@
-# preprocessed vectorstore retrieval
-import boto3
-from botocore import UNSIGNED
-from botocore.client import Config
-import zipfile
-from langchain.vectorstores import FAISS
-from langchain.vectorstores import Chroma
-from langchain.embeddings import HuggingFaceEmbeddings
-# access .env file
-s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
-model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
-#model_kwargs = {"device": "cuda"}
-embeddings = HuggingFaceEmbeddings(
-    model_name=model_name,
-#    model_kwargs=model_kwargs
-    )
-## FAISS
-FAISS_INDEX_PATH='./vectorstore/lc-faiss-multi-mpnet-500-markdown'
-VS_DESTINATION = FAISS_INDEX_PATH+".zip"
-s3.download_file('rad-rag-demos', 'vectorstores/lc-faiss-multi-mpnet-500-markdown.zip', VS_DESTINATION)
-with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
-    zip_ref.extractall('./vectorstore/')
-faissdb = FAISS.load_local(FAISS_INDEX_PATH, embeddings)
-## Chroma DB
-chroma_directory="./vectorstore/lc-chroma-multi-mpnet-500-markdown"
-VS_DESTINATION = chroma_directory+".zip"
-s3.download_file('rad-rag-demos', 'vectorstores/lc-chroma-multi-mpnet-500-markdown.zip', VS_DESTINATION)
-with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
-    zip_ref.extractall('./vectorstore/')
-chromadb = Chroma(persist_directory=chroma_directory, embedding_function=embeddings)
-chromadb.get()

{rag-system-anatomy → rag_app}/create_embedding.py RENAMED Viewed

@@ -1,16 +1,17 @@
 # embeddings functions
-from langchain.vectorstores import FAISS
-from langchain.document_loaders import ReadTheDocsLoader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.embeddings import HuggingFaceEmbeddings
 import time
 from langchain_core.documents import Document
 def create_embeddings(
         docs: list[Document],
-        chunk_size:int,
-        chunk_overlap:int,
         embedding_model: str = "sentence-transformers/multi-qa-mpnet-base-dot-v1",
         ):
     """given a sequence of `Document` objects this fucntion will
@@ -18,8 +19,8 @@ def create_embeddings(
     ## argument
     :params docs (list[Document]) -> list of `list[Document]`
-    :params chunk_size (int) -> chunk size in which documents are chunks
-    :params chunk_overlap (int) -> the amount of token that will be overlapped between chunks
     :params embedding_model (str) -> the huggingspace model that will embed the documents
     ## Return
     Tuple of embedding and chunks
@@ -35,14 +36,15 @@ def create_embeddings(
     # Stage one: read all the docs, split them into chunks.
     st = time.time()
-    print('Loading documents ...')
     chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
     et = time.time() - st
-    print(f'Time taken: {et} seconds.')
     #Stage two: embed the docs.
     embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
-    print(f"create a total of {len(chunks)}")
     return embeddings,chunks

 # embeddings functions
+#from langchain_community.vectorstores import FAISS
+#from langchain_community.document_loaders import ReadTheDocsLoader
+#from langchain_community.vectorstores.utils import filter_complex_metadata
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
 import time
 from langchain_core.documents import Document
 def create_embeddings(
         docs: list[Document],
+        chunk_size:int = 500,
+        chunk_overlap:int = 50,
         embedding_model: str = "sentence-transformers/multi-qa-mpnet-base-dot-v1",
         ):
     """given a sequence of `Document` objects this fucntion will
     ## argument
     :params docs (list[Document]) -> list of `list[Document]`
+    :params chunk_size (int) -> chunk size in which documents are chunks, defaults to 500
+    :params chunk_overlap (int) -> the amount of token that will be overlapped between chunks, defaults to 50
     :params embedding_model (str) -> the huggingspace model that will embed the documents
     ## Return
     Tuple of embedding and chunks
     # Stage one: read all the docs, split them into chunks.
     st = time.time()
+    print('Loading documents and creating chunks ...')
+    # Split each document into chunks using the configured text splitter
     chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
     et = time.time() - st
+    print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
     #Stage two: embed the docs.
     embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
+    print(f"created a total of {len(chunks)} chunks")
     return embeddings,chunks

rag_app/generate_summary.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from langchain_huggingface import HuggingFaceEndpoint
+from langchain_core.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+import json
+from dotenv import load_dotenv
+import os
+load_dotenv()
+HF_API_TOKEN = os.getenv('HUGGINGFACE_API_TOKEN')
+model_id=os.getenv('LLM_MODEL')
+LLM = HuggingFaceEndpoint(
+repo_id=model_id,
+temperature=0.1,
+max_new_tokens=512,
+repetition_penalty=1.2,
+return_full_text=False,
+huggingfacehub_api_token=HF_API_TOKEN)
+def generate_keywords(document:dict,
+                     llm_model:HuggingFaceEndpoint = LLM) -> str:
+    """ Generate a meaningful list of meta keywords for the provided document or chunk"""
+    template = (
+        """
+        You are a SEO expert bot. Your task is to craft a meaningful list of 5 keywords to organize documents.
+        The keywords should help us in searching and retrieving the documents later.
+        You will only respond with the clear, concise and meaningful 5 of keywords separated by comma.
+        <<<
+        Document: {document}
+        >>>
+        Keywords:
+        """
+    )
+    prompt = PromptTemplate.from_template(template=template)
+    chain = prompt | llm_model | StrOutputParser()
+    result = chain.invoke({'document': document})
+    return result.strip()
+def generate_description(document:dict,
+                     llm_model:HuggingFaceEndpoint = LLM) -> str:
+    """ Generate a meaningful document description based on document content """
+    template = (
+        """
+        You are a SEO expert bot. Your task is to craft a meaningful summary to descripe and organize documents.
+        The description should be a meaningful summary of the document's content and help us in searching and retrieving the documents later.
+        You will only respond with the clear, concise and meaningful description.
+        <<<
+        Document: {document}
+        >>>
+        Description:
+        """
+    )
+    prompt = PromptTemplate.from_template(template=template)
+    chain = prompt | llm_model | StrOutputParser()
+    result = chain.invoke({'document': document})
+    return result.strip()

{rag-system-anatomy → rag_app}/get_db_retriever.py RENAMED Viewed

File without changes

rag_app/handle_vector_store.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# vectorization functions
+from langchain_community.vectorstores import FAISS
+from langchain_community.vectorstores import Chroma
+from langchain_community.document_loaders import ReadTheDocsLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.retrievers import BM25Retriever
+from rag_app.create_embedding import create_embeddings
+from rag_app.generate_summary import generate_description, generate_keywords
+import time
+import os
+from dotenv import load_dotenv
+def build_vector_store(
+        docs: list,
+        db_path: str,
+        embedding_model: str,
+        new_db:bool=False,
+        chunk_size:int=500,
+        chunk_overlap:int=50,
+        ):
+    """
+    """
+    if db_path is None:
+        FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
+    else:
+        FAISS_INDEX_PATH = db_path
+    embeddings,chunks = create_embeddings(docs, chunk_size, chunk_overlap, embedding_model)
+    for chunk in chunks:
+        keywords=generate_keywords(chunk)
+        description=generate_description(chunk)
+        chunk.metadata['chunk_keywords']=keywords
+        chunk.metadata['chunk_description']=description
+    #load chunks into vector store
+    print(f'Loading chunks into faiss vector store ...')
+    st = time.time()
+    if new_db:
+        db_faiss = FAISS.from_documents(chunks, embeddings)
+        bm25_retriever = BM25Retriever.from_documents(chunks)
+    else:
+        db_faiss = FAISS.add_documents(chunks, embeddings)
+        bm25_retriever = BM25Retriever.add_documents(chunks)
+    db_faiss.save_local(FAISS_INDEX_PATH)
+    et = time.time() - st
+    print(f'Time taken: {et} seconds.')
+    print(f'Loading chunks into chroma vector store ...')
+    st = time.time()
+    persist_directory='./vectorstore/chroma-insurance-agent-1500'
+    db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
+    et = time.time() - st
+    print(f'Time taken: {et} seconds.')
+    result = f"built vectore store at {FAISS_INDEX_PATH}"
+    return result
+# # Path for saving the FAISS index
+# FAISS_INDEX_PATH = "./vectorstore/lc-faiss-multi-mpnet-500"
+# try:
+#     # Stage two: Vectorization of the document chunks
+#     model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"  # Model used for embedding
+#     # Initialize HuggingFace embeddings with the specified model
+#     embeddings = HuggingFaceEmbeddings(model_name=model_name)
+#     print(f'Loading chunks into vector store ...')
+#     st = time.time()  # Start time for performance measurement
+#     # Create a FAISS vector store from the document chunks and save it locally
+#     db = FAISS.from_documents(filter_complex_metadata(chunks), embeddings)
+#     db.save_local(FAISS_INDEX_PATH)
+#     et = time.time() - st  # Calculate time taken for vectorization
+#     print(f'Time taken for vectorization and saving: {et} seconds.')
+# except Exception as e:
+#     print(f"Error during vectorization or FAISS index saving: {e}", file=sys.stderr)
+# alternatively download a preparaed vectorized index from S3 and load the index into vectorstore
+# Import necessary libraries for AWS S3 interaction, file handling, and FAISS vector stores

{rag-system-anatomy → rag_app}/load_data_from_urls.py RENAMED Viewed

@@ -1,8 +1,13 @@
 # documents loader function
-from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
 from bs4 import BeautifulSoup as Soup
 from validators import url as url_validator
 from langchain_core.documents import Document
 def load_docs_from_urls(
         urls: list = ["https://docs.python.org/3/"],
@@ -21,12 +26,25 @@ def load_docs_from_urls(
     ## Raises:
         ValueError: If any URL in the provided list is invalid.
     """
     docs = []
     for url in urls:
         if not url_validator(url):
             raise ValueError(f"Invalid URL: {url}")
-        loader = RecursiveUrlLoader(url=url, max_depth=max_depth, extractor=lambda x: Soup(x, "html.parser").text)
-        docs.extend(loader.load())
-    print(f"loaded {len(docs)} pages")
     return docs

 # documents loader function
+from langchain_community.document_loaders import RecursiveUrlLoader
 from bs4 import BeautifulSoup as Soup
 from validators import url as url_validator
 from langchain_core.documents import Document
+import time
+import logging
+import sys
+logger = logging.getLogger(__name__)
 def load_docs_from_urls(
         urls: list = ["https://docs.python.org/3/"],
     ## Raises:
         ValueError: If any URL in the provided list is invalid.
     """
+    stf = time.time()  # Start time for performance measurement
     docs = []
     for url in urls:
+        st = time.time()  # Start time for outer performance measurement
         if not url_validator(url):
             raise ValueError(f"Invalid URL: {url}")
+        try:
+            st = time.time()  # Start time for inner performance measurement
+            loader = RecursiveUrlLoader(url=url, max_depth=max_depth, extractor=lambda x: Soup(x, "html.parser").text)
+            docs.extend(loader.load())
+            et = time.time() - st  # Calculate time taken for splitting
+            logMessage=f'Time taken for downloading documents from {url}: {et} seconds.'
+            logger.info(logMessage)
+            print(logMessage)
+        except Exception as e:
+            logMessage=f"Failed to load or parse the URL {url}. Error: {e}"
+            logger.error(logMessage)
+            print(logMessage, file=sys.stderr)
+    etf = time.time() - stf  # Calculate time taken for scrapping all URLs
+    print(f'Total time taken for downloading {len(docs)} documents: {etf} seconds.')
     return docs

rag_app/load_vector_stores.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# preprocessed vectorstore retrieval
+import boto3
+from botocore import UNSIGNED
+from botocore.client import Config
+import zipfile
+from langchain_community.vectorstores import FAISS
+from langchain_community.vectorstores import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings
+from dotenv import load_dotenv
+import os
+import sys
+import logging
+# Load environment variables from a .env file
+config = load_dotenv(".env")
+# Retrieve the Hugging Face API token from environment variables
+HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+S3_LOCATION = os.getenv("S3_LOCATION")
+FAISS_VS_NAME = os.getenv("FAISS_VS_NAME")
+FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
+CHROMA_DIRECTORY = os.getenv("CHROMA_DIRECTORY")
+CHROMA_VS_NAME = os.getenv("CHROMA_VS_NAME")
+EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
+model_name = EMBEDDING_MODEL
+#model_kwargs = {"device": "cuda"}
+embeddings = HuggingFaceEmbeddings(
+    model_name=model_name,
+#    model_kwargs=model_kwargs
+    )
+## FAISS
+def get_faiss_vs():
+    # Initialize an S3 client with unsigned configuration for public access
+    s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
+    # Define the destination for the downloaded file
+    VS_DESTINATION = FAISS_INDEX_PATH + ".zip"
+    try:
+        # Download the pre-prepared vectorized index from the S3 bucket
+        print("Downloading the pre-prepared vectorized index from S3...")
+        s3.download_file(S3_LOCATION, FAISS_VS_NAME, VS_DESTINATION)
+        # Extract the downloaded zip file
+        with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
+            zip_ref.extractall('./vectorstore/')
+        print("Download and extraction completed.")
+        return FAISS.load_local(FAISS_INDEX_PATH, embeddings)
+    except Exception as e:
+        print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
+    #faissdb = FAISS.load_local(FAISS_INDEX_PATH, embeddings)
+## Chroma DB
+def get_chroma_vs():
+    # Initialize an S3 client with unsigned configuration for public access
+    s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
+    VS_DESTINATION = CHROMA_DIRECTORY+".zip"
+    try:
+        s3.download_file(S3_LOCATION, CHROMA_VS_NAME, VS_DESTINATION)
+        with zipfile.ZipFile(VS_DESTINATION, 'r') as zip_ref:
+            zip_ref.extractall('./vectorstore/')
+        chromadb = Chroma(persist_directory=CHROMA_DIRECTORY, embedding_function=embeddings)
+        chromadb.get()
+    except Exception as e:
+        print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)

rag_app/react_agent.py ADDED Viewed

File without changes

rag_app/simple_qa_chain.py ADDED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
  langchain
  langchain-community
- langchain-huggingface
  beautifulsoup4
  faiss-cpu
  chromadb

  langchain
  langchain-community
+ langchain-HuggingFace
+ langchain-text-splitters
  beautifulsoup4
  faiss-cpu
  chromadb

test_this.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from rag_app.load_data_from_urls import load_docs_from_urls
+from rag_app.create_embedding import create_embeddings
+from rag_app.generate_summary import generate_description, generate_keywords
+from rag_app.handle_vector_store import build_vector_store
+docs = load_docs_from_urls(["https://www.wuerttembergische.de/"],5)
+for doc in docs:
+    keywords=generate_keywords(doc)
+    description=generate_description(doc)
+    doc.metadata['keywords']=keywords
+    doc.metadata['description']=description
+build_vector_store(docs, './vectorstore/faiss-insurance-agent-1500','sentence-transformers/multi-qa-mpnet-base-dot-v1',True,1500,150)
+#print(create_embeddings(docs))