Spaces:

ai-aerospace
/

aerospace_chatbot_ams

Sleeping

App Files Files Community

dsmueller commited on Feb 5

Commit

7206754

•

1 Parent(s): 0887c4f

Removed committed files, added git copies, modified poetry

Browse files

Files changed (36) hide show

Dockerfile +11 -3
config/config.json +0 -44
config/index_data.json +0 -13
data/AMS/AMS_1996.pdf +0 -3
data/AMS/AMS_1997.pdf +0 -3
data/AMS/AMS_1998.pdf +0 -3
data/AMS/AMS_1999.pdf +0 -3
data/AMS/AMS_2000.pdf +0 -3
data/AMS/AMS_2001.pdf +0 -3
data/AMS/AMS_2002.pdf +0 -3
data/AMS/AMS_2004.pdf +0 -3
data/AMS/AMS_2006.pdf +0 -3
data/AMS/AMS_2008.pdf +0 -3
data/AMS/AMS_2010.pdf +0 -3
data/AMS/AMS_2012.pdf +0 -3
data/AMS/AMS_2014.pdf +0 -3
data/AMS/AMS_2016.pdf +0 -3
data/AMS/AMS_2018.pdf +0 -3
data/AMS/AMS_2020.pdf +0 -3
data/AMS/AMS_2022.pdf +0 -3
data/AMS/README.txt +0 -18
data/AMS/ams_data-400-0-50.json +0 -0
data/AMS/ams_data-400-0-all.json +0 -3
data/AMS/ams_data-400-0.jsonl +0 -3
data/AMS/ams_data-5000-0.jsonl +0 -3
poetry.lock +0 -0
pyproject.toml +3 -3
scripts/Start.py +0 -41
scripts/data_import.py +0 -278
scripts/pages/1_Chatbot_AMS_Modular.py +0 -160
scripts/pages/2_Document_Upload.py +0 -112
scripts/pages/3_Visualize_Data.py +0 -123
scripts/pages/4_Clean_and_Question.py +0 -86
scripts/prompts.py +0 -12
scripts/queries.py +0 -278
scripts/setup.py +0 -168

Dockerfile CHANGED Viewed

@@ -6,6 +6,11 @@ FROM python:3.11.5-bookworm
 RUN useradd -m -u 1000 user
 USER user
 # Set home to the user's home directory
 ENV HOME=/home/user \
 	PATH=/home/user/.local/bin:$PATH
@@ -18,7 +23,7 @@ WORKDIR $HOME
 RUN pip3 install poetry==1.7.1
 # Copy poetry files
-COPY --chown=user pyproject.toml poetry.lock* $HOME
 # Disable virtual environments creation by Poetry
 # as the Docker container itself is an isolated environment
@@ -34,7 +39,9 @@ ENV PATH="$HOME/.venv/bin:$PATH"
 RUN poetry install --no-dev
 # Copy the rest of your application code
-COPY --chown=user . $HOME
 # Expose the port Streamlit runs on
 EXPOSE 8501
@@ -43,10 +50,11 @@ EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
 # Update working directory to be consistent with where Start.py is
-WORKDIR $HOME/scripts
 # An ENTRYPOINT allows you to configure a container that will run as an executable. Here, it also contains the entire streamlit run command for your app, so you don’t have to call it from the command line
 ENTRYPOINT ["streamlit", "run", "Start.py", "--server.port=8501", "--server.address=0.0.0.0"]
 # docker run -it -p 7860:7860 --platform=linux/amd64 \
 # 	registry.hf.space/ai-aerospace-aerospace-chatbots:latest

 RUN useradd -m -u 1000 user
 USER user
+# Clone aerospace-chatbot github repository
+RUN apt-get update && apt-get install -y git
+WORKDIR /app
+RUN git clone -b rag_study https://github.com/dan-s-mueller/aerospace_chatbot.git .
 # Set home to the user's home directory
 ENV HOME=/home/user \
 	PATH=/home/user/.local/bin:$PATH
 RUN pip3 install poetry==1.7.1
 # Copy poetry files
+COPY --chown=user /app/aerospace_chatbot/pyproject.toml /app/aerospace_chatbot/poetry.lock* $HOME
 # Disable virtual environments creation by Poetry
 # as the Docker container itself is an isolated environment
 RUN poetry install --no-dev
 # Copy the rest of your application code
+COPY --chown=user /app/aerospace_chatbot/src $HOME/src
+COPY --chown=user /app/aerospace_chatbot/data $HOME/data
+COPY --chown=user /app/aerospace_chatbot/config $HOME/config
 # Expose the port Streamlit runs on
 EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
 # Update working directory to be consistent with where Start.py is
+WORKDIR $HOME/src
 # An ENTRYPOINT allows you to configure a container that will run as an executable. Here, it also contains the entire streamlit run command for your app, so you don’t have to call it from the command line
 ENTRYPOINT ["streamlit", "run", "Start.py", "--server.port=8501", "--server.address=0.0.0.0"]
+# To run remotely
 # docker run -it -p 7860:7860 --platform=linux/amd64 \
 # 	registry.hf.space/ai-aerospace-aerospace-chatbots:latest

config/config.json DELETED Viewed

@@ -1,44 +0,0 @@
-{
-    "databases": [
-        {
-            "name": "Pinecone",
-            "embedding_models": ["Openai", "Voyage"]
-        },
-        {
-            "name": "ChromaDB",
-            "embedding_models": ["Openai"]
-        },
-        {
-            "name": "RAGatouille",
-            "hf_rag_models": [
-                "colbert-ir/colbertv2.0"
-            ]
-        }
-    ],
-    "llms": [
-        {
-            "name": "OpenAI",
-            "models": [
-                "gpt-3.5-turbo-1106",
-                "gpt-3.5-turbo-instruct",
-                "gpt-4",
-                "gpt-4-32k",
-                "gpt-4-1106-preview"
-            ]
-        },
-        {
-            "name": "Hugging Face",
-            "models": [
-                "mistralai/Mixtral-8x7B-Instruct-v0.1",
-                "ai-aerospace/autotrain-ams_v0.1_100_Mistral-7B-Instruct-v0.1",
-                "meta-llama/Llama-2-7b-chat-hf"
-            ]
-        }
-    ],
-    "rag_types": [
-        "Standard",
-        "Parent-Child",
-        "Hypothetical Questions",
-        "Summaries"
-    ]
-}

config/index_data.json DELETED Viewed

@@ -1,13 +0,0 @@
-{
-    "Pinecone": {
-        "Openai": "pinecone-openai-ams",
-        "Voyage": "pinecone-voyage-ams"
-    },
-    "ChromaDB": {
-        "Openai": "chromadb-openai-ams",
-        "Voyage": "chromadb-voyage-ams"
-    },
-    "RAGatouille": {
-        "colbert-ir/colbertv2.0": "RAGatouille-colbertv2.0-ams"
-    }
-}

data/AMS/AMS_1996.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3626fd4a0769b8a73a12ee79a1bec7c264c541a5bf90df6f6c13c1ff00011b24
-size 152158068

data/AMS/AMS_1997.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:34442bbc794415ea8d778ebd57e1dd368e20c5e6f65aff35fa008af54dbb900a
-size 22719877

data/AMS/AMS_1998.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1523ca03cd1254b81cd0cb285182b7ac40208cba7932972ca00e0942e43f3539
-size 122280718

data/AMS/AMS_1999.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1c631364761565d749e6bafb0ab1e84611e773ccdb640ab08f6b32b1fcc49e1e
-size 27850919

data/AMS/AMS_2000.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ddf89c5cd9ddbd225e77198b19274535d4f003fdc20b5823239f51ad48230549
-size 24061146

data/AMS/AMS_2001.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c63b2bba5a892759a7298097ee2388f353cc974285a73bfd8635d48af9f7d945
-size 23264984

data/AMS/AMS_2002.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1b8b60c30ea9843face46e021a80bd1072901596b8e0f98a63601b31ecac2076
-size 41615570

data/AMS/AMS_2004.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:986a7f046ba336d35d9d0db974931940543d612dad2c9bb6d5976d778777b659
-size 28914300

data/AMS/AMS_2006.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:af4fb8e67c1ebf7b51fddd947d531d68ab05ff187fe915528811676ae0083d55
-size 61039456

data/AMS/AMS_2008.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3d74dcd8ef68ae324f9246a35e1ccf538c4fd676d8b1ae733191c8ad6a055c90
-size 31961158

data/AMS/AMS_2010.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:30d9ad0b75d0d41c75926dd97361f1548b79920df61d8d7486978d4b69a00ef6
-size 30161812

data/AMS/AMS_2012.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e26a981f74c9d0c3526ad5152c83ad9fabde8f197f69cb24a0fd1d4004c1f026
-size 31088140

data/AMS/AMS_2014.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:73dea6c8c45d0103404e3e3bd764e6efcd0f5bf5f45d505ce98e6c07528d9322
-size 35199422

data/AMS/AMS_2016.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f7d8a0e558abd59b94abcbe013f263755f3c525eaf73702662293a3d8b5e2ec5
-size 35244294

data/AMS/AMS_2018.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7b929f0c6d71116e23d4f52011e82eda07280aabb177300e37419ca38b047c60
-size 30251124

data/AMS/AMS_2020.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cb6aaaa2cb700bc7d460a1f222756e6a795b629780087a477acd9713982fc0b9
-size 45426669

data/AMS/AMS_2022.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ccc90819f501fca9445d415c1ceca8d3991300f8e08724cf7043f1a103aa4231
-size 17636761

data/AMS/README.txt DELETED Viewed

@@ -1,18 +0,0 @@
-Documents are not uploaded to git. The list of documents which were uploaded to pinecone database AMS:
-AMS_1996, https://ntrs.nasa.gov/citations/19960025595
-AMS_1997, https://ntrs.nasa.gov/citations/19970021613
-AMS_1998, https://ntrs.nasa.gov/citations/19980193156
-AMS_1999, https://ntrs.nasa.gov/citations/19990053852
-AMS_2000, https://ntrs.nasa.gov/citations/20000048380
-AMS_2001, https://ntrs.nasa.gov/citations/20010071164
-AMS_2002, https://ntrs.nasa.gov/citations/20020050182
-AMS_2004, https://ntrs.nasa.gov/citations/20040084272
-AMS_2006, https://ntrs.nasa.gov/citations/20060028221
-AMS_2008, https://ntrs.nasa.gov/citations/20080023060
-AMS_2010, https://ntrs.nasa.gov/citations/20100021914
-AMS_2012, https://ntrs.nasa.gov/citations/20130008824
-AMS_2014, https://ntrs.nasa.gov/citations/20140008875
-AMS_2016, https://ntrs.nasa.gov/citations/20160004038
-AMS_2018, https://ntrs.nasa.gov/citations/20180002828
-AMS_2020, https://ntrs.nasa.gov/citations/20205009766
-AMS_2022, https://ntrs.nasa.gov/citations/20220006415

data/AMS/ams_data-400-0-50.json DELETED Viewed

The diff for this file is too large to render. See raw diff

data/AMS/ams_data-400-0-all.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:923f4efbb6bcfa932cad87520177cb65bcf4b3df7fbc7446285df7ef070fa3eb
-size 36094453

data/AMS/ams_data-400-0.jsonl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ef248f60645d1def4d3624351c90cbb5d91554d0a8bfd35615514f4a71a20159
-size 18183603

data/AMS/ams_data-5000-0.jsonl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0472930c89ad2c13f997789b070049c99640c6ddcd114cc635110409854435b5
-size 17283048

poetry.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -5,12 +5,11 @@ description = ""
 authors = ["dsmueller <[email protected]>"]
 [tool.poetry.dependencies]
-python = "^3.11"
 python-dotenv = "^1.0.0"
 ipykernel = "^6.28.0"
 ipywidgets = "^8.1.1"
 langchainhub = "^0.1.14"
-pinecone-client = "^2.2.4"
 tiktoken = "^0.5.2"
 watchdog = "^3.0.0"
 chromadb = "^0.4.22"
@@ -25,8 +24,9 @@ langchain-openai = "^0.0.2.post1"
 sentence-transformers = "^2.2.2"
 ragatouille = "^0.0.4b2"
 nbformat = "^5.9.2"
-ragxplorer = {git = "https://github.com/dsmueller3760/RAGxplorer.git", rev = "load_db"}
 pydantic = "^2.6.0"
 [build-system]

 authors = ["dsmueller <[email protected]>"]
 [tool.poetry.dependencies]
+python = ">=3.11,<3.13"
 python-dotenv = "^1.0.0"
 ipykernel = "^6.28.0"
 ipywidgets = "^8.1.1"
 langchainhub = "^0.1.14"
 tiktoken = "^0.5.2"
 watchdog = "^3.0.0"
 chromadb = "^0.4.22"
 sentence-transformers = "^2.2.2"
 ragatouille = "^0.0.4b2"
 nbformat = "^5.9.2"
 pydantic = "^2.6.0"
+RAGxplorer = { git = "https://github.com/dan-s-mueller/RAGxplorer.git", branch = "load_options" }
+pinecone-client = "^3.0.2"
 [build-system]

scripts/Start.py DELETED Viewed

@@ -1,41 +0,0 @@
-import streamlit as st
-import os
-# Set up page
-st.set_page_config(
-    page_title="Aerospace Chatbot: AMS",
-)
-st.title("Aerospace Chatbot Homepage")
-st.markdown("Code base: https://github.com/dsmueller3760/aerospace_chatbot/tree/rag_study")
-st.markdown('---')
-st.markdown("""
-This space contains chatbots and tools for exploring data in the aerospace mechanisms symposia, using all available papers published since 2000.
-""")
-st.subheader("Running Locally")
-'''
-It is recommended to run this streamlit app locally for improved performance. The hosted hugging face version is for proof of concept.
-You must have poetry installed locally to manage depdenencies. To run locally, clone the repository and run the following commands.
-    poetry config virtualenvs.in-project true
-    poetry install
-    source .venv/bin/activate
-    cd ./scripts
-    streamlit run Start.py
-'''
-st.subheader("Aerospace Mechanisms Symposia (AMS)")
-'''
-This chatbot will look up from all Aerospace Mechanism Symposia in the following location: https://github.com/dsmueller3760/aerospace_chatbot/tree/main/data/AMS
-* Available models: https://platform.openai.com/docs/models
-* Model parameters: https://platform.openai.com/docs/api-reference/chat/create
-* Pinecone: https://docs.pinecone.io/docs/projects#api-keys
-* OpenAI API: https://platform.openai.com/api-keys
-'''
-st.subheader("API Key Links")
-'''
-* OpenAI: https://platform.openai.com/api-keys
-* Pinecone: https://www.pinecone.io
-* Hugging Face: https://huggingface.co/settings/tokens
-* Voyage: https://dash.voyageai.com/api-keys
-'''

scripts/data_import.py DELETED Viewed

@@ -1,278 +0,0 @@
-import os
-import re
-import logging
-import shutil
-import string
-import pinecone
-import chromadb
-import json, jsonlines
-from tqdm import tqdm
-from langchain_community.vectorstores import Pinecone
-from langchain_community.vectorstores import Chroma
-from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
-from langchain_openai import OpenAIEmbeddings
-from langchain_community.embeddings import VoyageEmbeddings
-from langchain_community.document_loaders import PyPDFLoader
-from langchain_core.documents import Document as lancghain_Document
-from ragatouille import RAGPretrainedModel
-from dotenv import load_dotenv,find_dotenv
-load_dotenv(find_dotenv(),override=True)
-# Set secrets from environment file
-OPENAI_API_KEY=os.getenv('OPENAI_API_KEY')
-VOYAGE_API_KEY=os.getenv('VOYAGE_API_KEY')
-PINECONE_API_KEY=os.getenv('PINECONE_API_KEY')
-HUGGINGFACEHUB_API_TOKEN=os.getenv('HUGGINGFACEHUB_API_TOKEN')
-def chunk_docs(docs,
-               chunk_method='tiktoken_recursive',
-               file=None,
-               chunk_size=500,
-               chunk_overlap=0,
-               use_json=False):
-    docs_out=[]
-    if file:
-        logging.info('Jsonl file to be used: '+file)
-    if use_json and os.path.exists(file):
-            logging.info('Jsonl file found, using this instead of parsing docs.')
-            with open(file, "r") as file_in:
-                file_data = [json.loads(line) for line in file_in]
-            # Process the file data and put it into the same format as docs_out
-            for line in file_data:
-                doc_temp = lancghain_Document(page_content=line['page_content'],
-                                              source=line['metadata']['source'],
-                                              page=line['metadata']['page'],
-                                              metadata=line['metadata'])
-                if has_meaningful_content(doc_temp):
-                    docs_out.append(doc_temp)
-            logging.info('Parsed: '+file)
-            logging.info('Number of entries: '+str(len(docs_out)))
-            logging.info('Sample entries:')
-            logging.info(str(docs_out[0]))
-            logging.info(str(docs_out[-1]))
-    else:
-        logging.info('No jsonl found. Reading and parsing docs.')
-        logging.info('Chunk size (tokens): '+str(chunk_size))
-        logging.info('Chunk overlap (tokens): '+str(chunk_overlap))
-        for doc in tqdm(docs,desc='Reading and parsing docs'):
-            logging.info('Parsing: '+doc)
-            loader = PyPDFLoader(doc)
-            data = loader.load_and_split()
-            if chunk_method=='tiktoken_recursive':
-                text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
-            else:
-                raise NotImplementedError
-            pages = text_splitter.split_documents(data)
-            # Tidy up text by removing unnecessary characters
-            for page in pages:
-                page.metadata['source']=os.path.basename(page.metadata['source'])   # Strip path
-                page.metadata['page']=int(page.metadata['page'])+1   # Pages are 0 based, update
-                page.page_content=re.sub(r"(\w+)-\n(\w+)", r"\1\2", page.page_content)   # Merge hyphenated words
-                page.page_content = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", page.page_content.strip())  # Fix newlines in the middle of sentences
-                page.page_content = re.sub(r"\n\s*\n", "\n\n", page.page_content)   # Remove multiple newlines
-                # Add metadata to the end of the page content, some RAG models don't have metadata.
-                page.page_content += str(page.metadata)
-                doc_temp=lancghain_Document(page_content=page.page_content,
-                                            source=page.metadata['source'],
-                                            page=page.metadata['page'],
-                                            metadata=page.metadata)
-                if has_meaningful_content(page):
-                    docs_out.append(doc_temp)
-        logging.info('Parsed: '+doc)
-        logging.info('Sample entries:')
-        logging.info(str(docs_out[0]))
-        logging.info(str(docs_out[-1]))
-        if file:
-            # Write to a jsonl file, save it.
-            logging.info('Writing to jsonl file: '+file)
-            with jsonlines.open(file, mode='w') as writer:
-                for doc in docs_out:
-                    writer.write(doc.dict())
-            logging.info('Written: '+file)
-    return docs_out
-def load_docs(index_type,
-              docs,
-              query_model,
-              index_name=None,
-              chunk_method='tiktoken_recursive',
-              chunk_size=500,
-              chunk_overlap=0,
-              clear=False,
-              use_json=False,
-              file=None,
-              batch_size=50,
-              local_db_path='../db'):
-    """
-    Loads PDF documents. If index_name is blank, it will return a list of the data (texts). If it is a name of a pinecone storage, it will return the vector_store.
-    """
-    # Chunk docs
-    docs_out=chunk_docs(docs,
-                        chunk_method=chunk_method,
-                        file=file,
-                        chunk_size=chunk_size,
-                        chunk_overlap=chunk_overlap,
-                        use_json=use_json)
-    # Initialize client
-    if index_name:
-        if index_type=="Pinecone":
-            # Import and initialize Pinecone client
-            pinecone.init(
-                api_key=PINECONE_API_KEY
-            )
-            # Find the existing index, clear for new start
-            if clear:
-                try:
-                    pinecone.describe_index(index_name)
-                except:
-                    raise Exception(f"Cannot clear index {index_name} because it does not exist.")
-                index=pinecone.Index(index_name)
-                index.delete(delete_all=True) # Clear the index first, then upload
-                logging.info('Cleared database '+index_name)
-            # Upsert docs
-            try:
-                pinecone.describe_index(index_name)
-            except:
-                logging.info(f"Index {index_name} does not exist. Creating new index.")
-                logging.info('Size of embedding used: '+str(embedding_size(query_model)))  # TODO: set this to be backed out of the embedding size
-                pinecone.create_index(index_name,dimension=embedding_size(query_model))
-                logging.info(f"Index {index_name} created. Adding {len(docs_out)} entries to index.")
-                pass
-            else:
-                logging.info(f"Index {index_name} exists. Adding {len(docs_out)} entries to index.")
-            index = pinecone.Index(index_name)
-            vectorstore = Pinecone(index, query_model, "page_content") # Set the vector store to calculate embeddings on page_content
-            vectorstore = batch_upsert(index_type,
-                                       vectorstore,
-                                       docs_out,
-                                       batch_size=batch_size)
-        elif index_type=="ChromaDB":
-            # Upsert docs. Defaults to putting this in the local_db_path directory
-            logging.info(f"Creating new index {index_name}.")
-            persistent_client = chromadb.PersistentClient(path=local_db_path+'/chromadb')
-            vectorstore = Chroma(client=persistent_client,
-                                 collection_name=index_name,
-                                 embedding_function=query_model)
-            logging.info(f"Index {index_name} created. Adding {len(docs_out)} entries to index.")
-            vectorstore = batch_upsert(index_type,
-                                       vectorstore,
-                                       docs_out,
-                                       batch_size=batch_size)
-            logging.info("Documents upserted to f{index_name}.")
-            # Test query
-            test_query = vectorstore.similarity_search('What are examples of aerosapce adhesives to avoid?')
-            logging.info('Test query: '+str(test_query))
-            if not test_query:
-                raise ValueError("Chroma vector database is not configured properly. Test query failed.")
-        elif index_type=="RAGatouille":
-            logging.info(f'Setting up RAGatouille model {query_model}')
-            vectorstore = RAGPretrainedModel.from_pretrained(query_model)
-            logging.info('RAGatouille model set: '+str(vectorstore))
-            # Create an index from the vectorstore.
-            docs_out_colbert = [doc.page_content for doc in docs_out]
-            if chunk_size>500:
-                raise ValueError("RAGatouille cannot handle chunks larger than 500 tokens. Reduce token count.")
-            vectorstore.index(
-                collection=docs_out_colbert,
-                index_name=index_name,
-                max_document_length=chunk_size,
-                overwrite_index=True,
-                split_documents=True,
-            )
-            logging.info(f"Index created: {vectorstore}")
-            # Move the directory to the db folder
-            logging.info(f"Moving RAGatouille index to {local_db_path}")
-            ragatouille_path = os.path.join(local_db_path, '.ragatouille')
-            if os.path.exists(ragatouille_path):
-                shutil.rmtree(ragatouille_path)
-                logging.info(f"RAGatouille index deleted from {ragatouille_path}")
-            shutil.move('./.ragatouille', local_db_path)
-            logging.info(f"RAGatouille index created in {local_db_path}:"+str(vectorstore))
-    # Return vectorstore or docs
-    if index_name:
-        return vectorstore
-    else:
-        return docs_out
-def delete_index(index_type,index_name,
-                 local_db_path='../db'):
-    """
-    Deletes an existing Pinecone index with the given index_name.
-    """
-    if index_type=="Pinecone":
-        # Import and initialize Pinecone client
-        pinecone.init(
-            api_key=PINECONE_API_KEY
-        )
-        try:
-            pinecone.describe_index(index_name)
-            logging.info(f"Index {index_name} exists.")
-        except:
-            raise Exception(f"Index {index_name} does not exist, cannot delete.")
-        else:
-            pinecone.delete_index(index_name)
-            logging.info(f"Index {index_name} deleted.")
-    elif index_type=="ChromaDB":
-        # Delete existing collection
-        logging.info(f"Deleting index {index_name}.")
-        persistent_client = chromadb.PersistentClient(path=local_db_path+'/chromadb')
-        persistent_client.delete_collection(name=index_name)
-        logging.info("Index deleted.")
-    elif index_type=="RAGatouille":
-            raise NotImplementedError
-def batch_upsert(index_type,vectorstore,docs_out,batch_size=50):
-    # Batch insert the chunks into the vector store
-    for i in range(0, len(docs_out), batch_size):
-        chunk_batch = docs_out[i:i + batch_size]
-        if index_type=="Pinecone":
-            vectorstore.add_documents(chunk_batch)
-        elif index_type=="ChromaDB":
-            vectorstore.add_documents(chunk_batch)  # Happens to be same for chroma/pinecone, leaving if statement just in case
-    return vectorstore
-def has_meaningful_content(page):
-    """
-    Test whether the page has more than 30% words and is more than 5 words.
-    """
-    text=page.page_content
-    num_words = len(text.split())
-    alphanumeric_pct = sum(c.isalnum() for c in text) / len(text)
-    if num_words < 5 or alphanumeric_pct < 0.3:
-        return False
-    else:
-        return True
-def embedding_size(embedding_model):
-    """
-    Returns the embedding size of the model.
-    """
-    if isinstance(embedding_model,OpenAIEmbeddings):
-        return 1536 # https://platform.openai.com/docs/models/embeddings, test-embedding-ada-002
-    elif isinstance(embedding_model,VoyageEmbeddings):
-        return 1024 # https://docs.voyageai.com/embeddings/, voyage-02
-    else:
-        raise NotImplementedError
-def process_chunk(json_file,llm,
-                  clean_data=False,tag_data=False,question_data=False):
-    docs_out=[]
-    with open(json_file, "r") as file_in:
-        file_data = [json.loads(line) for line in file_in]
-        # Process the file data and put it into the same format as docs_out
-        for line in file_data:
-            doc_temp = lancghain_Document(page_content=line['page_content'],
-                                            source=line['metadata']['source'],
-                                            page=line['metadata']['page'],
-                                            metadata=line['metadata'])
-            docs_out.append(doc_temp)
-    # clean data
-    # tag data
-    # question data

scripts/pages/1_Chatbot_AMS_Modular.py DELETED Viewed

@@ -1,160 +0,0 @@
-import queries, setup
-import os
-import time
-import logging
-import json
-import pinecone
-import openai
-from langchain_community.vectorstores import Pinecone
-from langchain_community.vectorstores import Chroma
-from langchain_openai import OpenAIEmbeddings
-from langchain_community.embeddings import VoyageEmbeddings
-from langchain_openai import OpenAI, ChatOpenAI
-from langchain_community.llms import HuggingFaceHub
-from ragatouille import RAGPretrainedModel
-import streamlit as st
-# Set up the page, enable logging, read environment variables
-from dotenv import load_dotenv,find_dotenv
-load_dotenv(find_dotenv(),override=True)
-logging.basicConfig(filename='app_1_chatbot_ams_modular.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
-# Set the page title
-st.set_page_config(
-    page_title='Aerospace Chatbot: Modular',
-    layout='wide'
-)
-st.title('Aerospace Mechanisms Chatbot')
-with st.expander('''What's under the hood?'''):
-    st.markdown('''
-    This chatbot will look up from all Aerospace Mechanism Symposia in the following location: https://github.com/dsmueller3760/aerospace_chatbot/tree/main/data/AMS
-    Example questions:
-    * What are examples of latch failures which have occurred due to improper fitup?
-    * What are examples of lubricants which should be avoided for space mechanism applications?
-    ''')
-filter_toggle=st.checkbox('Filter response with last received sources?')
-sb=setup.load_sidebar(config_file='../config/config.json',
-                      index_data_file='../config/index_data.json',
-                      vector_databases=True,
-                      embeddings=True,
-                      rag_type=True,
-                      index_name=True,
-                      llm=True,
-                      model_options=True,
-                      secret_keys=True)
-secrets=setup.set_secrets(sb) # Take secrets from .env file first, otherwise from sidebar
-# Set up chat history
-if 'qa_model_obj' not in st.session_state:
-    st.session_state.qa_model_obj = []
-if 'message_id' not in st.session_state:
-    st.session_state.message_id = 0
-if 'messages' not in st.session_state:
-    st.session_state.messages = []
-for message in st.session_state.messages:
-    with st.chat_message(message['role']):
-        st.markdown(message['content'])
-# Define chat
-if prompt := st.chat_input('Prompt here'):
-    # User prompt
-    st.session_state.messages.append({'role': 'user', 'content': prompt})
-    with st.chat_message('user'):
-        st.markdown(prompt)
-    # Assistant response
-    with st.chat_message('assistant'):
-        message_placeholder = st.empty()
-        with st.status('Generating response...') as status:
-            t_start=time.time()
-            st.session_state.message_id += 1
-            st.write('Starting reponse generation for message: '+str(st.session_state.message_id))
-            logging.info('Starting reponse generation for message: '+str(st.session_state.message_id))
-             # Process some items
-            if sb['model_options']['output_level'] == 'Concise':
-                out_token = 50
-            else:
-                out_token = 516
-            logging.info('Output tokens: '+str(out_token))
-            if st.session_state.message_id==1:
-                # Define embeddings
-                if sb['query_model']=='Openai':
-                    query_model=OpenAIEmbeddings(model=sb['embedding_name'],openai_api_key=secrets['OPENAI_API_KEY'])
-                elif sb['query_model']=='Voyage':
-                    query_model=VoyageEmbeddings(model=sb['embedding_name'],voyage_api_key=secrets['VOYAGE_API_KEY'])
-                elif sb['index_type']=='RAGatouille':
-                    query_model=RAGPretrainedModel.from_index(sb['keys']['LOCAL_DB_PATH']+'/.ragatouille/colbert/indexes/'+sb['index_name'])
-                logging.info('Query model set: '+str(query_model))
-                # Define LLM
-                if sb['llm_source']=='OpenAI':
-                    llm = ChatOpenAI(model_name=sb['llm_model'],
-                                    temperature=sb['model_options']['temperature'],
-                                    openai_api_key=secrets['OPENAI_API_KEY'],
-                                    max_tokens=out_token)
-                elif sb['llm_source']=='Hugging Face':
-                    llm = HuggingFaceHub(repo_id=sb['llm_model'],
-                                        model_kwargs={"temperature": sb['model_options']['temperature'], "max_length": out_token})
-                logging.info('LLM model set: '+str(llm))
-                # Initialize QA model object
-                if 'search_type' in sb['model_options']:
-                    search_type=sb['model_options']['search_type']
-                else:
-                    search_type=None
-                st.session_state.qa_model_obj=queries.QA_Model(sb['index_type'],
-                                                               sb['index_name'],
-                                                               query_model,
-                                                               llm,
-                                                               k=sb['model_options']['k'],
-                                                               search_type=search_type,
-                                                               filter_arg=False,
-                                                               local_db_path=sb['keys']['LOCAL_DB_PATH'])
-                logging.info('QA model object set: '+str(st.session_state.qa_model_obj))
-            if st.session_state.message_id>1:
-                logging.info('Updating model with sidebar settings...')
-                # Update LLM
-                if sb['llm_source']=='OpenAI':
-                    llm = ChatOpenAI(model_name=sb['llm_model'],
-                                    temperature=sb['model_options']['temperature'],
-                                    openai_api_key=secrets['OPENAI_API_KEY'],
-                                    max_tokens=out_token)
-                elif sb['llm_source']=='Hugging Face':
-                    llm = HuggingFaceHub(repo_id=sb['llm_model'],
-                                        model_kwargs={"temperature": sb['model_options']['temperature'], "max_length": out_token})
-                logging.info('LLM model set: '+str(llm))
-                st.session_state.qa_model_obj.update_model(llm,
-                                                           k=sb['model_options']['k'],
-                                                           search_type=sb['model_options']['search_type'],
-                                                           filter_arg=filter_toggle)
-                logging.info('QA model object updated: '+str(st.session_state.qa_model_obj))
-            st.write('Searching vector database, generating prompt...')
-            logging.info('Searching vector database, generating prompt...')
-            st.session_state.qa_model_obj.query_docs(prompt)
-            ai_response=st.session_state.qa_model_obj.result['answer'].content
-            message_placeholder.markdown(ai_response)
-            t_delta=time.time() - t_start
-            status.update(label='Prompt generated in '+"{:10.3f}".format(t_delta)+' seconds', state='complete', expanded=False)
-        st.session_state.messages.append({'role': 'assistant', 'content': ai_response})
-        logging.info(f'Messaging complete for {st.session_state.message_id}.')
-# Add reset button
-if st.button('Restart session'):
-    st.session_state.qa_model_obj = []
-    st.session_state.message_id = 0
-    st.session_state.messages = []

scripts/pages/2_Document_Upload.py DELETED Viewed

@@ -1,112 +0,0 @@
-import data_import, setup
-import os
-import time
-import logging
-import glob
-from langchain_openai import OpenAIEmbeddings
-from langchain_community.embeddings import VoyageEmbeddings
-from ragatouille import RAGPretrainedModel
-import streamlit as st
-# Set up the page, enable logging, read environment variables
-from dotenv import load_dotenv,find_dotenv
-load_dotenv(find_dotenv(),override=True)
-logging.basicConfig(filename='app_2_document_upload.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
-# Set the page title
-st.set_page_config(
-    page_title='Upload PDFs',
-    layout='wide'
-)
-st.title('Upload PDFs')
-sb=setup.load_sidebar(config_file='../config/config.json',
-                      index_data_file='../config/index_data.json',
-                      vector_databases=True,
-                      embeddings=True,
-                      index_name=True,
-                      secret_keys=True)
-secrets=setup.set_secrets(sb) # Take secrets from .env file first, otherwise from sidebar
-# Populate the main screen
-logging.info(f'index_type test, {sb["index_type"]}')
-if sb["index_type"]=='RAGatouille':
-    logging.info('Set hugging face model for queries.')
-    query_model=sb['query_model']
-elif sb['query_model']=='Openai' or 'Voyage':
-    logging.info('Set embeddings model for queries.')
-    if sb['query_model']=='Openai':
-        query_model=OpenAIEmbeddings(model=sb['embedding_name'],openai_api_key=secrets['OPENAI_API_KEY'])
-    elif sb['query_model']=='Voyage':
-        query_model=VoyageEmbeddings(voyage_api_key=secrets['VOYAGE_API_KEY'])
-logging.info('Query model set: '+str(query_model))
-# Find docs
-index_name_md=st.markdown('Enter a directory relative to the current directory, or an absolute path.')
-data_folder = st.text_input('Enter a directory','../data/AMS/')
-if not os.path.isdir(data_folder):
-    st.error('The entered directory does not exist')
-docs = glob.glob(data_folder+'*.pdf')   # Only get the PDFs in the directory
-st.markdown('PDFs found: '+str(docs))
-st.markdown('Number of PDFs found: ' + str(len(docs)))
-logging.info('Docs: '+str(docs))
-# Add an expandable box for options
-with st.expander("Options"):
-    use_json = st.checkbox('Use existing jsonl, if available (will ignore chunk method, size, and overlap)?', value=True)
-    json_file=st.text_input('Jsonl file',data_folder+'ams_data.jsonl')
-    clear_database = st.checkbox('Clear existing database?')
-    chunk_method= st.selectbox('Chunk method', ['tiktoken_recursive'], index=0)
-    if sb['query_model']=='Openai' or 'ChromaDB':
-        # OpenAI will time out if the batch size is too large
-        batch_size=st.number_input('Batch size for upsert', min_value=1, step=1, value=100)
-    else:
-        batch_size=None
-    if chunk_method=='tiktoken_recursive':
-        chunk_size=st.number_input('Chunk size (tokens)', min_value=1, step=1, value=500)
-        chunk_overlap=st.number_input('Chunk overlap (tokens)', min_value=0, step=1, value=0)
-    else:
-        raise NotImplementedError
-# Add a button to run the function
-if st.button('Chunk docs to jsonl file'):
-    start_time = time.time()  # Start the timer
-    data_import.chunk_docs(docs,
-                           file=json_file,
-                           chunk_method=chunk_method,
-                           chunk_size=chunk_size,
-                           chunk_overlap=chunk_overlap,
-                           use_json=False)
-    end_time = time.time()  # Stop the timer
-    elapsed_time = end_time - start_time
-    st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
-if st.button('Load docs into vector database'):
-    start_time = time.time()  # Start the timer
-    data_import.load_docs(sb['index_type'],
-                          docs,
-                          query_model=query_model,
-                          index_name=sb['index_name'],
-                          chunk_size=chunk_size,
-                          chunk_overlap=chunk_overlap,
-                          use_json=use_json,
-                          clear=clear_database,
-                          file=json_file,
-                          batch_size=batch_size,
-                          local_db_path=sb['keys']['LOCAL_DB_PATH'])
-    end_time = time.time()  # Stop the timer
-    elapsed_time = end_time - start_time
-    st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
-# Add a button to delete the index
-if st.button('Delete existing index'):
-    start_time = time.time()  # Start the timer
-    data_import.delete_index(sb['index_type'],
-                             sb['index_name'],
-                             local_db_path=sb['keys']['LOCAL_DB_PATH'])
-    end_time = time.time()  # Stop the timer
-    elapsed_time = end_time - start_time
-    st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")

scripts/pages/3_Visualize_Data.py DELETED Viewed

@@ -1,123 +0,0 @@
-import setup
-import time
-import logging
-from datetime import datetime
-from langchain_openai import OpenAIEmbeddings
-from langchain_community.embeddings import VoyageEmbeddings
-from ragxplorer import RAGxplorer
-import streamlit as st
-# Set up the page, enable logging, read environment variables
-from dotenv import load_dotenv,find_dotenv
-load_dotenv(find_dotenv(),override=True)
-logging.basicConfig(filename='app_3_visualize_data.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
-# Set the page title
-st.set_page_config(
-    page_title='Visualize Data',
-    layout='wide'
-)
-st.title('Visualize Data')
-sb=setup.load_sidebar(config_file='../config/config.json',
-                      index_data_file='../config/index_data.json',
-                      vector_databases=True,
-                      embeddings=True,
-                      index_name=True,
-                      secret_keys=True)
-secrets=setup.set_secrets(sb) # Take secrets from .env file first, otherwise from sidebar
-# Set up session state variables
-if 'client' not in st.session_state:
-    st.session_state.client = None
-# Populate the main screen
-logging.info(f'index_type test, {sb["index_type"]}')
-if sb["index_type"]=='RAGatouille':
-    raise Exception('Only index type ChromaDB is supported for this function.')
-elif sb["index_type"]=='Pinecone':
-    raise Exception('Only index type ChromaDB is supported for this function.')
-elif sb['query_model']=='Openai' or 'Voyage':
-    logging.info('Set embeddings model for queries.')
-    if sb['query_model']=='Openai':
-        query_model=OpenAIEmbeddings(model=sb['embedding_name'],openai_api_key=secrets['OPENAI_API_KEY'])
-    elif sb['query_model']=='Voyage':
-        query_model=VoyageEmbeddings(voyage_api_key=secrets['VOYAGE_API_KEY'])
-logging.info('Query model set: '+str(query_model))
-st.info('You must have created a database using Document Upload in ChromaDB for this to work.')
-# Add an expandable with description of what's going on.
-with st.expander("Under the hood",expanded=True):
-    st.markdown('''
-                Uses modified version of https://github.com/gabrielchua/RAGxplorer/tree/main?tab=readme-ov-file to connect to existing database created.
-                Modified version here: https://github.com/dsmueller3760/RAGxplorer/tree/load_db
-                Assumes that chroma databases are located in local_db_path variable.
-                Query size in database: Take a random sample of this size from the database to visualize.
-                ''')
-with st.expander("Create visualization data",expanded=True):
-    # Add a button to run the function
-    limit_size = st.checkbox('Limit size of data visualization?', value=True)
-    if limit_size:
-        vector_qty=st.number_input('Query size in database', min_value=1, step=10, value=50)
-    else:
-        vector_qty=None
-    export_df = st.checkbox('Export visualization data?', value=True)
-    if export_df:
-        current_time = datetime.now().strftime("%Y.%m.%d.%H.%M")
-        if limit_size:
-            df_export_path = st.text_input('Export file', f'../data/AMS/ams_data-400-0-{vector_qty}.json')
-        else:
-            df_export_path=st.text_input('Export file', f'../data/AMS/ams_data-400-0-all.json')
-    if st.button('Create visualization data'):
-        start_time = time.time()  # Start the timer
-        st.session_state.client = RAGxplorer(embedding_model=sb['embedding_name'])
-        st.session_state.client.load_db(path_to_db=sb['keys']['LOCAL_DB_PATH']+'/chromadb/',
-                                        index_name=sb['index_name'],
-                                        df_export_path=df_export_path,
-                                        vector_qty=vector_qty,
-                                        umap_params={'n_neighbors': 5,
-                                                     'n_components': 2,
-                                                     'random_state':42},
-                                        verbose=True)
-        end_time = time.time()  # Stop the timer
-        elapsed_time = end_time - start_time
-        st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
-with st.expander("Visualize data",expanded=True):
-    import_data = st.checkbox('Import visualization data?', value=True)
-    if import_data:
-        import_file = st.file_uploader("Import file", type="json")
-        if import_file is None:
-            # Use a default file
-            import_file_path=st.text_input('Import file',df_export_path)
-        else:
-            # Use the uploaded file
-            import_file_path=st.text_input('Import file',f'../data/AMS/{import_file.name}')
-    else:
-        import_file_path=None
-    query = st.text_input('Query', 'What are examples of lubricants which should be avoided for space mechanism applications?')
-    if st.button('Visualize data'):
-        start_time = time.time()  # Start the timer
-        if st.session_state.client is None:
-            st.session_state.client = RAGxplorer(embedding_model=sb['embedding_name'])
-        fig = st.session_state.client.visualize_query(query,
-                                                      path_to_db=sb['keys']['LOCAL_DB_PATH']+'/chromadb/',
-                                                      viz_data_df_path=import_file_path,
-                                                      verbose=True)
-        st.plotly_chart(fig,use_container_width=True)
-        end_time = time.time()  # Stop the timer
-        elapsed_time = end_time - start_time

scripts/pages/4_Clean_and_Question.py DELETED Viewed

@@ -1,86 +0,0 @@
-import setup
-import data_import
-import time
-import logging
-import json
-from datetime import datetime
-from langchain_openai import OpenAIEmbeddings
-from langchain_community.embeddings import VoyageEmbeddings
-from langchain_openai import OpenAI, ChatOpenAI
-from langchain_community.llms import HuggingFaceHub
-import streamlit as st
-# Set up the page, enable logging
-from dotenv import load_dotenv,find_dotenv
-load_dotenv(find_dotenv(),override=True)
-logging.basicConfig(filename='app_4_clean_and_question.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
-# Set the page title
-st.set_page_config(
-    page_title='Clean and Question Data',
-    layout='wide'
-)
-st.title('Clean and Question Data')
-# TODO: add database status icons
-sb=setup.load_sidebar(config_file='../config/config.json',
-                      index_data_file='../config/index_data.json',
-                      llm=True,
-                      model_options=True,
-                      secret_keys=True)
-secrets=setup.set_secrets(sb) # Take secrets from .env file first, otherwise from sidebar
-# This is janky but works (needs secrets to initialize properly)
-from ragxplorer import RAGxplorer
-# Set up session state variables
-if 'client' not in st.session_state:
-    st.session_state.client = None
-# Populate the main screen
-# Add an expandable with description of what's going on.
-with st.expander("Under the hood",expanded=True):
-    st.markdown('''
-                ''')
-chunked_file = st.text_input('Chunked raw text file', f'../data/AMS/ams_data-400-0.jsonl')
-with st.expander("Process Chunked Data",expanded=True):
-    clean_data = st.checkbox('Clean data?', value=True)
-    tag_data = st.checkbox('Tag data?', value=True)
-    question_data = st.checkbox('Generate questions from data?', value=True)
-    if sb['model_options']['output_level'] == 'Concise':
-        out_token = 50
-    else:
-        out_token = 516
-    # Define LLM
-    if sb['llm_source']=='OpenAI':
-        llm = ChatOpenAI(model_name=sb['llm_model'],
-                        temperature=sb['model_options']['temperature'],
-                        openai_api_key=secrets['OPENAI_API_KEY'],
-                        max_tokens=out_token)
-    elif sb['llm_source']=='Hugging Face':
-        llm = HuggingFaceHub(repo_id=sb['llm_model'],
-                            model_kwargs={"temperature": sb['model_options']['temperature'], "max_length": out_token})
-    if clean_data or tag_data or question_data:
-        param_cleaning=None
-    if clean_data:
-        n_tags=None
-    if question_data:
-        n_questions=None
-    if st.button('Process chunked data'):
-        start_time = time.time()  # Start the timer
-        data_import.process_chunk(chunked_file,llm,
-                  clean_data=False,tag_data=False,question_data=False)
-        end_time = time.time()  # Stop the timer
-        elapsed_time = end_time - start_time
-        st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")

scripts/prompts.py DELETED Viewed

@@ -1,12 +0,0 @@
-from langchain import hub
-from langchain.prompts.prompt import PromptTemplate
-# Prompts on the hub: https://smith.langchain.com/hub/my-prompts?organizationId=45eb8917-7353-4296-978d-bb461fc45c65
-CONDENSE_QUESTION_PROMPT = hub.pull("dmueller/ams-chatbot-qa-condense-history")
-QA_PROMPT=hub.pull("dmueller/ams-chatbot-qa-retrieval")
-QA_WSOURCES_PROMPT=hub.pull("dmueller/ams-chatbot-qa-retrieval-wsources")
-QA_GENERATE_PROMPT=hub.pull("dmueller/generate_qa_prompt")
-# Prompts defined here only
-DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
-TEST_QUERY_PROMPT='What are examples of adhesives to use when potting motors for launch vehicle or spacecraft mechanisms?'

scripts/queries.py DELETED Viewed

@@ -1,278 +0,0 @@
-import os
-import logging
-import re
-from dotenv import load_dotenv, find_dotenv
-import openai
-import pinecone
-import chromadb
-from langchain_community.vectorstores import Pinecone
-from langchain_community.vectorstores import Chroma
-from langchain.memory import ConversationBufferMemory
-from operator import itemgetter
-from langchain_core.output_parsers import StrOutputParser
-from langchain_core.runnables import RunnableLambda, RunnablePassthrough
-from langchain.schema import format_document
-from langchain_core.messages import get_buffer_string
-from prompts import CONDENSE_QUESTION_PROMPT, QA_PROMPT, DEFAULT_DOCUMENT_PROMPT, TEST_QUERY_PROMPT
-# Set secrets from environment file
-OPENAI_API_KEY=os.getenv('OPENAI_API_KEY')
-VOYAGE_API_KEY=os.getenv('VOYAGE_API_KEY')
-PINECONE_API_KEY=os.getenv('PINECONE_API_KEY')
-HUGGINGFACEHUB_API_TOKEN=os.getenv('HUGGINGFACEHUB_API_TOKEN')
-# Class and functions
-class QA_Model:
-    def __init__(self,
-                 index_type,
-                 index_name,
-                 query_model,
-                 llm,
-                 k=6,
-                 search_type='similarity',
-                 fetch_k=50,
-                 temperature=0,
-                 chain_type='stuff',
-                 filter_arg=False,
-                 local_db_path='../db'):
-        self.index_type=index_type
-        self.index_name=index_name
-        self.query_model=query_model
-        self.llm=llm
-        self.k=k
-        self.search_type=search_type
-        self.fetch_k=fetch_k
-        self.temperature=temperature
-        self.chain_type=chain_type
-        self.filter_arg=filter_arg
-        self.sources=[]
-        load_dotenv(find_dotenv(),override=True)
-        # Define retriever search parameters
-        search_kwargs = _process_retriever_args(self.filter_arg,
-                                                self.sources,
-                                                self.search_type,
-                                                self.k,
-                                                self.fetch_k)
-        # Read in from the vector database
-        if index_type=='Pinecone':
-            pinecone.init(
-                api_key=PINECONE_API_KEY
-            )
-            logging.info('Chat pinecone index name: '+str(index_name))
-            logging.info('Chat query model: '+str(query_model))
-            index = pinecone.Index(index_name)
-            self.vectorstore = Pinecone(index,query_model,'page_content')
-            logging.info('Chat vectorstore: '+str(self.vectorstore))
-            # Test query
-            try:
-                test_query = self.vectorstore.similarity_search(TEST_QUERY_PROMPT)
-            except:
-                raise Exception("Pinecone vector database is not configured properly. Test query failed. Likely the index does not exist.")
-            logging.info('Test query: '+str(test_query))
-            if not test_query:
-                raise ValueError("Pinecone vector database is not configured properly. Test query failed.")
-            else:
-                logging.info('Test query succeeded!')
-            self.retriever=self.vectorstore.as_retriever(search_type=search_type,
-                                                         search_kwargs=search_kwargs)
-            logging.info('Chat retriever: '+str(self.retriever))
-        elif index_type=='ChromaDB':
-            logging.info('Chat chroma index name: '+str(index_name))
-            logging.info('Chat query model: '+str(query_model))
-            persistent_client = chromadb.PersistentClient(path=local_db_path+'/chromadb')
-            self.vectorstore = Chroma(client=persistent_client,
-                                      collection_name=index_name,
-                                      embedding_function=query_model)
-            logging.info('Chat vectorstore: '+str(self.vectorstore))
-            # Test query
-            try:
-                test_query = self.vectorstore.similarity_search(TEST_QUERY_PROMPT)
-            except:
-                raise Exception("Chroma vector database is not configured properly. Test query failed. Likely the index does not exist.")
-            logging.info('Test query: '+str(test_query))
-            if not test_query:
-                raise ValueError("Chroma vector database is not configured properly. Test query failed.")
-            else:
-                logging.info('Test query succeeded!')
-            self.retriever=self.vectorstore.as_retriever(search_type=search_type,
-                                                         search_kwargs=search_kwargs)
-            logging.info('Chat retriever: '+str(self.retriever))
-        elif index_type=='RAGatouille':
-            # Easy because the index is picked up directly.
-            self.vectorstore=query_model
-            logging.info('Chat query model:'+str(query_model))
-             # Test query
-            try:
-                test_query = self.vectorstore.search(TEST_QUERY_PROMPT)
-            except:
-                raise Exception("RAGatouille vector database is not configured properly.")
-            logging.info('Test query: '+str(test_query))
-            if not test_query:
-                raise ValueError("Chroma vector database is not configured properly. Test query failed.")
-            else:
-                logging.info('Test query succeeded!')
-            self.retriever=self.vectorstore.as_langchain_retriever()
-            logging.info('Chat retriever: '+str(self.retriever))
-        # Intialize memory
-        self.memory = ConversationBufferMemory(
-                        return_messages=True, output_key='answer', input_key='question')
-        logging.info('Memory: '+str(self.memory))
-        # Assemble main chain
-        self.conversational_qa_chain=_define_qa_chain(self.llm,
-                                                      self.retriever,
-                                                      self.memory,
-                                                      self.search_type,
-                                                      search_kwargs)
-    def query_docs(self,query):
-        self.memory.load_memory_variables({})
-        logging.info('Memory content before qa result: '+str(self.memory))
-        logging.info('Query: '+str(query))
-        self.result = self.conversational_qa_chain.invoke({'question': query})
-        logging.info('QA result: '+str(self.result))
-        if self.index_type!='RAGatouille':
-            self.sources = '\n'.join(str(data.metadata) for data in self.result['references'])
-            self.result['answer'].content += '\nSources: \n'+self.sources
-            logging.info('Sources: '+str(self.sources))
-            logging.info('Response with sources: '+str(self.result['answer'].content))
-        else:
-            # RAGatouille doesn't have metadata, need to extract from context first.
-            extracted_metadata = []
-            pattern = r'\{([^}]*)\}(?=[^{}]*$)' # Regular expression pattern to match the last curly braces
-            for ref in self.result['references']:
-                match = re.search(pattern, ref.page_content)
-                if match:
-                    extracted_metadata.append("{"+match.group(1)+"}")
-            self.sources = '\n'.join(extracted_metadata)
-            self.result['answer'].content += '\nSources: \n'+self.sources
-            logging.info('Sources: '+str(self.sources))
-            logging.info('Response with sources: '+str(self.result['answer'].content))
-        self.memory.save_context({'question': query}, {'answer': self.result['answer'].content})
-        logging.info('Memory content after qa result: '+str(self.memory))
-    def update_model(self,
-                     llm,
-                     k=6,
-                     search_type='similarity',
-                     fetch_k=50,
-                     filter_arg=False):
-        self.llm=llm
-        self.k=k
-        self.search_type=search_type
-        self.fetch_k=fetch_k
-        self.filter_arg=filter_arg
-        # Define retriever search parameters
-        search_kwargs = _process_retriever_args(self.filter_arg,
-                                                self.sources,
-                                                self.search_type,
-                                                self.k,
-                                                self.fetch_k)
-        # Update conversational retrieval chain
-        self.conversational_qa_chain=_define_qa_chain(self.llm,
-                                                      self.retriever,
-                                                      self.memory,
-                                                      self.search_type,
-                                                      search_kwargs)
-        logging.info('Updated qa chain: '+str(self.conversational_qa_chain))
-# Internal functions
-def _combine_documents(docs,
-                        document_prompt=DEFAULT_DOCUMENT_PROMPT,
-                        document_separator='\n\n'):
-    '''
-    Combine a list of documents into a single string.
-    '''
-    # TODO: this would be where stuff, map reduce, etc. would go
-    doc_strings = [format_document(doc, document_prompt) for doc in docs]
-    return document_separator.join(doc_strings)
-def _define_qa_chain(llm,
-                     retriever,
-                     memory,
-                     search_type,
-                     search_kwargs):
-    '''
-    Define the conversational QA chain.
-    '''
-    # This adds a 'memory' key to the input object
-    loaded_memory = RunnablePassthrough.assign(
-                        chat_history=RunnableLambda(memory.load_memory_variables)
-                        | itemgetter('history'))
-    logging.info('Loaded memory: '+str(loaded_memory))
-    # Assemble main chain
-    standalone_question = {
-        'standalone_question': {
-            'question': lambda x: x['question'],
-            'chat_history': lambda x: get_buffer_string(x['chat_history'])}
-        | CONDENSE_QUESTION_PROMPT
-        | llm
-        | StrOutputParser()}
-    logging.info('Condense inputs as a standalong question: '+str(standalone_question))
-    retrieved_documents = {
-        'source_documents': itemgetter('standalone_question')
-                            | retriever,
-        'question': lambda x: x['standalone_question']}
-    logging.info('Retrieved documents: '+str(retrieved_documents))
-    # Now we construct the inputs for the final prompt
-    final_inputs = {
-        'context': lambda x: _combine_documents(x['source_documents']),
-        'question': itemgetter('question')}
-    logging.info('Combined documents: '+str(final_inputs))
-    # And finally, we do the part that returns the answers
-    answer = {
-        'answer': final_inputs
-                    | QA_PROMPT
-                    | llm,
-        'references': itemgetter('source_documents')}
-    conversational_qa_chain = loaded_memory | standalone_question | retrieved_documents | answer
-    logging.info('Conversational QA chain: '+str(conversational_qa_chain))
-    return conversational_qa_chain
-def _process_retriever_args(filter_arg,
-                            sources,
-                            search_type,
-                            k,
-                            fetch_k):
-    '''
-    Process arguments for retriever.
-    '''
-    # Implement filter
-    if filter_arg:
-        filter_list = list(set(item['source'] for item in sources[-1]))
-        filter_items=[]
-        for item in filter_list:
-            filter_item={'source': item}
-            filter_items.append(filter_item)
-        filter={'$or':filter_items}
-    else:
-        filter=None
-    # Impement filtering and number of documents to return
-    if search_type=='mmr':
-        search_kwargs={'k':k,'fetch_k':fetch_k,'filter':filter} # See as_retriever docs for parameters
-    else:
-        search_kwargs={'k':k,'filter':filter} # See as_retriever docs for parameters
-    return search_kwargs

scripts/setup.py DELETED Viewed

@@ -1,168 +0,0 @@
-import os
-import logging
-import json
-import openai
-import streamlit as st
-# Set up the page, enable logging
-from dotenv import load_dotenv,find_dotenv
-load_dotenv(find_dotenv(),override=True)
-def load_sidebar(config_file,
-                 index_data_file,
-                 vector_databases=False,
-                 embeddings=False,
-                 rag_type=False,
-                 index_name=False,
-                 llm=False,
-                 model_options=False,
-                 secret_keys=False):
-    """
-    Sets up the sidebar based no toggled options. Returns variables with options.
-    """
-    sb_out={}
-    with open(config_file, 'r') as f:
-        config = json.load(f)
-        databases = {db['name']: db for db in config['databases']}
-        llms  = {m['name']: m for m in config['llms']}
-        logging.info('Loaded: '+config_file)
-    with open(index_data_file, 'r') as f:
-        index_data = json.load(f)
-        logging.info('Loaded: '+index_data_file)
-    if vector_databases:
-        # Vector databases
-        st.sidebar.title('Vector database')
-        sb_out['index_type']=st.sidebar.selectbox('Index type', list(databases.keys()), index=1)
-        logging.info('Index type: '+sb_out['index_type'])
-    if embeddings:
-        # Embeddings
-        st.sidebar.title('Embeddings')
-        if sb_out['index_type']=='RAGatouille':    # Default to selecting hugging face model for RAGatouille, otherwise select alternates
-           sb_out['query_model']=st.sidebar.selectbox('Hugging face rag models', databases[sb_out['index_type']]['hf_rag_models'], index=0)
-        else:
-            sb_out['query_model']=st.sidebar.selectbox('Embedding models', databases[sb_out['index_type']]['embedding_models'], index=0)
-        if sb_out['query_model']=='Openai':
-            sb_out['embedding_name']='text-embedding-ada-002'
-        elif sb_out['query_model']=='Voyage':
-            sb_out['embedding_name']='voyage-02'
-        logging.info('Query type: '+sb_out['query_model'])
-        if 'embedding_name' in locals() or 'embedding_name' in globals():
-            logging.info('Embedding name: '+sb_out['embedding_name'])
-    if rag_type:
-        if sb_out['index_type']!='RAGatouille': # RAGatouille doesn't have a rag_type
-            # RAG Type
-            st.sidebar.title('RAG Type')
-            sb_out['rag_type']=st.sidebar.selectbox('RAG type', config['rag_types'], index=0)
-            sb_out['smart_agent']=st.sidebar.checkbox('Smart agent?')
-            logging.info('RAG type: '+sb_out['rag_type'])
-            logging.info('Smart agent: '+str(sb_out['smart_agent']))
-    if index_name:
-        # Index Name
-        st.sidebar.title('Index Name')
-        sb_out['index_name']=index_data[sb_out['index_type']][sb_out['query_model']]
-        st.sidebar.markdown('Index name: '+sb_out['index_name'])
-        logging.info('Index name: '+sb_out['index_name'])
-    if llm:
-        # LLM
-        st.sidebar.title('LLM')
-        sb_out['llm_source']=st.sidebar.selectbox('LLM model', list(llms.keys()), index=0)
-        logging.info('LLM source: '+sb_out['llm_source'])
-        if sb_out['llm_source']=='OpenAI':
-            sb_out['llm_model']=st.sidebar.selectbox('OpenAI model', llms[sb_out['llm_source']]['models'], index=0)
-        if sb_out['llm_source']=='Hugging Face':
-            sb_out['llm_model']=st.sidebar.selectbox('Hugging Face model', llms[sb_out['llm_source']]['models'], index=0)
-    if model_options:
-        # Add input fields in the sidebar
-        st.sidebar.title('LLM Options')
-        temperature = st.sidebar.slider('Temperature', min_value=0.0, max_value=2.0, value=0.0, step=0.1)
-        output_level = st.sidebar.selectbox('Level of Output', ['Concise', 'Detailed'], index=1)
-        if 'index_type' in sb_out:
-            st.sidebar.title('Retrieval Options')
-            k = st.sidebar.number_input('Number of items per prompt', min_value=1, step=1, value=4)
-            if sb_out['index_type']!='RAGatouille':
-                search_type = st.sidebar.selectbox('Search Type', ['similarity', 'mmr'], index=0)
-                sb_out['model_options']={'output_level':output_level,
-                                        'k':k,
-                                        'search_type':search_type,
-                                        'temperature':temperature}
-        else:
-            sb_out['model_options']={'output_level':output_level,
-                                    'temperature':temperature}
-        logging.info('Model options: '+str(sb_out['model_options']))
-    if secret_keys:
-        # Add a section for secret keys
-        st.sidebar.title('Secret keys')
-        st.sidebar.markdown('If .env file is in directory, will use that first.')
-        sb_out['keys']={}
-        if 'llm_source' in sb_out and sb_out['llm_source'] == 'OpenAI':
-            sb_out['keys']['OPENAI_API_KEY'] = st.sidebar.text_input('OpenAI API Key', type='password')
-        elif 'query_model' in sb_out and sb_out['query_model'] == 'Openai':
-            sb_out['keys']['OPENAI_API_KEY'] = st.sidebar.text_input('OpenAI API Key', type='password')
-        if 'llm_source' in sb_out and sb_out['llm_source']=='Hugging Face':
-            sb_out['keys']['HUGGINGFACEHUB_API_TOKEN'] = st.sidebar.text_input('Hugging Face API Key', type='password')
-        if 'query_model' in sb_out and sb_out['query_model']=='Voyage':
-            sb_out['keys']['VOYAGE_API_KEY'] = st.sidebar.text_input('Voyage API Key', type='password')
-        if 'index_type' in sb_out and sb_out['index_type']=='Pinecone':
-            sb_out['keys']['PINECONE_API_KEY']=st.sidebar.text_input('Pinecone API Key',type='password')
-        if os.getenv('LOCAL_DB_PATH') is None:
-            sb_out['keys']['LOCAL_DB_PATH'] = st.sidebar.text_input('Local Database Path','/data',help='Path to local database (e.g. chroma)')
-            os.environ['LOCAL_DB_PATH'] = sb_out['keys']['LOCAL_DB_PATH']
-        else:
-            sb_out['keys']['LOCAL_DB_PATH'] = os.getenv('LOCAL_DB_PATH')
-            st.sidebar.markdown('Local Database Path: '+sb_out['keys']['LOCAL_DB_PATH'],help='From .env file.')
-    return sb_out
-def set_secrets(sb):
-    """
-    Sets secrets from environment file, or from sidebar if not available.
-    """
-    secrets={}
-    secrets['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
-    logging.info('OpenAI API Key: '+str(secrets['OPENAI_API_KEY']))
-    if not secrets['OPENAI_API_KEY'] and 'keys' in sb and 'OPENAI_API_KEY' in sb['keys']:
-        logging.info('Setting OpenAI API Key from sidebar...')
-        secrets['OPENAI_API_KEY'] = sb['keys']['OPENAI_API_KEY']
-        os.environ['OPENAI_API_KEY'] = secrets['OPENAI_API_KEY']
-        logging.info('OpenAI API Key: '+str(os.environ['OPENAI_API_KEY']))
-        if os.environ['OPENAI_API_KEY']=='':
-            raise Exception('OpenAI API Key is required.')
-    openai.api_key = secrets['OPENAI_API_KEY']
-    secrets['VOYAGE_API_KEY'] = os.getenv('VOYAGE_API_KEY')
-    logging.info('Voyage API Key: '+str(secrets['VOYAGE_API_KEY']))
-    if not secrets['VOYAGE_API_KEY'] and 'keys' in sb and 'VOYAGE_API_KEY' in sb['keys']:
-        logging.info('Setting Voyage API Key from sidebar...')
-        secrets['VOYAGE_API_KEY'] = sb['keys']['VOYAGE_API_KEY']
-        os.environ['VOYAGE_API_KEY'] = secrets['VOYAGE_API_KEY']
-        logging.info('Voyage API Key: '+str(os.environ['VOYAGE_API_KEY']))
-        if os.environ['VOYAGE_API_KEY']=='':
-            raise Exception('Voyage API Key is required.')
-    secrets['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')
-    logging.info('Pinecone API Key: '+str(secrets['PINECONE_API_KEY']))
-    if not secrets['PINECONE_API_KEY'] and 'keys' in sb and 'PINECONE_API_KEY' in sb['keys']:
-        logging.info('Setting Pinecone API Key from sidebar...')
-        secrets['PINECONE_API_KEY'] = sb['keys']['PINECONE_API_KEY']
-        os.environ['PINECONE_API_KEY'] = secrets['PINECONE_API_KEY']
-        logging.info('Pinecone API Key: '+str(os.environ['PINECONE_API_KEY']))
-        if os.environ['PINECONE_API_KEY']=='':
-            raise Exception('Pinecone API Key is required.')
-    secrets['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')
-    logging.info('Hugging Face API Key: '+str(secrets['HUGGINGFACEHUB_API_TOKEN']))
-    if not secrets['HUGGINGFACEHUB_API_TOKEN'] and 'keys' in sb and 'HUGGINGFACEHUB_API_TOKEN' in sb['keys']:
-        logging.info('Setting Hugging Face API Key from sidebar...')
-        secrets['HUGGINGFACEHUB_API_TOKEN'] = sb['keys']['HUGGINGFACEHUB_API_TOKEN']
-        os.environ['HUGGINGFACEHUB_API_TOKEN'] = secrets['HUGGINGFACEHUB_API_TOKEN']
-        logging.info('Hugging Face API Key: '+str(os.environ['HUGGINGFACEHUB_API_TOKEN']))
-        if os.environ['HUGGINGFACEHUB_API_TOKEN']=='':
-            raise Exception('Hugging Face API Key is required.')
-    return secrets