Removed committed files, added git copies, modified poetry
Browse files- Dockerfile +11 -3
- config/config.json +0 -44
- config/index_data.json +0 -13
- data/AMS/AMS_1996.pdf +0 -3
- data/AMS/AMS_1997.pdf +0 -3
- data/AMS/AMS_1998.pdf +0 -3
- data/AMS/AMS_1999.pdf +0 -3
- data/AMS/AMS_2000.pdf +0 -3
- data/AMS/AMS_2001.pdf +0 -3
- data/AMS/AMS_2002.pdf +0 -3
- data/AMS/AMS_2004.pdf +0 -3
- data/AMS/AMS_2006.pdf +0 -3
- data/AMS/AMS_2008.pdf +0 -3
- data/AMS/AMS_2010.pdf +0 -3
- data/AMS/AMS_2012.pdf +0 -3
- data/AMS/AMS_2014.pdf +0 -3
- data/AMS/AMS_2016.pdf +0 -3
- data/AMS/AMS_2018.pdf +0 -3
- data/AMS/AMS_2020.pdf +0 -3
- data/AMS/AMS_2022.pdf +0 -3
- data/AMS/README.txt +0 -18
- data/AMS/ams_data-400-0-50.json +0 -0
- data/AMS/ams_data-400-0-all.json +0 -3
- data/AMS/ams_data-400-0.jsonl +0 -3
- data/AMS/ams_data-5000-0.jsonl +0 -3
- poetry.lock +0 -0
- pyproject.toml +3 -3
- scripts/Start.py +0 -41
- scripts/data_import.py +0 -278
- scripts/pages/1_Chatbot_AMS_Modular.py +0 -160
- scripts/pages/2_Document_Upload.py +0 -112
- scripts/pages/3_Visualize_Data.py +0 -123
- scripts/pages/4_Clean_and_Question.py +0 -86
- scripts/prompts.py +0 -12
- scripts/queries.py +0 -278
- scripts/setup.py +0 -168
Dockerfile
CHANGED
@@ -6,6 +6,11 @@ FROM python:3.11.5-bookworm
|
|
6 |
RUN useradd -m -u 1000 user
|
7 |
USER user
|
8 |
|
|
|
|
|
|
|
|
|
|
|
9 |
# Set home to the user's home directory
|
10 |
ENV HOME=/home/user \
|
11 |
PATH=/home/user/.local/bin:$PATH
|
@@ -18,7 +23,7 @@ WORKDIR $HOME
|
|
18 |
RUN pip3 install poetry==1.7.1
|
19 |
|
20 |
# Copy poetry files
|
21 |
-
COPY --chown=user pyproject.toml poetry.lock* $HOME
|
22 |
|
23 |
# Disable virtual environments creation by Poetry
|
24 |
# as the Docker container itself is an isolated environment
|
@@ -34,7 +39,9 @@ ENV PATH="$HOME/.venv/bin:$PATH"
|
|
34 |
RUN poetry install --no-dev
|
35 |
|
36 |
# Copy the rest of your application code
|
37 |
-
COPY --chown=user
|
|
|
|
|
38 |
|
39 |
# Expose the port Streamlit runs on
|
40 |
EXPOSE 8501
|
@@ -43,10 +50,11 @@ EXPOSE 8501
|
|
43 |
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
44 |
|
45 |
# Update working directory to be consistent with where Start.py is
|
46 |
-
WORKDIR $HOME/
|
47 |
|
48 |
# An ENTRYPOINT allows you to configure a container that will run as an executable. Here, it also contains the entire streamlit run command for your app, so you don’t have to call it from the command line
|
49 |
ENTRYPOINT ["streamlit", "run", "Start.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
50 |
|
|
|
51 |
# docker run -it -p 7860:7860 --platform=linux/amd64 \
|
52 |
# registry.hf.space/ai-aerospace-aerospace-chatbots:latest
|
|
|
6 |
RUN useradd -m -u 1000 user
|
7 |
USER user
|
8 |
|
9 |
+
# Clone aerospace-chatbot github repository
|
10 |
+
RUN apt-get update && apt-get install -y git
|
11 |
+
WORKDIR /app
|
12 |
+
RUN git clone -b rag_study https://github.com/dan-s-mueller/aerospace_chatbot.git .
|
13 |
+
|
14 |
# Set home to the user's home directory
|
15 |
ENV HOME=/home/user \
|
16 |
PATH=/home/user/.local/bin:$PATH
|
|
|
23 |
RUN pip3 install poetry==1.7.1
|
24 |
|
25 |
# Copy poetry files
|
26 |
+
COPY --chown=user /app/aerospace_chatbot/pyproject.toml /app/aerospace_chatbot/poetry.lock* $HOME
|
27 |
|
28 |
# Disable virtual environments creation by Poetry
|
29 |
# as the Docker container itself is an isolated environment
|
|
|
39 |
RUN poetry install --no-dev
|
40 |
|
41 |
# Copy the rest of your application code
|
42 |
+
COPY --chown=user /app/aerospace_chatbot/src $HOME/src
|
43 |
+
COPY --chown=user /app/aerospace_chatbot/data $HOME/data
|
44 |
+
COPY --chown=user /app/aerospace_chatbot/config $HOME/config
|
45 |
|
46 |
# Expose the port Streamlit runs on
|
47 |
EXPOSE 8501
|
|
|
50 |
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
51 |
|
52 |
# Update working directory to be consistent with where Start.py is
|
53 |
+
WORKDIR $HOME/src
|
54 |
|
55 |
# An ENTRYPOINT allows you to configure a container that will run as an executable. Here, it also contains the entire streamlit run command for your app, so you don’t have to call it from the command line
|
56 |
ENTRYPOINT ["streamlit", "run", "Start.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
57 |
|
58 |
+
# To run remotely
|
59 |
# docker run -it -p 7860:7860 --platform=linux/amd64 \
|
60 |
# registry.hf.space/ai-aerospace-aerospace-chatbots:latest
|
config/config.json
DELETED
@@ -1,44 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"databases": [
|
3 |
-
{
|
4 |
-
"name": "Pinecone",
|
5 |
-
"embedding_models": ["Openai", "Voyage"]
|
6 |
-
},
|
7 |
-
{
|
8 |
-
"name": "ChromaDB",
|
9 |
-
"embedding_models": ["Openai"]
|
10 |
-
},
|
11 |
-
{
|
12 |
-
"name": "RAGatouille",
|
13 |
-
"hf_rag_models": [
|
14 |
-
"colbert-ir/colbertv2.0"
|
15 |
-
]
|
16 |
-
}
|
17 |
-
],
|
18 |
-
"llms": [
|
19 |
-
{
|
20 |
-
"name": "OpenAI",
|
21 |
-
"models": [
|
22 |
-
"gpt-3.5-turbo-1106",
|
23 |
-
"gpt-3.5-turbo-instruct",
|
24 |
-
"gpt-4",
|
25 |
-
"gpt-4-32k",
|
26 |
-
"gpt-4-1106-preview"
|
27 |
-
]
|
28 |
-
},
|
29 |
-
{
|
30 |
-
"name": "Hugging Face",
|
31 |
-
"models": [
|
32 |
-
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
33 |
-
"ai-aerospace/autotrain-ams_v0.1_100_Mistral-7B-Instruct-v0.1",
|
34 |
-
"meta-llama/Llama-2-7b-chat-hf"
|
35 |
-
]
|
36 |
-
}
|
37 |
-
],
|
38 |
-
"rag_types": [
|
39 |
-
"Standard",
|
40 |
-
"Parent-Child",
|
41 |
-
"Hypothetical Questions",
|
42 |
-
"Summaries"
|
43 |
-
]
|
44 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config/index_data.json
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"Pinecone": {
|
3 |
-
"Openai": "pinecone-openai-ams",
|
4 |
-
"Voyage": "pinecone-voyage-ams"
|
5 |
-
},
|
6 |
-
"ChromaDB": {
|
7 |
-
"Openai": "chromadb-openai-ams",
|
8 |
-
"Voyage": "chromadb-voyage-ams"
|
9 |
-
},
|
10 |
-
"RAGatouille": {
|
11 |
-
"colbert-ir/colbertv2.0": "RAGatouille-colbertv2.0-ams"
|
12 |
-
}
|
13 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/AMS/AMS_1996.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:3626fd4a0769b8a73a12ee79a1bec7c264c541a5bf90df6f6c13c1ff00011b24
|
3 |
-
size 152158068
|
|
|
|
|
|
|
|
data/AMS/AMS_1997.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:34442bbc794415ea8d778ebd57e1dd368e20c5e6f65aff35fa008af54dbb900a
|
3 |
-
size 22719877
|
|
|
|
|
|
|
|
data/AMS/AMS_1998.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:1523ca03cd1254b81cd0cb285182b7ac40208cba7932972ca00e0942e43f3539
|
3 |
-
size 122280718
|
|
|
|
|
|
|
|
data/AMS/AMS_1999.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:1c631364761565d749e6bafb0ab1e84611e773ccdb640ab08f6b32b1fcc49e1e
|
3 |
-
size 27850919
|
|
|
|
|
|
|
|
data/AMS/AMS_2000.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:ddf89c5cd9ddbd225e77198b19274535d4f003fdc20b5823239f51ad48230549
|
3 |
-
size 24061146
|
|
|
|
|
|
|
|
data/AMS/AMS_2001.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:c63b2bba5a892759a7298097ee2388f353cc974285a73bfd8635d48af9f7d945
|
3 |
-
size 23264984
|
|
|
|
|
|
|
|
data/AMS/AMS_2002.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:1b8b60c30ea9843face46e021a80bd1072901596b8e0f98a63601b31ecac2076
|
3 |
-
size 41615570
|
|
|
|
|
|
|
|
data/AMS/AMS_2004.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:986a7f046ba336d35d9d0db974931940543d612dad2c9bb6d5976d778777b659
|
3 |
-
size 28914300
|
|
|
|
|
|
|
|
data/AMS/AMS_2006.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:af4fb8e67c1ebf7b51fddd947d531d68ab05ff187fe915528811676ae0083d55
|
3 |
-
size 61039456
|
|
|
|
|
|
|
|
data/AMS/AMS_2008.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:3d74dcd8ef68ae324f9246a35e1ccf538c4fd676d8b1ae733191c8ad6a055c90
|
3 |
-
size 31961158
|
|
|
|
|
|
|
|
data/AMS/AMS_2010.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:30d9ad0b75d0d41c75926dd97361f1548b79920df61d8d7486978d4b69a00ef6
|
3 |
-
size 30161812
|
|
|
|
|
|
|
|
data/AMS/AMS_2012.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:e26a981f74c9d0c3526ad5152c83ad9fabde8f197f69cb24a0fd1d4004c1f026
|
3 |
-
size 31088140
|
|
|
|
|
|
|
|
data/AMS/AMS_2014.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:73dea6c8c45d0103404e3e3bd764e6efcd0f5bf5f45d505ce98e6c07528d9322
|
3 |
-
size 35199422
|
|
|
|
|
|
|
|
data/AMS/AMS_2016.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:f7d8a0e558abd59b94abcbe013f263755f3c525eaf73702662293a3d8b5e2ec5
|
3 |
-
size 35244294
|
|
|
|
|
|
|
|
data/AMS/AMS_2018.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:7b929f0c6d71116e23d4f52011e82eda07280aabb177300e37419ca38b047c60
|
3 |
-
size 30251124
|
|
|
|
|
|
|
|
data/AMS/AMS_2020.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:cb6aaaa2cb700bc7d460a1f222756e6a795b629780087a477acd9713982fc0b9
|
3 |
-
size 45426669
|
|
|
|
|
|
|
|
data/AMS/AMS_2022.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:ccc90819f501fca9445d415c1ceca8d3991300f8e08724cf7043f1a103aa4231
|
3 |
-
size 17636761
|
|
|
|
|
|
|
|
data/AMS/README.txt
DELETED
@@ -1,18 +0,0 @@
|
|
1 |
-
Documents are not uploaded to git. The list of documents which were uploaded to pinecone database AMS:
|
2 |
-
AMS_1996, https://ntrs.nasa.gov/citations/19960025595
|
3 |
-
AMS_1997, https://ntrs.nasa.gov/citations/19970021613
|
4 |
-
AMS_1998, https://ntrs.nasa.gov/citations/19980193156
|
5 |
-
AMS_1999, https://ntrs.nasa.gov/citations/19990053852
|
6 |
-
AMS_2000, https://ntrs.nasa.gov/citations/20000048380
|
7 |
-
AMS_2001, https://ntrs.nasa.gov/citations/20010071164
|
8 |
-
AMS_2002, https://ntrs.nasa.gov/citations/20020050182
|
9 |
-
AMS_2004, https://ntrs.nasa.gov/citations/20040084272
|
10 |
-
AMS_2006, https://ntrs.nasa.gov/citations/20060028221
|
11 |
-
AMS_2008, https://ntrs.nasa.gov/citations/20080023060
|
12 |
-
AMS_2010, https://ntrs.nasa.gov/citations/20100021914
|
13 |
-
AMS_2012, https://ntrs.nasa.gov/citations/20130008824
|
14 |
-
AMS_2014, https://ntrs.nasa.gov/citations/20140008875
|
15 |
-
AMS_2016, https://ntrs.nasa.gov/citations/20160004038
|
16 |
-
AMS_2018, https://ntrs.nasa.gov/citations/20180002828
|
17 |
-
AMS_2020, https://ntrs.nasa.gov/citations/20205009766
|
18 |
-
AMS_2022, https://ntrs.nasa.gov/citations/20220006415
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/AMS/ams_data-400-0-50.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
data/AMS/ams_data-400-0-all.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:923f4efbb6bcfa932cad87520177cb65bcf4b3df7fbc7446285df7ef070fa3eb
|
3 |
-
size 36094453
|
|
|
|
|
|
|
|
data/AMS/ams_data-400-0.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:ef248f60645d1def4d3624351c90cbb5d91554d0a8bfd35615514f4a71a20159
|
3 |
-
size 18183603
|
|
|
|
|
|
|
|
data/AMS/ams_data-5000-0.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:0472930c89ad2c13f997789b070049c99640c6ddcd114cc635110409854435b5
|
3 |
-
size 17283048
|
|
|
|
|
|
|
|
poetry.lock
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
CHANGED
@@ -5,12 +5,11 @@ description = ""
|
|
5 |
authors = ["dsmueller <[email protected]>"]
|
6 |
|
7 |
[tool.poetry.dependencies]
|
8 |
-
python = "
|
9 |
python-dotenv = "^1.0.0"
|
10 |
ipykernel = "^6.28.0"
|
11 |
ipywidgets = "^8.1.1"
|
12 |
langchainhub = "^0.1.14"
|
13 |
-
pinecone-client = "^2.2.4"
|
14 |
tiktoken = "^0.5.2"
|
15 |
watchdog = "^3.0.0"
|
16 |
chromadb = "^0.4.22"
|
@@ -25,8 +24,9 @@ langchain-openai = "^0.0.2.post1"
|
|
25 |
sentence-transformers = "^2.2.2"
|
26 |
ragatouille = "^0.0.4b2"
|
27 |
nbformat = "^5.9.2"
|
28 |
-
ragxplorer = {git = "https://github.com/dsmueller3760/RAGxplorer.git", rev = "load_db"}
|
29 |
pydantic = "^2.6.0"
|
|
|
|
|
30 |
|
31 |
|
32 |
[build-system]
|
|
|
5 |
authors = ["dsmueller <[email protected]>"]
|
6 |
|
7 |
[tool.poetry.dependencies]
|
8 |
+
python = ">=3.11,<3.13"
|
9 |
python-dotenv = "^1.0.0"
|
10 |
ipykernel = "^6.28.0"
|
11 |
ipywidgets = "^8.1.1"
|
12 |
langchainhub = "^0.1.14"
|
|
|
13 |
tiktoken = "^0.5.2"
|
14 |
watchdog = "^3.0.0"
|
15 |
chromadb = "^0.4.22"
|
|
|
24 |
sentence-transformers = "^2.2.2"
|
25 |
ragatouille = "^0.0.4b2"
|
26 |
nbformat = "^5.9.2"
|
|
|
27 |
pydantic = "^2.6.0"
|
28 |
+
RAGxplorer = { git = "https://github.com/dan-s-mueller/RAGxplorer.git", branch = "load_options" }
|
29 |
+
pinecone-client = "^3.0.2"
|
30 |
|
31 |
|
32 |
[build-system]
|
scripts/Start.py
DELETED
@@ -1,41 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import os
|
3 |
-
|
4 |
-
# Set up page
|
5 |
-
st.set_page_config(
|
6 |
-
page_title="Aerospace Chatbot: AMS",
|
7 |
-
)
|
8 |
-
st.title("Aerospace Chatbot Homepage")
|
9 |
-
st.markdown("Code base: https://github.com/dsmueller3760/aerospace_chatbot/tree/rag_study")
|
10 |
-
st.markdown('---')
|
11 |
-
st.markdown("""
|
12 |
-
This space contains chatbots and tools for exploring data in the aerospace mechanisms symposia, using all available papers published since 2000.
|
13 |
-
""")
|
14 |
-
st.subheader("Running Locally")
|
15 |
-
'''
|
16 |
-
It is recommended to run this streamlit app locally for improved performance. The hosted hugging face version is for proof of concept.
|
17 |
-
You must have poetry installed locally to manage depdenencies. To run locally, clone the repository and run the following commands.
|
18 |
-
|
19 |
-
poetry config virtualenvs.in-project true
|
20 |
-
poetry install
|
21 |
-
source .venv/bin/activate
|
22 |
-
cd ./scripts
|
23 |
-
streamlit run Start.py
|
24 |
-
'''
|
25 |
-
|
26 |
-
st.subheader("Aerospace Mechanisms Symposia (AMS)")
|
27 |
-
'''
|
28 |
-
This chatbot will look up from all Aerospace Mechanism Symposia in the following location: https://github.com/dsmueller3760/aerospace_chatbot/tree/main/data/AMS
|
29 |
-
* Available models: https://platform.openai.com/docs/models
|
30 |
-
* Model parameters: https://platform.openai.com/docs/api-reference/chat/create
|
31 |
-
* Pinecone: https://docs.pinecone.io/docs/projects#api-keys
|
32 |
-
* OpenAI API: https://platform.openai.com/api-keys
|
33 |
-
'''
|
34 |
-
|
35 |
-
st.subheader("API Key Links")
|
36 |
-
'''
|
37 |
-
* OpenAI: https://platform.openai.com/api-keys
|
38 |
-
* Pinecone: https://www.pinecone.io
|
39 |
-
* Hugging Face: https://huggingface.co/settings/tokens
|
40 |
-
* Voyage: https://dash.voyageai.com/api-keys
|
41 |
-
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/data_import.py
DELETED
@@ -1,278 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import re
|
3 |
-
import logging
|
4 |
-
import shutil
|
5 |
-
import string
|
6 |
-
|
7 |
-
import pinecone
|
8 |
-
import chromadb
|
9 |
-
|
10 |
-
import json, jsonlines
|
11 |
-
from tqdm import tqdm
|
12 |
-
|
13 |
-
from langchain_community.vectorstores import Pinecone
|
14 |
-
from langchain_community.vectorstores import Chroma
|
15 |
-
|
16 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
|
17 |
-
|
18 |
-
from langchain_openai import OpenAIEmbeddings
|
19 |
-
from langchain_community.embeddings import VoyageEmbeddings
|
20 |
-
|
21 |
-
from langchain_community.document_loaders import PyPDFLoader
|
22 |
-
from langchain_core.documents import Document as lancghain_Document
|
23 |
-
|
24 |
-
from ragatouille import RAGPretrainedModel
|
25 |
-
|
26 |
-
from dotenv import load_dotenv,find_dotenv
|
27 |
-
load_dotenv(find_dotenv(),override=True)
|
28 |
-
|
29 |
-
# Set secrets from environment file
|
30 |
-
OPENAI_API_KEY=os.getenv('OPENAI_API_KEY')
|
31 |
-
VOYAGE_API_KEY=os.getenv('VOYAGE_API_KEY')
|
32 |
-
PINECONE_API_KEY=os.getenv('PINECONE_API_KEY')
|
33 |
-
HUGGINGFACEHUB_API_TOKEN=os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
34 |
-
|
35 |
-
def chunk_docs(docs,
|
36 |
-
chunk_method='tiktoken_recursive',
|
37 |
-
file=None,
|
38 |
-
chunk_size=500,
|
39 |
-
chunk_overlap=0,
|
40 |
-
use_json=False):
|
41 |
-
docs_out=[]
|
42 |
-
if file:
|
43 |
-
logging.info('Jsonl file to be used: '+file)
|
44 |
-
if use_json and os.path.exists(file):
|
45 |
-
logging.info('Jsonl file found, using this instead of parsing docs.')
|
46 |
-
with open(file, "r") as file_in:
|
47 |
-
file_data = [json.loads(line) for line in file_in]
|
48 |
-
# Process the file data and put it into the same format as docs_out
|
49 |
-
for line in file_data:
|
50 |
-
doc_temp = lancghain_Document(page_content=line['page_content'],
|
51 |
-
source=line['metadata']['source'],
|
52 |
-
page=line['metadata']['page'],
|
53 |
-
metadata=line['metadata'])
|
54 |
-
if has_meaningful_content(doc_temp):
|
55 |
-
docs_out.append(doc_temp)
|
56 |
-
logging.info('Parsed: '+file)
|
57 |
-
logging.info('Number of entries: '+str(len(docs_out)))
|
58 |
-
logging.info('Sample entries:')
|
59 |
-
logging.info(str(docs_out[0]))
|
60 |
-
logging.info(str(docs_out[-1]))
|
61 |
-
else:
|
62 |
-
logging.info('No jsonl found. Reading and parsing docs.')
|
63 |
-
logging.info('Chunk size (tokens): '+str(chunk_size))
|
64 |
-
logging.info('Chunk overlap (tokens): '+str(chunk_overlap))
|
65 |
-
for doc in tqdm(docs,desc='Reading and parsing docs'):
|
66 |
-
logging.info('Parsing: '+doc)
|
67 |
-
loader = PyPDFLoader(doc)
|
68 |
-
data = loader.load_and_split()
|
69 |
-
|
70 |
-
if chunk_method=='tiktoken_recursive':
|
71 |
-
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
72 |
-
else:
|
73 |
-
raise NotImplementedError
|
74 |
-
pages = text_splitter.split_documents(data)
|
75 |
-
|
76 |
-
# Tidy up text by removing unnecessary characters
|
77 |
-
for page in pages:
|
78 |
-
page.metadata['source']=os.path.basename(page.metadata['source']) # Strip path
|
79 |
-
page.metadata['page']=int(page.metadata['page'])+1 # Pages are 0 based, update
|
80 |
-
page.page_content=re.sub(r"(\w+)-\n(\w+)", r"\1\2", page.page_content) # Merge hyphenated words
|
81 |
-
page.page_content = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", page.page_content.strip()) # Fix newlines in the middle of sentences
|
82 |
-
page.page_content = re.sub(r"\n\s*\n", "\n\n", page.page_content) # Remove multiple newlines
|
83 |
-
# Add metadata to the end of the page content, some RAG models don't have metadata.
|
84 |
-
page.page_content += str(page.metadata)
|
85 |
-
doc_temp=lancghain_Document(page_content=page.page_content,
|
86 |
-
source=page.metadata['source'],
|
87 |
-
page=page.metadata['page'],
|
88 |
-
metadata=page.metadata)
|
89 |
-
if has_meaningful_content(page):
|
90 |
-
docs_out.append(doc_temp)
|
91 |
-
logging.info('Parsed: '+doc)
|
92 |
-
logging.info('Sample entries:')
|
93 |
-
logging.info(str(docs_out[0]))
|
94 |
-
logging.info(str(docs_out[-1]))
|
95 |
-
if file:
|
96 |
-
# Write to a jsonl file, save it.
|
97 |
-
logging.info('Writing to jsonl file: '+file)
|
98 |
-
with jsonlines.open(file, mode='w') as writer:
|
99 |
-
for doc in docs_out:
|
100 |
-
writer.write(doc.dict())
|
101 |
-
logging.info('Written: '+file)
|
102 |
-
return docs_out
|
103 |
-
def load_docs(index_type,
|
104 |
-
docs,
|
105 |
-
query_model,
|
106 |
-
index_name=None,
|
107 |
-
chunk_method='tiktoken_recursive',
|
108 |
-
chunk_size=500,
|
109 |
-
chunk_overlap=0,
|
110 |
-
clear=False,
|
111 |
-
use_json=False,
|
112 |
-
file=None,
|
113 |
-
batch_size=50,
|
114 |
-
local_db_path='../db'):
|
115 |
-
"""
|
116 |
-
Loads PDF documents. If index_name is blank, it will return a list of the data (texts). If it is a name of a pinecone storage, it will return the vector_store.
|
117 |
-
"""
|
118 |
-
# Chunk docs
|
119 |
-
docs_out=chunk_docs(docs,
|
120 |
-
chunk_method=chunk_method,
|
121 |
-
file=file,
|
122 |
-
chunk_size=chunk_size,
|
123 |
-
chunk_overlap=chunk_overlap,
|
124 |
-
use_json=use_json)
|
125 |
-
# Initialize client
|
126 |
-
if index_name:
|
127 |
-
if index_type=="Pinecone":
|
128 |
-
# Import and initialize Pinecone client
|
129 |
-
pinecone.init(
|
130 |
-
api_key=PINECONE_API_KEY
|
131 |
-
)
|
132 |
-
# Find the existing index, clear for new start
|
133 |
-
if clear:
|
134 |
-
try:
|
135 |
-
pinecone.describe_index(index_name)
|
136 |
-
except:
|
137 |
-
raise Exception(f"Cannot clear index {index_name} because it does not exist.")
|
138 |
-
index=pinecone.Index(index_name)
|
139 |
-
index.delete(delete_all=True) # Clear the index first, then upload
|
140 |
-
logging.info('Cleared database '+index_name)
|
141 |
-
# Upsert docs
|
142 |
-
try:
|
143 |
-
pinecone.describe_index(index_name)
|
144 |
-
except:
|
145 |
-
logging.info(f"Index {index_name} does not exist. Creating new index.")
|
146 |
-
logging.info('Size of embedding used: '+str(embedding_size(query_model))) # TODO: set this to be backed out of the embedding size
|
147 |
-
pinecone.create_index(index_name,dimension=embedding_size(query_model))
|
148 |
-
logging.info(f"Index {index_name} created. Adding {len(docs_out)} entries to index.")
|
149 |
-
pass
|
150 |
-
else:
|
151 |
-
logging.info(f"Index {index_name} exists. Adding {len(docs_out)} entries to index.")
|
152 |
-
index = pinecone.Index(index_name)
|
153 |
-
vectorstore = Pinecone(index, query_model, "page_content") # Set the vector store to calculate embeddings on page_content
|
154 |
-
vectorstore = batch_upsert(index_type,
|
155 |
-
vectorstore,
|
156 |
-
docs_out,
|
157 |
-
batch_size=batch_size)
|
158 |
-
elif index_type=="ChromaDB":
|
159 |
-
# Upsert docs. Defaults to putting this in the local_db_path directory
|
160 |
-
logging.info(f"Creating new index {index_name}.")
|
161 |
-
persistent_client = chromadb.PersistentClient(path=local_db_path+'/chromadb')
|
162 |
-
vectorstore = Chroma(client=persistent_client,
|
163 |
-
collection_name=index_name,
|
164 |
-
embedding_function=query_model)
|
165 |
-
logging.info(f"Index {index_name} created. Adding {len(docs_out)} entries to index.")
|
166 |
-
vectorstore = batch_upsert(index_type,
|
167 |
-
vectorstore,
|
168 |
-
docs_out,
|
169 |
-
batch_size=batch_size)
|
170 |
-
logging.info("Documents upserted to f{index_name}.")
|
171 |
-
# Test query
|
172 |
-
test_query = vectorstore.similarity_search('What are examples of aerosapce adhesives to avoid?')
|
173 |
-
logging.info('Test query: '+str(test_query))
|
174 |
-
if not test_query:
|
175 |
-
raise ValueError("Chroma vector database is not configured properly. Test query failed.")
|
176 |
-
elif index_type=="RAGatouille":
|
177 |
-
logging.info(f'Setting up RAGatouille model {query_model}')
|
178 |
-
vectorstore = RAGPretrainedModel.from_pretrained(query_model)
|
179 |
-
logging.info('RAGatouille model set: '+str(vectorstore))
|
180 |
-
|
181 |
-
# Create an index from the vectorstore.
|
182 |
-
docs_out_colbert = [doc.page_content for doc in docs_out]
|
183 |
-
if chunk_size>500:
|
184 |
-
raise ValueError("RAGatouille cannot handle chunks larger than 500 tokens. Reduce token count.")
|
185 |
-
vectorstore.index(
|
186 |
-
collection=docs_out_colbert,
|
187 |
-
index_name=index_name,
|
188 |
-
max_document_length=chunk_size,
|
189 |
-
overwrite_index=True,
|
190 |
-
split_documents=True,
|
191 |
-
)
|
192 |
-
logging.info(f"Index created: {vectorstore}")
|
193 |
-
|
194 |
-
# Move the directory to the db folder
|
195 |
-
logging.info(f"Moving RAGatouille index to {local_db_path}")
|
196 |
-
ragatouille_path = os.path.join(local_db_path, '.ragatouille')
|
197 |
-
if os.path.exists(ragatouille_path):
|
198 |
-
shutil.rmtree(ragatouille_path)
|
199 |
-
logging.info(f"RAGatouille index deleted from {ragatouille_path}")
|
200 |
-
shutil.move('./.ragatouille', local_db_path)
|
201 |
-
logging.info(f"RAGatouille index created in {local_db_path}:"+str(vectorstore))
|
202 |
-
|
203 |
-
# Return vectorstore or docs
|
204 |
-
if index_name:
|
205 |
-
return vectorstore
|
206 |
-
else:
|
207 |
-
return docs_out
|
208 |
-
def delete_index(index_type,index_name,
|
209 |
-
local_db_path='../db'):
|
210 |
-
"""
|
211 |
-
Deletes an existing Pinecone index with the given index_name.
|
212 |
-
"""
|
213 |
-
if index_type=="Pinecone":
|
214 |
-
# Import and initialize Pinecone client
|
215 |
-
pinecone.init(
|
216 |
-
api_key=PINECONE_API_KEY
|
217 |
-
)
|
218 |
-
try:
|
219 |
-
pinecone.describe_index(index_name)
|
220 |
-
logging.info(f"Index {index_name} exists.")
|
221 |
-
except:
|
222 |
-
raise Exception(f"Index {index_name} does not exist, cannot delete.")
|
223 |
-
else:
|
224 |
-
pinecone.delete_index(index_name)
|
225 |
-
logging.info(f"Index {index_name} deleted.")
|
226 |
-
elif index_type=="ChromaDB":
|
227 |
-
# Delete existing collection
|
228 |
-
logging.info(f"Deleting index {index_name}.")
|
229 |
-
persistent_client = chromadb.PersistentClient(path=local_db_path+'/chromadb')
|
230 |
-
persistent_client.delete_collection(name=index_name)
|
231 |
-
logging.info("Index deleted.")
|
232 |
-
elif index_type=="RAGatouille":
|
233 |
-
raise NotImplementedError
|
234 |
-
def batch_upsert(index_type,vectorstore,docs_out,batch_size=50):
|
235 |
-
# Batch insert the chunks into the vector store
|
236 |
-
for i in range(0, len(docs_out), batch_size):
|
237 |
-
chunk_batch = docs_out[i:i + batch_size]
|
238 |
-
if index_type=="Pinecone":
|
239 |
-
vectorstore.add_documents(chunk_batch)
|
240 |
-
elif index_type=="ChromaDB":
|
241 |
-
vectorstore.add_documents(chunk_batch) # Happens to be same for chroma/pinecone, leaving if statement just in case
|
242 |
-
return vectorstore
|
243 |
-
def has_meaningful_content(page):
|
244 |
-
"""
|
245 |
-
Test whether the page has more than 30% words and is more than 5 words.
|
246 |
-
"""
|
247 |
-
text=page.page_content
|
248 |
-
num_words = len(text.split())
|
249 |
-
alphanumeric_pct = sum(c.isalnum() for c in text) / len(text)
|
250 |
-
if num_words < 5 or alphanumeric_pct < 0.3:
|
251 |
-
return False
|
252 |
-
else:
|
253 |
-
return True
|
254 |
-
def embedding_size(embedding_model):
|
255 |
-
"""
|
256 |
-
Returns the embedding size of the model.
|
257 |
-
"""
|
258 |
-
if isinstance(embedding_model,OpenAIEmbeddings):
|
259 |
-
return 1536 # https://platform.openai.com/docs/models/embeddings, test-embedding-ada-002
|
260 |
-
elif isinstance(embedding_model,VoyageEmbeddings):
|
261 |
-
return 1024 # https://docs.voyageai.com/embeddings/, voyage-02
|
262 |
-
else:
|
263 |
-
raise NotImplementedError
|
264 |
-
def process_chunk(json_file,llm,
|
265 |
-
clean_data=False,tag_data=False,question_data=False):
|
266 |
-
docs_out=[]
|
267 |
-
with open(json_file, "r") as file_in:
|
268 |
-
file_data = [json.loads(line) for line in file_in]
|
269 |
-
# Process the file data and put it into the same format as docs_out
|
270 |
-
for line in file_data:
|
271 |
-
doc_temp = lancghain_Document(page_content=line['page_content'],
|
272 |
-
source=line['metadata']['source'],
|
273 |
-
page=line['metadata']['page'],
|
274 |
-
metadata=line['metadata'])
|
275 |
-
docs_out.append(doc_temp)
|
276 |
-
# clean data
|
277 |
-
# tag data
|
278 |
-
# question data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/pages/1_Chatbot_AMS_Modular.py
DELETED
@@ -1,160 +0,0 @@
|
|
1 |
-
import queries, setup
|
2 |
-
|
3 |
-
import os
|
4 |
-
import time
|
5 |
-
import logging
|
6 |
-
import json
|
7 |
-
|
8 |
-
import pinecone
|
9 |
-
import openai
|
10 |
-
|
11 |
-
from langchain_community.vectorstores import Pinecone
|
12 |
-
from langchain_community.vectorstores import Chroma
|
13 |
-
|
14 |
-
from langchain_openai import OpenAIEmbeddings
|
15 |
-
from langchain_community.embeddings import VoyageEmbeddings
|
16 |
-
|
17 |
-
from langchain_openai import OpenAI, ChatOpenAI
|
18 |
-
from langchain_community.llms import HuggingFaceHub
|
19 |
-
|
20 |
-
from ragatouille import RAGPretrainedModel
|
21 |
-
|
22 |
-
import streamlit as st
|
23 |
-
|
24 |
-
# Set up the page, enable logging, read environment variables
|
25 |
-
from dotenv import load_dotenv,find_dotenv
|
26 |
-
load_dotenv(find_dotenv(),override=True)
|
27 |
-
logging.basicConfig(filename='app_1_chatbot_ams_modular.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
|
28 |
-
|
29 |
-
# Set the page title
|
30 |
-
st.set_page_config(
|
31 |
-
page_title='Aerospace Chatbot: Modular',
|
32 |
-
layout='wide'
|
33 |
-
)
|
34 |
-
st.title('Aerospace Mechanisms Chatbot')
|
35 |
-
with st.expander('''What's under the hood?'''):
|
36 |
-
st.markdown('''
|
37 |
-
This chatbot will look up from all Aerospace Mechanism Symposia in the following location: https://github.com/dsmueller3760/aerospace_chatbot/tree/main/data/AMS
|
38 |
-
Example questions:
|
39 |
-
* What are examples of latch failures which have occurred due to improper fitup?
|
40 |
-
* What are examples of lubricants which should be avoided for space mechanism applications?
|
41 |
-
''')
|
42 |
-
filter_toggle=st.checkbox('Filter response with last received sources?')
|
43 |
-
|
44 |
-
sb=setup.load_sidebar(config_file='../config/config.json',
|
45 |
-
index_data_file='../config/index_data.json',
|
46 |
-
vector_databases=True,
|
47 |
-
embeddings=True,
|
48 |
-
rag_type=True,
|
49 |
-
index_name=True,
|
50 |
-
llm=True,
|
51 |
-
model_options=True,
|
52 |
-
secret_keys=True)
|
53 |
-
|
54 |
-
secrets=setup.set_secrets(sb) # Take secrets from .env file first, otherwise from sidebar
|
55 |
-
|
56 |
-
# Set up chat history
|
57 |
-
if 'qa_model_obj' not in st.session_state:
|
58 |
-
st.session_state.qa_model_obj = []
|
59 |
-
if 'message_id' not in st.session_state:
|
60 |
-
st.session_state.message_id = 0
|
61 |
-
if 'messages' not in st.session_state:
|
62 |
-
st.session_state.messages = []
|
63 |
-
for message in st.session_state.messages:
|
64 |
-
with st.chat_message(message['role']):
|
65 |
-
st.markdown(message['content'])
|
66 |
-
|
67 |
-
# Define chat
|
68 |
-
if prompt := st.chat_input('Prompt here'):
|
69 |
-
# User prompt
|
70 |
-
st.session_state.messages.append({'role': 'user', 'content': prompt})
|
71 |
-
with st.chat_message('user'):
|
72 |
-
st.markdown(prompt)
|
73 |
-
# Assistant response
|
74 |
-
with st.chat_message('assistant'):
|
75 |
-
message_placeholder = st.empty()
|
76 |
-
|
77 |
-
with st.status('Generating response...') as status:
|
78 |
-
t_start=time.time()
|
79 |
-
|
80 |
-
st.session_state.message_id += 1
|
81 |
-
st.write('Starting reponse generation for message: '+str(st.session_state.message_id))
|
82 |
-
logging.info('Starting reponse generation for message: '+str(st.session_state.message_id))
|
83 |
-
|
84 |
-
# Process some items
|
85 |
-
if sb['model_options']['output_level'] == 'Concise':
|
86 |
-
out_token = 50
|
87 |
-
else:
|
88 |
-
out_token = 516
|
89 |
-
logging.info('Output tokens: '+str(out_token))
|
90 |
-
|
91 |
-
if st.session_state.message_id==1:
|
92 |
-
# Define embeddings
|
93 |
-
if sb['query_model']=='Openai':
|
94 |
-
query_model=OpenAIEmbeddings(model=sb['embedding_name'],openai_api_key=secrets['OPENAI_API_KEY'])
|
95 |
-
elif sb['query_model']=='Voyage':
|
96 |
-
query_model=VoyageEmbeddings(model=sb['embedding_name'],voyage_api_key=secrets['VOYAGE_API_KEY'])
|
97 |
-
elif sb['index_type']=='RAGatouille':
|
98 |
-
query_model=RAGPretrainedModel.from_index(sb['keys']['LOCAL_DB_PATH']+'/.ragatouille/colbert/indexes/'+sb['index_name'])
|
99 |
-
logging.info('Query model set: '+str(query_model))
|
100 |
-
|
101 |
-
# Define LLM
|
102 |
-
if sb['llm_source']=='OpenAI':
|
103 |
-
llm = ChatOpenAI(model_name=sb['llm_model'],
|
104 |
-
temperature=sb['model_options']['temperature'],
|
105 |
-
openai_api_key=secrets['OPENAI_API_KEY'],
|
106 |
-
max_tokens=out_token)
|
107 |
-
elif sb['llm_source']=='Hugging Face':
|
108 |
-
llm = HuggingFaceHub(repo_id=sb['llm_model'],
|
109 |
-
model_kwargs={"temperature": sb['model_options']['temperature'], "max_length": out_token})
|
110 |
-
logging.info('LLM model set: '+str(llm))
|
111 |
-
|
112 |
-
# Initialize QA model object
|
113 |
-
if 'search_type' in sb['model_options']:
|
114 |
-
search_type=sb['model_options']['search_type']
|
115 |
-
else:
|
116 |
-
search_type=None
|
117 |
-
st.session_state.qa_model_obj=queries.QA_Model(sb['index_type'],
|
118 |
-
sb['index_name'],
|
119 |
-
query_model,
|
120 |
-
llm,
|
121 |
-
k=sb['model_options']['k'],
|
122 |
-
search_type=search_type,
|
123 |
-
filter_arg=False,
|
124 |
-
local_db_path=sb['keys']['LOCAL_DB_PATH'])
|
125 |
-
logging.info('QA model object set: '+str(st.session_state.qa_model_obj))
|
126 |
-
if st.session_state.message_id>1:
|
127 |
-
logging.info('Updating model with sidebar settings...')
|
128 |
-
# Update LLM
|
129 |
-
if sb['llm_source']=='OpenAI':
|
130 |
-
llm = ChatOpenAI(model_name=sb['llm_model'],
|
131 |
-
temperature=sb['model_options']['temperature'],
|
132 |
-
openai_api_key=secrets['OPENAI_API_KEY'],
|
133 |
-
max_tokens=out_token)
|
134 |
-
elif sb['llm_source']=='Hugging Face':
|
135 |
-
llm = HuggingFaceHub(repo_id=sb['llm_model'],
|
136 |
-
model_kwargs={"temperature": sb['model_options']['temperature'], "max_length": out_token})
|
137 |
-
logging.info('LLM model set: '+str(llm))
|
138 |
-
|
139 |
-
st.session_state.qa_model_obj.update_model(llm,
|
140 |
-
k=sb['model_options']['k'],
|
141 |
-
search_type=sb['model_options']['search_type'],
|
142 |
-
filter_arg=filter_toggle)
|
143 |
-
logging.info('QA model object updated: '+str(st.session_state.qa_model_obj))
|
144 |
-
|
145 |
-
st.write('Searching vector database, generating prompt...')
|
146 |
-
logging.info('Searching vector database, generating prompt...')
|
147 |
-
st.session_state.qa_model_obj.query_docs(prompt)
|
148 |
-
ai_response=st.session_state.qa_model_obj.result['answer'].content
|
149 |
-
message_placeholder.markdown(ai_response)
|
150 |
-
t_delta=time.time() - t_start
|
151 |
-
status.update(label='Prompt generated in '+"{:10.3f}".format(t_delta)+' seconds', state='complete', expanded=False)
|
152 |
-
|
153 |
-
st.session_state.messages.append({'role': 'assistant', 'content': ai_response})
|
154 |
-
logging.info(f'Messaging complete for {st.session_state.message_id}.')
|
155 |
-
|
156 |
-
# Add reset button
|
157 |
-
if st.button('Restart session'):
|
158 |
-
st.session_state.qa_model_obj = []
|
159 |
-
st.session_state.message_id = 0
|
160 |
-
st.session_state.messages = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/pages/2_Document_Upload.py
DELETED
@@ -1,112 +0,0 @@
|
|
1 |
-
import data_import, setup
|
2 |
-
|
3 |
-
import os
|
4 |
-
import time
|
5 |
-
import logging
|
6 |
-
import glob
|
7 |
-
|
8 |
-
from langchain_openai import OpenAIEmbeddings
|
9 |
-
from langchain_community.embeddings import VoyageEmbeddings
|
10 |
-
|
11 |
-
from ragatouille import RAGPretrainedModel
|
12 |
-
|
13 |
-
import streamlit as st
|
14 |
-
|
15 |
-
# Set up the page, enable logging, read environment variables
|
16 |
-
from dotenv import load_dotenv,find_dotenv
|
17 |
-
load_dotenv(find_dotenv(),override=True)
|
18 |
-
logging.basicConfig(filename='app_2_document_upload.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
|
19 |
-
|
20 |
-
# Set the page title
|
21 |
-
st.set_page_config(
|
22 |
-
page_title='Upload PDFs',
|
23 |
-
layout='wide'
|
24 |
-
)
|
25 |
-
st.title('Upload PDFs')
|
26 |
-
|
27 |
-
sb=setup.load_sidebar(config_file='../config/config.json',
|
28 |
-
index_data_file='../config/index_data.json',
|
29 |
-
vector_databases=True,
|
30 |
-
embeddings=True,
|
31 |
-
index_name=True,
|
32 |
-
secret_keys=True)
|
33 |
-
secrets=setup.set_secrets(sb) # Take secrets from .env file first, otherwise from sidebar
|
34 |
-
|
35 |
-
# Populate the main screen
|
36 |
-
logging.info(f'index_type test, {sb["index_type"]}')
|
37 |
-
|
38 |
-
if sb["index_type"]=='RAGatouille':
|
39 |
-
logging.info('Set hugging face model for queries.')
|
40 |
-
query_model=sb['query_model']
|
41 |
-
elif sb['query_model']=='Openai' or 'Voyage':
|
42 |
-
logging.info('Set embeddings model for queries.')
|
43 |
-
if sb['query_model']=='Openai':
|
44 |
-
query_model=OpenAIEmbeddings(model=sb['embedding_name'],openai_api_key=secrets['OPENAI_API_KEY'])
|
45 |
-
elif sb['query_model']=='Voyage':
|
46 |
-
query_model=VoyageEmbeddings(voyage_api_key=secrets['VOYAGE_API_KEY'])
|
47 |
-
logging.info('Query model set: '+str(query_model))
|
48 |
-
|
49 |
-
# Find docs
|
50 |
-
index_name_md=st.markdown('Enter a directory relative to the current directory, or an absolute path.')
|
51 |
-
data_folder = st.text_input('Enter a directory','../data/AMS/')
|
52 |
-
if not os.path.isdir(data_folder):
|
53 |
-
st.error('The entered directory does not exist')
|
54 |
-
docs = glob.glob(data_folder+'*.pdf') # Only get the PDFs in the directory
|
55 |
-
st.markdown('PDFs found: '+str(docs))
|
56 |
-
st.markdown('Number of PDFs found: ' + str(len(docs)))
|
57 |
-
logging.info('Docs: '+str(docs))
|
58 |
-
|
59 |
-
# Add an expandable box for options
|
60 |
-
with st.expander("Options"):
|
61 |
-
use_json = st.checkbox('Use existing jsonl, if available (will ignore chunk method, size, and overlap)?', value=True)
|
62 |
-
json_file=st.text_input('Jsonl file',data_folder+'ams_data.jsonl')
|
63 |
-
clear_database = st.checkbox('Clear existing database?')
|
64 |
-
chunk_method= st.selectbox('Chunk method', ['tiktoken_recursive'], index=0)
|
65 |
-
if sb['query_model']=='Openai' or 'ChromaDB':
|
66 |
-
# OpenAI will time out if the batch size is too large
|
67 |
-
batch_size=st.number_input('Batch size for upsert', min_value=1, step=1, value=100)
|
68 |
-
else:
|
69 |
-
batch_size=None
|
70 |
-
if chunk_method=='tiktoken_recursive':
|
71 |
-
chunk_size=st.number_input('Chunk size (tokens)', min_value=1, step=1, value=500)
|
72 |
-
chunk_overlap=st.number_input('Chunk overlap (tokens)', min_value=0, step=1, value=0)
|
73 |
-
else:
|
74 |
-
raise NotImplementedError
|
75 |
-
|
76 |
-
# Add a button to run the function
|
77 |
-
if st.button('Chunk docs to jsonl file'):
|
78 |
-
start_time = time.time() # Start the timer
|
79 |
-
data_import.chunk_docs(docs,
|
80 |
-
file=json_file,
|
81 |
-
chunk_method=chunk_method,
|
82 |
-
chunk_size=chunk_size,
|
83 |
-
chunk_overlap=chunk_overlap,
|
84 |
-
use_json=False)
|
85 |
-
end_time = time.time() # Stop the timer
|
86 |
-
elapsed_time = end_time - start_time
|
87 |
-
st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
|
88 |
-
if st.button('Load docs into vector database'):
|
89 |
-
start_time = time.time() # Start the timer
|
90 |
-
data_import.load_docs(sb['index_type'],
|
91 |
-
docs,
|
92 |
-
query_model=query_model,
|
93 |
-
index_name=sb['index_name'],
|
94 |
-
chunk_size=chunk_size,
|
95 |
-
chunk_overlap=chunk_overlap,
|
96 |
-
use_json=use_json,
|
97 |
-
clear=clear_database,
|
98 |
-
file=json_file,
|
99 |
-
batch_size=batch_size,
|
100 |
-
local_db_path=sb['keys']['LOCAL_DB_PATH'])
|
101 |
-
end_time = time.time() # Stop the timer
|
102 |
-
elapsed_time = end_time - start_time
|
103 |
-
st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
|
104 |
-
# Add a button to delete the index
|
105 |
-
if st.button('Delete existing index'):
|
106 |
-
start_time = time.time() # Start the timer
|
107 |
-
data_import.delete_index(sb['index_type'],
|
108 |
-
sb['index_name'],
|
109 |
-
local_db_path=sb['keys']['LOCAL_DB_PATH'])
|
110 |
-
end_time = time.time() # Stop the timer
|
111 |
-
elapsed_time = end_time - start_time
|
112 |
-
st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/pages/3_Visualize_Data.py
DELETED
@@ -1,123 +0,0 @@
|
|
1 |
-
import setup
|
2 |
-
|
3 |
-
import time
|
4 |
-
import logging
|
5 |
-
from datetime import datetime
|
6 |
-
|
7 |
-
from langchain_openai import OpenAIEmbeddings
|
8 |
-
from langchain_community.embeddings import VoyageEmbeddings
|
9 |
-
|
10 |
-
from ragxplorer import RAGxplorer
|
11 |
-
|
12 |
-
import streamlit as st
|
13 |
-
|
14 |
-
# Set up the page, enable logging, read environment variables
|
15 |
-
from dotenv import load_dotenv,find_dotenv
|
16 |
-
load_dotenv(find_dotenv(),override=True)
|
17 |
-
logging.basicConfig(filename='app_3_visualize_data.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
|
18 |
-
|
19 |
-
# Set the page title
|
20 |
-
st.set_page_config(
|
21 |
-
page_title='Visualize Data',
|
22 |
-
layout='wide'
|
23 |
-
)
|
24 |
-
st.title('Visualize Data')
|
25 |
-
|
26 |
-
sb=setup.load_sidebar(config_file='../config/config.json',
|
27 |
-
index_data_file='../config/index_data.json',
|
28 |
-
vector_databases=True,
|
29 |
-
embeddings=True,
|
30 |
-
index_name=True,
|
31 |
-
secret_keys=True)
|
32 |
-
secrets=setup.set_secrets(sb) # Take secrets from .env file first, otherwise from sidebar
|
33 |
-
|
34 |
-
# Set up session state variables
|
35 |
-
if 'client' not in st.session_state:
|
36 |
-
st.session_state.client = None
|
37 |
-
|
38 |
-
# Populate the main screen
|
39 |
-
logging.info(f'index_type test, {sb["index_type"]}')
|
40 |
-
|
41 |
-
if sb["index_type"]=='RAGatouille':
|
42 |
-
raise Exception('Only index type ChromaDB is supported for this function.')
|
43 |
-
elif sb["index_type"]=='Pinecone':
|
44 |
-
raise Exception('Only index type ChromaDB is supported for this function.')
|
45 |
-
elif sb['query_model']=='Openai' or 'Voyage':
|
46 |
-
logging.info('Set embeddings model for queries.')
|
47 |
-
if sb['query_model']=='Openai':
|
48 |
-
query_model=OpenAIEmbeddings(model=sb['embedding_name'],openai_api_key=secrets['OPENAI_API_KEY'])
|
49 |
-
elif sb['query_model']=='Voyage':
|
50 |
-
query_model=VoyageEmbeddings(voyage_api_key=secrets['VOYAGE_API_KEY'])
|
51 |
-
logging.info('Query model set: '+str(query_model))
|
52 |
-
|
53 |
-
st.info('You must have created a database using Document Upload in ChromaDB for this to work.')
|
54 |
-
|
55 |
-
# Add an expandable with description of what's going on.
|
56 |
-
with st.expander("Under the hood",expanded=True):
|
57 |
-
st.markdown('''
|
58 |
-
Uses modified version of https://github.com/gabrielchua/RAGxplorer/tree/main?tab=readme-ov-file to connect to existing database created.
|
59 |
-
Modified version here: https://github.com/dsmueller3760/RAGxplorer/tree/load_db
|
60 |
-
Assumes that chroma databases are located in local_db_path variable.
|
61 |
-
Query size in database: Take a random sample of this size from the database to visualize.
|
62 |
-
''')
|
63 |
-
|
64 |
-
with st.expander("Create visualization data",expanded=True):
|
65 |
-
# Add a button to run the function
|
66 |
-
limit_size = st.checkbox('Limit size of data visualization?', value=True)
|
67 |
-
if limit_size:
|
68 |
-
vector_qty=st.number_input('Query size in database', min_value=1, step=10, value=50)
|
69 |
-
else:
|
70 |
-
vector_qty=None
|
71 |
-
export_df = st.checkbox('Export visualization data?', value=True)
|
72 |
-
if export_df:
|
73 |
-
current_time = datetime.now().strftime("%Y.%m.%d.%H.%M")
|
74 |
-
if limit_size:
|
75 |
-
df_export_path = st.text_input('Export file', f'../data/AMS/ams_data-400-0-{vector_qty}.json')
|
76 |
-
else:
|
77 |
-
df_export_path=st.text_input('Export file', f'../data/AMS/ams_data-400-0-all.json')
|
78 |
-
if st.button('Create visualization data'):
|
79 |
-
start_time = time.time() # Start the timer
|
80 |
-
|
81 |
-
st.session_state.client = RAGxplorer(embedding_model=sb['embedding_name'])
|
82 |
-
st.session_state.client.load_db(path_to_db=sb['keys']['LOCAL_DB_PATH']+'/chromadb/',
|
83 |
-
index_name=sb['index_name'],
|
84 |
-
df_export_path=df_export_path,
|
85 |
-
vector_qty=vector_qty,
|
86 |
-
umap_params={'n_neighbors': 5,
|
87 |
-
'n_components': 2,
|
88 |
-
'random_state':42},
|
89 |
-
verbose=True)
|
90 |
-
|
91 |
-
end_time = time.time() # Stop the timer
|
92 |
-
elapsed_time = end_time - start_time
|
93 |
-
st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
|
94 |
-
|
95 |
-
with st.expander("Visualize data",expanded=True):
|
96 |
-
import_data = st.checkbox('Import visualization data?', value=True)
|
97 |
-
if import_data:
|
98 |
-
import_file = st.file_uploader("Import file", type="json")
|
99 |
-
if import_file is None:
|
100 |
-
# Use a default file
|
101 |
-
import_file_path=st.text_input('Import file',df_export_path)
|
102 |
-
else:
|
103 |
-
# Use the uploaded file
|
104 |
-
import_file_path=st.text_input('Import file',f'../data/AMS/{import_file.name}')
|
105 |
-
else:
|
106 |
-
import_file_path=None
|
107 |
-
|
108 |
-
query = st.text_input('Query', 'What are examples of lubricants which should be avoided for space mechanism applications?')
|
109 |
-
|
110 |
-
if st.button('Visualize data'):
|
111 |
-
start_time = time.time() # Start the timer
|
112 |
-
|
113 |
-
if st.session_state.client is None:
|
114 |
-
st.session_state.client = RAGxplorer(embedding_model=sb['embedding_name'])
|
115 |
-
|
116 |
-
fig = st.session_state.client.visualize_query(query,
|
117 |
-
path_to_db=sb['keys']['LOCAL_DB_PATH']+'/chromadb/',
|
118 |
-
viz_data_df_path=import_file_path,
|
119 |
-
verbose=True)
|
120 |
-
st.plotly_chart(fig,use_container_width=True)
|
121 |
-
|
122 |
-
end_time = time.time() # Stop the timer
|
123 |
-
elapsed_time = end_time - start_time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/pages/4_Clean_and_Question.py
DELETED
@@ -1,86 +0,0 @@
|
|
1 |
-
import setup
|
2 |
-
import data_import
|
3 |
-
|
4 |
-
import time
|
5 |
-
import logging
|
6 |
-
import json
|
7 |
-
from datetime import datetime
|
8 |
-
|
9 |
-
from langchain_openai import OpenAIEmbeddings
|
10 |
-
from langchain_community.embeddings import VoyageEmbeddings
|
11 |
-
|
12 |
-
from langchain_openai import OpenAI, ChatOpenAI
|
13 |
-
from langchain_community.llms import HuggingFaceHub
|
14 |
-
|
15 |
-
import streamlit as st
|
16 |
-
|
17 |
-
# Set up the page, enable logging
|
18 |
-
from dotenv import load_dotenv,find_dotenv
|
19 |
-
load_dotenv(find_dotenv(),override=True)
|
20 |
-
logging.basicConfig(filename='app_4_clean_and_question.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
|
21 |
-
|
22 |
-
# Set the page title
|
23 |
-
st.set_page_config(
|
24 |
-
page_title='Clean and Question Data',
|
25 |
-
layout='wide'
|
26 |
-
)
|
27 |
-
st.title('Clean and Question Data')
|
28 |
-
# TODO: add database status icons
|
29 |
-
sb=setup.load_sidebar(config_file='../config/config.json',
|
30 |
-
index_data_file='../config/index_data.json',
|
31 |
-
llm=True,
|
32 |
-
model_options=True,
|
33 |
-
secret_keys=True)
|
34 |
-
secrets=setup.set_secrets(sb) # Take secrets from .env file first, otherwise from sidebar
|
35 |
-
|
36 |
-
# This is janky but works (needs secrets to initialize properly)
|
37 |
-
from ragxplorer import RAGxplorer
|
38 |
-
|
39 |
-
# Set up session state variables
|
40 |
-
if 'client' not in st.session_state:
|
41 |
-
st.session_state.client = None
|
42 |
-
|
43 |
-
# Populate the main screen
|
44 |
-
# Add an expandable with description of what's going on.
|
45 |
-
with st.expander("Under the hood",expanded=True):
|
46 |
-
st.markdown('''
|
47 |
-
|
48 |
-
''')
|
49 |
-
|
50 |
-
chunked_file = st.text_input('Chunked raw text file', f'../data/AMS/ams_data-400-0.jsonl')
|
51 |
-
|
52 |
-
with st.expander("Process Chunked Data",expanded=True):
|
53 |
-
clean_data = st.checkbox('Clean data?', value=True)
|
54 |
-
tag_data = st.checkbox('Tag data?', value=True)
|
55 |
-
question_data = st.checkbox('Generate questions from data?', value=True)
|
56 |
-
if sb['model_options']['output_level'] == 'Concise':
|
57 |
-
out_token = 50
|
58 |
-
else:
|
59 |
-
out_token = 516
|
60 |
-
|
61 |
-
# Define LLM
|
62 |
-
if sb['llm_source']=='OpenAI':
|
63 |
-
llm = ChatOpenAI(model_name=sb['llm_model'],
|
64 |
-
temperature=sb['model_options']['temperature'],
|
65 |
-
openai_api_key=secrets['OPENAI_API_KEY'],
|
66 |
-
max_tokens=out_token)
|
67 |
-
elif sb['llm_source']=='Hugging Face':
|
68 |
-
llm = HuggingFaceHub(repo_id=sb['llm_model'],
|
69 |
-
model_kwargs={"temperature": sb['model_options']['temperature'], "max_length": out_token})
|
70 |
-
|
71 |
-
if clean_data or tag_data or question_data:
|
72 |
-
param_cleaning=None
|
73 |
-
if clean_data:
|
74 |
-
n_tags=None
|
75 |
-
if question_data:
|
76 |
-
n_questions=None
|
77 |
-
|
78 |
-
if st.button('Process chunked data'):
|
79 |
-
start_time = time.time() # Start the timer
|
80 |
-
|
81 |
-
data_import.process_chunk(chunked_file,llm,
|
82 |
-
clean_data=False,tag_data=False,question_data=False)
|
83 |
-
|
84 |
-
end_time = time.time() # Stop the timer
|
85 |
-
elapsed_time = end_time - start_time
|
86 |
-
st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/prompts.py
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
from langchain import hub
|
2 |
-
from langchain.prompts.prompt import PromptTemplate
|
3 |
-
|
4 |
-
# Prompts on the hub: https://smith.langchain.com/hub/my-prompts?organizationId=45eb8917-7353-4296-978d-bb461fc45c65
|
5 |
-
CONDENSE_QUESTION_PROMPT = hub.pull("dmueller/ams-chatbot-qa-condense-history")
|
6 |
-
QA_PROMPT=hub.pull("dmueller/ams-chatbot-qa-retrieval")
|
7 |
-
QA_WSOURCES_PROMPT=hub.pull("dmueller/ams-chatbot-qa-retrieval-wsources")
|
8 |
-
QA_GENERATE_PROMPT=hub.pull("dmueller/generate_qa_prompt")
|
9 |
-
|
10 |
-
# Prompts defined here only
|
11 |
-
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
|
12 |
-
TEST_QUERY_PROMPT='What are examples of adhesives to use when potting motors for launch vehicle or spacecraft mechanisms?'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/queries.py
DELETED
@@ -1,278 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import logging
|
3 |
-
import re
|
4 |
-
|
5 |
-
from dotenv import load_dotenv, find_dotenv
|
6 |
-
|
7 |
-
import openai
|
8 |
-
import pinecone
|
9 |
-
import chromadb
|
10 |
-
|
11 |
-
from langchain_community.vectorstores import Pinecone
|
12 |
-
from langchain_community.vectorstores import Chroma
|
13 |
-
|
14 |
-
from langchain.memory import ConversationBufferMemory
|
15 |
-
|
16 |
-
from operator import itemgetter
|
17 |
-
from langchain_core.output_parsers import StrOutputParser
|
18 |
-
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
|
19 |
-
from langchain.schema import format_document
|
20 |
-
from langchain_core.messages import get_buffer_string
|
21 |
-
|
22 |
-
from prompts import CONDENSE_QUESTION_PROMPT, QA_PROMPT, DEFAULT_DOCUMENT_PROMPT, TEST_QUERY_PROMPT
|
23 |
-
|
24 |
-
# Set secrets from environment file
|
25 |
-
OPENAI_API_KEY=os.getenv('OPENAI_API_KEY')
|
26 |
-
VOYAGE_API_KEY=os.getenv('VOYAGE_API_KEY')
|
27 |
-
PINECONE_API_KEY=os.getenv('PINECONE_API_KEY')
|
28 |
-
HUGGINGFACEHUB_API_TOKEN=os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
29 |
-
|
30 |
-
# Class and functions
|
31 |
-
class QA_Model:
|
32 |
-
def __init__(self,
|
33 |
-
index_type,
|
34 |
-
index_name,
|
35 |
-
query_model,
|
36 |
-
llm,
|
37 |
-
k=6,
|
38 |
-
search_type='similarity',
|
39 |
-
fetch_k=50,
|
40 |
-
temperature=0,
|
41 |
-
chain_type='stuff',
|
42 |
-
filter_arg=False,
|
43 |
-
local_db_path='../db'):
|
44 |
-
|
45 |
-
self.index_type=index_type
|
46 |
-
self.index_name=index_name
|
47 |
-
self.query_model=query_model
|
48 |
-
self.llm=llm
|
49 |
-
self.k=k
|
50 |
-
self.search_type=search_type
|
51 |
-
self.fetch_k=fetch_k
|
52 |
-
self.temperature=temperature
|
53 |
-
self.chain_type=chain_type
|
54 |
-
self.filter_arg=filter_arg
|
55 |
-
self.sources=[]
|
56 |
-
|
57 |
-
load_dotenv(find_dotenv(),override=True)
|
58 |
-
|
59 |
-
# Define retriever search parameters
|
60 |
-
search_kwargs = _process_retriever_args(self.filter_arg,
|
61 |
-
self.sources,
|
62 |
-
self.search_type,
|
63 |
-
self.k,
|
64 |
-
self.fetch_k)
|
65 |
-
|
66 |
-
# Read in from the vector database
|
67 |
-
if index_type=='Pinecone':
|
68 |
-
pinecone.init(
|
69 |
-
api_key=PINECONE_API_KEY
|
70 |
-
)
|
71 |
-
logging.info('Chat pinecone index name: '+str(index_name))
|
72 |
-
logging.info('Chat query model: '+str(query_model))
|
73 |
-
index = pinecone.Index(index_name)
|
74 |
-
self.vectorstore = Pinecone(index,query_model,'page_content')
|
75 |
-
logging.info('Chat vectorstore: '+str(self.vectorstore))
|
76 |
-
|
77 |
-
# Test query
|
78 |
-
try:
|
79 |
-
test_query = self.vectorstore.similarity_search(TEST_QUERY_PROMPT)
|
80 |
-
except:
|
81 |
-
raise Exception("Pinecone vector database is not configured properly. Test query failed. Likely the index does not exist.")
|
82 |
-
logging.info('Test query: '+str(test_query))
|
83 |
-
if not test_query:
|
84 |
-
raise ValueError("Pinecone vector database is not configured properly. Test query failed.")
|
85 |
-
else:
|
86 |
-
logging.info('Test query succeeded!')
|
87 |
-
|
88 |
-
self.retriever=self.vectorstore.as_retriever(search_type=search_type,
|
89 |
-
search_kwargs=search_kwargs)
|
90 |
-
logging.info('Chat retriever: '+str(self.retriever))
|
91 |
-
elif index_type=='ChromaDB':
|
92 |
-
logging.info('Chat chroma index name: '+str(index_name))
|
93 |
-
logging.info('Chat query model: '+str(query_model))
|
94 |
-
persistent_client = chromadb.PersistentClient(path=local_db_path+'/chromadb')
|
95 |
-
self.vectorstore = Chroma(client=persistent_client,
|
96 |
-
collection_name=index_name,
|
97 |
-
embedding_function=query_model)
|
98 |
-
logging.info('Chat vectorstore: '+str(self.vectorstore))
|
99 |
-
|
100 |
-
# Test query
|
101 |
-
try:
|
102 |
-
test_query = self.vectorstore.similarity_search(TEST_QUERY_PROMPT)
|
103 |
-
except:
|
104 |
-
raise Exception("Chroma vector database is not configured properly. Test query failed. Likely the index does not exist.")
|
105 |
-
logging.info('Test query: '+str(test_query))
|
106 |
-
if not test_query:
|
107 |
-
raise ValueError("Chroma vector database is not configured properly. Test query failed.")
|
108 |
-
else:
|
109 |
-
logging.info('Test query succeeded!')
|
110 |
-
|
111 |
-
self.retriever=self.vectorstore.as_retriever(search_type=search_type,
|
112 |
-
search_kwargs=search_kwargs)
|
113 |
-
logging.info('Chat retriever: '+str(self.retriever))
|
114 |
-
elif index_type=='RAGatouille':
|
115 |
-
# Easy because the index is picked up directly.
|
116 |
-
self.vectorstore=query_model
|
117 |
-
logging.info('Chat query model:'+str(query_model))
|
118 |
-
|
119 |
-
# Test query
|
120 |
-
try:
|
121 |
-
test_query = self.vectorstore.search(TEST_QUERY_PROMPT)
|
122 |
-
except:
|
123 |
-
raise Exception("RAGatouille vector database is not configured properly.")
|
124 |
-
logging.info('Test query: '+str(test_query))
|
125 |
-
if not test_query:
|
126 |
-
raise ValueError("Chroma vector database is not configured properly. Test query failed.")
|
127 |
-
else:
|
128 |
-
logging.info('Test query succeeded!')
|
129 |
-
|
130 |
-
self.retriever=self.vectorstore.as_langchain_retriever()
|
131 |
-
logging.info('Chat retriever: '+str(self.retriever))
|
132 |
-
|
133 |
-
# Intialize memory
|
134 |
-
self.memory = ConversationBufferMemory(
|
135 |
-
return_messages=True, output_key='answer', input_key='question')
|
136 |
-
logging.info('Memory: '+str(self.memory))
|
137 |
-
|
138 |
-
# Assemble main chain
|
139 |
-
self.conversational_qa_chain=_define_qa_chain(self.llm,
|
140 |
-
self.retriever,
|
141 |
-
self.memory,
|
142 |
-
self.search_type,
|
143 |
-
search_kwargs)
|
144 |
-
def query_docs(self,query):
|
145 |
-
self.memory.load_memory_variables({})
|
146 |
-
logging.info('Memory content before qa result: '+str(self.memory))
|
147 |
-
|
148 |
-
logging.info('Query: '+str(query))
|
149 |
-
self.result = self.conversational_qa_chain.invoke({'question': query})
|
150 |
-
logging.info('QA result: '+str(self.result))
|
151 |
-
|
152 |
-
if self.index_type!='RAGatouille':
|
153 |
-
self.sources = '\n'.join(str(data.metadata) for data in self.result['references'])
|
154 |
-
self.result['answer'].content += '\nSources: \n'+self.sources
|
155 |
-
logging.info('Sources: '+str(self.sources))
|
156 |
-
logging.info('Response with sources: '+str(self.result['answer'].content))
|
157 |
-
else:
|
158 |
-
# RAGatouille doesn't have metadata, need to extract from context first.
|
159 |
-
extracted_metadata = []
|
160 |
-
pattern = r'\{([^}]*)\}(?=[^{}]*$)' # Regular expression pattern to match the last curly braces
|
161 |
-
|
162 |
-
for ref in self.result['references']:
|
163 |
-
match = re.search(pattern, ref.page_content)
|
164 |
-
if match:
|
165 |
-
extracted_metadata.append("{"+match.group(1)+"}")
|
166 |
-
self.sources = '\n'.join(extracted_metadata)
|
167 |
-
self.result['answer'].content += '\nSources: \n'+self.sources
|
168 |
-
logging.info('Sources: '+str(self.sources))
|
169 |
-
logging.info('Response with sources: '+str(self.result['answer'].content))
|
170 |
-
|
171 |
-
self.memory.save_context({'question': query}, {'answer': self.result['answer'].content})
|
172 |
-
logging.info('Memory content after qa result: '+str(self.memory))
|
173 |
-
|
174 |
-
def update_model(self,
|
175 |
-
llm,
|
176 |
-
k=6,
|
177 |
-
search_type='similarity',
|
178 |
-
fetch_k=50,
|
179 |
-
filter_arg=False):
|
180 |
-
|
181 |
-
self.llm=llm
|
182 |
-
self.k=k
|
183 |
-
self.search_type=search_type
|
184 |
-
self.fetch_k=fetch_k
|
185 |
-
self.filter_arg=filter_arg
|
186 |
-
|
187 |
-
# Define retriever search parameters
|
188 |
-
search_kwargs = _process_retriever_args(self.filter_arg,
|
189 |
-
self.sources,
|
190 |
-
self.search_type,
|
191 |
-
self.k,
|
192 |
-
self.fetch_k)
|
193 |
-
# Update conversational retrieval chain
|
194 |
-
self.conversational_qa_chain=_define_qa_chain(self.llm,
|
195 |
-
self.retriever,
|
196 |
-
self.memory,
|
197 |
-
self.search_type,
|
198 |
-
search_kwargs)
|
199 |
-
logging.info('Updated qa chain: '+str(self.conversational_qa_chain))
|
200 |
-
|
201 |
-
# Internal functions
|
202 |
-
def _combine_documents(docs,
|
203 |
-
document_prompt=DEFAULT_DOCUMENT_PROMPT,
|
204 |
-
document_separator='\n\n'):
|
205 |
-
'''
|
206 |
-
Combine a list of documents into a single string.
|
207 |
-
'''
|
208 |
-
# TODO: this would be where stuff, map reduce, etc. would go
|
209 |
-
doc_strings = [format_document(doc, document_prompt) for doc in docs]
|
210 |
-
return document_separator.join(doc_strings)
|
211 |
-
def _define_qa_chain(llm,
|
212 |
-
retriever,
|
213 |
-
memory,
|
214 |
-
search_type,
|
215 |
-
search_kwargs):
|
216 |
-
'''
|
217 |
-
Define the conversational QA chain.
|
218 |
-
'''
|
219 |
-
# This adds a 'memory' key to the input object
|
220 |
-
loaded_memory = RunnablePassthrough.assign(
|
221 |
-
chat_history=RunnableLambda(memory.load_memory_variables)
|
222 |
-
| itemgetter('history'))
|
223 |
-
logging.info('Loaded memory: '+str(loaded_memory))
|
224 |
-
|
225 |
-
# Assemble main chain
|
226 |
-
standalone_question = {
|
227 |
-
'standalone_question': {
|
228 |
-
'question': lambda x: x['question'],
|
229 |
-
'chat_history': lambda x: get_buffer_string(x['chat_history'])}
|
230 |
-
| CONDENSE_QUESTION_PROMPT
|
231 |
-
| llm
|
232 |
-
| StrOutputParser()}
|
233 |
-
logging.info('Condense inputs as a standalong question: '+str(standalone_question))
|
234 |
-
retrieved_documents = {
|
235 |
-
'source_documents': itemgetter('standalone_question')
|
236 |
-
| retriever,
|
237 |
-
'question': lambda x: x['standalone_question']}
|
238 |
-
logging.info('Retrieved documents: '+str(retrieved_documents))
|
239 |
-
# Now we construct the inputs for the final prompt
|
240 |
-
final_inputs = {
|
241 |
-
'context': lambda x: _combine_documents(x['source_documents']),
|
242 |
-
'question': itemgetter('question')}
|
243 |
-
logging.info('Combined documents: '+str(final_inputs))
|
244 |
-
# And finally, we do the part that returns the answers
|
245 |
-
answer = {
|
246 |
-
'answer': final_inputs
|
247 |
-
| QA_PROMPT
|
248 |
-
| llm,
|
249 |
-
'references': itemgetter('source_documents')}
|
250 |
-
conversational_qa_chain = loaded_memory | standalone_question | retrieved_documents | answer
|
251 |
-
logging.info('Conversational QA chain: '+str(conversational_qa_chain))
|
252 |
-
return conversational_qa_chain
|
253 |
-
def _process_retriever_args(filter_arg,
|
254 |
-
sources,
|
255 |
-
search_type,
|
256 |
-
k,
|
257 |
-
fetch_k):
|
258 |
-
'''
|
259 |
-
Process arguments for retriever.
|
260 |
-
'''
|
261 |
-
# Implement filter
|
262 |
-
if filter_arg:
|
263 |
-
filter_list = list(set(item['source'] for item in sources[-1]))
|
264 |
-
filter_items=[]
|
265 |
-
for item in filter_list:
|
266 |
-
filter_item={'source': item}
|
267 |
-
filter_items.append(filter_item)
|
268 |
-
filter={'$or':filter_items}
|
269 |
-
else:
|
270 |
-
filter=None
|
271 |
-
|
272 |
-
# Impement filtering and number of documents to return
|
273 |
-
if search_type=='mmr':
|
274 |
-
search_kwargs={'k':k,'fetch_k':fetch_k,'filter':filter} # See as_retriever docs for parameters
|
275 |
-
else:
|
276 |
-
search_kwargs={'k':k,'filter':filter} # See as_retriever docs for parameters
|
277 |
-
|
278 |
-
return search_kwargs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/setup.py
DELETED
@@ -1,168 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import logging
|
3 |
-
import json
|
4 |
-
|
5 |
-
import openai
|
6 |
-
|
7 |
-
import streamlit as st
|
8 |
-
|
9 |
-
# Set up the page, enable logging
|
10 |
-
from dotenv import load_dotenv,find_dotenv
|
11 |
-
load_dotenv(find_dotenv(),override=True)
|
12 |
-
|
13 |
-
def load_sidebar(config_file,
|
14 |
-
index_data_file,
|
15 |
-
vector_databases=False,
|
16 |
-
embeddings=False,
|
17 |
-
rag_type=False,
|
18 |
-
index_name=False,
|
19 |
-
llm=False,
|
20 |
-
model_options=False,
|
21 |
-
secret_keys=False):
|
22 |
-
"""
|
23 |
-
Sets up the sidebar based no toggled options. Returns variables with options.
|
24 |
-
"""
|
25 |
-
sb_out={}
|
26 |
-
with open(config_file, 'r') as f:
|
27 |
-
config = json.load(f)
|
28 |
-
databases = {db['name']: db for db in config['databases']}
|
29 |
-
llms = {m['name']: m for m in config['llms']}
|
30 |
-
logging.info('Loaded: '+config_file)
|
31 |
-
with open(index_data_file, 'r') as f:
|
32 |
-
index_data = json.load(f)
|
33 |
-
logging.info('Loaded: '+index_data_file)
|
34 |
-
|
35 |
-
if vector_databases:
|
36 |
-
# Vector databases
|
37 |
-
st.sidebar.title('Vector database')
|
38 |
-
sb_out['index_type']=st.sidebar.selectbox('Index type', list(databases.keys()), index=1)
|
39 |
-
logging.info('Index type: '+sb_out['index_type'])
|
40 |
-
|
41 |
-
if embeddings:
|
42 |
-
# Embeddings
|
43 |
-
st.sidebar.title('Embeddings')
|
44 |
-
if sb_out['index_type']=='RAGatouille': # Default to selecting hugging face model for RAGatouille, otherwise select alternates
|
45 |
-
sb_out['query_model']=st.sidebar.selectbox('Hugging face rag models', databases[sb_out['index_type']]['hf_rag_models'], index=0)
|
46 |
-
else:
|
47 |
-
sb_out['query_model']=st.sidebar.selectbox('Embedding models', databases[sb_out['index_type']]['embedding_models'], index=0)
|
48 |
-
|
49 |
-
if sb_out['query_model']=='Openai':
|
50 |
-
sb_out['embedding_name']='text-embedding-ada-002'
|
51 |
-
elif sb_out['query_model']=='Voyage':
|
52 |
-
sb_out['embedding_name']='voyage-02'
|
53 |
-
logging.info('Query type: '+sb_out['query_model'])
|
54 |
-
if 'embedding_name' in locals() or 'embedding_name' in globals():
|
55 |
-
logging.info('Embedding name: '+sb_out['embedding_name'])
|
56 |
-
if rag_type:
|
57 |
-
if sb_out['index_type']!='RAGatouille': # RAGatouille doesn't have a rag_type
|
58 |
-
# RAG Type
|
59 |
-
st.sidebar.title('RAG Type')
|
60 |
-
sb_out['rag_type']=st.sidebar.selectbox('RAG type', config['rag_types'], index=0)
|
61 |
-
sb_out['smart_agent']=st.sidebar.checkbox('Smart agent?')
|
62 |
-
logging.info('RAG type: '+sb_out['rag_type'])
|
63 |
-
logging.info('Smart agent: '+str(sb_out['smart_agent']))
|
64 |
-
if index_name:
|
65 |
-
# Index Name
|
66 |
-
st.sidebar.title('Index Name')
|
67 |
-
sb_out['index_name']=index_data[sb_out['index_type']][sb_out['query_model']]
|
68 |
-
st.sidebar.markdown('Index name: '+sb_out['index_name'])
|
69 |
-
logging.info('Index name: '+sb_out['index_name'])
|
70 |
-
if llm:
|
71 |
-
# LLM
|
72 |
-
st.sidebar.title('LLM')
|
73 |
-
sb_out['llm_source']=st.sidebar.selectbox('LLM model', list(llms.keys()), index=0)
|
74 |
-
logging.info('LLM source: '+sb_out['llm_source'])
|
75 |
-
if sb_out['llm_source']=='OpenAI':
|
76 |
-
sb_out['llm_model']=st.sidebar.selectbox('OpenAI model', llms[sb_out['llm_source']]['models'], index=0)
|
77 |
-
if sb_out['llm_source']=='Hugging Face':
|
78 |
-
sb_out['llm_model']=st.sidebar.selectbox('Hugging Face model', llms[sb_out['llm_source']]['models'], index=0)
|
79 |
-
if model_options:
|
80 |
-
# Add input fields in the sidebar
|
81 |
-
st.sidebar.title('LLM Options')
|
82 |
-
temperature = st.sidebar.slider('Temperature', min_value=0.0, max_value=2.0, value=0.0, step=0.1)
|
83 |
-
output_level = st.sidebar.selectbox('Level of Output', ['Concise', 'Detailed'], index=1)
|
84 |
-
|
85 |
-
if 'index_type' in sb_out:
|
86 |
-
st.sidebar.title('Retrieval Options')
|
87 |
-
k = st.sidebar.number_input('Number of items per prompt', min_value=1, step=1, value=4)
|
88 |
-
if sb_out['index_type']!='RAGatouille':
|
89 |
-
search_type = st.sidebar.selectbox('Search Type', ['similarity', 'mmr'], index=0)
|
90 |
-
sb_out['model_options']={'output_level':output_level,
|
91 |
-
'k':k,
|
92 |
-
'search_type':search_type,
|
93 |
-
'temperature':temperature}
|
94 |
-
else:
|
95 |
-
sb_out['model_options']={'output_level':output_level,
|
96 |
-
'temperature':temperature}
|
97 |
-
logging.info('Model options: '+str(sb_out['model_options']))
|
98 |
-
if secret_keys:
|
99 |
-
# Add a section for secret keys
|
100 |
-
st.sidebar.title('Secret keys')
|
101 |
-
st.sidebar.markdown('If .env file is in directory, will use that first.')
|
102 |
-
sb_out['keys']={}
|
103 |
-
if 'llm_source' in sb_out and sb_out['llm_source'] == 'OpenAI':
|
104 |
-
sb_out['keys']['OPENAI_API_KEY'] = st.sidebar.text_input('OpenAI API Key', type='password')
|
105 |
-
elif 'query_model' in sb_out and sb_out['query_model'] == 'Openai':
|
106 |
-
sb_out['keys']['OPENAI_API_KEY'] = st.sidebar.text_input('OpenAI API Key', type='password')
|
107 |
-
if 'llm_source' in sb_out and sb_out['llm_source']=='Hugging Face':
|
108 |
-
sb_out['keys']['HUGGINGFACEHUB_API_TOKEN'] = st.sidebar.text_input('Hugging Face API Key', type='password')
|
109 |
-
if 'query_model' in sb_out and sb_out['query_model']=='Voyage':
|
110 |
-
sb_out['keys']['VOYAGE_API_KEY'] = st.sidebar.text_input('Voyage API Key', type='password')
|
111 |
-
if 'index_type' in sb_out and sb_out['index_type']=='Pinecone':
|
112 |
-
sb_out['keys']['PINECONE_API_KEY']=st.sidebar.text_input('Pinecone API Key',type='password')
|
113 |
-
if os.getenv('LOCAL_DB_PATH') is None:
|
114 |
-
sb_out['keys']['LOCAL_DB_PATH'] = st.sidebar.text_input('Local Database Path','/data',help='Path to local database (e.g. chroma)')
|
115 |
-
os.environ['LOCAL_DB_PATH'] = sb_out['keys']['LOCAL_DB_PATH']
|
116 |
-
else:
|
117 |
-
sb_out['keys']['LOCAL_DB_PATH'] = os.getenv('LOCAL_DB_PATH')
|
118 |
-
st.sidebar.markdown('Local Database Path: '+sb_out['keys']['LOCAL_DB_PATH'],help='From .env file.')
|
119 |
-
|
120 |
-
return sb_out
|
121 |
-
|
122 |
-
def set_secrets(sb):
|
123 |
-
"""
|
124 |
-
Sets secrets from environment file, or from sidebar if not available.
|
125 |
-
"""
|
126 |
-
secrets={}
|
127 |
-
|
128 |
-
secrets['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
|
129 |
-
logging.info('OpenAI API Key: '+str(secrets['OPENAI_API_KEY']))
|
130 |
-
if not secrets['OPENAI_API_KEY'] and 'keys' in sb and 'OPENAI_API_KEY' in sb['keys']:
|
131 |
-
logging.info('Setting OpenAI API Key from sidebar...')
|
132 |
-
secrets['OPENAI_API_KEY'] = sb['keys']['OPENAI_API_KEY']
|
133 |
-
os.environ['OPENAI_API_KEY'] = secrets['OPENAI_API_KEY']
|
134 |
-
logging.info('OpenAI API Key: '+str(os.environ['OPENAI_API_KEY']))
|
135 |
-
if os.environ['OPENAI_API_KEY']=='':
|
136 |
-
raise Exception('OpenAI API Key is required.')
|
137 |
-
openai.api_key = secrets['OPENAI_API_KEY']
|
138 |
-
|
139 |
-
secrets['VOYAGE_API_KEY'] = os.getenv('VOYAGE_API_KEY')
|
140 |
-
logging.info('Voyage API Key: '+str(secrets['VOYAGE_API_KEY']))
|
141 |
-
if not secrets['VOYAGE_API_KEY'] and 'keys' in sb and 'VOYAGE_API_KEY' in sb['keys']:
|
142 |
-
logging.info('Setting Voyage API Key from sidebar...')
|
143 |
-
secrets['VOYAGE_API_KEY'] = sb['keys']['VOYAGE_API_KEY']
|
144 |
-
os.environ['VOYAGE_API_KEY'] = secrets['VOYAGE_API_KEY']
|
145 |
-
logging.info('Voyage API Key: '+str(os.environ['VOYAGE_API_KEY']))
|
146 |
-
if os.environ['VOYAGE_API_KEY']=='':
|
147 |
-
raise Exception('Voyage API Key is required.')
|
148 |
-
|
149 |
-
secrets['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')
|
150 |
-
logging.info('Pinecone API Key: '+str(secrets['PINECONE_API_KEY']))
|
151 |
-
if not secrets['PINECONE_API_KEY'] and 'keys' in sb and 'PINECONE_API_KEY' in sb['keys']:
|
152 |
-
logging.info('Setting Pinecone API Key from sidebar...')
|
153 |
-
secrets['PINECONE_API_KEY'] = sb['keys']['PINECONE_API_KEY']
|
154 |
-
os.environ['PINECONE_API_KEY'] = secrets['PINECONE_API_KEY']
|
155 |
-
logging.info('Pinecone API Key: '+str(os.environ['PINECONE_API_KEY']))
|
156 |
-
if os.environ['PINECONE_API_KEY']=='':
|
157 |
-
raise Exception('Pinecone API Key is required.')
|
158 |
-
|
159 |
-
secrets['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
160 |
-
logging.info('Hugging Face API Key: '+str(secrets['HUGGINGFACEHUB_API_TOKEN']))
|
161 |
-
if not secrets['HUGGINGFACEHUB_API_TOKEN'] and 'keys' in sb and 'HUGGINGFACEHUB_API_TOKEN' in sb['keys']:
|
162 |
-
logging.info('Setting Hugging Face API Key from sidebar...')
|
163 |
-
secrets['HUGGINGFACEHUB_API_TOKEN'] = sb['keys']['HUGGINGFACEHUB_API_TOKEN']
|
164 |
-
os.environ['HUGGINGFACEHUB_API_TOKEN'] = secrets['HUGGINGFACEHUB_API_TOKEN']
|
165 |
-
logging.info('Hugging Face API Key: '+str(os.environ['HUGGINGFACEHUB_API_TOKEN']))
|
166 |
-
if os.environ['HUGGINGFACEHUB_API_TOKEN']=='':
|
167 |
-
raise Exception('Hugging Face API Key is required.')
|
168 |
-
return secrets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|