Spaces:
Runtime error
Runtime error
thisisishara
commited on
Commit
•
0fac726
1
Parent(s):
7ec1ff7
init commit
Browse files- .env.template +11 -0
- .gitignore +163 -0
- .streamlit/config.toml +6 -0
- Dockerfile +13 -0
- LICENSE +21 -0
- app.py +294 -0
- backup/docker/kb_openai_ishara.faiss +0 -0
- backup/docker/kb_openai_ishara.pkl +3 -0
- backup/windows/kb_openai_ishara.faiss +0 -0
- backup/windows/kb_openai_ishara.pkl +3 -0
- build_knowledgebase.py +94 -0
- chat.py +92 -0
- knowledgebase.py +203 -0
- knowledgebases/.gitkeep +0 -0
- knowledgebases/kb_openai_ishara.faiss +0 -0
- knowledgebases/kb_openai_ishara.pkl +3 -0
- requirements.txt +10 -0
- utils/__init__.py +0 -0
- utils/constants.py +65 -0
- utils/llm.py +42 -0
.env.template
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ["hf", "openai"]
|
2 |
+
ASSISTANT_TYPE=openai
|
3 |
+
EMBEDDING_TYPE=hf
|
4 |
+
|
5 |
+
# if openai
|
6 |
+
OPENAI_API_KEY=sk-xxxxx
|
7 |
+
OPENAI_KNOWLEDGEBASE=kb_openai
|
8 |
+
|
9 |
+
# if hf
|
10 |
+
HUGGINGFACEHUB_API_TOKEN=hf_xxxxx
|
11 |
+
HF_KNOWLEDGEBASE=kb_hf
|
.gitignore
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
.idea/
|
161 |
+
|
162 |
+
|
163 |
+
# App-specific
|
.streamlit/config.toml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
primaryColor="#2c7b2c"
|
3 |
+
backgroundColor="#171e1a"
|
4 |
+
secondaryBackgroundColor="#111811"
|
5 |
+
textColor="#cfd8dc"
|
6 |
+
font="sans serif"
|
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10.9
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY requirements.txt .
|
6 |
+
|
7 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
8 |
+
|
9 |
+
COPY . .
|
10 |
+
|
11 |
+
EXPOSE 8501
|
12 |
+
|
13 |
+
CMD ["streamlit", "run", "app.py", "--server.port", "8501"]
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Ishara Dissanayake
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
app.py
ADDED
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
|
5 |
+
import streamlit as st
|
6 |
+
from streamlit.logger import get_logger
|
7 |
+
|
8 |
+
from knowledgebase import Knowledgebase
|
9 |
+
from utils.constants import (
|
10 |
+
AssistantType,
|
11 |
+
OPENAI_KNOWLEDGEBASE_KEY,
|
12 |
+
HUGGINGFACEHUB_API_TOKEN_KEY,
|
13 |
+
HF_KNOWLEDGEBASE_KEY,
|
14 |
+
SOURCES_TAG,
|
15 |
+
ANSWER_TAG,
|
16 |
+
NONE_TAG,
|
17 |
+
EMPTY_TAG,
|
18 |
+
MESSAGE_HISTORY_TAG,
|
19 |
+
TEXT_TAG,
|
20 |
+
USER_TAG,
|
21 |
+
ASSISTANT_TAG,
|
22 |
+
FROM_TAG,
|
23 |
+
IN_PROGRESS_TAG,
|
24 |
+
QUERY_INPUT_TAG,
|
25 |
+
VALID_TOKEN_TAG,
|
26 |
+
StNotificationType,
|
27 |
+
API_KEY_TAG,
|
28 |
+
ASSISTANT_TYPE_TAG,
|
29 |
+
ASSISTANT_AVATAR,
|
30 |
+
USER_AVATAR,
|
31 |
+
EmbeddingType,
|
32 |
+
APIKeyType,
|
33 |
+
)
|
34 |
+
from utils.llm import validate_api_token
|
35 |
+
|
36 |
+
# initialize a logger
|
37 |
+
logger = get_logger(__name__)
|
38 |
+
|
39 |
+
|
40 |
+
def retrieve_answer(query: str):
|
41 |
+
try:
|
42 |
+
assistant_type = st.session_state.selected_assistant_type
|
43 |
+
embedding_type = EmbeddingType.HUGGINGFACE
|
44 |
+
assistant_api_key = st.session_state.verified_api_key
|
45 |
+
embedding_api_key = st.session_state.embedding_api_key
|
46 |
+
knowledgebase_name = st.session_state.knowledgebase_name
|
47 |
+
|
48 |
+
knowledgebase = Knowledgebase(
|
49 |
+
assistant_type=assistant_type,
|
50 |
+
embedding_type=embedding_type,
|
51 |
+
assistant_api_key=assistant_api_key,
|
52 |
+
embedding_api_key=embedding_api_key,
|
53 |
+
knowledgebase_name=knowledgebase_name,
|
54 |
+
)
|
55 |
+
answer, metadata = knowledgebase.query_knowledgebase(query=query)
|
56 |
+
if not metadata:
|
57 |
+
metadata = "$0.00"
|
58 |
+
|
59 |
+
final_answer = re.sub(
|
60 |
+
r"\bSOURCES:[\n\s]*$", "", str(answer[ANSWER_TAG]).strip()
|
61 |
+
).strip()
|
62 |
+
logger.info(f"final answer: {final_answer}")
|
63 |
+
|
64 |
+
if answer.get(SOURCES_TAG, None) not in [None, NONE_TAG, EMPTY_TAG]:
|
65 |
+
return f"{final_answer}\n\nSources:\n{answer[SOURCES_TAG]}\n\nCost (USD):\n`{metadata}`"
|
66 |
+
else:
|
67 |
+
return f"{final_answer}\n\nCost:\n`{metadata}`"
|
68 |
+
except Exception as e:
|
69 |
+
logger.exception(f"Invalid API key. {e}")
|
70 |
+
return (
|
71 |
+
f"Could not retrieve the answer. This could be due to "
|
72 |
+
f"various reasons such as Invalid API Tokens or hitting "
|
73 |
+
f"the Rate limit enforced by LLM vendors."
|
74 |
+
)
|
75 |
+
|
76 |
+
|
77 |
+
def show_chat_ui():
|
78 |
+
if (
|
79 |
+
st.session_state.selected_assistant_type == AssistantType.HUGGINGFACE
|
80 |
+
and not st.session_state.get(MESSAGE_HISTORY_TAG, None)
|
81 |
+
):
|
82 |
+
show_notification_banner_ui(
|
83 |
+
notification_type=StNotificationType.WARNING,
|
84 |
+
notification="🤗🤏🏽 HuggingFace assistant is not always guaranteed "
|
85 |
+
"to return a valid response and often exceeds the "
|
86 |
+
"maximum token limit. Use the OpenAI assistant for "
|
87 |
+
"more reliable responses.",
|
88 |
+
)
|
89 |
+
|
90 |
+
if not st.session_state.get(MESSAGE_HISTORY_TAG, None):
|
91 |
+
st.subheader("Let's start chatting, shall we?")
|
92 |
+
|
93 |
+
if st.session_state.get(IN_PROGRESS_TAG, False):
|
94 |
+
query = st.chat_input(
|
95 |
+
"Ask me about ShoutOUT AI stuff", key=QUERY_INPUT_TAG, disabled=True
|
96 |
+
)
|
97 |
+
else:
|
98 |
+
query = st.chat_input("Ask me about ShoutOUT AI stuff", key=QUERY_INPUT_TAG)
|
99 |
+
|
100 |
+
if query:
|
101 |
+
st.session_state.in_progress = True
|
102 |
+
current_messages = st.session_state.get(MESSAGE_HISTORY_TAG, [])
|
103 |
+
current_messages.append({TEXT_TAG: query, FROM_TAG: USER_TAG})
|
104 |
+
st.session_state.message_history = current_messages
|
105 |
+
answer = retrieve_answer(query=query)
|
106 |
+
current_messages.append({TEXT_TAG: answer, FROM_TAG: ASSISTANT_TAG})
|
107 |
+
st.session_state.message_history = current_messages
|
108 |
+
st.session_state.in_progress = False
|
109 |
+
|
110 |
+
if st.session_state.get(MESSAGE_HISTORY_TAG, None):
|
111 |
+
messages = st.session_state.message_history
|
112 |
+
for message in messages:
|
113 |
+
if message.get(FROM_TAG) == USER_TAG:
|
114 |
+
with st.chat_message(USER_TAG, avatar=USER_AVATAR):
|
115 |
+
st.write(message.get(TEXT_TAG))
|
116 |
+
|
117 |
+
if message.get(FROM_TAG) == ASSISTANT_TAG:
|
118 |
+
with st.chat_message(ASSISTANT_TAG, avatar=ASSISTANT_AVATAR):
|
119 |
+
st.write(message.get(TEXT_TAG))
|
120 |
+
|
121 |
+
|
122 |
+
def show_hf_chat_ui():
|
123 |
+
st.sidebar.info(
|
124 |
+
"🤗 You are using the Hugging Face Hub models for the QA task and "
|
125 |
+
"performance might not be as good as proprietary LLMs."
|
126 |
+
)
|
127 |
+
|
128 |
+
verify_token()
|
129 |
+
validated_token = st.session_state.get(VALID_TOKEN_TAG, None)
|
130 |
+
if validated_token is None:
|
131 |
+
st.stop()
|
132 |
+
if not validated_token:
|
133 |
+
st.sidebar.error("❌ Failed to get connected to the HuggingFace Hub")
|
134 |
+
show_notification_banner_ui(
|
135 |
+
notification_type=StNotificationType.INFO,
|
136 |
+
notification="Failed to get connected to the HuggingFace Hub",
|
137 |
+
)
|
138 |
+
st.stop()
|
139 |
+
|
140 |
+
st.sidebar.success(f"✅ Connected to the HF Hub")
|
141 |
+
show_chat_ui()
|
142 |
+
|
143 |
+
|
144 |
+
def show_openai_chat_ui():
|
145 |
+
st.sidebar.info(
|
146 |
+
"🚀 To get started, enter your OpenAI API key. Once that's done, "
|
147 |
+
"you can ask start asking questions. Oh! one more thing, we take "
|
148 |
+
"security seriously and we are NOT storing the API keys in any manner, "
|
149 |
+
"so you're safe. Just revoke it after usage to make sure nothing "
|
150 |
+
"unexpected happens."
|
151 |
+
)
|
152 |
+
if st.sidebar.text_input(
|
153 |
+
"Enter the OpenAI API Key",
|
154 |
+
key=API_KEY_TAG,
|
155 |
+
label_visibility="hidden",
|
156 |
+
placeholder="OpenAI API Key",
|
157 |
+
type="password",
|
158 |
+
):
|
159 |
+
verify_token()
|
160 |
+
|
161 |
+
validated_token = st.session_state.get(VALID_TOKEN_TAG, None)
|
162 |
+
if validated_token is None:
|
163 |
+
st.sidebar.info(f"🗝️ Provide the API Key")
|
164 |
+
st.stop()
|
165 |
+
if not validated_token:
|
166 |
+
st.sidebar.error("❌ API Key you provided is invalid")
|
167 |
+
show_notification_banner_ui(
|
168 |
+
notification_type=StNotificationType.INFO,
|
169 |
+
notification="Please provide a valid OpenAI API Key",
|
170 |
+
)
|
171 |
+
st.stop()
|
172 |
+
|
173 |
+
st.sidebar.success(f"✅ Token Validated!")
|
174 |
+
show_chat_ui()
|
175 |
+
|
176 |
+
|
177 |
+
def show_notification_banner_ui(
|
178 |
+
notification_type: StNotificationType, notification: str
|
179 |
+
):
|
180 |
+
if notification_type == StNotificationType.INFO:
|
181 |
+
st.info(notification)
|
182 |
+
elif notification_type == StNotificationType.WARNING:
|
183 |
+
st.warning(notification)
|
184 |
+
elif notification_type == StNotificationType.ERROR:
|
185 |
+
st.error(notification)
|
186 |
+
|
187 |
+
|
188 |
+
def verify_token():
|
189 |
+
from dotenv import load_dotenv
|
190 |
+
|
191 |
+
load_dotenv()
|
192 |
+
|
193 |
+
embedding_api_key = os.getenv(HUGGINGFACEHUB_API_TOKEN_KEY, None)
|
194 |
+
st_assistant_type = st.session_state.selected_assistant_type
|
195 |
+
if st_assistant_type == AssistantType.OPENAI:
|
196 |
+
assistant_api_key = st.session_state.get(API_KEY_TAG, None)
|
197 |
+
assistant_api_key_type = APIKeyType.OPENAI
|
198 |
+
knowledgebase_name = os.environ.get(OPENAI_KNOWLEDGEBASE_KEY, None)
|
199 |
+
else:
|
200 |
+
assistant_api_key = os.getenv(HUGGINGFACEHUB_API_TOKEN_KEY, None)
|
201 |
+
assistant_api_key_type = APIKeyType.HUGGINGFACE
|
202 |
+
knowledgebase_name = os.environ.get(HF_KNOWLEDGEBASE_KEY, None)
|
203 |
+
|
204 |
+
logger.info(
|
205 |
+
f"The API key for the current st session: {assistant_api_key}\n"
|
206 |
+
f"The Knowledgebase for the current st session: {knowledgebase_name}"
|
207 |
+
)
|
208 |
+
|
209 |
+
assistant_valid, assistant_err = validate_api_token(
|
210 |
+
api_key_type=assistant_api_key_type,
|
211 |
+
api_key=assistant_api_key,
|
212 |
+
)
|
213 |
+
embedding_valid, embedding_err = validate_api_token(
|
214 |
+
api_key_type=APIKeyType.HUGGINGFACE,
|
215 |
+
api_key=embedding_api_key,
|
216 |
+
)
|
217 |
+
|
218 |
+
if assistant_valid and embedding_valid:
|
219 |
+
st.session_state.valid_token = True
|
220 |
+
st.session_state.verified_api_key = assistant_api_key
|
221 |
+
st.session_state.embedding_api_key = embedding_api_key
|
222 |
+
st.session_state.knowledgebase_name = knowledgebase_name
|
223 |
+
elif not assistant_valid and not embedding_valid:
|
224 |
+
st.session_state.valid_token = False
|
225 |
+
st.session_state.token_err = f"{assistant_err}\n{embedding_err}"
|
226 |
+
elif not assistant_valid:
|
227 |
+
st.session_state.valid_token = False
|
228 |
+
st.session_state.token_err = assistant_err
|
229 |
+
elif not embedding_valid:
|
230 |
+
st.session_state.valid_token = False
|
231 |
+
st.session_state.token_err = embedding_err
|
232 |
+
else:
|
233 |
+
st.session_state.valid_token = False
|
234 |
+
st.session_state.token_err = (
|
235 |
+
"An unknown error occurred while validating the API keys"
|
236 |
+
)
|
237 |
+
|
238 |
+
|
239 |
+
def app():
|
240 |
+
# sidebar
|
241 |
+
st.sidebar.image(
|
242 |
+
"https://thisisishara.com/res/images/favicon/android-chrome-192x192.png",
|
243 |
+
width=80,
|
244 |
+
)
|
245 |
+
if st.sidebar.selectbox(
|
246 |
+
"Assistant Type",
|
247 |
+
["OpenAI", "Hugging Face"],
|
248 |
+
key=ASSISTANT_TYPE_TAG,
|
249 |
+
placeholder="Select Assistant Type",
|
250 |
+
):
|
251 |
+
if str(st.session_state.assistant_type).lower() == AssistantType.OPENAI.value:
|
252 |
+
st.session_state.selected_assistant_type = AssistantType.OPENAI
|
253 |
+
else:
|
254 |
+
st.session_state.selected_assistant_type = AssistantType.HUGGINGFACE
|
255 |
+
st.session_state.valid_token = None
|
256 |
+
st.session_state.verified_api_key = None
|
257 |
+
st.session_state.knowledgebase_name = None
|
258 |
+
|
259 |
+
st.write(st.session_state.selected_assistant_type)
|
260 |
+
|
261 |
+
# main section
|
262 |
+
st.header("LLM Website QA Demo")
|
263 |
+
st.caption("⚡ Powered by :blue[LangChain], :green[OpenAI] & :green[Hugging Face]")
|
264 |
+
|
265 |
+
assistant_type = st.session_state.selected_assistant_type
|
266 |
+
if assistant_type == AssistantType.OPENAI:
|
267 |
+
show_openai_chat_ui()
|
268 |
+
elif assistant_type == AssistantType.HUGGINGFACE:
|
269 |
+
show_hf_chat_ui()
|
270 |
+
else:
|
271 |
+
show_notification_banner_ui(
|
272 |
+
notification_type=StNotificationType.INFO,
|
273 |
+
notification="Please select an assistant type to get started!",
|
274 |
+
)
|
275 |
+
|
276 |
+
|
277 |
+
if __name__ == "__main__":
|
278 |
+
st.set_page_config(
|
279 |
+
page_title="Website QA powered by LangChain & LLMs",
|
280 |
+
page_icon="https://thisisishara.com/res/images/favicon/android-chrome-192x192.png",
|
281 |
+
layout="wide",
|
282 |
+
initial_sidebar_state="expanded",
|
283 |
+
)
|
284 |
+
hide_streamlit_style = """
|
285 |
+
<style>
|
286 |
+
# #MainMenu {visibility: hidden;}
|
287 |
+
# footer {visibility: hidden;}
|
288 |
+
[data-testid="stDecoration"] {background: linear-gradient(to right, #9EE51A, #208BBC) !important;}
|
289 |
+
</style>
|
290 |
+
"""
|
291 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
292 |
+
|
293 |
+
# run the app
|
294 |
+
app()
|
backup/docker/kb_openai_ishara.faiss
ADDED
Binary file (218 kB). View file
|
|
backup/docker/kb_openai_ishara.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:540dfd6d7bda272777ee04edb931074a548217f1abbd76e772d6a36dea44c5bc
|
3 |
+
size 40432
|
backup/windows/kb_openai_ishara.faiss
ADDED
Binary file (218 kB). View file
|
|
backup/windows/kb_openai_ishara.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16937df0644608b4c1a5d3c6dcfa3cbd12b8afd3dac9d35d5654ac5d727ffefb
|
3 |
+
size 40432
|
build_knowledgebase.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
|
7 |
+
from knowledgebase import create_knowledgebase
|
8 |
+
from utils.constants import (
|
9 |
+
ASSISTANT_TYPE_KEY,
|
10 |
+
AssistantType,
|
11 |
+
OPENAI_API_TOKEN_KEY,
|
12 |
+
HUGGINGFACEHUB_API_TOKEN_KEY,
|
13 |
+
OPENAI_KNOWLEDGEBASE_KEY,
|
14 |
+
HF_KNOWLEDGEBASE_KEY,
|
15 |
+
ENV_FILE,
|
16 |
+
EMBEDDING_TYPE_KEY,
|
17 |
+
EmbeddingType,
|
18 |
+
APIKeyType,
|
19 |
+
)
|
20 |
+
from utils.llm import validate_api_token
|
21 |
+
|
22 |
+
logger = logging.getLogger(__name__)
|
23 |
+
|
24 |
+
# load the .env
|
25 |
+
load_dotenv(dotenv_path=os.path.join(os.getcwd(), ENV_FILE))
|
26 |
+
|
27 |
+
|
28 |
+
if __name__ == "__main__":
|
29 |
+
# initialize the knowledgebase
|
30 |
+
logger.info("⚡ Initializing the URLs...")
|
31 |
+
|
32 |
+
# determine assistant type
|
33 |
+
assistant_type = os.getenv(ASSISTANT_TYPE_KEY, AssistantType.HUGGINGFACE.value)
|
34 |
+
embedding_type = os.getenv(EMBEDDING_TYPE_KEY, EmbeddingType.HUGGINGFACE.value)
|
35 |
+
|
36 |
+
if assistant_type == AssistantType.OPENAI.value:
|
37 |
+
assistant_type = AssistantType.OPENAI
|
38 |
+
knowledgebase_name = os.environ.get(OPENAI_KNOWLEDGEBASE_KEY, None)
|
39 |
+
|
40 |
+
if embedding_type == EmbeddingType.OPENAI.value:
|
41 |
+
embedding_type = EmbeddingType.OPENAI
|
42 |
+
embedding_api_key = os.getenv(OPENAI_API_TOKEN_KEY, None)
|
43 |
+
embedding_api_key_type = APIKeyType.OPENAI
|
44 |
+
else:
|
45 |
+
embedding_type = EmbeddingType.HUGGINGFACE
|
46 |
+
embedding_api_key = os.getenv(HUGGINGFACEHUB_API_TOKEN_KEY, None)
|
47 |
+
embedding_api_key_type = APIKeyType.HUGGINGFACE
|
48 |
+
|
49 |
+
else:
|
50 |
+
assistant_type = AssistantType.HUGGINGFACE
|
51 |
+
knowledgebase_name = os.environ.get(HF_KNOWLEDGEBASE_KEY, None)
|
52 |
+
embedding_type = EmbeddingType.HUGGINGFACE
|
53 |
+
embedding_api_key = os.getenv(HUGGINGFACEHUB_API_TOKEN_KEY, None)
|
54 |
+
embedding_api_key_type = APIKeyType.HUGGINGFACE
|
55 |
+
|
56 |
+
if embedding_type == EmbeddingType.OPENAI:
|
57 |
+
urls = [
|
58 |
+
"https://thisisishara.com/",
|
59 |
+
"https://github.com/thisisishara",
|
60 |
+
"https://github.com/thisisishara?tab=repositories",
|
61 |
+
"https://www.hackerrank.com/thisisishara?hr_r=1",
|
62 |
+
"https://www.npmjs.com/~thisisishara",
|
63 |
+
"https://pypi.org/user/thisisishara/",
|
64 |
+
"https://www.linkedin.com/in/isharadissanayake/",
|
65 |
+
]
|
66 |
+
|
67 |
+
else:
|
68 |
+
urls = [
|
69 |
+
"https://thisisishara.com/",
|
70 |
+
"https://github.com/thisisishara",
|
71 |
+
"https://github.com/thisisishara?tab=repositories",
|
72 |
+
"https://www.hackerrank.com/thisisishara?hr_r=1",
|
73 |
+
"https://www.npmjs.com/~thisisishara",
|
74 |
+
"https://pypi.org/user/thisisishara/",
|
75 |
+
"https://www.linkedin.com/in/isharadissanayake/",
|
76 |
+
]
|
77 |
+
|
78 |
+
logger.info("🗝️ Validating the embedding API token...")
|
79 |
+
embedding_valid, embedding_err = validate_api_token(
|
80 |
+
api_key_type=embedding_api_key_type, api_key=embedding_api_key
|
81 |
+
)
|
82 |
+
if not embedding_valid:
|
83 |
+
logger.error(embedding_err)
|
84 |
+
sys.exit(1)
|
85 |
+
|
86 |
+
create_knowledgebase(
|
87 |
+
urls=urls,
|
88 |
+
assistant_type=assistant_type,
|
89 |
+
embedding_type=embedding_type,
|
90 |
+
embedding_api_key=embedding_api_key,
|
91 |
+
knowledgebase_name=knowledgebase_name,
|
92 |
+
)
|
93 |
+
|
94 |
+
logger.info("✅ Knowledgebase created")
|
chat.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
|
8 |
+
from knowledgebase import Knowledgebase
|
9 |
+
from utils.constants import (
|
10 |
+
ENV_FILE,
|
11 |
+
ASSISTANT_TYPE_KEY,
|
12 |
+
AssistantType,
|
13 |
+
OPENAI_API_TOKEN_KEY,
|
14 |
+
OPENAI_KNOWLEDGEBASE_KEY,
|
15 |
+
HUGGINGFACEHUB_API_TOKEN_KEY,
|
16 |
+
HF_KNOWLEDGEBASE_KEY,
|
17 |
+
QUERY_TAG,
|
18 |
+
ANSWER_TAG,
|
19 |
+
SOURCES_TAG,
|
20 |
+
EMBEDDING_TYPE_KEY,
|
21 |
+
APIKeyType,
|
22 |
+
EmbeddingType,
|
23 |
+
)
|
24 |
+
from utils.llm import validate_api_token
|
25 |
+
|
26 |
+
# load the .env
|
27 |
+
load_dotenv(dotenv_path=os.path.join(os.getcwd(), ENV_FILE))
|
28 |
+
|
29 |
+
logger = logging.getLogger(__name__)
|
30 |
+
|
31 |
+
|
32 |
+
if __name__ == "__main__":
|
33 |
+
assistant_type = os.getenv(ASSISTANT_TYPE_KEY, AssistantType.HUGGINGFACE.value)
|
34 |
+
embedding_type = os.getenv(EMBEDDING_TYPE_KEY, EmbeddingType.HUGGINGFACE.value)
|
35 |
+
|
36 |
+
if assistant_type == AssistantType.OPENAI.value:
|
37 |
+
assistant_type = AssistantType.OPENAI
|
38 |
+
assistant_api_key = os.environ.get(OPENAI_API_TOKEN_KEY, None)
|
39 |
+
assistant_api_key_type = APIKeyType.OPENAI
|
40 |
+
knowledgebase_name = os.environ.get(OPENAI_KNOWLEDGEBASE_KEY, None)
|
41 |
+
|
42 |
+
if embedding_type == EmbeddingType.OPENAI.value:
|
43 |
+
embedding_type = EmbeddingType.OPENAI
|
44 |
+
embedding_api_key = assistant_api_key
|
45 |
+
embedding_api_key_type = APIKeyType.OPENAI
|
46 |
+
else:
|
47 |
+
embedding_type = EmbeddingType.HUGGINGFACE
|
48 |
+
embedding_api_key = os.getenv(HUGGINGFACEHUB_API_TOKEN_KEY, None)
|
49 |
+
embedding_api_key_type = APIKeyType.HUGGINGFACE
|
50 |
+
else:
|
51 |
+
assistant_type = AssistantType.HUGGINGFACE
|
52 |
+
assistant_api_key = os.environ.get(HUGGINGFACEHUB_API_TOKEN_KEY, None)
|
53 |
+
assistant_api_key_type = APIKeyType.HUGGINGFACE
|
54 |
+
knowledgebase_name = os.environ.get(HF_KNOWLEDGEBASE_KEY, None)
|
55 |
+
embedding_type = EmbeddingType.HUGGINGFACE
|
56 |
+
embedding_api_key = assistant_api_key
|
57 |
+
embedding_api_key_type = APIKeyType.HUGGINGFACE
|
58 |
+
|
59 |
+
logger.info("🗝️ Validating the API tokens...")
|
60 |
+
assistant_valid, assistant_err = validate_api_token(
|
61 |
+
api_key_type=assistant_api_key_type, api_key=assistant_api_key
|
62 |
+
)
|
63 |
+
if not assistant_valid:
|
64 |
+
logger.error(assistant_err)
|
65 |
+
sys.exit(1)
|
66 |
+
|
67 |
+
embedding_valid, embedding_err = validate_api_token(
|
68 |
+
api_key_type=embedding_api_key_type, api_key=embedding_api_key
|
69 |
+
)
|
70 |
+
if not embedding_valid:
|
71 |
+
logger.error(embedding_err)
|
72 |
+
sys.exit(1)
|
73 |
+
|
74 |
+
parser = argparse.ArgumentParser(description="LLM Website QA - CLI")
|
75 |
+
parser.add_argument(
|
76 |
+
QUERY_TAG, type=str, help="Question to be asked from the assistant"
|
77 |
+
)
|
78 |
+
args = parser.parse_args()
|
79 |
+
query = args.query
|
80 |
+
|
81 |
+
knowledgebase = Knowledgebase(
|
82 |
+
assistant_type=assistant_type,
|
83 |
+
embedding_type=embedding_type,
|
84 |
+
assistant_api_key=assistant_api_key,
|
85 |
+
embedding_api_key=embedding_api_key,
|
86 |
+
knowledgebase_name=knowledgebase_name,
|
87 |
+
)
|
88 |
+
result, metadata = knowledgebase.query_knowledgebase(query=query)
|
89 |
+
|
90 |
+
print(f"\nAnswer: \n{str(result.get(ANSWER_TAG, '').strip())}")
|
91 |
+
print(f"\nSources: \n{str(result.get(SOURCES_TAG, '').strip())}")
|
92 |
+
print(f"\nCost: \n{metadata}")
|
knowledgebase.py
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
from langchain.callbacks import get_openai_callback
|
4 |
+
from langchain.chains import RetrievalQAWithSourcesChain
|
5 |
+
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
|
6 |
+
from langchain.llms import OpenAIChat, HuggingFaceHub
|
7 |
+
from langchain.text_splitter import CharacterTextSplitter
|
8 |
+
from langchain.vectorstores import FAISS
|
9 |
+
from streamlit.logger import get_logger
|
10 |
+
|
11 |
+
from utils.constants import (
|
12 |
+
KNOWLEDGEBASE_DIR,
|
13 |
+
AssistantType,
|
14 |
+
BS_HTML_PARSER,
|
15 |
+
TEXT_TAG,
|
16 |
+
SOURCE_TAG,
|
17 |
+
ANSWER_TAG,
|
18 |
+
QUESTION_TAG,
|
19 |
+
HF_TEXT_GENERATION_REPO_ID,
|
20 |
+
EmbeddingType,
|
21 |
+
TOTAL_TOKENS_TAG,
|
22 |
+
PROMPT_TOKENS_TAG,
|
23 |
+
COMPLETION_TOKENS_TAG,
|
24 |
+
TOTAL_COST_TAG,
|
25 |
+
OPENAI_CHAT_COMPLETIONS_MODEL,
|
26 |
+
)
|
27 |
+
|
28 |
+
logger = get_logger(__name__)
|
29 |
+
|
30 |
+
|
31 |
+
def extract_text_from(url_: str):
|
32 |
+
html = requests.get(url_).text
|
33 |
+
soup = BeautifulSoup(html, features=BS_HTML_PARSER)
|
34 |
+
text = soup.get_text()
|
35 |
+
|
36 |
+
lines = (line.strip() for line in text.splitlines())
|
37 |
+
return "\n".join(line for line in lines if line)
|
38 |
+
|
39 |
+
|
40 |
+
def create_knowledgebase(
|
41 |
+
urls: list,
|
42 |
+
assistant_type: AssistantType,
|
43 |
+
embedding_type: EmbeddingType,
|
44 |
+
embedding_api_key: str,
|
45 |
+
knowledgebase_name: str,
|
46 |
+
):
|
47 |
+
pages: list[dict] = []
|
48 |
+
for url in urls:
|
49 |
+
pages.append({TEXT_TAG: extract_text_from(url_=url), SOURCE_TAG: url})
|
50 |
+
|
51 |
+
chunk_size = 500
|
52 |
+
chunk_overlap = 30
|
53 |
+
if assistant_type == AssistantType.OPENAI:
|
54 |
+
# # override the default chunk configs
|
55 |
+
# chunk_size = 1500
|
56 |
+
# chunk_overlap = 200
|
57 |
+
if embedding_type == EmbeddingType.HUGGINGFACE:
|
58 |
+
embeddings = HuggingFaceHubEmbeddings(
|
59 |
+
huggingfacehub_api_token=embedding_api_key
|
60 |
+
)
|
61 |
+
logger.info(f"Using `hf` embeddings")
|
62 |
+
else:
|
63 |
+
embeddings = OpenAIEmbeddings(openai_api_key=embedding_api_key)
|
64 |
+
logger.info(f"Using `openai` embeddings")
|
65 |
+
else:
|
66 |
+
embeddings = HuggingFaceHubEmbeddings(
|
67 |
+
huggingfacehub_api_token=embedding_api_key
|
68 |
+
)
|
69 |
+
logger.info(
|
70 |
+
f"Since the assistant type is set to `hf`, `hf` embeddings are used by default."
|
71 |
+
)
|
72 |
+
|
73 |
+
text_splitter = CharacterTextSplitter(
|
74 |
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator="\n"
|
75 |
+
)
|
76 |
+
|
77 |
+
docs, metadata = [], []
|
78 |
+
for page in pages:
|
79 |
+
splits = text_splitter.split_text(page[TEXT_TAG])
|
80 |
+
docs.extend(splits)
|
81 |
+
metadata.extend([{SOURCE_TAG: page[SOURCE_TAG]}] * len(splits))
|
82 |
+
print(f"Split {page[SOURCE_TAG]} into {len(splits)} chunks")
|
83 |
+
|
84 |
+
vectorstore = FAISS.from_texts(texts=docs, embedding=embeddings, metadatas=metadata)
|
85 |
+
vectorstore.save_local(folder_path=KNOWLEDGEBASE_DIR, index_name=knowledgebase_name)
|
86 |
+
|
87 |
+
|
88 |
+
def load_vectorstore(
|
89 |
+
embedding_type: EmbeddingType,
|
90 |
+
embedding_api_key: str,
|
91 |
+
knowledgebase_name: str,
|
92 |
+
):
|
93 |
+
if embedding_type == EmbeddingType.OPENAI:
|
94 |
+
embeddings = OpenAIEmbeddings(openai_api_key=embedding_api_key)
|
95 |
+
else:
|
96 |
+
embeddings = HuggingFaceHubEmbeddings(
|
97 |
+
huggingfacehub_api_token=embedding_api_key
|
98 |
+
)
|
99 |
+
logger.info(
|
100 |
+
f"Since the assistant type is set to `hf`, `hf` embeddings are used by default."
|
101 |
+
)
|
102 |
+
|
103 |
+
store = FAISS.load_local(
|
104 |
+
folder_path=KNOWLEDGEBASE_DIR,
|
105 |
+
embeddings=embeddings,
|
106 |
+
index_name=knowledgebase_name,
|
107 |
+
)
|
108 |
+
return store
|
109 |
+
|
110 |
+
|
111 |
+
def construct_query_response(result: dict) -> dict:
|
112 |
+
return {ANSWER_TAG: result}
|
113 |
+
|
114 |
+
|
115 |
+
class Knowledgebase:
|
116 |
+
def __init__(
|
117 |
+
self,
|
118 |
+
assistant_type: AssistantType,
|
119 |
+
embedding_type: EmbeddingType,
|
120 |
+
assistant_api_key: str,
|
121 |
+
embedding_api_key: str,
|
122 |
+
knowledgebase_name: str,
|
123 |
+
):
|
124 |
+
self.assistant_type = assistant_type
|
125 |
+
self.embedding_type = embedding_type
|
126 |
+
self.assistant_api_key = assistant_api_key
|
127 |
+
self.embedding_api_key = embedding_api_key
|
128 |
+
self.knowledgebase = load_vectorstore(
|
129 |
+
embedding_type=embedding_type,
|
130 |
+
embedding_api_key=embedding_api_key,
|
131 |
+
knowledgebase_name=knowledgebase_name,
|
132 |
+
)
|
133 |
+
|
134 |
+
def query_knowledgebase(self, query: str) -> tuple[dict, dict]:
|
135 |
+
try:
|
136 |
+
logger.info(
|
137 |
+
f"The assistant API key for the current session: ***{self.assistant_api_key[-4:]}"
|
138 |
+
)
|
139 |
+
logger.info(
|
140 |
+
f"The embedding API key for the current session: ***{self.embedding_api_key[-4:]}"
|
141 |
+
)
|
142 |
+
|
143 |
+
query = query.strip()
|
144 |
+
if not query:
|
145 |
+
return {
|
146 |
+
ANSWER_TAG: "Oh snap! did you hit send accidentally, because I can't see any questions 🤔",
|
147 |
+
}, {}
|
148 |
+
|
149 |
+
if self.assistant_type == AssistantType.OPENAI:
|
150 |
+
llm = OpenAIChat(
|
151 |
+
model_name=OPENAI_CHAT_COMPLETIONS_MODEL,
|
152 |
+
temperature=0,
|
153 |
+
verbose=True,
|
154 |
+
openai_api_key=self.assistant_api_key,
|
155 |
+
)
|
156 |
+
# # this is deprecated
|
157 |
+
# chain = VectorDBQAWithSourcesChain.from_llm(
|
158 |
+
# llm=llm,
|
159 |
+
# vectorstore=self.knowledgebase,
|
160 |
+
# max_tokens_limit=2048,
|
161 |
+
# k=2,
|
162 |
+
# reduce_k_below_max_tokens=True,
|
163 |
+
# )
|
164 |
+
chain = RetrievalQAWithSourcesChain.from_chain_type(
|
165 |
+
llm=llm,
|
166 |
+
chain_type="stuff",
|
167 |
+
retriever=self.knowledgebase.as_retriever(),
|
168 |
+
reduce_k_below_max_tokens=True,
|
169 |
+
chain_type_kwargs={"verbose": True},
|
170 |
+
)
|
171 |
+
else:
|
172 |
+
llm = HuggingFaceHub(
|
173 |
+
repo_id=HF_TEXT_GENERATION_REPO_ID,
|
174 |
+
model_kwargs={"temperature": 0.5, "max_length": 64},
|
175 |
+
huggingfacehub_api_token=self.assistant_api_key,
|
176 |
+
verbose=True,
|
177 |
+
)
|
178 |
+
chain = RetrievalQAWithSourcesChain.from_chain_type(
|
179 |
+
llm=llm,
|
180 |
+
chain_type="refine",
|
181 |
+
retriever=self.knowledgebase.as_retriever(),
|
182 |
+
max_tokens_limit=1024,
|
183 |
+
reduce_k_below_max_tokens=True,
|
184 |
+
chain_type_kwargs={"verbose": True},
|
185 |
+
)
|
186 |
+
|
187 |
+
with get_openai_callback() as cb:
|
188 |
+
result = chain({QUESTION_TAG: query})
|
189 |
+
print(f"Total Tokens: {cb.total_tokens}")
|
190 |
+
print(f"Prompt Tokens: {cb.prompt_tokens}")
|
191 |
+
print(f"Completion Tokens: {cb.completion_tokens}")
|
192 |
+
print(f"Total Cost (USD): ${cb.total_cost}")
|
193 |
+
|
194 |
+
metadata = {
|
195 |
+
TOTAL_TOKENS_TAG: cb.total_tokens,
|
196 |
+
PROMPT_TOKENS_TAG: cb.prompt_tokens,
|
197 |
+
COMPLETION_TOKENS_TAG: cb.completion_tokens,
|
198 |
+
TOTAL_COST_TAG: cb.total_cost,
|
199 |
+
}
|
200 |
+
return result, metadata
|
201 |
+
except Exception as e:
|
202 |
+
logger.error(f"{e.__class__.__name__}: {e}")
|
203 |
+
return {ANSWER_TAG: f"{e.__class__.__name__}: {e}"}, {}
|
knowledgebases/.gitkeep
ADDED
File without changes
|
knowledgebases/kb_openai_ishara.faiss
ADDED
Binary file (218 kB). View file
|
|
knowledgebases/kb_openai_ishara.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:540dfd6d7bda272777ee04edb931074a548217f1abbd76e772d6a36dea44c5bc
|
3 |
+
size 40432
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
openai~=0.27.8
|
2 |
+
streamlit~=1.25.0
|
3 |
+
streamlit-chat~=0.1.1
|
4 |
+
langchain~=0.0.238
|
5 |
+
bs4==0.0.1
|
6 |
+
tiktoken==0.3.0
|
7 |
+
faiss-cpu==1.7.4
|
8 |
+
requests~=2.31.0
|
9 |
+
python-dotenv==1.0.0
|
10 |
+
huggingface-hub==0.16.4
|
utils/__init__.py
ADDED
File without changes
|
utils/constants.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import Enum
|
2 |
+
|
3 |
+
KNOWLEDGEBASE_DIR = "knowledgebases"
|
4 |
+
BS_HTML_PARSER = "html.parser"
|
5 |
+
OPENAI_COMPLETIONS_MODEL = "gpt-3.5-turbo"
|
6 |
+
OPENAI_CHAT_COMPLETIONS_MODEL = "gpt-3.5-turbo"
|
7 |
+
OPENAI_TEST_MODEL = "text-ada-001"
|
8 |
+
ENV_FILE = ".env"
|
9 |
+
HF_TEXT_GENERATION_REPO_ID = "google/flan-t5-xxl"
|
10 |
+
# HF_TEXT_GENERATION_REPO_ID = "OpenAssistant/falcon-40b-sft-mix-1226"
|
11 |
+
# HF_TEXT_GENERATION_REPO_ID = "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
|
12 |
+
TEST_PROMPT = "test"
|
13 |
+
|
14 |
+
ASSISTANT_TYPE_KEY = "ASSISTANT_TYPE"
|
15 |
+
EMBEDDING_TYPE_KEY = "EMBEDDING_TYPE"
|
16 |
+
OPENAI_API_TOKEN_KEY = "OPENAI_API_KEY"
|
17 |
+
HUGGINGFACEHUB_API_TOKEN_KEY = "HUGGINGFACEHUB_API_TOKEN"
|
18 |
+
OPENAI_KNOWLEDGEBASE_KEY = "OPENAI_KNOWLEDGEBASE"
|
19 |
+
HF_KNOWLEDGEBASE_KEY = "HF_KNOWLEDGEBASE"
|
20 |
+
|
21 |
+
TEXT_TAG = "text"
|
22 |
+
SOURCE_TAG = "source"
|
23 |
+
SOURCES_TAG = "sources"
|
24 |
+
ANSWER_TAG = "answer"
|
25 |
+
QUESTION_TAG = "question"
|
26 |
+
QUERY_TAG = "query"
|
27 |
+
NONE_TAG = "None"
|
28 |
+
EMPTY_TAG = ""
|
29 |
+
MESSAGE_HISTORY_TAG = "message_history"
|
30 |
+
USER_TAG = "user"
|
31 |
+
ASSISTANT_TAG = "assistant"
|
32 |
+
FROM_TAG = "from"
|
33 |
+
IN_PROGRESS_TAG = "in_progress"
|
34 |
+
QUERY_INPUT_TAG = "query_input"
|
35 |
+
VALID_TOKEN_TAG = "valid_token"
|
36 |
+
API_KEY_TAG = "api_key"
|
37 |
+
ASSISTANT_TYPE_TAG = "assistant_type"
|
38 |
+
TOTAL_TOKENS_TAG = "total_tokens"
|
39 |
+
PROMPT_TOKENS_TAG = "prompt_tokens"
|
40 |
+
COMPLETION_TOKENS_TAG = "completion_tokens"
|
41 |
+
TOTAL_COST_TAG = "total_cost"
|
42 |
+
|
43 |
+
USER_AVATAR = "https://i.imgur.com/Rf63hWt.png"
|
44 |
+
ASSISTANT_AVATAR = "https://i.imgur.com/NQwsRn2.png"
|
45 |
+
|
46 |
+
|
47 |
+
class AssistantType(Enum):
|
48 |
+
HUGGINGFACE = "hf"
|
49 |
+
OPENAI = "openai"
|
50 |
+
|
51 |
+
|
52 |
+
class APIKeyType(Enum):
|
53 |
+
HUGGINGFACE = "hf"
|
54 |
+
OPENAI = "openai"
|
55 |
+
|
56 |
+
|
57 |
+
class EmbeddingType(Enum):
|
58 |
+
HUGGINGFACE = "hf"
|
59 |
+
OPENAI = "openai"
|
60 |
+
|
61 |
+
|
62 |
+
class StNotificationType(Enum):
|
63 |
+
INFO = "info"
|
64 |
+
WARNING = "warning"
|
65 |
+
ERROR = "err"
|
utils/llm.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import streamlit as st
|
3 |
+
from huggingface_hub import InferenceClient
|
4 |
+
from streamlit.logger import get_logger
|
5 |
+
|
6 |
+
from utils.constants import APIKeyType, TEST_PROMPT, OPENAI_TEST_MODEL
|
7 |
+
|
8 |
+
logger = get_logger(__name__)
|
9 |
+
|
10 |
+
|
11 |
+
@st.cache_data(show_spinner=False)
|
12 |
+
def validate_api_token(api_key_type: APIKeyType, api_key: str) -> tuple[bool, str]:
|
13 |
+
if not api_key_type:
|
14 |
+
return (
|
15 |
+
False,
|
16 |
+
"API key type is not mentioned",
|
17 |
+
)
|
18 |
+
|
19 |
+
if not api_key:
|
20 |
+
return (
|
21 |
+
False,
|
22 |
+
"Invalid API key detected",
|
23 |
+
)
|
24 |
+
|
25 |
+
try:
|
26 |
+
if api_key_type == APIKeyType.OPENAI:
|
27 |
+
openai.Completion.create(
|
28 |
+
model=OPENAI_TEST_MODEL,
|
29 |
+
prompt=TEST_PROMPT,
|
30 |
+
api_key=api_key,
|
31 |
+
max_tokens=1,
|
32 |
+
)
|
33 |
+
logger.info("OpenAI token validated")
|
34 |
+
else:
|
35 |
+
client = InferenceClient(token=api_key)
|
36 |
+
client.text_generation(prompt=TEST_PROMPT, max_new_tokens=1)
|
37 |
+
logger.info("HuggingFace token validated")
|
38 |
+
|
39 |
+
except Exception as e:
|
40 |
+
logger.error(f"{e.__class__.__name__}: {e}")
|
41 |
+
return False, f"{e.__class__.__name__}: {e}"
|
42 |
+
return True, ""
|