thisisishara commited on
Commit
0fac726
1 Parent(s): 7ec1ff7

init commit

Browse files
.env.template ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ["hf", "openai"]
2
+ ASSISTANT_TYPE=openai
3
+ EMBEDDING_TYPE=hf
4
+
5
+ # if openai
6
+ OPENAI_API_KEY=sk-xxxxx
7
+ OPENAI_KNOWLEDGEBASE=kb_openai
8
+
9
+ # if hf
10
+ HUGGINGFACEHUB_API_TOKEN=hf_xxxxx
11
+ HF_KNOWLEDGEBASE=kb_hf
.gitignore ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ .idea/
161
+
162
+
163
+ # App-specific
.streamlit/config.toml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [theme]
2
+ primaryColor="#2c7b2c"
3
+ backgroundColor="#171e1a"
4
+ secondaryBackgroundColor="#111811"
5
+ textColor="#cfd8dc"
6
+ font="sans serif"
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10.9
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ COPY . .
10
+
11
+ EXPOSE 8501
12
+
13
+ CMD ["streamlit", "run", "app.py", "--server.port", "8501"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Ishara Dissanayake
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
app.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import re
4
+
5
+ import streamlit as st
6
+ from streamlit.logger import get_logger
7
+
8
+ from knowledgebase import Knowledgebase
9
+ from utils.constants import (
10
+ AssistantType,
11
+ OPENAI_KNOWLEDGEBASE_KEY,
12
+ HUGGINGFACEHUB_API_TOKEN_KEY,
13
+ HF_KNOWLEDGEBASE_KEY,
14
+ SOURCES_TAG,
15
+ ANSWER_TAG,
16
+ NONE_TAG,
17
+ EMPTY_TAG,
18
+ MESSAGE_HISTORY_TAG,
19
+ TEXT_TAG,
20
+ USER_TAG,
21
+ ASSISTANT_TAG,
22
+ FROM_TAG,
23
+ IN_PROGRESS_TAG,
24
+ QUERY_INPUT_TAG,
25
+ VALID_TOKEN_TAG,
26
+ StNotificationType,
27
+ API_KEY_TAG,
28
+ ASSISTANT_TYPE_TAG,
29
+ ASSISTANT_AVATAR,
30
+ USER_AVATAR,
31
+ EmbeddingType,
32
+ APIKeyType,
33
+ )
34
+ from utils.llm import validate_api_token
35
+
36
+ # initialize a logger
37
+ logger = get_logger(__name__)
38
+
39
+
40
+ def retrieve_answer(query: str):
41
+ try:
42
+ assistant_type = st.session_state.selected_assistant_type
43
+ embedding_type = EmbeddingType.HUGGINGFACE
44
+ assistant_api_key = st.session_state.verified_api_key
45
+ embedding_api_key = st.session_state.embedding_api_key
46
+ knowledgebase_name = st.session_state.knowledgebase_name
47
+
48
+ knowledgebase = Knowledgebase(
49
+ assistant_type=assistant_type,
50
+ embedding_type=embedding_type,
51
+ assistant_api_key=assistant_api_key,
52
+ embedding_api_key=embedding_api_key,
53
+ knowledgebase_name=knowledgebase_name,
54
+ )
55
+ answer, metadata = knowledgebase.query_knowledgebase(query=query)
56
+ if not metadata:
57
+ metadata = "$0.00"
58
+
59
+ final_answer = re.sub(
60
+ r"\bSOURCES:[\n\s]*$", "", str(answer[ANSWER_TAG]).strip()
61
+ ).strip()
62
+ logger.info(f"final answer: {final_answer}")
63
+
64
+ if answer.get(SOURCES_TAG, None) not in [None, NONE_TAG, EMPTY_TAG]:
65
+ return f"{final_answer}\n\nSources:\n{answer[SOURCES_TAG]}\n\nCost (USD):\n`{metadata}`"
66
+ else:
67
+ return f"{final_answer}\n\nCost:\n`{metadata}`"
68
+ except Exception as e:
69
+ logger.exception(f"Invalid API key. {e}")
70
+ return (
71
+ f"Could not retrieve the answer. This could be due to "
72
+ f"various reasons such as Invalid API Tokens or hitting "
73
+ f"the Rate limit enforced by LLM vendors."
74
+ )
75
+
76
+
77
+ def show_chat_ui():
78
+ if (
79
+ st.session_state.selected_assistant_type == AssistantType.HUGGINGFACE
80
+ and not st.session_state.get(MESSAGE_HISTORY_TAG, None)
81
+ ):
82
+ show_notification_banner_ui(
83
+ notification_type=StNotificationType.WARNING,
84
+ notification="🤗🤏🏽 HuggingFace assistant is not always guaranteed "
85
+ "to return a valid response and often exceeds the "
86
+ "maximum token limit. Use the OpenAI assistant for "
87
+ "more reliable responses.",
88
+ )
89
+
90
+ if not st.session_state.get(MESSAGE_HISTORY_TAG, None):
91
+ st.subheader("Let's start chatting, shall we?")
92
+
93
+ if st.session_state.get(IN_PROGRESS_TAG, False):
94
+ query = st.chat_input(
95
+ "Ask me about ShoutOUT AI stuff", key=QUERY_INPUT_TAG, disabled=True
96
+ )
97
+ else:
98
+ query = st.chat_input("Ask me about ShoutOUT AI stuff", key=QUERY_INPUT_TAG)
99
+
100
+ if query:
101
+ st.session_state.in_progress = True
102
+ current_messages = st.session_state.get(MESSAGE_HISTORY_TAG, [])
103
+ current_messages.append({TEXT_TAG: query, FROM_TAG: USER_TAG})
104
+ st.session_state.message_history = current_messages
105
+ answer = retrieve_answer(query=query)
106
+ current_messages.append({TEXT_TAG: answer, FROM_TAG: ASSISTANT_TAG})
107
+ st.session_state.message_history = current_messages
108
+ st.session_state.in_progress = False
109
+
110
+ if st.session_state.get(MESSAGE_HISTORY_TAG, None):
111
+ messages = st.session_state.message_history
112
+ for message in messages:
113
+ if message.get(FROM_TAG) == USER_TAG:
114
+ with st.chat_message(USER_TAG, avatar=USER_AVATAR):
115
+ st.write(message.get(TEXT_TAG))
116
+
117
+ if message.get(FROM_TAG) == ASSISTANT_TAG:
118
+ with st.chat_message(ASSISTANT_TAG, avatar=ASSISTANT_AVATAR):
119
+ st.write(message.get(TEXT_TAG))
120
+
121
+
122
+ def show_hf_chat_ui():
123
+ st.sidebar.info(
124
+ "🤗 You are using the Hugging Face Hub models for the QA task and "
125
+ "performance might not be as good as proprietary LLMs."
126
+ )
127
+
128
+ verify_token()
129
+ validated_token = st.session_state.get(VALID_TOKEN_TAG, None)
130
+ if validated_token is None:
131
+ st.stop()
132
+ if not validated_token:
133
+ st.sidebar.error("❌ Failed to get connected to the HuggingFace Hub")
134
+ show_notification_banner_ui(
135
+ notification_type=StNotificationType.INFO,
136
+ notification="Failed to get connected to the HuggingFace Hub",
137
+ )
138
+ st.stop()
139
+
140
+ st.sidebar.success(f"✅ Connected to the HF Hub")
141
+ show_chat_ui()
142
+
143
+
144
+ def show_openai_chat_ui():
145
+ st.sidebar.info(
146
+ "🚀 To get started, enter your OpenAI API key. Once that's done, "
147
+ "you can ask start asking questions. Oh! one more thing, we take "
148
+ "security seriously and we are NOT storing the API keys in any manner, "
149
+ "so you're safe. Just revoke it after usage to make sure nothing "
150
+ "unexpected happens."
151
+ )
152
+ if st.sidebar.text_input(
153
+ "Enter the OpenAI API Key",
154
+ key=API_KEY_TAG,
155
+ label_visibility="hidden",
156
+ placeholder="OpenAI API Key",
157
+ type="password",
158
+ ):
159
+ verify_token()
160
+
161
+ validated_token = st.session_state.get(VALID_TOKEN_TAG, None)
162
+ if validated_token is None:
163
+ st.sidebar.info(f"🗝️ Provide the API Key")
164
+ st.stop()
165
+ if not validated_token:
166
+ st.sidebar.error("❌ API Key you provided is invalid")
167
+ show_notification_banner_ui(
168
+ notification_type=StNotificationType.INFO,
169
+ notification="Please provide a valid OpenAI API Key",
170
+ )
171
+ st.stop()
172
+
173
+ st.sidebar.success(f"✅ Token Validated!")
174
+ show_chat_ui()
175
+
176
+
177
+ def show_notification_banner_ui(
178
+ notification_type: StNotificationType, notification: str
179
+ ):
180
+ if notification_type == StNotificationType.INFO:
181
+ st.info(notification)
182
+ elif notification_type == StNotificationType.WARNING:
183
+ st.warning(notification)
184
+ elif notification_type == StNotificationType.ERROR:
185
+ st.error(notification)
186
+
187
+
188
+ def verify_token():
189
+ from dotenv import load_dotenv
190
+
191
+ load_dotenv()
192
+
193
+ embedding_api_key = os.getenv(HUGGINGFACEHUB_API_TOKEN_KEY, None)
194
+ st_assistant_type = st.session_state.selected_assistant_type
195
+ if st_assistant_type == AssistantType.OPENAI:
196
+ assistant_api_key = st.session_state.get(API_KEY_TAG, None)
197
+ assistant_api_key_type = APIKeyType.OPENAI
198
+ knowledgebase_name = os.environ.get(OPENAI_KNOWLEDGEBASE_KEY, None)
199
+ else:
200
+ assistant_api_key = os.getenv(HUGGINGFACEHUB_API_TOKEN_KEY, None)
201
+ assistant_api_key_type = APIKeyType.HUGGINGFACE
202
+ knowledgebase_name = os.environ.get(HF_KNOWLEDGEBASE_KEY, None)
203
+
204
+ logger.info(
205
+ f"The API key for the current st session: {assistant_api_key}\n"
206
+ f"The Knowledgebase for the current st session: {knowledgebase_name}"
207
+ )
208
+
209
+ assistant_valid, assistant_err = validate_api_token(
210
+ api_key_type=assistant_api_key_type,
211
+ api_key=assistant_api_key,
212
+ )
213
+ embedding_valid, embedding_err = validate_api_token(
214
+ api_key_type=APIKeyType.HUGGINGFACE,
215
+ api_key=embedding_api_key,
216
+ )
217
+
218
+ if assistant_valid and embedding_valid:
219
+ st.session_state.valid_token = True
220
+ st.session_state.verified_api_key = assistant_api_key
221
+ st.session_state.embedding_api_key = embedding_api_key
222
+ st.session_state.knowledgebase_name = knowledgebase_name
223
+ elif not assistant_valid and not embedding_valid:
224
+ st.session_state.valid_token = False
225
+ st.session_state.token_err = f"{assistant_err}\n{embedding_err}"
226
+ elif not assistant_valid:
227
+ st.session_state.valid_token = False
228
+ st.session_state.token_err = assistant_err
229
+ elif not embedding_valid:
230
+ st.session_state.valid_token = False
231
+ st.session_state.token_err = embedding_err
232
+ else:
233
+ st.session_state.valid_token = False
234
+ st.session_state.token_err = (
235
+ "An unknown error occurred while validating the API keys"
236
+ )
237
+
238
+
239
+ def app():
240
+ # sidebar
241
+ st.sidebar.image(
242
+ "https://thisisishara.com/res/images/favicon/android-chrome-192x192.png",
243
+ width=80,
244
+ )
245
+ if st.sidebar.selectbox(
246
+ "Assistant Type",
247
+ ["OpenAI", "Hugging Face"],
248
+ key=ASSISTANT_TYPE_TAG,
249
+ placeholder="Select Assistant Type",
250
+ ):
251
+ if str(st.session_state.assistant_type).lower() == AssistantType.OPENAI.value:
252
+ st.session_state.selected_assistant_type = AssistantType.OPENAI
253
+ else:
254
+ st.session_state.selected_assistant_type = AssistantType.HUGGINGFACE
255
+ st.session_state.valid_token = None
256
+ st.session_state.verified_api_key = None
257
+ st.session_state.knowledgebase_name = None
258
+
259
+ st.write(st.session_state.selected_assistant_type)
260
+
261
+ # main section
262
+ st.header("LLM Website QA Demo")
263
+ st.caption("⚡ Powered by :blue[LangChain], :green[OpenAI] & :green[Hugging Face]")
264
+
265
+ assistant_type = st.session_state.selected_assistant_type
266
+ if assistant_type == AssistantType.OPENAI:
267
+ show_openai_chat_ui()
268
+ elif assistant_type == AssistantType.HUGGINGFACE:
269
+ show_hf_chat_ui()
270
+ else:
271
+ show_notification_banner_ui(
272
+ notification_type=StNotificationType.INFO,
273
+ notification="Please select an assistant type to get started!",
274
+ )
275
+
276
+
277
+ if __name__ == "__main__":
278
+ st.set_page_config(
279
+ page_title="Website QA powered by LangChain & LLMs",
280
+ page_icon="https://thisisishara.com/res/images/favicon/android-chrome-192x192.png",
281
+ layout="wide",
282
+ initial_sidebar_state="expanded",
283
+ )
284
+ hide_streamlit_style = """
285
+ <style>
286
+ # #MainMenu {visibility: hidden;}
287
+ # footer {visibility: hidden;}
288
+ [data-testid="stDecoration"] {background: linear-gradient(to right, #9EE51A, #208BBC) !important;}
289
+ </style>
290
+ """
291
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
292
+
293
+ # run the app
294
+ app()
backup/docker/kb_openai_ishara.faiss ADDED
Binary file (218 kB). View file
 
backup/docker/kb_openai_ishara.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:540dfd6d7bda272777ee04edb931074a548217f1abbd76e772d6a36dea44c5bc
3
+ size 40432
backup/windows/kb_openai_ishara.faiss ADDED
Binary file (218 kB). View file
 
backup/windows/kb_openai_ishara.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16937df0644608b4c1a5d3c6dcfa3cbd12b8afd3dac9d35d5654ac5d727ffefb
3
+ size 40432
build_knowledgebase.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import sys
4
+
5
+ from dotenv import load_dotenv
6
+
7
+ from knowledgebase import create_knowledgebase
8
+ from utils.constants import (
9
+ ASSISTANT_TYPE_KEY,
10
+ AssistantType,
11
+ OPENAI_API_TOKEN_KEY,
12
+ HUGGINGFACEHUB_API_TOKEN_KEY,
13
+ OPENAI_KNOWLEDGEBASE_KEY,
14
+ HF_KNOWLEDGEBASE_KEY,
15
+ ENV_FILE,
16
+ EMBEDDING_TYPE_KEY,
17
+ EmbeddingType,
18
+ APIKeyType,
19
+ )
20
+ from utils.llm import validate_api_token
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # load the .env
25
+ load_dotenv(dotenv_path=os.path.join(os.getcwd(), ENV_FILE))
26
+
27
+
28
+ if __name__ == "__main__":
29
+ # initialize the knowledgebase
30
+ logger.info("⚡ Initializing the URLs...")
31
+
32
+ # determine assistant type
33
+ assistant_type = os.getenv(ASSISTANT_TYPE_KEY, AssistantType.HUGGINGFACE.value)
34
+ embedding_type = os.getenv(EMBEDDING_TYPE_KEY, EmbeddingType.HUGGINGFACE.value)
35
+
36
+ if assistant_type == AssistantType.OPENAI.value:
37
+ assistant_type = AssistantType.OPENAI
38
+ knowledgebase_name = os.environ.get(OPENAI_KNOWLEDGEBASE_KEY, None)
39
+
40
+ if embedding_type == EmbeddingType.OPENAI.value:
41
+ embedding_type = EmbeddingType.OPENAI
42
+ embedding_api_key = os.getenv(OPENAI_API_TOKEN_KEY, None)
43
+ embedding_api_key_type = APIKeyType.OPENAI
44
+ else:
45
+ embedding_type = EmbeddingType.HUGGINGFACE
46
+ embedding_api_key = os.getenv(HUGGINGFACEHUB_API_TOKEN_KEY, None)
47
+ embedding_api_key_type = APIKeyType.HUGGINGFACE
48
+
49
+ else:
50
+ assistant_type = AssistantType.HUGGINGFACE
51
+ knowledgebase_name = os.environ.get(HF_KNOWLEDGEBASE_KEY, None)
52
+ embedding_type = EmbeddingType.HUGGINGFACE
53
+ embedding_api_key = os.getenv(HUGGINGFACEHUB_API_TOKEN_KEY, None)
54
+ embedding_api_key_type = APIKeyType.HUGGINGFACE
55
+
56
+ if embedding_type == EmbeddingType.OPENAI:
57
+ urls = [
58
+ "https://thisisishara.com/",
59
+ "https://github.com/thisisishara",
60
+ "https://github.com/thisisishara?tab=repositories",
61
+ "https://www.hackerrank.com/thisisishara?hr_r=1",
62
+ "https://www.npmjs.com/~thisisishara",
63
+ "https://pypi.org/user/thisisishara/",
64
+ "https://www.linkedin.com/in/isharadissanayake/",
65
+ ]
66
+
67
+ else:
68
+ urls = [
69
+ "https://thisisishara.com/",
70
+ "https://github.com/thisisishara",
71
+ "https://github.com/thisisishara?tab=repositories",
72
+ "https://www.hackerrank.com/thisisishara?hr_r=1",
73
+ "https://www.npmjs.com/~thisisishara",
74
+ "https://pypi.org/user/thisisishara/",
75
+ "https://www.linkedin.com/in/isharadissanayake/",
76
+ ]
77
+
78
+ logger.info("🗝️ Validating the embedding API token...")
79
+ embedding_valid, embedding_err = validate_api_token(
80
+ api_key_type=embedding_api_key_type, api_key=embedding_api_key
81
+ )
82
+ if not embedding_valid:
83
+ logger.error(embedding_err)
84
+ sys.exit(1)
85
+
86
+ create_knowledgebase(
87
+ urls=urls,
88
+ assistant_type=assistant_type,
89
+ embedding_type=embedding_type,
90
+ embedding_api_key=embedding_api_key,
91
+ knowledgebase_name=knowledgebase_name,
92
+ )
93
+
94
+ logger.info("✅ Knowledgebase created")
chat.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import os
4
+ import sys
5
+
6
+ from dotenv import load_dotenv
7
+
8
+ from knowledgebase import Knowledgebase
9
+ from utils.constants import (
10
+ ENV_FILE,
11
+ ASSISTANT_TYPE_KEY,
12
+ AssistantType,
13
+ OPENAI_API_TOKEN_KEY,
14
+ OPENAI_KNOWLEDGEBASE_KEY,
15
+ HUGGINGFACEHUB_API_TOKEN_KEY,
16
+ HF_KNOWLEDGEBASE_KEY,
17
+ QUERY_TAG,
18
+ ANSWER_TAG,
19
+ SOURCES_TAG,
20
+ EMBEDDING_TYPE_KEY,
21
+ APIKeyType,
22
+ EmbeddingType,
23
+ )
24
+ from utils.llm import validate_api_token
25
+
26
+ # load the .env
27
+ load_dotenv(dotenv_path=os.path.join(os.getcwd(), ENV_FILE))
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ if __name__ == "__main__":
33
+ assistant_type = os.getenv(ASSISTANT_TYPE_KEY, AssistantType.HUGGINGFACE.value)
34
+ embedding_type = os.getenv(EMBEDDING_TYPE_KEY, EmbeddingType.HUGGINGFACE.value)
35
+
36
+ if assistant_type == AssistantType.OPENAI.value:
37
+ assistant_type = AssistantType.OPENAI
38
+ assistant_api_key = os.environ.get(OPENAI_API_TOKEN_KEY, None)
39
+ assistant_api_key_type = APIKeyType.OPENAI
40
+ knowledgebase_name = os.environ.get(OPENAI_KNOWLEDGEBASE_KEY, None)
41
+
42
+ if embedding_type == EmbeddingType.OPENAI.value:
43
+ embedding_type = EmbeddingType.OPENAI
44
+ embedding_api_key = assistant_api_key
45
+ embedding_api_key_type = APIKeyType.OPENAI
46
+ else:
47
+ embedding_type = EmbeddingType.HUGGINGFACE
48
+ embedding_api_key = os.getenv(HUGGINGFACEHUB_API_TOKEN_KEY, None)
49
+ embedding_api_key_type = APIKeyType.HUGGINGFACE
50
+ else:
51
+ assistant_type = AssistantType.HUGGINGFACE
52
+ assistant_api_key = os.environ.get(HUGGINGFACEHUB_API_TOKEN_KEY, None)
53
+ assistant_api_key_type = APIKeyType.HUGGINGFACE
54
+ knowledgebase_name = os.environ.get(HF_KNOWLEDGEBASE_KEY, None)
55
+ embedding_type = EmbeddingType.HUGGINGFACE
56
+ embedding_api_key = assistant_api_key
57
+ embedding_api_key_type = APIKeyType.HUGGINGFACE
58
+
59
+ logger.info("🗝️ Validating the API tokens...")
60
+ assistant_valid, assistant_err = validate_api_token(
61
+ api_key_type=assistant_api_key_type, api_key=assistant_api_key
62
+ )
63
+ if not assistant_valid:
64
+ logger.error(assistant_err)
65
+ sys.exit(1)
66
+
67
+ embedding_valid, embedding_err = validate_api_token(
68
+ api_key_type=embedding_api_key_type, api_key=embedding_api_key
69
+ )
70
+ if not embedding_valid:
71
+ logger.error(embedding_err)
72
+ sys.exit(1)
73
+
74
+ parser = argparse.ArgumentParser(description="LLM Website QA - CLI")
75
+ parser.add_argument(
76
+ QUERY_TAG, type=str, help="Question to be asked from the assistant"
77
+ )
78
+ args = parser.parse_args()
79
+ query = args.query
80
+
81
+ knowledgebase = Knowledgebase(
82
+ assistant_type=assistant_type,
83
+ embedding_type=embedding_type,
84
+ assistant_api_key=assistant_api_key,
85
+ embedding_api_key=embedding_api_key,
86
+ knowledgebase_name=knowledgebase_name,
87
+ )
88
+ result, metadata = knowledgebase.query_knowledgebase(query=query)
89
+
90
+ print(f"\nAnswer: \n{str(result.get(ANSWER_TAG, '').strip())}")
91
+ print(f"\nSources: \n{str(result.get(SOURCES_TAG, '').strip())}")
92
+ print(f"\nCost: \n{metadata}")
knowledgebase.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from langchain.callbacks import get_openai_callback
4
+ from langchain.chains import RetrievalQAWithSourcesChain
5
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
6
+ from langchain.llms import OpenAIChat, HuggingFaceHub
7
+ from langchain.text_splitter import CharacterTextSplitter
8
+ from langchain.vectorstores import FAISS
9
+ from streamlit.logger import get_logger
10
+
11
+ from utils.constants import (
12
+ KNOWLEDGEBASE_DIR,
13
+ AssistantType,
14
+ BS_HTML_PARSER,
15
+ TEXT_TAG,
16
+ SOURCE_TAG,
17
+ ANSWER_TAG,
18
+ QUESTION_TAG,
19
+ HF_TEXT_GENERATION_REPO_ID,
20
+ EmbeddingType,
21
+ TOTAL_TOKENS_TAG,
22
+ PROMPT_TOKENS_TAG,
23
+ COMPLETION_TOKENS_TAG,
24
+ TOTAL_COST_TAG,
25
+ OPENAI_CHAT_COMPLETIONS_MODEL,
26
+ )
27
+
28
+ logger = get_logger(__name__)
29
+
30
+
31
+ def extract_text_from(url_: str):
32
+ html = requests.get(url_).text
33
+ soup = BeautifulSoup(html, features=BS_HTML_PARSER)
34
+ text = soup.get_text()
35
+
36
+ lines = (line.strip() for line in text.splitlines())
37
+ return "\n".join(line for line in lines if line)
38
+
39
+
40
+ def create_knowledgebase(
41
+ urls: list,
42
+ assistant_type: AssistantType,
43
+ embedding_type: EmbeddingType,
44
+ embedding_api_key: str,
45
+ knowledgebase_name: str,
46
+ ):
47
+ pages: list[dict] = []
48
+ for url in urls:
49
+ pages.append({TEXT_TAG: extract_text_from(url_=url), SOURCE_TAG: url})
50
+
51
+ chunk_size = 500
52
+ chunk_overlap = 30
53
+ if assistant_type == AssistantType.OPENAI:
54
+ # # override the default chunk configs
55
+ # chunk_size = 1500
56
+ # chunk_overlap = 200
57
+ if embedding_type == EmbeddingType.HUGGINGFACE:
58
+ embeddings = HuggingFaceHubEmbeddings(
59
+ huggingfacehub_api_token=embedding_api_key
60
+ )
61
+ logger.info(f"Using `hf` embeddings")
62
+ else:
63
+ embeddings = OpenAIEmbeddings(openai_api_key=embedding_api_key)
64
+ logger.info(f"Using `openai` embeddings")
65
+ else:
66
+ embeddings = HuggingFaceHubEmbeddings(
67
+ huggingfacehub_api_token=embedding_api_key
68
+ )
69
+ logger.info(
70
+ f"Since the assistant type is set to `hf`, `hf` embeddings are used by default."
71
+ )
72
+
73
+ text_splitter = CharacterTextSplitter(
74
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator="\n"
75
+ )
76
+
77
+ docs, metadata = [], []
78
+ for page in pages:
79
+ splits = text_splitter.split_text(page[TEXT_TAG])
80
+ docs.extend(splits)
81
+ metadata.extend([{SOURCE_TAG: page[SOURCE_TAG]}] * len(splits))
82
+ print(f"Split {page[SOURCE_TAG]} into {len(splits)} chunks")
83
+
84
+ vectorstore = FAISS.from_texts(texts=docs, embedding=embeddings, metadatas=metadata)
85
+ vectorstore.save_local(folder_path=KNOWLEDGEBASE_DIR, index_name=knowledgebase_name)
86
+
87
+
88
+ def load_vectorstore(
89
+ embedding_type: EmbeddingType,
90
+ embedding_api_key: str,
91
+ knowledgebase_name: str,
92
+ ):
93
+ if embedding_type == EmbeddingType.OPENAI:
94
+ embeddings = OpenAIEmbeddings(openai_api_key=embedding_api_key)
95
+ else:
96
+ embeddings = HuggingFaceHubEmbeddings(
97
+ huggingfacehub_api_token=embedding_api_key
98
+ )
99
+ logger.info(
100
+ f"Since the assistant type is set to `hf`, `hf` embeddings are used by default."
101
+ )
102
+
103
+ store = FAISS.load_local(
104
+ folder_path=KNOWLEDGEBASE_DIR,
105
+ embeddings=embeddings,
106
+ index_name=knowledgebase_name,
107
+ )
108
+ return store
109
+
110
+
111
+ def construct_query_response(result: dict) -> dict:
112
+ return {ANSWER_TAG: result}
113
+
114
+
115
+ class Knowledgebase:
116
+ def __init__(
117
+ self,
118
+ assistant_type: AssistantType,
119
+ embedding_type: EmbeddingType,
120
+ assistant_api_key: str,
121
+ embedding_api_key: str,
122
+ knowledgebase_name: str,
123
+ ):
124
+ self.assistant_type = assistant_type
125
+ self.embedding_type = embedding_type
126
+ self.assistant_api_key = assistant_api_key
127
+ self.embedding_api_key = embedding_api_key
128
+ self.knowledgebase = load_vectorstore(
129
+ embedding_type=embedding_type,
130
+ embedding_api_key=embedding_api_key,
131
+ knowledgebase_name=knowledgebase_name,
132
+ )
133
+
134
+ def query_knowledgebase(self, query: str) -> tuple[dict, dict]:
135
+ try:
136
+ logger.info(
137
+ f"The assistant API key for the current session: ***{self.assistant_api_key[-4:]}"
138
+ )
139
+ logger.info(
140
+ f"The embedding API key for the current session: ***{self.embedding_api_key[-4:]}"
141
+ )
142
+
143
+ query = query.strip()
144
+ if not query:
145
+ return {
146
+ ANSWER_TAG: "Oh snap! did you hit send accidentally, because I can't see any questions 🤔",
147
+ }, {}
148
+
149
+ if self.assistant_type == AssistantType.OPENAI:
150
+ llm = OpenAIChat(
151
+ model_name=OPENAI_CHAT_COMPLETIONS_MODEL,
152
+ temperature=0,
153
+ verbose=True,
154
+ openai_api_key=self.assistant_api_key,
155
+ )
156
+ # # this is deprecated
157
+ # chain = VectorDBQAWithSourcesChain.from_llm(
158
+ # llm=llm,
159
+ # vectorstore=self.knowledgebase,
160
+ # max_tokens_limit=2048,
161
+ # k=2,
162
+ # reduce_k_below_max_tokens=True,
163
+ # )
164
+ chain = RetrievalQAWithSourcesChain.from_chain_type(
165
+ llm=llm,
166
+ chain_type="stuff",
167
+ retriever=self.knowledgebase.as_retriever(),
168
+ reduce_k_below_max_tokens=True,
169
+ chain_type_kwargs={"verbose": True},
170
+ )
171
+ else:
172
+ llm = HuggingFaceHub(
173
+ repo_id=HF_TEXT_GENERATION_REPO_ID,
174
+ model_kwargs={"temperature": 0.5, "max_length": 64},
175
+ huggingfacehub_api_token=self.assistant_api_key,
176
+ verbose=True,
177
+ )
178
+ chain = RetrievalQAWithSourcesChain.from_chain_type(
179
+ llm=llm,
180
+ chain_type="refine",
181
+ retriever=self.knowledgebase.as_retriever(),
182
+ max_tokens_limit=1024,
183
+ reduce_k_below_max_tokens=True,
184
+ chain_type_kwargs={"verbose": True},
185
+ )
186
+
187
+ with get_openai_callback() as cb:
188
+ result = chain({QUESTION_TAG: query})
189
+ print(f"Total Tokens: {cb.total_tokens}")
190
+ print(f"Prompt Tokens: {cb.prompt_tokens}")
191
+ print(f"Completion Tokens: {cb.completion_tokens}")
192
+ print(f"Total Cost (USD): ${cb.total_cost}")
193
+
194
+ metadata = {
195
+ TOTAL_TOKENS_TAG: cb.total_tokens,
196
+ PROMPT_TOKENS_TAG: cb.prompt_tokens,
197
+ COMPLETION_TOKENS_TAG: cb.completion_tokens,
198
+ TOTAL_COST_TAG: cb.total_cost,
199
+ }
200
+ return result, metadata
201
+ except Exception as e:
202
+ logger.error(f"{e.__class__.__name__}: {e}")
203
+ return {ANSWER_TAG: f"{e.__class__.__name__}: {e}"}, {}
knowledgebases/.gitkeep ADDED
File without changes
knowledgebases/kb_openai_ishara.faiss ADDED
Binary file (218 kB). View file
 
knowledgebases/kb_openai_ishara.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:540dfd6d7bda272777ee04edb931074a548217f1abbd76e772d6a36dea44c5bc
3
+ size 40432
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ openai~=0.27.8
2
+ streamlit~=1.25.0
3
+ streamlit-chat~=0.1.1
4
+ langchain~=0.0.238
5
+ bs4==0.0.1
6
+ tiktoken==0.3.0
7
+ faiss-cpu==1.7.4
8
+ requests~=2.31.0
9
+ python-dotenv==1.0.0
10
+ huggingface-hub==0.16.4
utils/__init__.py ADDED
File without changes
utils/constants.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+
3
+ KNOWLEDGEBASE_DIR = "knowledgebases"
4
+ BS_HTML_PARSER = "html.parser"
5
+ OPENAI_COMPLETIONS_MODEL = "gpt-3.5-turbo"
6
+ OPENAI_CHAT_COMPLETIONS_MODEL = "gpt-3.5-turbo"
7
+ OPENAI_TEST_MODEL = "text-ada-001"
8
+ ENV_FILE = ".env"
9
+ HF_TEXT_GENERATION_REPO_ID = "google/flan-t5-xxl"
10
+ # HF_TEXT_GENERATION_REPO_ID = "OpenAssistant/falcon-40b-sft-mix-1226"
11
+ # HF_TEXT_GENERATION_REPO_ID = "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
12
+ TEST_PROMPT = "test"
13
+
14
+ ASSISTANT_TYPE_KEY = "ASSISTANT_TYPE"
15
+ EMBEDDING_TYPE_KEY = "EMBEDDING_TYPE"
16
+ OPENAI_API_TOKEN_KEY = "OPENAI_API_KEY"
17
+ HUGGINGFACEHUB_API_TOKEN_KEY = "HUGGINGFACEHUB_API_TOKEN"
18
+ OPENAI_KNOWLEDGEBASE_KEY = "OPENAI_KNOWLEDGEBASE"
19
+ HF_KNOWLEDGEBASE_KEY = "HF_KNOWLEDGEBASE"
20
+
21
+ TEXT_TAG = "text"
22
+ SOURCE_TAG = "source"
23
+ SOURCES_TAG = "sources"
24
+ ANSWER_TAG = "answer"
25
+ QUESTION_TAG = "question"
26
+ QUERY_TAG = "query"
27
+ NONE_TAG = "None"
28
+ EMPTY_TAG = ""
29
+ MESSAGE_HISTORY_TAG = "message_history"
30
+ USER_TAG = "user"
31
+ ASSISTANT_TAG = "assistant"
32
+ FROM_TAG = "from"
33
+ IN_PROGRESS_TAG = "in_progress"
34
+ QUERY_INPUT_TAG = "query_input"
35
+ VALID_TOKEN_TAG = "valid_token"
36
+ API_KEY_TAG = "api_key"
37
+ ASSISTANT_TYPE_TAG = "assistant_type"
38
+ TOTAL_TOKENS_TAG = "total_tokens"
39
+ PROMPT_TOKENS_TAG = "prompt_tokens"
40
+ COMPLETION_TOKENS_TAG = "completion_tokens"
41
+ TOTAL_COST_TAG = "total_cost"
42
+
43
+ USER_AVATAR = "https://i.imgur.com/Rf63hWt.png"
44
+ ASSISTANT_AVATAR = "https://i.imgur.com/NQwsRn2.png"
45
+
46
+
47
+ class AssistantType(Enum):
48
+ HUGGINGFACE = "hf"
49
+ OPENAI = "openai"
50
+
51
+
52
+ class APIKeyType(Enum):
53
+ HUGGINGFACE = "hf"
54
+ OPENAI = "openai"
55
+
56
+
57
+ class EmbeddingType(Enum):
58
+ HUGGINGFACE = "hf"
59
+ OPENAI = "openai"
60
+
61
+
62
+ class StNotificationType(Enum):
63
+ INFO = "info"
64
+ WARNING = "warning"
65
+ ERROR = "err"
utils/llm.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import streamlit as st
3
+ from huggingface_hub import InferenceClient
4
+ from streamlit.logger import get_logger
5
+
6
+ from utils.constants import APIKeyType, TEST_PROMPT, OPENAI_TEST_MODEL
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ @st.cache_data(show_spinner=False)
12
+ def validate_api_token(api_key_type: APIKeyType, api_key: str) -> tuple[bool, str]:
13
+ if not api_key_type:
14
+ return (
15
+ False,
16
+ "API key type is not mentioned",
17
+ )
18
+
19
+ if not api_key:
20
+ return (
21
+ False,
22
+ "Invalid API key detected",
23
+ )
24
+
25
+ try:
26
+ if api_key_type == APIKeyType.OPENAI:
27
+ openai.Completion.create(
28
+ model=OPENAI_TEST_MODEL,
29
+ prompt=TEST_PROMPT,
30
+ api_key=api_key,
31
+ max_tokens=1,
32
+ )
33
+ logger.info("OpenAI token validated")
34
+ else:
35
+ client = InferenceClient(token=api_key)
36
+ client.text_generation(prompt=TEST_PROMPT, max_new_tokens=1)
37
+ logger.info("HuggingFace token validated")
38
+
39
+ except Exception as e:
40
+ logger.error(f"{e.__class__.__name__}: {e}")
41
+ return False, f"{e.__class__.__name__}: {e}"
42
+ return True, ""