Spaces:
Runtime error
Runtime error
File size: 7,162 Bytes
0fac726 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
import requests
from bs4 import BeautifulSoup
from langchain.callbacks import get_openai_callback
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
from langchain.llms import OpenAIChat, HuggingFaceHub
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from streamlit.logger import get_logger
from utils.constants import (
KNOWLEDGEBASE_DIR,
AssistantType,
BS_HTML_PARSER,
TEXT_TAG,
SOURCE_TAG,
ANSWER_TAG,
QUESTION_TAG,
HF_TEXT_GENERATION_REPO_ID,
EmbeddingType,
TOTAL_TOKENS_TAG,
PROMPT_TOKENS_TAG,
COMPLETION_TOKENS_TAG,
TOTAL_COST_TAG,
OPENAI_CHAT_COMPLETIONS_MODEL,
)
logger = get_logger(__name__)
def extract_text_from(url_: str):
html = requests.get(url_).text
soup = BeautifulSoup(html, features=BS_HTML_PARSER)
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
return "\n".join(line for line in lines if line)
def create_knowledgebase(
urls: list,
assistant_type: AssistantType,
embedding_type: EmbeddingType,
embedding_api_key: str,
knowledgebase_name: str,
):
pages: list[dict] = []
for url in urls:
pages.append({TEXT_TAG: extract_text_from(url_=url), SOURCE_TAG: url})
chunk_size = 500
chunk_overlap = 30
if assistant_type == AssistantType.OPENAI:
# # override the default chunk configs
# chunk_size = 1500
# chunk_overlap = 200
if embedding_type == EmbeddingType.HUGGINGFACE:
embeddings = HuggingFaceHubEmbeddings(
huggingfacehub_api_token=embedding_api_key
)
logger.info(f"Using `hf` embeddings")
else:
embeddings = OpenAIEmbeddings(openai_api_key=embedding_api_key)
logger.info(f"Using `openai` embeddings")
else:
embeddings = HuggingFaceHubEmbeddings(
huggingfacehub_api_token=embedding_api_key
)
logger.info(
f"Since the assistant type is set to `hf`, `hf` embeddings are used by default."
)
text_splitter = CharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator="\n"
)
docs, metadata = [], []
for page in pages:
splits = text_splitter.split_text(page[TEXT_TAG])
docs.extend(splits)
metadata.extend([{SOURCE_TAG: page[SOURCE_TAG]}] * len(splits))
print(f"Split {page[SOURCE_TAG]} into {len(splits)} chunks")
vectorstore = FAISS.from_texts(texts=docs, embedding=embeddings, metadatas=metadata)
vectorstore.save_local(folder_path=KNOWLEDGEBASE_DIR, index_name=knowledgebase_name)
def load_vectorstore(
embedding_type: EmbeddingType,
embedding_api_key: str,
knowledgebase_name: str,
):
if embedding_type == EmbeddingType.OPENAI:
embeddings = OpenAIEmbeddings(openai_api_key=embedding_api_key)
else:
embeddings = HuggingFaceHubEmbeddings(
huggingfacehub_api_token=embedding_api_key
)
logger.info(
f"Since the assistant type is set to `hf`, `hf` embeddings are used by default."
)
store = FAISS.load_local(
folder_path=KNOWLEDGEBASE_DIR,
embeddings=embeddings,
index_name=knowledgebase_name,
)
return store
def construct_query_response(result: dict) -> dict:
return {ANSWER_TAG: result}
class Knowledgebase:
def __init__(
self,
assistant_type: AssistantType,
embedding_type: EmbeddingType,
assistant_api_key: str,
embedding_api_key: str,
knowledgebase_name: str,
):
self.assistant_type = assistant_type
self.embedding_type = embedding_type
self.assistant_api_key = assistant_api_key
self.embedding_api_key = embedding_api_key
self.knowledgebase = load_vectorstore(
embedding_type=embedding_type,
embedding_api_key=embedding_api_key,
knowledgebase_name=knowledgebase_name,
)
def query_knowledgebase(self, query: str) -> tuple[dict, dict]:
try:
logger.info(
f"The assistant API key for the current session: ***{self.assistant_api_key[-4:]}"
)
logger.info(
f"The embedding API key for the current session: ***{self.embedding_api_key[-4:]}"
)
query = query.strip()
if not query:
return {
ANSWER_TAG: "Oh snap! did you hit send accidentally, because I can't see any questions 🤔",
}, {}
if self.assistant_type == AssistantType.OPENAI:
llm = OpenAIChat(
model_name=OPENAI_CHAT_COMPLETIONS_MODEL,
temperature=0,
verbose=True,
openai_api_key=self.assistant_api_key,
)
# # this is deprecated
# chain = VectorDBQAWithSourcesChain.from_llm(
# llm=llm,
# vectorstore=self.knowledgebase,
# max_tokens_limit=2048,
# k=2,
# reduce_k_below_max_tokens=True,
# )
chain = RetrievalQAWithSourcesChain.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=self.knowledgebase.as_retriever(),
reduce_k_below_max_tokens=True,
chain_type_kwargs={"verbose": True},
)
else:
llm = HuggingFaceHub(
repo_id=HF_TEXT_GENERATION_REPO_ID,
model_kwargs={"temperature": 0.5, "max_length": 64},
huggingfacehub_api_token=self.assistant_api_key,
verbose=True,
)
chain = RetrievalQAWithSourcesChain.from_chain_type(
llm=llm,
chain_type="refine",
retriever=self.knowledgebase.as_retriever(),
max_tokens_limit=1024,
reduce_k_below_max_tokens=True,
chain_type_kwargs={"verbose": True},
)
with get_openai_callback() as cb:
result = chain({QUESTION_TAG: query})
print(f"Total Tokens: {cb.total_tokens}")
print(f"Prompt Tokens: {cb.prompt_tokens}")
print(f"Completion Tokens: {cb.completion_tokens}")
print(f"Total Cost (USD): ${cb.total_cost}")
metadata = {
TOTAL_TOKENS_TAG: cb.total_tokens,
PROMPT_TOKENS_TAG: cb.prompt_tokens,
COMPLETION_TOKENS_TAG: cb.completion_tokens,
TOTAL_COST_TAG: cb.total_cost,
}
return result, metadata
except Exception as e:
logger.error(f"{e.__class__.__name__}: {e}")
return {ANSWER_TAG: f"{e.__class__.__name__}: {e}"}, {}
|