Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup | |
from langchain.callbacks import get_openai_callback | |
from langchain.chains import RetrievalQAWithSourcesChain | |
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings | |
from langchain.llms import OpenAIChat, HuggingFaceHub | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.vectorstores import FAISS | |
from streamlit.logger import get_logger | |
from utils.constants import ( | |
KNOWLEDGEBASE_DIR, | |
AssistantType, | |
BS_HTML_PARSER, | |
TEXT_TAG, | |
SOURCE_TAG, | |
ANSWER_TAG, | |
QUESTION_TAG, | |
HF_TEXT_GENERATION_REPO_ID, | |
EmbeddingType, | |
TOTAL_TOKENS_TAG, | |
PROMPT_TOKENS_TAG, | |
COMPLETION_TOKENS_TAG, | |
TOTAL_COST_TAG, | |
OPENAI_CHAT_COMPLETIONS_MODEL, | |
) | |
logger = get_logger(__name__) | |
def extract_text_from(url_: str): | |
html = requests.get(url_).text | |
soup = BeautifulSoup(html, features=BS_HTML_PARSER) | |
text = soup.get_text() | |
lines = (line.strip() for line in text.splitlines()) | |
return "\n".join(line for line in lines if line) | |
def create_knowledgebase( | |
urls: list, | |
assistant_type: AssistantType, | |
embedding_type: EmbeddingType, | |
embedding_api_key: str, | |
knowledgebase_name: str, | |
): | |
pages: list[dict] = [] | |
for url in urls: | |
pages.append({TEXT_TAG: extract_text_from(url_=url), SOURCE_TAG: url}) | |
chunk_size = 500 | |
chunk_overlap = 30 | |
if assistant_type == AssistantType.OPENAI: | |
# # override the default chunk configs | |
# chunk_size = 1500 | |
# chunk_overlap = 200 | |
if embedding_type == EmbeddingType.HUGGINGFACE: | |
embeddings = HuggingFaceHubEmbeddings( | |
huggingfacehub_api_token=embedding_api_key | |
) | |
logger.info(f"Using `hf` embeddings") | |
else: | |
embeddings = OpenAIEmbeddings(openai_api_key=embedding_api_key) | |
logger.info(f"Using `openai` embeddings") | |
else: | |
embeddings = HuggingFaceHubEmbeddings( | |
huggingfacehub_api_token=embedding_api_key | |
) | |
logger.info( | |
f"Since the assistant type is set to `hf`, `hf` embeddings are used by default." | |
) | |
text_splitter = CharacterTextSplitter( | |
chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator="\n" | |
) | |
docs, metadata = [], [] | |
for page in pages: | |
splits = text_splitter.split_text(page[TEXT_TAG]) | |
docs.extend(splits) | |
metadata.extend([{SOURCE_TAG: page[SOURCE_TAG]}] * len(splits)) | |
print(f"Split {page[SOURCE_TAG]} into {len(splits)} chunks") | |
vectorstore = FAISS.from_texts(texts=docs, embedding=embeddings, metadatas=metadata) | |
vectorstore.save_local(folder_path=KNOWLEDGEBASE_DIR, index_name=knowledgebase_name) | |
def load_vectorstore( | |
embedding_type: EmbeddingType, | |
embedding_api_key: str, | |
knowledgebase_name: str, | |
): | |
if embedding_type == EmbeddingType.OPENAI: | |
embeddings = OpenAIEmbeddings(openai_api_key=embedding_api_key) | |
else: | |
embeddings = HuggingFaceHubEmbeddings( | |
huggingfacehub_api_token=embedding_api_key | |
) | |
logger.info( | |
f"Since the assistant type is set to `hf`, `hf` embeddings are used by default." | |
) | |
store = FAISS.load_local( | |
folder_path=KNOWLEDGEBASE_DIR, | |
embeddings=embeddings, | |
index_name=knowledgebase_name, | |
) | |
return store | |
def construct_query_response(result: dict) -> dict: | |
return {ANSWER_TAG: result} | |
class Knowledgebase: | |
def __init__( | |
self, | |
assistant_type: AssistantType, | |
embedding_type: EmbeddingType, | |
assistant_api_key: str, | |
embedding_api_key: str, | |
knowledgebase_name: str, | |
): | |
self.assistant_type = assistant_type | |
self.embedding_type = embedding_type | |
self.assistant_api_key = assistant_api_key | |
self.embedding_api_key = embedding_api_key | |
self.knowledgebase = load_vectorstore( | |
embedding_type=embedding_type, | |
embedding_api_key=embedding_api_key, | |
knowledgebase_name=knowledgebase_name, | |
) | |
def query_knowledgebase(self, query: str) -> tuple[dict, dict]: | |
try: | |
logger.info( | |
f"The assistant API key for the current session: ***{self.assistant_api_key[-4:]}" | |
) | |
logger.info( | |
f"The embedding API key for the current session: ***{self.embedding_api_key[-4:]}" | |
) | |
query = query.strip() | |
if not query: | |
return { | |
ANSWER_TAG: "Oh snap! did you hit send accidentally, because I can't see any questions 🤔", | |
}, {} | |
if self.assistant_type == AssistantType.OPENAI: | |
llm = OpenAIChat( | |
model_name=OPENAI_CHAT_COMPLETIONS_MODEL, | |
temperature=0, | |
verbose=True, | |
openai_api_key=self.assistant_api_key, | |
) | |
# # this is deprecated | |
# chain = VectorDBQAWithSourcesChain.from_llm( | |
# llm=llm, | |
# vectorstore=self.knowledgebase, | |
# max_tokens_limit=2048, | |
# k=2, | |
# reduce_k_below_max_tokens=True, | |
# ) | |
chain = RetrievalQAWithSourcesChain.from_chain_type( | |
llm=llm, | |
chain_type="stuff", | |
retriever=self.knowledgebase.as_retriever(), | |
reduce_k_below_max_tokens=True, | |
chain_type_kwargs={"verbose": True}, | |
) | |
else: | |
llm = HuggingFaceHub( | |
repo_id=HF_TEXT_GENERATION_REPO_ID, | |
model_kwargs={"temperature": 0.5, "max_length": 64}, | |
huggingfacehub_api_token=self.assistant_api_key, | |
verbose=True, | |
) | |
chain = RetrievalQAWithSourcesChain.from_chain_type( | |
llm=llm, | |
chain_type="refine", | |
retriever=self.knowledgebase.as_retriever(), | |
max_tokens_limit=1024, | |
reduce_k_below_max_tokens=True, | |
chain_type_kwargs={"verbose": True}, | |
) | |
with get_openai_callback() as cb: | |
result = chain({QUESTION_TAG: query}) | |
print(f"Total Tokens: {cb.total_tokens}") | |
print(f"Prompt Tokens: {cb.prompt_tokens}") | |
print(f"Completion Tokens: {cb.completion_tokens}") | |
print(f"Total Cost (USD): ${cb.total_cost}") | |
metadata = { | |
TOTAL_TOKENS_TAG: cb.total_tokens, | |
PROMPT_TOKENS_TAG: cb.prompt_tokens, | |
COMPLETION_TOKENS_TAG: cb.completion_tokens, | |
TOTAL_COST_TAG: cb.total_cost, | |
} | |
return result, metadata | |
except Exception as e: | |
logger.error(f"{e.__class__.__name__}: {e}") | |
return {ANSWER_TAG: f"{e.__class__.__name__}: {e}"}, {} | |