GSA_AI_ASSISTANT / utils.py
Mishab's picture
Initial Push
e1b512a
raw
history blame contribute delete
No virus
8.07 kB
import streamlit as st
from pypdf import PdfReader
import os
from pathlib import Path
from dotenv import load_dotenv
import pickle
import timeit
from PIL import Image
import zipfile
import datetime
import shutil
from collections import defaultdict
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts.prompt import PromptTemplate
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_toolkits import create_retriever_tool
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
from langchain.utilities import SerpAPIWrapper
from langchain.agents import Tool
from langchain.agents import load_tools
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import RetrievalQA
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank
import logging
load_dotenv()
current_timestamp = datetime.datetime.now()
timestamp_string = current_timestamp.strftime("%Y-%m-%d %H:%M:%S")
def build_llm():
'''
Loading OpenAI model
'''
# llm= OpenAI(temperature=0.2)
llm= ChatOpenAI(temperature = 0)
return llm
def build_embedding_model():
'''
Loading Sentence transformer model for text embedding
'''
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
model_kwargs={'device': 'cpu'})
return embeddings
def unzip_opm():
'''
This function is used to unzip the documents file. This is required if there is no extisting vector database
created and wanted to build from the scratch
'''
# Specify the path to your ZIP file
zip_file_path = r'OPM_Files/OPM_Retirement_backup-20230902T130906Z-001.zip'
# Get the directory where the ZIP file is located
extract_path = os.path.dirname(zip_file_path)
# Create a folder with the same name as the ZIP file (without the .zip extension)
extract_folder = os.path.splitext(os.path.basename(zip_file_path))[0]
extract_folder_path = os.path.join(extract_path, extract_folder)
# Create the folder if it doesn't exist
if not os.path.exists(extract_folder_path):
os.makedirs(extract_folder_path)
# Open the ZIP file for reading
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
# Extract all the contents into the created folder
zip_ref.extractall(extract_folder_path)
print(f'Unzipped {zip_file_path} to {extract_folder_path}')
return extract_folder_path
return
def count_files_by_type(folder_path):
'''
Counting files by file type in the specified folder.
This is required if there is no extisting vector database
created and wanted to build from the scratch
'''
file_count_by_type = defaultdict(int)
for root, _, files in os.walk(folder_path):
for file in files:
_, extension = os.path.splitext(file)
file_count_by_type[extension] += 1
return file_count_by_type
def generate_file_count_table(file_count_by_type):
'''
Generate a table files count file type.
This is required if there is no extisting vector database
created and wanted to build from the scratch
'''
data = {"File Type": [], "Number of Files": []}
for extension, count in file_count_by_type.items():
data["File Type"].append(extension)
data["Number of Files"].append(count)
df = pd.DataFrame(data)
df = df.sort_values(by="Number of Files", ascending=False) # Sort by number of files
return df
def move_files_to_folders(folder_path):
'''
Move files to respective folder. Example, PDF docs to PDFs folder, HTML docs to HTMLs folder.
This is required if there is no extisting vector database
created and wanted to build from the scratch
'''
for root, _, files in os.walk(folder_path):
for file in files:
_, extension = os.path.splitext(file)
source_path = os.path.join(root, file)
if extension == '.pdf':
dest_folder = "PDFs"
elif extension == '.html':
dest_folder = "HTMLs"
else:
continue
dest_path = os.path.join(dest_folder, file)
os.makedirs(dest_folder, exist_ok=True)
shutil.copy(source_path, dest_path)
def load_vectorstore(persist_directory, embeddings):
'''
This function will try first to load chroma database from the disk. If it does exist,
It will do the following,
1) Load the pdfs
2) create text chunks
3) Index it and store it in a Chroma DB
4) Peform the same for HTML files
5) Store the final chroma db in the disk.
This is required if there is no extisting vector database
created and wanted to build from the scratch
'''
if os.path.exists(persist_directory):
print("Using existing vectore store for these documents.")
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
print("Chroma DB loaded from the disk")
return vectorstore
def load_retriver(chroma_vectorstore):
"""Load cohere rerank method for retrieval"""
# bm25_retriever = BM25Retriever.from_documents(text_chunks)
# bm25_retriever.k = 2
chroma_retriever = chroma_vectorstore.as_retriever(search_kwargs={"k": 5})
# ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever], weights=[0.3, 0.7])
logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)
multi_query_retriever = MultiQueryRetriever.from_llm(retriever=chroma_retriever,
llm=ChatOpenAI(temperature=0))
compressor = CohereRerank()
compression_retriever = ContextualCompressionRetriever(
base_compressor=compressor,
base_retriever=multi_query_retriever)
return compression_retriever
def load_conversational_retrievel_chain(retriever, llm):
'''
Create RetrievalQA chain with memory
'''
# template = """You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'.
# Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
# Only include information found in the results and don't add any additional information.
# Make sure the answer is correct and don't output false content.
# If the text does not relate to the query, simply state 'Text Not Found in the Document'. Ignore outlier,
# search results which has nothing to do with the question. Only answer what is asked.
# The answer should be short and concise. Answer step-by-step.
# {context}
# {history}
# Question: {question}
# Helpful Answer:"""
# prompt = PromptTemplate(input_variables=["history", "context", "question"], template=template)
memory = ConversationBufferMemory(input_key="question", memory_key="history")
qa = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=True,
chain_type_kwargs={"memory": memory},
)
return qa