''' This module contains all the loaders ''' import os from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain.chains import RetrievalQA from langchain.chat_models import ChatOpenAI from constants import TEMPERATURE, MODEL_NAME openai_api_key=os.environ['OPENAI_API_KEY'] def load_pdf(path: str = "resume.pdf"): ''' Load a pdf file from a stringio object ''' pdf_loader = PyPDFLoader(path) documents = pdf_loader.load() return documents def load_multiple_documents(path: str = "documents"): ''' Load multiple documents from a folder ''' documents = [] for file in os.listdir(path): if file.endswith('.pdf'): pdf_path = './documents/' + file loader = PyPDFLoader(pdf_path) documents.extend(loader.load()) elif file.endswith('.docx') or file.endswith('.doc'): doc_path = './documents/' + file loader = Docx2txtLoader(doc_path) documents.extend(loader.load()) elif file.endswith('.txt'): text_path = './documents/' + file loader = TextLoader(text_path) documents.extend(loader.load()) return documents def get_embeddings(documents): ''' Get embeddings from a list of documents ''' splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10) texts = splitter.split_documents(documents) embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) return texts, embeddings def get_db(texts, embeddings): ''' Get a vectorstore from a list of texts and embeddings ''' db = Chroma.from_documents(texts, embeddings) return db def get_retriever(db): ''' Get a retriever from a vectorstore ''' retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":1}) return retriever def get_chain_for_pdf(path): ''' Get a conversation chain from a path ''' documents = load_multiple_documents(path) texts, embeddings = get_embeddings(documents) db = get_db(texts, embeddings) retriever = get_retriever(db) chain = RetrievalQA.from_chain_type( llm=ChatOpenAI(temperature=TEMPERATURE, openai_api_key=openai_api_key, model=MODEL_NAME), chain_type="stuff", retriever=retriever, return_source_documents=True) return chain