''' This module contains all the loaders ''' import os from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain.chains import RetrievalQA from langchain.chat_models import ChatOpenAI openai_api_key=os.environ['OPENAI_API_KEY'] def load_pdf(path: str = "resume.pdf"): ''' Load a pdf file from a stringio object ''' pdf_loader = PyPDFLoader(path) documents = pdf_loader.load() return documents def get_embeddings(documents): ''' Get embeddings from a list of documents ''' splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) texts = splitter.split_documents(documents) embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) return texts, embeddings def get_db(texts, embeddings): ''' Get a vectorstore from a list of texts and embeddings ''' db = Chroma.from_documents(texts, embeddings) return db def get_retriever(db): ''' Get a retriever from a vectorstore ''' retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":1}) return retriever def get_chain_for_pdf(path): ''' Get a conversation chain from a path ''' documents = load_pdf(path) texts, embeddings = get_embeddings(documents) db = get_db(texts, embeddings) retriever = get_retriever(db) chain = RetrievalQA.from_chain_type( llm=ChatOpenAI(temperature=0, openai_api_key=openai_api_key), chain_type="stuff", retriever=retriever, return_source_documents=True) return chain