Spaces:
Sleeping
Sleeping
''' | |
This module contains all the loaders | |
''' | |
import os | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.vectorstores import Chroma | |
from langchain.chains import RetrievalQA | |
from langchain.chat_models import ChatOpenAI | |
openai_api_key=os.environ['OPENAI_API_KEY'] | |
def load_pdf(path: str = "resume.pdf"): | |
''' | |
Load a pdf file from a stringio object | |
''' | |
pdf_loader = PyPDFLoader(path) | |
documents = pdf_loader.load() | |
return documents | |
def get_embeddings(documents): | |
''' | |
Get embeddings from a list of documents | |
''' | |
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
texts = splitter.split_documents(documents) | |
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) | |
return texts, embeddings | |
def get_db(texts, embeddings): | |
''' | |
Get a vectorstore from a list of texts and embeddings | |
''' | |
db = Chroma.from_documents(texts, embeddings) | |
return db | |
def get_retriever(db): | |
''' | |
Get a retriever from a vectorstore | |
''' | |
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":1}) | |
return retriever | |
def get_chain_for_pdf(path): | |
''' | |
Get a conversation chain from a path | |
''' | |
documents = load_pdf(path) | |
texts, embeddings = get_embeddings(documents) | |
db = get_db(texts, embeddings) | |
retriever = get_retriever(db) | |
chain = RetrievalQA.from_chain_type( | |
llm=ChatOpenAI(temperature=0, openai_api_key=openai_api_key), | |
chain_type="stuff", | |
retriever=retriever, | |
return_source_documents=True) | |
return chain | |