Spaces:
Sleeping
Sleeping
''' | |
This module contains all the loaders | |
''' | |
import os | |
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.vectorstores import Chroma | |
from langchain.chains import RetrievalQA | |
from langchain.chat_models import ChatOpenAI | |
from constants import TEMPERATURE, MODEL_NAME | |
openai_api_key=os.environ['OPENAI_API_KEY'] | |
def load_pdf(path: str = "resume.pdf"): | |
''' | |
Load a pdf file from a stringio object | |
''' | |
pdf_loader = PyPDFLoader(path) | |
documents = pdf_loader.load() | |
return documents | |
def load_multiple_documents(path: str = "documents"): | |
''' | |
Load multiple documents from a folder | |
''' | |
documents = [] | |
for file in os.listdir(path): | |
if file.endswith('.pdf'): | |
pdf_path = './documents/' + file | |
loader = PyPDFLoader(pdf_path) | |
documents.extend(loader.load()) | |
elif file.endswith('.docx') or file.endswith('.doc'): | |
doc_path = './documents/' + file | |
loader = Docx2txtLoader(doc_path) | |
documents.extend(loader.load()) | |
elif file.endswith('.txt'): | |
text_path = './documents/' + file | |
loader = TextLoader(text_path) | |
documents.extend(loader.load()) | |
return documents | |
def get_embeddings(documents): | |
''' | |
Get embeddings from a list of documents | |
''' | |
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10) | |
texts = splitter.split_documents(documents) | |
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) | |
return texts, embeddings | |
def get_db(texts, embeddings): | |
''' | |
Get a vectorstore from a list of texts and embeddings | |
''' | |
db = Chroma.from_documents(texts, embeddings) | |
return db | |
def get_retriever(db): | |
''' | |
Get a retriever from a vectorstore | |
''' | |
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":1}) | |
return retriever | |
def get_chain_for_pdf(path): | |
''' | |
Get a conversation chain from a path | |
''' | |
documents = load_multiple_documents(path) | |
texts, embeddings = get_embeddings(documents) | |
db = get_db(texts, embeddings) | |
retriever = get_retriever(db) | |
chain = RetrievalQA.from_chain_type( | |
llm=ChatOpenAI(temperature=TEMPERATURE, openai_api_key=openai_api_key, model=MODEL_NAME), | |
chain_type="stuff", | |
retriever=retriever, | |
return_source_documents=True) | |
return chain | |