Spaces:
Sleeping
Sleeping
File size: 2,540 Bytes
6c6956f 0cae9a4 6c6956f 0cae9a4 6c6956f 0cae9a4 6c6956f 0cae9a4 6c6956f 0cae9a4 6c6956f 0cae9a4 6c6956f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
'''
This module contains all the loaders
'''
import os
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from constants import TEMPERATURE, MODEL_NAME
openai_api_key=os.environ['OPENAI_API_KEY']
def load_pdf(path: str = "resume.pdf"):
'''
Load a pdf file from a stringio object
'''
pdf_loader = PyPDFLoader(path)
documents = pdf_loader.load()
return documents
def load_multiple_documents(path: str = "documents"):
'''
Load multiple documents from a folder
'''
documents = []
for file in os.listdir(path):
if file.endswith('.pdf'):
pdf_path = './documents/' + file
loader = PyPDFLoader(pdf_path)
documents.extend(loader.load())
elif file.endswith('.docx') or file.endswith('.doc'):
doc_path = './documents/' + file
loader = Docx2txtLoader(doc_path)
documents.extend(loader.load())
elif file.endswith('.txt'):
text_path = './documents/' + file
loader = TextLoader(text_path)
documents.extend(loader.load())
return documents
def get_embeddings(documents):
'''
Get embeddings from a list of documents
'''
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
texts = splitter.split_documents(documents)
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
return texts, embeddings
def get_db(texts, embeddings):
'''
Get a vectorstore from a list of texts and embeddings
'''
db = Chroma.from_documents(texts, embeddings)
return db
def get_retriever(db):
'''
Get a retriever from a vectorstore
'''
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":1})
return retriever
def get_chain_for_pdf(path):
'''
Get a conversation chain from a path
'''
documents = load_multiple_documents(path)
texts, embeddings = get_embeddings(documents)
db = get_db(texts, embeddings)
retriever = get_retriever(db)
chain = RetrievalQA.from_chain_type(
llm=ChatOpenAI(temperature=TEMPERATURE, openai_api_key=openai_api_key, model=MODEL_NAME),
chain_type="stuff",
retriever=retriever,
return_source_documents=True)
return chain
|