Spaces:
Sleeping
Sleeping
File size: 1,698 Bytes
6c6956f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
'''
This module contains all the loaders
'''
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
openai_api_key=os.environ['OPENAI_API_KEY']
def load_pdf(path: str = "resume.pdf"):
'''
Load a pdf file from a stringio object
'''
pdf_loader = PyPDFLoader(path)
documents = pdf_loader.load()
return documents
def get_embeddings(documents):
'''
Get embeddings from a list of documents
'''
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = splitter.split_documents(documents)
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
return texts, embeddings
def get_db(texts, embeddings):
'''
Get a vectorstore from a list of texts and embeddings
'''
db = Chroma.from_documents(texts, embeddings)
return db
def get_retriever(db):
'''
Get a retriever from a vectorstore
'''
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":1})
return retriever
def get_chain_for_pdf(path):
'''
Get a conversation chain from a path
'''
documents = load_pdf(path)
texts, embeddings = get_embeddings(documents)
db = get_db(texts, embeddings)
retriever = get_retriever(db)
chain = RetrievalQA.from_chain_type(
llm=ChatOpenAI(temperature=0, openai_api_key=openai_api_key),
chain_type="stuff",
retriever=retriever,
return_source_documents=True)
return chain
|