import os import streamlit as st from typing import List from dotenv import load_dotenv from langchain_community.embeddings import OpenAIEmbeddings from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_community.document_loaders import PyPDFLoader from langchain.chains import RetrievalQA from langchain_openai import ChatOpenAI from langchain_openai import OpenAIEmbeddings import tempfile # Load environment variables load_dotenv() # Initialize OpenAI API key openai_api_key = os.getenv('OPENAI_API_KEY') # Initialize embedding model using OpenAI embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key, model="text-embedding-3-small") # Initialize vector store vector_store = None # Store PDF file paths pdf_files = {} # Define the path for the FAISS index FAISS_INDEX_PATH = "faiss_index" FAISS_INDEX_FILE = os.path.join(FAISS_INDEX_PATH, "index.faiss") @st.cache_resource def process_pdf(uploaded_file): """Process the uploaded PDF and add it to the vector store.""" global vector_store, pdf_files # Create a temporary file to store the uploaded PDF with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: tmp_file.write(uploaded_file.getvalue()) tmp_file_path = tmp_file.name loader = PyPDFLoader(tmp_file_path) documents = loader.load() pdf_files[uploaded_file.name] = tmp_file_path text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) texts = text_splitter.split_documents(documents) if vector_store is None: vector_store = FAISS.from_documents(texts, embeddings) else: vector_store.add_documents(texts) # Save the updated vector store if not os.path.exists(FAISS_INDEX_PATH): os.makedirs(FAISS_INDEX_PATH) vector_store.save_local(FAISS_INDEX_PATH) # Clean up the temporary file os.unlink(tmp_file_path) def main(): st.title("PDF Question Answering System") # File uploader uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") if uploaded_file is not None: process_pdf(uploaded_file) st.success(f"PDF '{uploaded_file.name}' processed. You can now ask questions!") # User input user_question = st.text_input("Ask a question about the PDFs:") if user_question: if vector_store is None: st.error("Error: No PDFs have been uploaded yet.") return retriever = vector_store.as_retriever(search_kwargs={"k": 3}) # Initialize the OpenAI language model llm = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-4o-mini", temperature=0) qa_chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True ) result = qa_chain(user_question) answer = result['result'] source_docs = result['source_documents'] st.write("Answer:", answer) if source_docs: st.subheader("Sources:") unique_sources = set() for doc in source_docs: file_name = os.path.basename(doc.metadata['source']) if file_name in pdf_files and file_name not in unique_sources: unique_sources.add(file_name) file_path = pdf_files[file_name] st.write(f"Source: {file_name}") with open(file_path, "rb") as file: st.download_button( label=f"Download {file_name}", data=file, file_name=file_name, mime="application/pdf" ) other_sources = [doc.metadata['source'] for doc in source_docs if os.path.basename(doc.metadata['source']) not in pdf_files] unique_other_sources = set(other_sources) if unique_other_sources: st.subheader("Other Sources:") for source in unique_other_sources: st.write(f"- {source}") if __name__ == "__main__": main()