File size: 4,228 Bytes
030bc4f
4f4392b
030bc4f
 
 
 
 
 
 
 
 
29d0fc0
030bc4f
 
 
 
 
4f4392b
030bc4f
 
4f4392b
030bc4f
 
 
 
 
 
 
 
 
 
 
4f4392b
29d0fc0
 
030bc4f
29d0fc0
 
 
 
 
030bc4f
29d0fc0
 
 
030bc4f
 
 
 
29d0fc0
030bc4f
29d0fc0
 
030bc4f
 
 
 
 
 
29d0fc0
 
 
4f4392b
 
 
29d0fc0
 
4f4392b
29d0fc0
 
 
4f4392b
 
 
 
 
 
29d0fc0
4f4392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import streamlit as st
from typing import List
from dotenv import load_dotenv
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
import tempfile

# Load environment variables
load_dotenv()

# Initialize OpenAI API key
openai_api_key = os.getenv('OPENAI_API_KEY')

# Initialize embedding model using OpenAI
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key, model="text-embedding-3-small")

# Initialize vector store
vector_store = None

# Store PDF file paths
pdf_files = {}

# Define the path for the FAISS index
FAISS_INDEX_PATH = "faiss_index"
FAISS_INDEX_FILE = os.path.join(FAISS_INDEX_PATH, "index.faiss")

@st.cache_resource
def process_pdf(uploaded_file):
    """Process the uploaded PDF and add it to the vector store."""
    global vector_store, pdf_files
    
    # Create a temporary file to store the uploaded PDF
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
        tmp_file.write(uploaded_file.getvalue())
        tmp_file_path = tmp_file.name

    loader = PyPDFLoader(tmp_file_path)
    documents = loader.load()
    pdf_files[uploaded_file.name] = tmp_file_path

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_documents(documents)

    if vector_store is None:
        vector_store = FAISS.from_documents(texts, embeddings)
    else:
        vector_store.add_documents(texts)

    # Save the updated vector store
    if not os.path.exists(FAISS_INDEX_PATH):
        os.makedirs(FAISS_INDEX_PATH)
    vector_store.save_local(FAISS_INDEX_PATH)

    # Clean up the temporary file
    os.unlink(tmp_file_path)

def main():
    st.title("PDF Question Answering System")

    # File uploader
    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

    if uploaded_file is not None:
        process_pdf(uploaded_file)
        st.success(f"PDF '{uploaded_file.name}' processed. You can now ask questions!")

    # User input
    user_question = st.text_input("Ask a question about the PDFs:")

    if user_question:
        if vector_store is None:
            st.error("Error: No PDFs have been uploaded yet.")
            return

        retriever = vector_store.as_retriever(search_kwargs={"k": 3})

        # Initialize the OpenAI language model
        llm = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-4o-mini", temperature=0)

        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=retriever,
            return_source_documents=True
        )

        result = qa_chain(user_question)
        answer = result['result']
        source_docs = result['source_documents']

        st.write("Answer:", answer)

        if source_docs:
            st.subheader("Sources:")
            unique_sources = set()
            for doc in source_docs:
                file_name = os.path.basename(doc.metadata['source'])
                if file_name in pdf_files and file_name not in unique_sources:
                    unique_sources.add(file_name)
                    file_path = pdf_files[file_name]
                    st.write(f"Source: {file_name}")
                    with open(file_path, "rb") as file:
                        st.download_button(
                            label=f"Download {file_name}",
                            data=file,
                            file_name=file_name,
                            mime="application/pdf"
                        )

            other_sources = [doc.metadata['source'] for doc in source_docs if os.path.basename(doc.metadata['source']) not in pdf_files]
            unique_other_sources = set(other_sources)
            if unique_other_sources:
                st.subheader("Other Sources:")
                for source in unique_other_sources:
                    st.write(f"- {source}")

if __name__ == "__main__":
    main()