Phi2-PDF-chat

Sleeping

dinhquangson commited on Jan 21

Commit

477ed7e

•

1 Parent(s): 4ae7ab5

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ from htmlTemplates import css, bot_template, user_template
 from langchain.llms import HuggingFaceHub
-def get_pdf_text(pdf_docs):
     """
     Extract text from a list of PDF documents.
@@ -35,16 +35,15 @@ def get_pdf_text(pdf_docs):
         Extracted text from all the PDF documents.
     """
-    text = ""
     for pdf in pdf_docs:
         pdf_loader = UnstructuredPDFLoader(pdf)
         pdf_pages = pdf_loader.load_and_split()
-        for page in pdf_pages:
-            text += page.extract_text()
-    return text
-def get_text_chunks(text):
     """
     Split the input text into chunks.
@@ -62,7 +61,7 @@ def get_text_chunks(text):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1024, chunk_overlap=64
     )
-    texts = text_splitter.split_text(text)
     return texts
@@ -173,10 +172,10 @@ def main():
         if st.button("Process"):
             with st.spinner("Processing"):
                 # get the raw text
-                text = get_pdf_text(pdf_docs)
                 # get the text chunks
-                text_chunks = get_text_chunks(text)
                 # create vector store
                 vectorstore = get_vectorstore(text_chunks)

 from langchain.llms import HuggingFaceHub
+def get_pdf_pages(pdf_docs):
     """
     Extract text from a list of PDF documents.
         Extracted text from all the PDF documents.
     """
+    pages = []
     for pdf in pdf_docs:
         pdf_loader = UnstructuredPDFLoader(pdf)
         pdf_pages = pdf_loader.load_and_split()
+        pages=paegs+pdf_pages
+    return pages
+def get_text_chunks(pages):
     """
     Split the input text into chunks.
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1024, chunk_overlap=64
     )
+    texts = text_splitter.split_documents(pages)
     return texts
         if st.button("Process"):
             with st.spinner("Processing"):
                 # get the raw text
+                pages = get_pdf_pages(pdf_docs)
                 # get the text chunks
+                text_chunks = get_text_chunks(pages)
                 # create vector store
                 vectorstore = get_vectorstore(text_chunks)