Spaces:
Sleeping
Sleeping
dinhquangson
commited on
Commit
•
477ed7e
1
Parent(s):
4ae7ab5
Update app.py
Browse files
app.py
CHANGED
@@ -20,7 +20,7 @@ from htmlTemplates import css, bot_template, user_template
|
|
20 |
from langchain.llms import HuggingFaceHub
|
21 |
|
22 |
|
23 |
-
def
|
24 |
"""
|
25 |
Extract text from a list of PDF documents.
|
26 |
|
@@ -35,16 +35,15 @@ def get_pdf_text(pdf_docs):
|
|
35 |
Extracted text from all the PDF documents.
|
36 |
|
37 |
"""
|
38 |
-
|
39 |
for pdf in pdf_docs:
|
40 |
pdf_loader = UnstructuredPDFLoader(pdf)
|
41 |
pdf_pages = pdf_loader.load_and_split()
|
42 |
-
|
43 |
-
|
44 |
-
return text
|
45 |
|
46 |
|
47 |
-
def get_text_chunks(
|
48 |
"""
|
49 |
Split the input text into chunks.
|
50 |
|
@@ -62,7 +61,7 @@ def get_text_chunks(text):
|
|
62 |
text_splitter = RecursiveCharacterTextSplitter(
|
63 |
chunk_size=1024, chunk_overlap=64
|
64 |
)
|
65 |
-
texts = text_splitter.
|
66 |
return texts
|
67 |
|
68 |
|
@@ -173,10 +172,10 @@ def main():
|
|
173 |
if st.button("Process"):
|
174 |
with st.spinner("Processing"):
|
175 |
# get the raw text
|
176 |
-
|
177 |
|
178 |
# get the text chunks
|
179 |
-
text_chunks = get_text_chunks(
|
180 |
|
181 |
# create vector store
|
182 |
vectorstore = get_vectorstore(text_chunks)
|
|
|
20 |
from langchain.llms import HuggingFaceHub
|
21 |
|
22 |
|
23 |
+
def get_pdf_pages(pdf_docs):
|
24 |
"""
|
25 |
Extract text from a list of PDF documents.
|
26 |
|
|
|
35 |
Extracted text from all the PDF documents.
|
36 |
|
37 |
"""
|
38 |
+
pages = []
|
39 |
for pdf in pdf_docs:
|
40 |
pdf_loader = UnstructuredPDFLoader(pdf)
|
41 |
pdf_pages = pdf_loader.load_and_split()
|
42 |
+
pages=paegs+pdf_pages
|
43 |
+
return pages
|
|
|
44 |
|
45 |
|
46 |
+
def get_text_chunks(pages):
|
47 |
"""
|
48 |
Split the input text into chunks.
|
49 |
|
|
|
61 |
text_splitter = RecursiveCharacterTextSplitter(
|
62 |
chunk_size=1024, chunk_overlap=64
|
63 |
)
|
64 |
+
texts = text_splitter.split_documents(pages)
|
65 |
return texts
|
66 |
|
67 |
|
|
|
172 |
if st.button("Process"):
|
173 |
with st.spinner("Processing"):
|
174 |
# get the raw text
|
175 |
+
pages = get_pdf_pages(pdf_docs)
|
176 |
|
177 |
# get the text chunks
|
178 |
+
text_chunks = get_text_chunks(pages)
|
179 |
|
180 |
# create vector store
|
181 |
vectorstore = get_vectorstore(text_chunks)
|