dinhquangson commited on
Commit
477ed7e
1 Parent(s): 4ae7ab5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -9
app.py CHANGED
@@ -20,7 +20,7 @@ from htmlTemplates import css, bot_template, user_template
20
  from langchain.llms import HuggingFaceHub
21
 
22
 
23
- def get_pdf_text(pdf_docs):
24
  """
25
  Extract text from a list of PDF documents.
26
 
@@ -35,16 +35,15 @@ def get_pdf_text(pdf_docs):
35
  Extracted text from all the PDF documents.
36
 
37
  """
38
- text = ""
39
  for pdf in pdf_docs:
40
  pdf_loader = UnstructuredPDFLoader(pdf)
41
  pdf_pages = pdf_loader.load_and_split()
42
- for page in pdf_pages:
43
- text += page.extract_text()
44
- return text
45
 
46
 
47
- def get_text_chunks(text):
48
  """
49
  Split the input text into chunks.
50
 
@@ -62,7 +61,7 @@ def get_text_chunks(text):
62
  text_splitter = RecursiveCharacterTextSplitter(
63
  chunk_size=1024, chunk_overlap=64
64
  )
65
- texts = text_splitter.split_text(text)
66
  return texts
67
 
68
 
@@ -173,10 +172,10 @@ def main():
173
  if st.button("Process"):
174
  with st.spinner("Processing"):
175
  # get the raw text
176
- text = get_pdf_text(pdf_docs)
177
 
178
  # get the text chunks
179
- text_chunks = get_text_chunks(text)
180
 
181
  # create vector store
182
  vectorstore = get_vectorstore(text_chunks)
 
20
  from langchain.llms import HuggingFaceHub
21
 
22
 
23
+ def get_pdf_pages(pdf_docs):
24
  """
25
  Extract text from a list of PDF documents.
26
 
 
35
  Extracted text from all the PDF documents.
36
 
37
  """
38
+ pages = []
39
  for pdf in pdf_docs:
40
  pdf_loader = UnstructuredPDFLoader(pdf)
41
  pdf_pages = pdf_loader.load_and_split()
42
+ pages=paegs+pdf_pages
43
+ return pages
 
44
 
45
 
46
+ def get_text_chunks(pages):
47
  """
48
  Split the input text into chunks.
49
 
 
61
  text_splitter = RecursiveCharacterTextSplitter(
62
  chunk_size=1024, chunk_overlap=64
63
  )
64
+ texts = text_splitter.split_documents(pages)
65
  return texts
66
 
67
 
 
172
  if st.button("Process"):
173
  with st.spinner("Processing"):
174
  # get the raw text
175
+ pages = get_pdf_pages(pdf_docs)
176
 
177
  # get the text chunks
178
+ text_chunks = get_text_chunks(pages)
179
 
180
  # create vector store
181
  vectorstore = get_vectorstore(text_chunks)