Spaces:
Sleeping
Sleeping
dinhquangson
commited on
Commit
•
4ae7ab5
1
Parent(s):
142ca34
Update app.py
Browse files
app.py
CHANGED
@@ -10,6 +10,7 @@ import streamlit as st
|
|
10 |
from dotenv import load_dotenv
|
11 |
from PyPDF2 import PdfReader
|
12 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
13 |
from langchain.embeddings import HuggingFaceBgeEmbeddings
|
14 |
from langchain.vectorstores import FAISS
|
15 |
from langchain.chat_models import ChatOpenAI
|
@@ -36,13 +37,14 @@ def get_pdf_text(pdf_docs):
|
|
36 |
"""
|
37 |
text = ""
|
38 |
for pdf in pdf_docs:
|
39 |
-
|
40 |
-
|
|
|
41 |
text += page.extract_text()
|
42 |
return text
|
43 |
|
44 |
|
45 |
-
def
|
46 |
"""
|
47 |
Split the input text into chunks.
|
48 |
|
@@ -60,7 +62,7 @@ def get_texts(pdf_pages):
|
|
60 |
text_splitter = RecursiveCharacterTextSplitter(
|
61 |
chunk_size=1024, chunk_overlap=64
|
62 |
)
|
63 |
-
texts = text_splitter.split_text(
|
64 |
return texts
|
65 |
|
66 |
|
@@ -170,8 +172,11 @@ def main():
|
|
170 |
)
|
171 |
if st.button("Process"):
|
172 |
with st.spinner("Processing"):
|
|
|
|
|
|
|
173 |
# get the text chunks
|
174 |
-
text_chunks =
|
175 |
|
176 |
# create vector store
|
177 |
vectorstore = get_vectorstore(text_chunks)
|
|
|
10 |
from dotenv import load_dotenv
|
11 |
from PyPDF2 import PdfReader
|
12 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
+
from langchain.document_loaders import UnstructuredPDFLoader
|
14 |
from langchain.embeddings import HuggingFaceBgeEmbeddings
|
15 |
from langchain.vectorstores import FAISS
|
16 |
from langchain.chat_models import ChatOpenAI
|
|
|
37 |
"""
|
38 |
text = ""
|
39 |
for pdf in pdf_docs:
|
40 |
+
pdf_loader = UnstructuredPDFLoader(pdf)
|
41 |
+
pdf_pages = pdf_loader.load_and_split()
|
42 |
+
for page in pdf_pages:
|
43 |
text += page.extract_text()
|
44 |
return text
|
45 |
|
46 |
|
47 |
+
def get_text_chunks(text):
|
48 |
"""
|
49 |
Split the input text into chunks.
|
50 |
|
|
|
62 |
text_splitter = RecursiveCharacterTextSplitter(
|
63 |
chunk_size=1024, chunk_overlap=64
|
64 |
)
|
65 |
+
texts = text_splitter.split_text(text)
|
66 |
return texts
|
67 |
|
68 |
|
|
|
172 |
)
|
173 |
if st.button("Process"):
|
174 |
with st.spinner("Processing"):
|
175 |
+
# get the raw text
|
176 |
+
text = get_pdf_text(pdf_docs)
|
177 |
+
|
178 |
# get the text chunks
|
179 |
+
text_chunks = get_text_chunks(text)
|
180 |
|
181 |
# create vector store
|
182 |
vectorstore = get_vectorstore(text_chunks)
|