Spaces:
Sleeping
Sleeping
dinhquangson
commited on
Commit
•
142ca34
1
Parent(s):
16a13e4
Update app.py
Browse files
app.py
CHANGED
@@ -9,7 +9,7 @@ import os
|
|
9 |
import streamlit as st
|
10 |
from dotenv import load_dotenv
|
11 |
from PyPDF2 import PdfReader
|
12 |
-
from langchain.text_splitter import
|
13 |
from langchain.embeddings import HuggingFaceBgeEmbeddings
|
14 |
from langchain.vectorstores import FAISS
|
15 |
from langchain.chat_models import ChatOpenAI
|
@@ -42,7 +42,7 @@ def get_pdf_text(pdf_docs):
|
|
42 |
return text
|
43 |
|
44 |
|
45 |
-
def
|
46 |
"""
|
47 |
Split the input text into chunks.
|
48 |
|
@@ -57,11 +57,11 @@ def get_text_chunks(text):
|
|
57 |
List of text chunks.
|
58 |
|
59 |
"""
|
60 |
-
text_splitter =
|
61 |
-
|
62 |
)
|
63 |
-
|
64 |
-
return
|
65 |
|
66 |
|
67 |
def get_vectorstore(text_chunks):
|
@@ -170,11 +170,8 @@ def main():
|
|
170 |
)
|
171 |
if st.button("Process"):
|
172 |
with st.spinner("Processing"):
|
173 |
-
# get pdf text
|
174 |
-
raw_text = get_pdf_text(pdf_docs)
|
175 |
-
|
176 |
# get the text chunks
|
177 |
-
text_chunks =
|
178 |
|
179 |
# create vector store
|
180 |
vectorstore = get_vectorstore(text_chunks)
|
|
|
9 |
import streamlit as st
|
10 |
from dotenv import load_dotenv
|
11 |
from PyPDF2 import PdfReader
|
12 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
from langchain.embeddings import HuggingFaceBgeEmbeddings
|
14 |
from langchain.vectorstores import FAISS
|
15 |
from langchain.chat_models import ChatOpenAI
|
|
|
42 |
return text
|
43 |
|
44 |
|
45 |
+
def get_texts(pdf_pages):
|
46 |
"""
|
47 |
Split the input text into chunks.
|
48 |
|
|
|
57 |
List of text chunks.
|
58 |
|
59 |
"""
|
60 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
61 |
+
chunk_size=1024, chunk_overlap=64
|
62 |
)
|
63 |
+
texts = text_splitter.split_text(pdf_pages)
|
64 |
+
return texts
|
65 |
|
66 |
|
67 |
def get_vectorstore(text_chunks):
|
|
|
170 |
)
|
171 |
if st.button("Process"):
|
172 |
with st.spinner("Processing"):
|
|
|
|
|
|
|
173 |
# get the text chunks
|
174 |
+
text_chunks = get_texts(pdf_docs)
|
175 |
|
176 |
# create vector store
|
177 |
vectorstore = get_vectorstore(text_chunks)
|