Phi2-PDF-chat

Sleeping

dinhquangson commited on Jan 21

Commit

142ca34

•

1 Parent(s): 16a13e4

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import os
 import streamlit as st
 from dotenv import load_dotenv
 from PyPDF2 import PdfReader
-from langchain.text_splitter import CharacterTextSplitter
 from langchain.embeddings import HuggingFaceBgeEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.chat_models import ChatOpenAI
@@ -42,7 +42,7 @@ def get_pdf_text(pdf_docs):
     return text
-def get_text_chunks(text):
     """
     Split the input text into chunks.
@@ -57,11 +57,11 @@ def get_text_chunks(text):
         List of text chunks.
     """
-    text_splitter = CharacterTextSplitter(
-        separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
     )
-    chunks = text_splitter.split_text(text)
-    return chunks
 def get_vectorstore(text_chunks):
@@ -170,11 +170,8 @@ def main():
         )
         if st.button("Process"):
             with st.spinner("Processing"):
-                # get pdf text
-                raw_text = get_pdf_text(pdf_docs)
                 # get the text chunks
-                text_chunks = get_text_chunks(raw_text)
                 # create vector store
                 vectorstore = get_vectorstore(text_chunks)

 import streamlit as st
 from dotenv import load_dotenv
 from PyPDF2 import PdfReader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.embeddings import HuggingFaceBgeEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.chat_models import ChatOpenAI
     return text
+def get_texts(pdf_pages):
     """
     Split the input text into chunks.
         List of text chunks.
     """
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1024, chunk_overlap=64
     )
+    texts = text_splitter.split_text(pdf_pages)
+    return texts
 def get_vectorstore(text_chunks):
         )
         if st.button("Process"):
             with st.spinner("Processing"):
                 # get the text chunks
+                text_chunks = get_texts(pdf_docs)
                 # create vector store
                 vectorstore = get_vectorstore(text_chunks)