Noobian commited on
Commit
aa0a1b8
1 Parent(s): 21dffd3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -7
app.py CHANGED
@@ -2,6 +2,8 @@ import streamlit as st
2
  from dotenv import load_dotenv
3
  from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter
 
 
5
  from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
  from langchain.vectorstores import FAISS
7
  from langchain.chat_models import ChatOpenAI
@@ -18,18 +20,43 @@ def get_pdf_text(pdf_docs):
18
  text += page.extract_text()
19
  return text
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  def get_text_chunks(text):
23
- text_splitter = CharacterTextSplitter(
24
- separator="\n",
25
- chunk_size=1000,
26
- chunk_overlap=200,
27
- length_function=len
28
- )
29
- chunks = text_splitter.split_text(text)
 
 
 
30
  return chunks
31
 
32
 
 
 
33
  def get_vectorstore(text_chunks):
34
  #embeddings = OpenAIEmbeddings()
35
  embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
 
2
  from dotenv import load_dotenv
3
  from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+
7
  from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
8
  from langchain.vectorstores import FAISS
9
  from langchain.chat_models import ChatOpenAI
 
20
  text += page.extract_text()
21
  return text
22
 
23
+ #@st.cache_resource
24
+ def split_texts(text, chunk_size, overlap, split_method):
25
+
26
+ # Split texts
27
+ # IN: text, chunk size, overlap, split_method
28
+ # OUT: list of str splits
29
+
30
+ st.info("`Splitting doc ...`")
31
+
32
+ split_method = "RecursiveTextSplitter"
33
+ text_splitter = RecursiveCharacterTextSplitter(
34
+ chunk_size=chunk_size, chunk_overlap=overlap)
35
+
36
+ splits = text_splitter.split_text(text)
37
+ if not splits:
38
+ st.error("Failed to split document")
39
+ st.stop()
40
+
41
+ return splits
42
+
43
 
44
  def get_text_chunks(text):
45
+ # text_splitter = CharacterTextSplitter(
46
+ # separator="\n",
47
+ # chunk_size=1000,
48
+ # chunk_overlap=200,
49
+ # length_function=len
50
+ # )
51
+ # chunks = text_splitter.split_text(text)
52
+
53
+ chunks = split_texts(text, 1000, 200, "RecursiveCharacterTextSplitter")
54
+
55
  return chunks
56
 
57
 
58
+
59
+
60
  def get_vectorstore(text_chunks):
61
  #embeddings = OpenAIEmbeddings()
62
  embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")