gkrthk commited on
Commit
d30c111
1 Parent(s): 4247ae4
Files changed (1) hide show
  1. confluence_qa.py +7 -5
confluence_qa.py CHANGED
@@ -1,5 +1,5 @@
1
  from langchain.document_loaders import ConfluenceLoader
2
- from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter,RecursiveCharacterTextSplitter
3
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,pipeline
4
  from langchain.llms.huggingface_pipeline import HuggingFacePipeline
5
  from langchain.prompts import PromptTemplate
@@ -28,12 +28,14 @@ class ConfluenceQA:
28
  url=confluence_url, username=username, api_key=api_key
29
  )
30
  documents = loader.load(include_attachments=include_attachment, limit=100, space_key=space_key)
31
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
 
 
 
 
 
32
  documents = text_splitter.split_documents(documents)
33
  print(documents)
34
- # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
35
- # documents = text_splitter.split_documents(documents)
36
-
37
  # text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base") # This the encoding for text-embedding-ada-002
38
  # documents = text_splitter.split_documents(documents)
39
  self.db = Chroma.from_documents(documents, self.embeddings)
 
1
  from langchain.document_loaders import ConfluenceLoader
2
+ from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter,RecursiveCharacterTextSplitter,SentenceTransformersTokenTextSplitter
3
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,pipeline
4
  from langchain.llms.huggingface_pipeline import HuggingFacePipeline
5
  from langchain.prompts import PromptTemplate
 
28
  url=confluence_url, username=username, api_key=api_key
29
  )
30
  documents = loader.load(include_attachments=include_attachment, limit=100, space_key=space_key)
31
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
32
+ # documents = text_splitter.split_documents(documents)
33
+ # print(documents)
34
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
35
+ documents = text_splitter.split_documents(documents)
36
+ text_splitter = SentenceTransformersTokenTextSplitter(chunck_size=1000,chunk_overlap=10)
37
  documents = text_splitter.split_documents(documents)
38
  print(documents)
 
 
 
39
  # text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base") # This the encoding for text-embedding-ada-002
40
  # documents = text_splitter.split_documents(documents)
41
  self.db = Chroma.from_documents(documents, self.embeddings)