gkrthk commited on
Commit
4247ae4
1 Parent(s): 26cc997

is a combination of 2 commits.

Browse files

fix error

move print

update

Files changed (1) hide show
  1. confluence_qa.py +7 -4
confluence_qa.py CHANGED
@@ -1,5 +1,5 @@
1
  from langchain.document_loaders import ConfluenceLoader
2
- from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
3
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,pipeline
4
  from langchain.llms.huggingface_pipeline import HuggingFacePipeline
5
  from langchain.prompts import PromptTemplate
@@ -28,11 +28,14 @@ class ConfluenceQA:
28
  url=confluence_url, username=username, api_key=api_key
29
  )
30
  documents = loader.load(include_attachments=include_attachment, limit=100, space_key=space_key)
31
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
32
  documents = text_splitter.split_documents(documents)
33
  print(documents)
34
- text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base") # This the encoding for text-embedding-ada-002
35
- texts = text_splitter.split_documents(texts)
 
 
 
36
  self.db = Chroma.from_documents(documents, self.embeddings)
37
 
38
 
 
1
  from langchain.document_loaders import ConfluenceLoader
2
+ from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter,RecursiveCharacterTextSplitter
3
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,pipeline
4
  from langchain.llms.huggingface_pipeline import HuggingFacePipeline
5
  from langchain.prompts import PromptTemplate
 
28
  url=confluence_url, username=username, api_key=api_key
29
  )
30
  documents = loader.load(include_attachments=include_attachment, limit=100, space_key=space_key)
31
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
32
  documents = text_splitter.split_documents(documents)
33
  print(documents)
34
+ # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
35
+ # documents = text_splitter.split_documents(documents)
36
+
37
+ # text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base") # This the encoding for text-embedding-ada-002
38
+ # documents = text_splitter.split_documents(documents)
39
  self.db = Chroma.from_documents(documents, self.embeddings)
40
 
41