gkrthk commited on
Commit
d3deef7
1 Parent(s): 396decf
Files changed (1) hide show
  1. confluence_qa.py +5 -5
confluence_qa.py CHANGED
@@ -15,7 +15,7 @@ class ConfluenceQA:
15
  tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
16
  model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
17
  pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, truncation=True)
18
- self.llm = HuggingFacePipeline(pipeline = pipe,model_kwargs={"temperature": 1},)
19
 
20
  def store_in_vector_db(self) -> None:
21
  persist_directory = self.config.get("persist_directory",None)
@@ -33,11 +33,11 @@ class ConfluenceQA:
33
  # print(documents)
34
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
35
  documents = text_splitter.split_documents(documents)
36
- text_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=10)
37
- documents = text_splitter.split_documents(documents)
38
- print(documents)
39
- # text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base") # This the encoding for text-embedding-ada-002
40
  # documents = text_splitter.split_documents(documents)
 
 
 
41
  self.db = Chroma.from_documents(documents, self.embeddings)
42
 
43
 
 
15
  tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
16
  model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
17
  pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, truncation=True)
18
+ self.llm = HuggingFacePipeline(pipeline = pipe,model_kwargs={"temperature": 0},)
19
 
20
  def store_in_vector_db(self) -> None:
21
  persist_directory = self.config.get("persist_directory",None)
 
33
  # print(documents)
34
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
35
  documents = text_splitter.split_documents(documents)
36
+ # text_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=10)
 
 
 
37
  # documents = text_splitter.split_documents(documents)
38
+ # print(documents)
39
+ text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base") # This the encoding for text-embedding-ada-002
40
+ documents = text_splitter.split_documents(documents)
41
  self.db = Chroma.from_documents(documents, self.embeddings)
42
 
43