gkrthk commited on
Commit
26cc997
1 Parent(s): 39d1b98

change embedding

Browse files
Files changed (1) hide show
  1. confluence_qa.py +10 -8
confluence_qa.py CHANGED
@@ -1,5 +1,5 @@
1
  from langchain.document_loaders import ConfluenceLoader
2
- from langchain.text_splitter import RecursiveCharacterTextSplitter,TokenTextSplitter
3
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,pipeline
4
  from langchain.llms.huggingface_pipeline import HuggingFacePipeline
5
  from langchain.prompts import PromptTemplate
@@ -14,8 +14,8 @@ class ConfluenceQA:
14
  def define_model(self) -> None:
15
  tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
16
  model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
17
- pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024)
18
- self.llm = HuggingFacePipeline(pipeline = pipe,model_kwargs={"temperature": 0, "max_length": 1024},)
19
 
20
  def store_in_vector_db(self) -> None:
21
  persist_directory = self.config.get("persist_directory",None)
@@ -23,17 +23,19 @@ class ConfluenceQA:
23
  username = self.config.get("username",None)
24
  api_key = self.config.get("api_key",None)
25
  space_key = self.config.get("space_key",None)
26
- include_attachment = self.config.get("include_attachment", False)
27
  loader = ConfluenceLoader(
28
  url=confluence_url, username=username, api_key=api_key
29
  )
30
- documents = loader.load(include_attachments=include_attachment, limit=50, space_key=space_key)
31
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
32
  documents = text_splitter.split_documents(documents)
33
- # text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10) # This the encoding for text-embedding-ada-002
34
- # texts = text_splitter.split_documents(texts)
 
35
  self.db = Chroma.from_documents(documents, self.embeddings)
36
 
 
37
  def retrieve_qa_chain(self) -> None:
38
  template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Keep the answer as concise as possible.
39
  {context}
 
1
  from langchain.document_loaders import ConfluenceLoader
2
+ from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
3
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,pipeline
4
  from langchain.llms.huggingface_pipeline import HuggingFacePipeline
5
  from langchain.prompts import PromptTemplate
 
14
  def define_model(self) -> None:
15
  tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
16
  model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
17
+ pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
18
+ self.llm = HuggingFacePipeline(pipeline = pipe,model_kwargs={"temperature": 1},)
19
 
20
  def store_in_vector_db(self) -> None:
21
  persist_directory = self.config.get("persist_directory",None)
 
23
  username = self.config.get("username",None)
24
  api_key = self.config.get("api_key",None)
25
  space_key = self.config.get("space_key",None)
26
+ include_attachment = self.config.get("include_attachment", True)
27
  loader = ConfluenceLoader(
28
  url=confluence_url, username=username, api_key=api_key
29
  )
30
+ documents = loader.load(include_attachments=include_attachment, limit=100, space_key=space_key)
31
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
32
  documents = text_splitter.split_documents(documents)
33
+ print(documents)
34
+ text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base") # This the encoding for text-embedding-ada-002
35
+ texts = text_splitter.split_documents(texts)
36
  self.db = Chroma.from_documents(documents, self.embeddings)
37
 
38
+
39
  def retrieve_qa_chain(self) -> None:
40
  template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Keep the answer as concise as possible.
41
  {context}