gkrthk commited on
Commit
fb5c9b9
1 Parent(s): 4583ba5
Files changed (1) hide show
  1. confluence_qa.py +5 -12
confluence_qa.py CHANGED
@@ -12,12 +12,12 @@ class ConfluenceQA:
12
  self.embeddings = HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")
13
 
14
  def define_model(self) -> None:
15
- # tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')
16
- # model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')
17
 
18
- tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
19
- model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
20
- pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
21
  self.llm = HuggingFacePipeline(pipeline = pipe,model_kwargs={"temperature": 0})
22
 
23
  def store_in_vector_db(self) -> None:
@@ -33,13 +33,6 @@ class ConfluenceQA:
33
  documents = loader.load(include_attachments=include_attachment, limit=100, space_key=space_key)
34
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
35
  documents = text_splitter.split_documents(documents)
36
- # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
37
- # documents = text_splitter.split_documents(documents)
38
- # text_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=10)
39
- # documents = text_splitter.split_documents(documents)
40
- # print(documents)
41
- # text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base") # This the encoding for text-embedding-ada-002
42
- # documents = text_splitter.split_documents(documents)
43
  self.db = Chroma.from_documents(documents, self.embeddings)
44
  question = "How do I make a space public?"
45
  searchDocs = self.db.similarity_search(question)
 
12
  self.embeddings = HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")
13
 
14
  def define_model(self) -> None:
15
+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')
16
+ model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')
17
 
18
+ # tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
19
+ # model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
20
+ pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)
21
  self.llm = HuggingFacePipeline(pipeline = pipe,model_kwargs={"temperature": 0})
22
 
23
  def store_in_vector_db(self) -> None:
 
33
  documents = loader.load(include_attachments=include_attachment, limit=100, space_key=space_key)
34
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
35
  documents = text_splitter.split_documents(documents)
 
 
 
 
 
 
 
36
  self.db = Chroma.from_documents(documents, self.embeddings)
37
  question = "How do I make a space public?"
38
  searchDocs = self.db.similarity_search(question)