Spaces:

gkrthk
/

confluence_qa

Runtime error

App Files Files Community

gkrthk commited on Nov 15, 2023

Commit

4583ba5

•

1 Parent(s): 72b8502

update embedding

Browse files

Files changed (1) hide show

confluence_qa.py +14 -8

confluence_qa.py CHANGED Viewed

@@ -9,15 +9,15 @@ from langchain.vectorstores import Chroma
 class ConfluenceQA:
     def init_embeddings(self) -> None:
-        self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
     def define_model(self) -> None:
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')
-        model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')
-        # tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
-        # model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
-        pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)
         self.llm = HuggingFacePipeline(pipeline = pipe,model_kwargs={"temperature": 0})
     def store_in_vector_db(self) -> None:
@@ -31,14 +31,20 @@ class ConfluenceQA:
             url=confluence_url, username=username, api_key=api_key
         )
         documents = loader.load(include_attachments=include_attachment, limit=100, space_key=space_key)
-        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
         documents = text_splitter.split_documents(documents)
         self.db = Chroma.from_documents(documents, self.embeddings)
         question = "How do I make a space public?"
         searchDocs = self.db.similarity_search(question)
         print(searchDocs[0].page_content)
     def retrieve_qa_chain(self) -> None:
         template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
         {context}

 class ConfluenceQA:
     def init_embeddings(self) -> None:
+        self.embeddings = HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")
     def define_model(self) -> None:
+        # tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')
+        # model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')
+        tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
+        model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
+        pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
         self.llm = HuggingFacePipeline(pipeline = pipe,model_kwargs={"temperature": 0})
     def store_in_vector_db(self) -> None:
             url=confluence_url, username=username, api_key=api_key
         )
         documents = loader.load(include_attachments=include_attachment, limit=100, space_key=space_key)
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
         documents = text_splitter.split_documents(documents)
+        # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
+        # documents = text_splitter.split_documents(documents)
+        # text_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=10)
+        # documents = text_splitter.split_documents(documents)
+        # print(documents)
+        # text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base")  # This the encoding for text-embedding-ada-002
+        # documents = text_splitter.split_documents(documents)
         self.db = Chroma.from_documents(documents, self.embeddings)
         question = "How do I make a space public?"
         searchDocs = self.db.similarity_search(question)
         print(searchDocs[0].page_content)
     def retrieve_qa_chain(self) -> None:
         template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
         {context}