Spaces:
Runtime error
Runtime error
gkrthk
commited on
Commit
•
4583ba5
1
Parent(s):
72b8502
update embedding
Browse files- confluence_qa.py +14 -8
confluence_qa.py
CHANGED
@@ -9,15 +9,15 @@ from langchain.vectorstores import Chroma
|
|
9 |
|
10 |
class ConfluenceQA:
|
11 |
def init_embeddings(self) -> None:
|
12 |
-
self.embeddings = HuggingFaceEmbeddings(model_name="
|
13 |
|
14 |
def define_model(self) -> None:
|
15 |
-
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')
|
16 |
-
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
pipe = pipeline("
|
21 |
self.llm = HuggingFacePipeline(pipeline = pipe,model_kwargs={"temperature": 0})
|
22 |
|
23 |
def store_in_vector_db(self) -> None:
|
@@ -31,14 +31,20 @@ class ConfluenceQA:
|
|
31 |
url=confluence_url, username=username, api_key=api_key
|
32 |
)
|
33 |
documents = loader.load(include_attachments=include_attachment, limit=100, space_key=space_key)
|
34 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=
|
35 |
documents = text_splitter.split_documents(documents)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
self.db = Chroma.from_documents(documents, self.embeddings)
|
37 |
question = "How do I make a space public?"
|
38 |
searchDocs = self.db.similarity_search(question)
|
39 |
print(searchDocs[0].page_content)
|
40 |
|
41 |
-
|
42 |
def retrieve_qa_chain(self) -> None:
|
43 |
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
44 |
{context}
|
|
|
9 |
|
10 |
class ConfluenceQA:
|
11 |
def init_embeddings(self) -> None:
|
12 |
+
self.embeddings = HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")
|
13 |
|
14 |
def define_model(self) -> None:
|
15 |
+
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')
|
16 |
+
# model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')
|
17 |
|
18 |
+
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
|
19 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
|
20 |
+
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
|
21 |
self.llm = HuggingFacePipeline(pipeline = pipe,model_kwargs={"temperature": 0})
|
22 |
|
23 |
def store_in_vector_db(self) -> None:
|
|
|
31 |
url=confluence_url, username=username, api_key=api_key
|
32 |
)
|
33 |
documents = loader.load(include_attachments=include_attachment, limit=100, space_key=space_key)
|
34 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
|
35 |
documents = text_splitter.split_documents(documents)
|
36 |
+
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
|
37 |
+
# documents = text_splitter.split_documents(documents)
|
38 |
+
# text_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=10)
|
39 |
+
# documents = text_splitter.split_documents(documents)
|
40 |
+
# print(documents)
|
41 |
+
# text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base") # This the encoding for text-embedding-ada-002
|
42 |
+
# documents = text_splitter.split_documents(documents)
|
43 |
self.db = Chroma.from_documents(documents, self.embeddings)
|
44 |
question = "How do I make a space public?"
|
45 |
searchDocs = self.db.similarity_search(question)
|
46 |
print(searchDocs[0].page_content)
|
47 |
|
|
|
48 |
def retrieve_qa_chain(self) -> None:
|
49 |
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
50 |
{context}
|