anything-question-answering

Runtime error

App Files Files Community

LOUIS SANNA commited on Oct 24, 2023

Commit

cc2ce8c

•

1 Parent(s): 6e28a81

feat(loader)

Browse files

Files changed (25) hide show

.DS_Store +0 -0
.gitattributes +3 -0
README.md +10 -2
app.py +14 -14
chroma_db/5fa47764-2449-49fb-ae2f-0fd1886dfa2d/data_level0.bin +3 -0
chroma_db/5fa47764-2449-49fb-ae2f-0fd1886dfa2d/header.bin +3 -0
chroma_db/5fa47764-2449-49fb-ae2f-0fd1886dfa2d/length.bin +3 -0
chroma_db/5fa47764-2449-49fb-ae2f-0fd1886dfa2d/link_lists.bin +0 -0
chroma_db/chroma.sqlite3 +3 -0
climateqa/build_index.py +54 -0
climateqa/chains.py +2 -1
climateqa/chat.py +0 -42
climateqa/custom_retrieval_chain.py +2 -18
climateqa/embeddings.py +1 -0
climateqa/llm.py +39 -6
climateqa/{logging.py → qa_logging.py} +2 -3
climateqa/retriever.py +48 -107
climateqa/vectorstore.py +26 -4
constitution.pdf +0 -0
data/daoism/tao-te-ching.pdf +0 -0
data/us-founding/constitution.pdf +0 -0
data/us-founding/declaration-of-independance.pdf +0 -0
declaration-of-independance.pdf +0 -0
requirements.txt +5 -3
utils.py +0 -3

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitattributes CHANGED Viewed

@@ -44,3 +44,6 @@ documents/climate_gpt_v2_only_giec.faiss filter=lfs diff=lfs merge=lfs -text
 documents/climate_gpt_v2.faiss filter=lfs diff=lfs merge=lfs -text
 climateqa_v3.db filter=lfs diff=lfs merge=lfs -text
 climateqa_v3.faiss filter=lfs diff=lfs merge=lfs -text

 documents/climate_gpt_v2.faiss filter=lfs diff=lfs merge=lfs -text
 climateqa_v3.db filter=lfs diff=lfs merge=lfs -text
 climateqa_v3.faiss filter=lfs diff=lfs merge=lfs -text
+data filter=lfs diff=lfs merge=lfs -text
+chroma_db filter=lfs diff=lfs merge=lfs -text
+chroma_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: ClimateQ&A
 emoji: 🌍
 colorFrom: blue
 colorTo: red
@@ -9,4 +9,12 @@ app_file: app.py
 pinned: false
 ---
-# Climate Q&A

 ---
+title: AnythingQ&A
 emoji: 🌍
 colorFrom: blue
 colorTo: red
 pinned: false
 ---
+# Anything Q&A
+A clone of the amazing https://huggingface.co/spaces/Ekimetrics/climate-question-answering.
+## Build vector index
+```bash
+python -m climateqa.build_index
+```

app.py CHANGED Viewed

@@ -2,18 +2,18 @@ import gradio as gr
 from utils import create_user_id
 # Langchain
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 # ClimateQ&A imports
 from climateqa.llm import get_llm
-from climateqa.logging import log
 from climateqa.chains import load_qa_chain_with_text
 from climateqa.chains import load_reformulation_chain
-from climateqa.vectorstore import get_pinecone_vectorstore
-from climateqa.retriever import ClimateQARetriever
 from climateqa.prompts import audience_prompts
 # Load environment variables in local mode
@@ -113,13 +113,10 @@ class StreamingGradioCallbackHandler(BaseCallbackHandler):
 # Create embeddings function and LLM
-embeddings_function = HuggingFaceEmbeddings(
-    model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1"
-)
 # Create vectorstore and retriever
-vectorstore = get_pinecone_vectorstore(embeddings_function)
 # ---------------------------------------------------------------------------
 # ClimateQ&A Streaming
@@ -148,8 +145,8 @@ def fetch_sources(query, sources):
     llm_reformulation = get_llm(
         max_tokens=512, temperature=0.0, verbose=True, streaming=False
     )
-    retriever = ClimateQARetriever(
-        vectorstore=vectorstore, sources=sources, k_summary=3, k_total=10
     )
     reformulation_chain = load_reformulation_chain(llm_reformulation)
@@ -265,6 +262,11 @@ def answer_bot(query, history, docs, question, language, audience):
 def make_html_source(source, i):
     meta = source.metadata
     content = source.page_content.split(":", 1)[1].strip()
     return f"""
 <div class="card">
     <div class="card-content">
@@ -273,9 +275,7 @@ def make_html_source(source, i):
     </div>
     <div class="card-footer">
         <span>{meta['name']}</span>
-        <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
-            <span role="img" aria-label="Open PDF">🔗</span>
-        </a>
     </div>
 </div>
 """

 from utils import create_user_id
 # Langchain
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 # ClimateQ&A imports
+from climateqa.embeddings import EMBEDDING_MODEL_NAME
 from climateqa.llm import get_llm
+from climateqa.qa_logging import log
 from climateqa.chains import load_qa_chain_with_text
 from climateqa.chains import load_reformulation_chain
+from climateqa.vectorstore import get_vectorstore
+from climateqa.retriever import QARetriever
 from climateqa.prompts import audience_prompts
 # Load environment variables in local mode
 # Create embeddings function and LLM
+embeddings_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
 # Create vectorstore and retriever
+vectorstore = get_vectorstore(embeddings_function)
 # ---------------------------------------------------------------------------
 # ClimateQ&A Streaming
     llm_reformulation = get_llm(
         max_tokens=512, temperature=0.0, verbose=True, streaming=False
     )
+    retriever = QARetriever(
+        vectorstore=vectorstore, sources=[], k_summary=0, k_total=10
     )
     reformulation_chain = load_reformulation_chain(llm_reformulation)
 def make_html_source(source, i):
     meta = source.metadata
     content = source.page_content.split(":", 1)[1].strip()
+    link = (
+        f'<a href="{meta["url"]}#page={int(meta["page_number"])}" target="_blank" class="pdf-link"><span role="img" aria-label="Open PDF">🔗</span></a>'
+        if "url" in meta
+        else ""
+    )
     return f"""
 <div class="card">
     <div class="card-content">
     </div>
     <div class="card-footer">
         <span>{meta['name']}</span>
+        {link}
     </div>
 </div>
 """

chroma_db/5fa47764-2449-49fb-ae2f-0fd1886dfa2d/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a13e72541800c513c73dccea69f79e39cf4baef4fa23f7e117c0d6b0f5f99670
+size 3212000

chroma_db/5fa47764-2449-49fb-ae2f-0fd1886dfa2d/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ec6df10978b056a10062ed99efeef2702fa4a1301fad702b53dd2517103c746
+size 100

chroma_db/5fa47764-2449-49fb-ae2f-0fd1886dfa2d/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
+size 4000

chroma_db/5fa47764-2449-49fb-ae2f-0fd1886dfa2d/link_lists.bin ADDED Viewed

File without changes

chroma_db/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db081ece29301d223a01bac97e8b2905fada2e7c376cec96bf44fee0f5c95069
+size 1843200

climateqa/build_index.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# import
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import Chroma
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.document_loaders import PyPDFLoader
+from .embeddings import EMBEDDING_MODEL_NAME
+from .vectorstore import get_vectorstore
+def load_data():
+    docs = parse_data()
+    embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
+    vectorstore = get_vectorstore(embedding_function)
+    assert isinstance(vectorstore, Chroma)
+    vectorstore.from_documents(
+        docs, embedding_function, persist_directory="./chroma_db"
+    )
+    return vectorstore
+def parse_data():
+    loader = PyPDFLoader("data/daoism/tao-te-ching.pdf")
+    pages = loader.load_and_split()
+    # split it into chunks
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=0)
+    docs = text_splitter.split_documents(pages)
+    print(docs)
+    for doc in docs:
+        doc.metadata["name"] = parse_name(doc.metadata["source"])
+        doc.metadata["domain"] = parse_domain(doc.metadata["source"])
+        doc.metadata["page_number"] = doc.metadata["page"]
+        doc.metadata["short_name"] = doc.metadata["name"]
+    return docs
+def parse_name(source: str) -> str:
+    return source.split("/")[-1].split(".")[0]
+def parse_domain(source: str) -> str:
+    return source.split("/")[2]
+if __name__ == "__main__":
+    db = load_data()
+    # query it
+    query = (
+        "He who can bear the misfortune of a nation is called the ruler of the world."
+    )
+    docs = db.similarity_search(query)
+    print(docs)

climateqa/chains.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import json
 from langchain import PromptTemplate, LLMChain
-from langchain.chains import RetrievalQAWithSourcesChain, QAWithSourcesChain
 from langchain.chains import TransformChain, SequentialChain
 from langchain.chains.qa_with_sources import load_qa_with_sources_chain
@@ -21,6 +21,7 @@ def load_reformulation_chain(llm):
     # Parse the output
     def parse_output(output):
         query = output["query"]
         json_output = json.loads(output["json"])
         question = json_output.get("question", query)
         language = json_output.get("language", "English")

 import json
 from langchain import PromptTemplate, LLMChain
+from langchain.chains import QAWithSourcesChain
 from langchain.chains import TransformChain, SequentialChain
 from langchain.chains.qa_with_sources import load_qa_with_sources_chain
     # Parse the output
     def parse_output(output):
         query = output["query"]
+        print("output", output)
         json_output = json.loads(output["json"])
         question = json_output.get("question", query)
         language = json_output.get("language", "English")

climateqa/chat.py DELETED Viewed

@@ -1,42 +0,0 @@
-# LANGCHAIN IMPORTS
-from langchain import PromptTemplate, LLMChain
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.chains import RetrievalQAWithSourcesChain
-from langchain.chains.qa_with_sources import load_qa_with_sources_chain
-# CLIMATEQA
-from climateqa.retriever import ClimateQARetriever
-from climateqa.vectorstore import get_pinecone_vectorstore
-from climateqa.chains import load_climateqa_chain
-class ClimateQA:
-    def __init__(
-        self,
-        hf_embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
-        show_progress_bar=False,
-        batch_size=1,
-        max_tokens=1024,
-        **kwargs
-    ):
-        self.llm = self.get_llm(max_tokens=max_tokens, **kwargs)
-        self.embeddings_function = HuggingFaceEmbeddings(
-            model_name=hf_embedding_model,
-            encode_kwargs={
-                "show_progress_bar": show_progress_bar,
-                "batch_size": batch_size,
-            },
-        )
-    def get_vectorstore(self):
-        pass
-    def reformulate(self):
-        pass
-    def retrieve(self):
-        pass
-    def ask(self):
-        pass

climateqa/custom_retrieval_chain.py CHANGED Viewed

@@ -1,35 +1,19 @@
 from __future__ import annotations
 import inspect
-from typing import Any, Dict, List, Optional
-from pydantic import Extra
-from langchain.schema.language_model import BaseLanguageModel
 from langchain.callbacks.manager import (
-    AsyncCallbackManagerForChainRun,
     CallbackManagerForChainRun,
 )
-from langchain.chains.base import Chain
-from langchain.prompts.base import BasePromptTemplate
-from typing import Any, Dict, List
 from langchain.callbacks.manager import (
-    AsyncCallbackManagerForChainRun,
     CallbackManagerForChainRun,
 )
-from langchain.chains.combine_documents.stuff import StuffDocumentsChain
-from langchain.chains.qa_with_sources.base import BaseQAWithSourcesChain
-from langchain.docstore.document import Document
-from langchain.pydantic_v1 import Field
-from langchain.schema import BaseRetriever
 from langchain.chains import RetrievalQAWithSourcesChain
-from langchain.chains.router.llm_router import LLMRouterChain
 class CustomRetrievalQAWithSourcesChain(RetrievalQAWithSourcesChain):
     fallback_answer: str = "No sources available to answer this question."

 from __future__ import annotations
 import inspect
+from typing import Any, Dict
 from langchain.callbacks.manager import (
     CallbackManagerForChainRun,
 )
+from typing import Any, Dict
 from langchain.callbacks.manager import (
     CallbackManagerForChainRun,
 )
 from langchain.chains import RetrievalQAWithSourcesChain
 class CustomRetrievalQAWithSourcesChain(RetrievalQAWithSourcesChain):
     fallback_answer: str = "No sources available to answer this question."

climateqa/embeddings.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ EMBEDDING_MODEL_NAME = "sentence-transformers/multi-qa-mpnet-base-dot-v1"

climateqa/llm.py CHANGED Viewed

@@ -1,7 +1,6 @@
-from langchain.chat_models import AzureChatOpenAI
 import os
-# LOAD ENVIRONMENT VARIABLES
 try:
     from dotenv import load_dotenv
@@ -11,16 +10,50 @@ except:
 def get_llm(max_tokens=1024, temperature=0.0, verbose=True, streaming=False, **kwargs):
     llm = AzureChatOpenAI(
         openai_api_base=os.environ["AZURE_OPENAI_API_BASE_URL"],
         openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
         deployment_name=os.environ["AZURE_OPENAI_API_DEPLOYMENT_NAME"],
         openai_api_key=os.environ["AZURE_OPENAI_API_KEY"],
         openai_api_type="azure",
-        max_tokens=max_tokens,
-        temperature=temperature,
-        verbose=verbose,
-        streaming=streaming,
         **kwargs,
     )
     return llm

+from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
 import os
 try:
     from dotenv import load_dotenv
 def get_llm(max_tokens=1024, temperature=0.0, verbose=True, streaming=False, **kwargs):
+    if has_azure_openai_config():
+        return get_azure_llm(
+            max_tokens=max_tokens,
+            temperature=temperature,
+            verbose=verbose,
+            streaming=streaming,
+            **kwargs,
+        )
+    return get_open_ai_llm(
+        max_tokens=max_tokens,
+        temperature=temperature,
+        verbose=verbose,
+        streaming=streaming,
+        **kwargs,
+    )
+def has_azure_openai_config():
+    """
+    Checks if the necessary environment variables for Azure Blob Storage are set.
+    Returns True if they are set, False otherwise.
+    """
+    return all(
+        key in os.environ
+        for key in [
+            "AZURE_OPENAI_API_BASE_URL",
+            "AZURE_OPENAI_API_VERSION",
+            "AZURE_OPENAI_API_DEPLOYMENT_NAME",
+            "AZURE_OPENAI_API_KEY",
+        ]
+    )
+def get_open_ai_llm(**kwargs):
+    return ChatOpenAI(**kwargs)
+def get_azure_llm(**kwargs):
     llm = AzureChatOpenAI(
         openai_api_base=os.environ["AZURE_OPENAI_API_BASE_URL"],
         openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
         deployment_name=os.environ["AZURE_OPENAI_API_DEPLOYMENT_NAME"],
         openai_api_key=os.environ["AZURE_OPENAI_API_KEY"],
         openai_api_type="azure",
         **kwargs,
     )
     return llm

climateqa/{logging.py → qa_logging.py} RENAMED Viewed

@@ -2,9 +2,6 @@ import datetime
 import json
 import os
-from azure.storage.fileshare import ShareServiceClient
 def log(question, history, docs, user_id):
     if has_blob_config():
         log_in_azure(question, history, docs, user_id)
@@ -49,6 +46,8 @@ def get_azure_blob_client():
     }
     account_url = os.environ["BLOB_ACCOUNT_URL"]
     file_share_name = "climategpt"
     service = ShareServiceClient(account_url=account_url, credential=credential)
     share_client = service.get_share_client(file_share_name)
     return share_client

 import json
 import os
 def log(question, history, docs, user_id):
     if has_blob_config():
         log_in_azure(question, history, docs, user_id)
     }
     account_url = os.environ["BLOB_ACCOUNT_URL"]
     file_share_name = "climategpt"
+    # I don't know why this is necessary, but it cause an error otherwise when running build_index.py
+    from azure.storage.fileshare import ShareServiceClient
     service = ShareServiceClient(account_url=account_url, credential=credential)
     share_client = service.get_share_client(file_share_name)
     return share_client

climateqa/retriever.py CHANGED Viewed

@@ -1,56 +1,65 @@
 # https://github.com/langchain-ai/langchain/issues/8623
-import pandas as pd
 from langchain.schema.retriever import BaseRetriever, Document
-from langchain.vectorstores.base import VectorStoreRetriever
 from langchain.vectorstores import VectorStore
-from langchain.callbacks.manager import CallbackManagerForRetrieverRun
 from typing import List
-from pydantic import Field
-class ClimateQARetriever(BaseRetriever):
     vectorstore: VectorStore
-    sources: list = ["IPCC", "IPBES"]
     threshold: float = 22
-    k_summary: int = 3
     k_total: int = 10
     namespace: str = "vectors"
     def get_relevant_documents(self, query: str) -> List[Document]:
         # Check if all elements in the list are either IPCC or IPBES
         assert isinstance(self.sources, list)
-        assert all([x in ["IPCC", "IPBES"] for x in self.sources])
         assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
         # Prepare base search kwargs
-        filters = {
-            "source": {"$in": self.sources},
-        }
-        # Search for k_summary documents in the summaries dataset
-        filters_summaries = {
-            **filters,
-            "report_type": {"$in": ["SPM", "TS"]},
-        }
-        docs_summaries = self.vectorstore.similarity_search_with_score(
-            query=query,
-            namespace=self.namespace,
-            filter=filters_summaries,
-            k=self.k_summary,
-        )
-        docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
         # Search for k_total - k_summary documents in the full reports dataset
-        filters_full = {
-            **filters,
-            "report_type": {"$nin": ["SPM", "TS"]},
-        }
         k_full = self.k_total - len(docs_summaries)
         docs_full = self.vectorstore.similarity_search_with_score(
-            query=query, namespace=self.namespace, filter=filters_full, k=k_full
         )
         # Concatenate documents
         docs = docs_summaries + docs_full
@@ -71,81 +80,13 @@ class ClimateQARetriever(BaseRetriever):
         return results
-# def filter_summaries(df,k_summary = 3,k_total = 10):
-#     # assert source in ["IPCC","IPBES","ALL"], "source arg should be in (IPCC,IPBES,ALL)"
-#     # # Filter by source
-#     # if source == "IPCC":
-#     #     df = df.loc[df["source"]=="IPCC"]
-#     # elif source == "IPBES":
-#     #     df = df.loc[df["source"]=="IPBES"]
-#     # else:
-#     #     pass
-#     # Separate summaries and full reports
-#     df_summaries = df.loc[df["report_type"].isin(["SPM","TS"])]
-#     df_full = df.loc[~df["report_type"].isin(["SPM","TS"])]
-#     # Find passages from summaries dataset
-#     passages_summaries = df_summaries.head(k_summary)
-#     # Find passages from full reports dataset
-#     passages_fullreports = df_full.head(k_total - len(passages_summaries))
-#     # Concatenate passages
-#     passages = pd.concat([passages_summaries,passages_fullreports],axis = 0,ignore_index = True)
-#     return passages
-# def retrieve_with_summaries(query,retriever,k_summary = 3,k_total = 10,sources = ["IPCC","IPBES"],max_k = 100,threshold = 0.555,as_dict = True,min_length = 300):
-#     assert max_k > k_total
-#     validated_sources = ["IPCC","IPBES"]
-#     sources = [x for x in sources if x in validated_sources]
-#     filters = {
-#         "source": { "$in": sources },
-#     }
-#     print(filters)
-#     # Retrieve documents
-#     docs = retriever.retrieve(query,top_k = max_k,filters = filters)
-#     # Filter by score
-#     docs = [{**x.meta,"score":x.score,"content":x.content} for x in docs if x.score > threshold]
-#     if len(docs) == 0:
-#         return []
-#     res = pd.DataFrame(docs)
-#     passages_df = filter_summaries(res,k_summary,k_total)
-#     if as_dict:
-#         contents = passages_df["content"].tolist()
-#         meta = passages_df.drop(columns = ["content"]).to_dict(orient = "records")
-#         passages = []
-#         for i in range(len(contents)):
-#             passages.append({"content":contents[i],"meta":meta[i]})
-#         return passages
-#     else:
-#         return passages_df
-# def retrieve(query,sources = ["IPCC"],threshold = 0.555,k = 10):
-#     print("hellooooo")
-#     # Reformulate queries
-#     reformulated_query,language = reformulate(query)
-#     print(reformulated_query)
-#     # Retrieve documents
-#     passages = retrieve_with_summaries(reformulated_query,retriever,k_total = k,k_summary = 3,as_dict = True,sources = sources,threshold = threshold)
-#     response = {
-#       "query":query,
-#       "reformulated_query":reformulated_query,
-#       "language":language,
-#       "sources":passages,
-#       "prompts":{"init_prompt":init_prompt,"sources_prompt":sources_prompt},
-#     }
-#     return response

 # https://github.com/langchain-ai/langchain/issues/8623
 from langchain.schema.retriever import BaseRetriever, Document
 from langchain.vectorstores import VectorStore
+from langchain.vectorstores import Chroma
 from typing import List
+## The idea that some documents are summaries so easier to exploit
+SUMMARY_TYPES = []
+class QARetriever(BaseRetriever):
     vectorstore: VectorStore
+    sources: list = []
     threshold: float = 22
+    k_summary: int = 0
     k_total: int = 10
     namespace: str = "vectors"
     def get_relevant_documents(self, query: str) -> List[Document]:
         # Check if all elements in the list are either IPCC or IPBES
         assert isinstance(self.sources, list)
         assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
+        query = "He who can bear the misfortune of a nation is called the ruler of the world."
         # Prepare base search kwargs
+        filters = {}
+        if len(self.sources):
+            filters["source"] = {"$in": self.sources}
+        if self.k_summary > 0:
+            # Search for k_summary documents in the summaries dataset
+            if len(SUMMARY_TYPES):
+                filters_summaries = {
+                    **filters_summaries,
+                    "report_type": {"$in": SUMMARY_TYPES},
+                }
+            docs_summaries = self.vectorstore.similarity_search_with_score(
+                query=query,
+                # namespace=self.namespace,
+                filter=self.format_filter(filters_summaries),
+                k=self.k_summary,
+            )
+            docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
+        else:
+            docs_summaries = []
         # Search for k_total - k_summary documents in the full reports dataset
+        filters_full = {}
+        if len(SUMMARY_TYPES):
+            filters_full = {**filters_full, "report_type": {"$nin": SUMMARY_TYPES}}
         k_full = self.k_total - len(docs_summaries)
         docs_full = self.vectorstore.similarity_search_with_score(
+            query=query,
+            # namespace=self.namespace,
+            filter=self.format_filter(filters_full),
+            k=k_full,
         )
+        print("docs_full", docs_full)
         # Concatenate documents
         docs = docs_summaries + docs_full
         return results
+    def format_filter(self, filters):
+        # https://docs.trychroma.com/usage-guide#using-logical-operators
+        if isinstance(self.vectorstore, Chroma):
+            if len(filters) <= 1:
+                return filters
+            and_filters = []
+            for field, condition in filters.items():
+                and_filters.append({field: condition})
+            return {"$and": and_filters}
+        return filters

climateqa/vectorstore.py CHANGED Viewed

@@ -3,9 +3,8 @@
 # And https://python.langchain.com/docs/integrations/vectorstores/pinecone
 import os
 import pinecone
-from langchain.vectorstores import Pinecone
-# LOAD ENVIRONMENT VARIABLES
 try:
     from dotenv import load_dotenv
@@ -14,7 +13,30 @@ except:
     pass
-def get_pinecone_vectorstore(embeddings, text_key="content"):
     # initialize pinecone
     pinecone.init(
         api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
@@ -23,6 +45,6 @@ def get_pinecone_vectorstore(embeddings, text_key="content"):
     index_name = os.getenv("PINECONE_API_INDEX")
     vectorstore = Pinecone.from_existing_index(
-        index_name, embeddings, text_key=text_key
     )
     return vectorstore

 # And https://python.langchain.com/docs/integrations/vectorstores/pinecone
 import os
 import pinecone
+from langchain.vectorstores import Chroma, Pinecone
 try:
     from dotenv import load_dotenv
     pass
+def get_vectorstore(embeddings_function):
+    if has_pinecone_config():
+        return get_pinecone_vectorstore(embeddings_function)
+    return get_chroma_vectore_store(embeddings_function)
+def get_chroma_vectore_store(embedding_function):
+    return Chroma(
+        persist_directory="./chroma_db", embedding_function=embedding_function
+    )
+def has_pinecone_config():
+    return all(
+        key in os.environ
+        for key in [
+            "PINECONE_API_KEY",
+            "PINECONE_API_ENVIRONMENT",
+            "PINECONE_API_INDEX",
+        ]
+    )
+def get_pinecone_vectorstore(embeddings_function, text_key="content"):
     # initialize pinecone
     pinecone.init(
         api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
     index_name = os.getenv("PINECONE_API_INDEX")
     vectorstore = Pinecone.from_existing_index(
+        index_name, embeddings_function, text_key=text_key
     )
     return vectorstore

constitution.pdf ADDED Viewed

Binary file (414 kB). View file

data/daoism/tao-te-ching.pdf ADDED Viewed

Binary file (174 kB). View file

data/us-founding/constitution.pdf ADDED Viewed

Binary file (414 kB). View file

data/us-founding/declaration-of-independance.pdf ADDED Viewed

Binary file (742 kB). View file

declaration-of-independance.pdf ADDED Viewed

Binary file (742 kB). View file

requirements.txt CHANGED Viewed

@@ -1,7 +1,9 @@
-gradio==3.47.1
-openai==0.27.0
 azure-storage-file-share==12.11.1
-python-dotenv==1.0.0
 langchain==0.0.295
 pinecone-client==2.2.1
 sentence-transformers==2.2.2

 azure-storage-file-share==12.11.1
+chromadb==0.4.14
+gradio==3.47.1
 langchain==0.0.295
+openai==0.27.0
 pinecone-client==2.2.1
+pypdf==3.16.4
+python-dotenv==1.0.0
 sentence-transformers==2.2.2

utils.py CHANGED Viewed

@@ -1,6 +1,3 @@
-import numpy as np
-import random
-import string
 import uuid





1	import uuid
2
3