anything-question-answering

Runtime error

App Files Files Community

LOUIS SANNA commited on Oct 28, 2023

Commit

fe19632

•

1 Parent(s): 528bf3d

feat(data): add url

Browse files

Files changed (9) hide show

anyqa/build_index.py +18 -28
anyqa/config.py +34 -3
anyqa/source_table.py +5 -17
app.py +10 -13
chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/data_level0.bin +0 -0
chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/header.bin +0 -0
chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/length.bin +0 -0
chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/link_lists.bin +0 -0
chroma_db/chroma.sqlite3 +1 -1

anyqa/build_index.py CHANGED Viewed

@@ -5,6 +5,7 @@ from langchain.vectorstores import Chroma
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.document_loaders import PyPDFLoader
 from .embeddings import EMBEDDING_MODEL_NAME
 from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
@@ -12,7 +13,7 @@ from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
 def load_data():
     print("Loading data...")
     docs = parse_data()
-    print("Loaded documents")
     embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
     print("Building index...")
     vectorstore = get_vectorstore(embedding_function)
@@ -27,37 +28,26 @@ def load_data():
 def parse_data():
     docs = []
-    for root, dirs, files in os.walk("data"):
-        for file in files:
-            if file.endswith(".pdf"):
-                file_path = os.path.join(root, file)
-                loader = PyPDFLoader(file_path)
-                pages = loader.load_and_split()
-                # split it into chunks
-                text_splitter = RecursiveCharacterTextSplitter(
-                    chunk_size=1000, chunk_overlap=0
-                )
-                doc_chunks = text_splitter.split_documents(pages)
-                for chunk in doc_chunks:
-                    chunk.metadata["name"] = parse_name(chunk.metadata["source"])
-                    chunk.metadata["domain"] = parse_domain(chunk.metadata["source"])
-                    chunk.metadata["page_number"] = chunk.metadata["page"]
-                    chunk.metadata["short_name"] = chunk.metadata["name"]
-                    docs.append(chunk)
     return docs
-def parse_name(source: str) -> str:
-    return source.split("/")[-1].split(".")[0].replace("_", " ")
-def parse_domain(source: str) -> str:
-    return source.split("/")[1]
 def clear_index():
     for filename in os.listdir("../chroma_db"):
         file_path = os.path.join("../chroma_db", filename)

 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.document_loaders import PyPDFLoader
+from .config import get_sources
 from .embeddings import EMBEDDING_MODEL_NAME
 from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
 def load_data():
     print("Loading data...")
     docs = parse_data()
+    print("Documents loaded")
     embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
     print("Building index...")
     vectorstore = get_vectorstore(embedding_function)
 def parse_data():
     docs = []
+    for source in get_sources():
+        file_path = source["file_path"]
+        loader = PyPDFLoader(file_path)
+        pages = loader.load_and_split()
+        # split it into chunks
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+        doc_chunks = text_splitter.split_documents(pages)
+        for chunk in doc_chunks:
+            chunk.metadata["name"] = source["name"]
+            chunk.metadata["domain"] = source["domain"]
+            chunk.metadata["url"] = source.get("url", None)
+            chunk.metadata["page_number"] = chunk.metadata["page"]
+            chunk.metadata["short_name"] = chunk.metadata["name"]
+            docs.append(chunk)
     return docs
 def clear_index():
     for filename in os.listdir("../chroma_db"):
         file_path = os.path.join("../chroma_db", filename)

anyqa/config.py CHANGED Viewed

@@ -1,10 +1,41 @@
 import os
 def get_domains():
     domains = []
     for root, dirs, files in os.walk("data"):
         for dir in dirs:
             domains.append(dir)
-    return domains

 import os
+# can be used to add metadata to the index, for instance URL
+metadata_by_file_path = {
+    "data/Daoism/Tao_Te_Ching.pdf": { "url": "https://www.with.org/tao_te_ching_en.pdf" },
+    "data/Confucianism/Analects of Confucius.pdf": { "url": "https://chinatxt.sitehost.iu.edu/Analects_of_Confucius_(Eno-2015).pdf" },
+}
 def get_domains():
     domains = []
     for root, dirs, files in os.walk("data"):
         for dir in dirs:
             domains.append(dir)
+    return domains
+def get_sources():
+    res = []
+    for root, dirs, files in os.walk("data"):
+        for file in files:
+            if file.endswith(".pdf"):
+                file_path = os.path.join(root, file)
+                print("file_path", file_path)
+                res.append(
+                    {
+                        "domain": parse_domain(file_path),
+                        "name": parse_name(file_path),
+                        "file_path": file_path,
+                        **metadata_by_file_path.get(file_path, {})
+                    }
+                )
+    return res
+def parse_name(source: str) -> str:
+    return source.split("/")[-1].split(".")[0].replace("_", " ")
+def parse_domain(source: str) -> str:
+    return source.split("/")[1].replace("_", " ")

anyqa/source_table.py CHANGED Viewed

@@ -1,16 +1,4 @@
-config_list = [
-    {
-        "type": "Doism",
-        "source": "Tao Te Ching",
-        "URL": "https://www.with.org/tao_te_ching_en.pdf",
-    },
-    {
-        "type": "Confucianism",
-        "source": "The Analects of Confucius",
-        "URL": "https://chinatxt.sitehost.iu.edu/Analects_of_Confucius_(Eno-2015).pdf",
-    },
-    # Add more dictionaries as needed
-]
 def generate_source_table():
@@ -27,11 +15,11 @@ def generate_source_table():
     rows = []
     # Add each row to the list
-    for config in config_list:
         row = [
-            config.get("type", ""),
-            config.get("source", ""),
-            config.get("URL", ""),
         ]
         row_str = " | ".join(row)

+from anyqa.config import get_sources
 def generate_source_table():
     rows = []
     # Add each row to the list
+    for source in get_sources():
         row = [
+            source.get("domain", ""),
+            source.get("name", ""),
+            source.get("url", ""),
         ]
         row_str = " | ".join(row)

app.py CHANGED Viewed

@@ -345,18 +345,17 @@ with gr.Blocks(title="❓ Q&A", css="style.css", theme=theme) as demo:
                 with gr.Tabs() as tabs:
                     with gr.TabItem("📝 Examples", elem_id="tab-examples", id=0):
                         examples_hidden = gr.Textbox(elem_id="hidden-message")
                         questions = [
-                            "How does doaism view our dependence on modern technology?",
-                            "From a doaism perspective, should we embrace or challenge the rise of AI?",
-                            "How might doaism influence sustainable economic practices?",
-                            "Does doaism support the idea of a minimalistic economy over consumerism?",
-                            "How does doaism interpret the dynamics of modern relationships?",
-                            "From a doaism viewpoint, how should society handle conflicts and disagreements?",
-                            "How might doaism guide our approach to mental and physical health?",
-                            "Does doaism offer insights into balancing work-life pressures in the modern age?",
-                            "How does doaism view the purpose and methods of modern education?",
-                            "From a doaism perspective, should learning be more experiential than theoretical?",
                         ]
                         examples_questions = gr.Examples(
@@ -564,5 +563,3 @@ Or around 2 to 4 times more than a typical Google search.
     demo.queue(concurrency_count=16)
 demo.launch()

                 with gr.Tabs() as tabs:
                     with gr.TabItem("📝 Examples", elem_id="tab-examples", id=0):
                         examples_hidden = gr.Textbox(elem_id="hidden-message")
                         questions = [
+                            "How does Daoism view our dependence on modern technology?",
+                            "From a Confucian perspective, what is the role of tradition in modern society?",
+                            "How might Daoism influence sustainable economic practices?",
+                            "Does Confucianism advocate for a particular economic model?",
+                            "How does Daoism interpret the dynamics of modern relationships?",
+                            "From a Confucian viewpoint, what are the responsibilities of individuals in a family?",
+                            "How might Daoism guide our approach to mental and physical health?",
+                            "Does Confucianism offer insights into educational methods?",
+                            "How does Daoism view the purpose and methods of modern education?",
+                            "From a Confucian perspective, what is the importance of social harmony?",
                         ]
                         examples_questions = gr.Examples(
     demo.queue(concurrency_count=16)
 demo.launch()

chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/data_level0.bin RENAMED Viewed

File without changes

chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/header.bin RENAMED Viewed

File without changes

chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/length.bin RENAMED Viewed

File without changes

chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/link_lists.bin RENAMED Viewed

File without changes

chroma_db/chroma.sqlite3 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d627997dd35604ac27e67f35911999f234285c39362fffecddd50621d9f01d77
 size 4067328

 version https://git-lfs.github.com/spec/v1
+oid sha256:70a6e9fbcc7cec17b315d076cca1f60f6bce9f8d02a12e3f0a5786b4d1565f86
 size 4067328