LOUIS SANNA commited on
Commit
fe19632
1 Parent(s): 528bf3d

feat(data): add url

Browse files
anyqa/build_index.py CHANGED
@@ -5,6 +5,7 @@ from langchain.vectorstores import Chroma
5
  from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.document_loaders import PyPDFLoader
7
 
 
8
  from .embeddings import EMBEDDING_MODEL_NAME
9
  from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
10
 
@@ -12,7 +13,7 @@ from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
12
  def load_data():
13
  print("Loading data...")
14
  docs = parse_data()
15
- print("Loaded documents")
16
  embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
17
  print("Building index...")
18
  vectorstore = get_vectorstore(embedding_function)
@@ -27,37 +28,26 @@ def load_data():
27
 
28
  def parse_data():
29
  docs = []
30
- for root, dirs, files in os.walk("data"):
31
- for file in files:
32
- if file.endswith(".pdf"):
33
- file_path = os.path.join(root, file)
34
- loader = PyPDFLoader(file_path)
35
- pages = loader.load_and_split()
36
-
37
- # split it into chunks
38
- text_splitter = RecursiveCharacterTextSplitter(
39
- chunk_size=1000, chunk_overlap=0
40
- )
41
- doc_chunks = text_splitter.split_documents(pages)
42
-
43
- for chunk in doc_chunks:
44
- chunk.metadata["name"] = parse_name(chunk.metadata["source"])
45
- chunk.metadata["domain"] = parse_domain(chunk.metadata["source"])
46
- chunk.metadata["page_number"] = chunk.metadata["page"]
47
- chunk.metadata["short_name"] = chunk.metadata["name"]
48
- docs.append(chunk)
49
 
50
  return docs
51
 
52
 
53
- def parse_name(source: str) -> str:
54
- return source.split("/")[-1].split(".")[0].replace("_", " ")
55
-
56
-
57
- def parse_domain(source: str) -> str:
58
- return source.split("/")[1]
59
-
60
-
61
  def clear_index():
62
  for filename in os.listdir("../chroma_db"):
63
  file_path = os.path.join("../chroma_db", filename)
 
5
  from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.document_loaders import PyPDFLoader
7
 
8
+ from .config import get_sources
9
  from .embeddings import EMBEDDING_MODEL_NAME
10
  from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
11
 
 
13
  def load_data():
14
  print("Loading data...")
15
  docs = parse_data()
16
+ print("Documents loaded")
17
  embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
18
  print("Building index...")
19
  vectorstore = get_vectorstore(embedding_function)
 
28
 
29
  def parse_data():
30
  docs = []
31
+ for source in get_sources():
32
+ file_path = source["file_path"]
33
+ loader = PyPDFLoader(file_path)
34
+ pages = loader.load_and_split()
35
+
36
+ # split it into chunks
37
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
38
+ doc_chunks = text_splitter.split_documents(pages)
39
+
40
+ for chunk in doc_chunks:
41
+ chunk.metadata["name"] = source["name"]
42
+ chunk.metadata["domain"] = source["domain"]
43
+ chunk.metadata["url"] = source.get("url", None)
44
+ chunk.metadata["page_number"] = chunk.metadata["page"]
45
+ chunk.metadata["short_name"] = chunk.metadata["name"]
46
+ docs.append(chunk)
 
 
 
47
 
48
  return docs
49
 
50
 
 
 
 
 
 
 
 
 
51
  def clear_index():
52
  for filename in os.listdir("../chroma_db"):
53
  file_path = os.path.join("../chroma_db", filename)
anyqa/config.py CHANGED
@@ -1,10 +1,41 @@
1
-
2
-
3
  import os
4
 
 
 
 
 
 
 
5
  def get_domains():
6
  domains = []
7
  for root, dirs, files in os.walk("data"):
8
  for dir in dirs:
9
  domains.append(dir)
10
- return domains
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
 
3
+ # can be used to add metadata to the index, for instance URL
4
+ metadata_by_file_path = {
5
+ "data/Daoism/Tao_Te_Ching.pdf": { "url": "https://www.with.org/tao_te_ching_en.pdf" },
6
+ "data/Confucianism/Analects of Confucius.pdf": { "url": "https://chinatxt.sitehost.iu.edu/Analects_of_Confucius_(Eno-2015).pdf" },
7
+ }
8
+
9
  def get_domains():
10
  domains = []
11
  for root, dirs, files in os.walk("data"):
12
  for dir in dirs:
13
  domains.append(dir)
14
+ return domains
15
+
16
+
17
+ def get_sources():
18
+ res = []
19
+ for root, dirs, files in os.walk("data"):
20
+ for file in files:
21
+ if file.endswith(".pdf"):
22
+ file_path = os.path.join(root, file)
23
+ print("file_path", file_path)
24
+ res.append(
25
+ {
26
+ "domain": parse_domain(file_path),
27
+ "name": parse_name(file_path),
28
+ "file_path": file_path,
29
+ **metadata_by_file_path.get(file_path, {})
30
+ }
31
+ )
32
+
33
+ return res
34
+
35
+
36
+ def parse_name(source: str) -> str:
37
+ return source.split("/")[-1].split(".")[0].replace("_", " ")
38
+
39
+
40
+ def parse_domain(source: str) -> str:
41
+ return source.split("/")[1].replace("_", " ")
anyqa/source_table.py CHANGED
@@ -1,16 +1,4 @@
1
- config_list = [
2
- {
3
- "type": "Doism",
4
- "source": "Tao Te Ching",
5
- "URL": "https://www.with.org/tao_te_ching_en.pdf",
6
- },
7
- {
8
- "type": "Confucianism",
9
- "source": "The Analects of Confucius",
10
- "URL": "https://chinatxt.sitehost.iu.edu/Analects_of_Confucius_(Eno-2015).pdf",
11
- },
12
- # Add more dictionaries as needed
13
- ]
14
 
15
 
16
  def generate_source_table():
@@ -27,11 +15,11 @@ def generate_source_table():
27
  rows = []
28
 
29
  # Add each row to the list
30
- for config in config_list:
31
  row = [
32
- config.get("type", ""),
33
- config.get("source", ""),
34
- config.get("URL", ""),
35
  ]
36
 
37
  row_str = " | ".join(row)
 
1
+ from anyqa.config import get_sources
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
 
4
  def generate_source_table():
 
15
  rows = []
16
 
17
  # Add each row to the list
18
+ for source in get_sources():
19
  row = [
20
+ source.get("domain", ""),
21
+ source.get("name", ""),
22
+ source.get("url", ""),
23
  ]
24
 
25
  row_str = " | ".join(row)
app.py CHANGED
@@ -345,18 +345,17 @@ with gr.Blocks(title="❓ Q&A", css="style.css", theme=theme) as demo:
345
  with gr.Tabs() as tabs:
346
  with gr.TabItem("📝 Examples", elem_id="tab-examples", id=0):
347
  examples_hidden = gr.Textbox(elem_id="hidden-message")
348
-
349
  questions = [
350
- "How does doaism view our dependence on modern technology?",
351
- "From a doaism perspective, should we embrace or challenge the rise of AI?",
352
- "How might doaism influence sustainable economic practices?",
353
- "Does doaism support the idea of a minimalistic economy over consumerism?",
354
- "How does doaism interpret the dynamics of modern relationships?",
355
- "From a doaism viewpoint, how should society handle conflicts and disagreements?",
356
- "How might doaism guide our approach to mental and physical health?",
357
- "Does doaism offer insights into balancing work-life pressures in the modern age?",
358
- "How does doaism view the purpose and methods of modern education?",
359
- "From a doaism perspective, should learning be more experiential than theoretical?",
360
  ]
361
 
362
  examples_questions = gr.Examples(
@@ -564,5 +563,3 @@ Or around 2 to 4 times more than a typical Google search.
564
  demo.queue(concurrency_count=16)
565
 
566
  demo.launch()
567
-
568
-
 
345
  with gr.Tabs() as tabs:
346
  with gr.TabItem("📝 Examples", elem_id="tab-examples", id=0):
347
  examples_hidden = gr.Textbox(elem_id="hidden-message")
 
348
  questions = [
349
+ "How does Daoism view our dependence on modern technology?",
350
+ "From a Confucian perspective, what is the role of tradition in modern society?",
351
+ "How might Daoism influence sustainable economic practices?",
352
+ "Does Confucianism advocate for a particular economic model?",
353
+ "How does Daoism interpret the dynamics of modern relationships?",
354
+ "From a Confucian viewpoint, what are the responsibilities of individuals in a family?",
355
+ "How might Daoism guide our approach to mental and physical health?",
356
+ "Does Confucianism offer insights into educational methods?",
357
+ "How does Daoism view the purpose and methods of modern education?",
358
+ "From a Confucian perspective, what is the importance of social harmony?",
359
  ]
360
 
361
  examples_questions = gr.Examples(
 
563
  demo.queue(concurrency_count=16)
564
 
565
  demo.launch()
 
 
chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/data_level0.bin RENAMED
File without changes
chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/header.bin RENAMED
File without changes
chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/length.bin RENAMED
File without changes
chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/link_lists.bin RENAMED
File without changes
chroma_db/chroma.sqlite3 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d627997dd35604ac27e67f35911999f234285c39362fffecddd50621d9f01d77
3
  size 4067328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70a6e9fbcc7cec17b315d076cca1f60f6bce9f8d02a12e3f0a5786b4d1565f86
3
  size 4067328