Spaces:
Runtime error
Runtime error
LOUIS SANNA
commited on
Commit
•
fe19632
1
Parent(s):
528bf3d
feat(data): add url
Browse files- anyqa/build_index.py +18 -28
- anyqa/config.py +34 -3
- anyqa/source_table.py +5 -17
- app.py +10 -13
- chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/data_level0.bin +0 -0
- chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/header.bin +0 -0
- chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/length.bin +0 -0
- chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/link_lists.bin +0 -0
- chroma_db/chroma.sqlite3 +1 -1
anyqa/build_index.py
CHANGED
@@ -5,6 +5,7 @@ from langchain.vectorstores import Chroma
|
|
5 |
from langchain.embeddings import HuggingFaceEmbeddings
|
6 |
from langchain.document_loaders import PyPDFLoader
|
7 |
|
|
|
8 |
from .embeddings import EMBEDDING_MODEL_NAME
|
9 |
from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
|
10 |
|
@@ -12,7 +13,7 @@ from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
|
|
12 |
def load_data():
|
13 |
print("Loading data...")
|
14 |
docs = parse_data()
|
15 |
-
print("
|
16 |
embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
|
17 |
print("Building index...")
|
18 |
vectorstore = get_vectorstore(embedding_function)
|
@@ -27,37 +28,26 @@ def load_data():
|
|
27 |
|
28 |
def parse_data():
|
29 |
docs = []
|
30 |
-
for
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
chunk.metadata["page_number"] = chunk.metadata["page"]
|
47 |
-
chunk.metadata["short_name"] = chunk.metadata["name"]
|
48 |
-
docs.append(chunk)
|
49 |
|
50 |
return docs
|
51 |
|
52 |
|
53 |
-
def parse_name(source: str) -> str:
|
54 |
-
return source.split("/")[-1].split(".")[0].replace("_", " ")
|
55 |
-
|
56 |
-
|
57 |
-
def parse_domain(source: str) -> str:
|
58 |
-
return source.split("/")[1]
|
59 |
-
|
60 |
-
|
61 |
def clear_index():
|
62 |
for filename in os.listdir("../chroma_db"):
|
63 |
file_path = os.path.join("../chroma_db", filename)
|
|
|
5 |
from langchain.embeddings import HuggingFaceEmbeddings
|
6 |
from langchain.document_loaders import PyPDFLoader
|
7 |
|
8 |
+
from .config import get_sources
|
9 |
from .embeddings import EMBEDDING_MODEL_NAME
|
10 |
from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
|
11 |
|
|
|
13 |
def load_data():
|
14 |
print("Loading data...")
|
15 |
docs = parse_data()
|
16 |
+
print("Documents loaded")
|
17 |
embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
|
18 |
print("Building index...")
|
19 |
vectorstore = get_vectorstore(embedding_function)
|
|
|
28 |
|
29 |
def parse_data():
|
30 |
docs = []
|
31 |
+
for source in get_sources():
|
32 |
+
file_path = source["file_path"]
|
33 |
+
loader = PyPDFLoader(file_path)
|
34 |
+
pages = loader.load_and_split()
|
35 |
+
|
36 |
+
# split it into chunks
|
37 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
38 |
+
doc_chunks = text_splitter.split_documents(pages)
|
39 |
+
|
40 |
+
for chunk in doc_chunks:
|
41 |
+
chunk.metadata["name"] = source["name"]
|
42 |
+
chunk.metadata["domain"] = source["domain"]
|
43 |
+
chunk.metadata["url"] = source.get("url", None)
|
44 |
+
chunk.metadata["page_number"] = chunk.metadata["page"]
|
45 |
+
chunk.metadata["short_name"] = chunk.metadata["name"]
|
46 |
+
docs.append(chunk)
|
|
|
|
|
|
|
47 |
|
48 |
return docs
|
49 |
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
def clear_index():
|
52 |
for filename in os.listdir("../chroma_db"):
|
53 |
file_path = os.path.join("../chroma_db", filename)
|
anyqa/config.py
CHANGED
@@ -1,10 +1,41 @@
|
|
1 |
-
|
2 |
-
|
3 |
import os
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
def get_domains():
|
6 |
domains = []
|
7 |
for root, dirs, files in os.walk("data"):
|
8 |
for dir in dirs:
|
9 |
domains.append(dir)
|
10 |
-
return domains
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
|
3 |
+
# can be used to add metadata to the index, for instance URL
|
4 |
+
metadata_by_file_path = {
|
5 |
+
"data/Daoism/Tao_Te_Ching.pdf": { "url": "https://www.with.org/tao_te_ching_en.pdf" },
|
6 |
+
"data/Confucianism/Analects of Confucius.pdf": { "url": "https://chinatxt.sitehost.iu.edu/Analects_of_Confucius_(Eno-2015).pdf" },
|
7 |
+
}
|
8 |
+
|
9 |
def get_domains():
|
10 |
domains = []
|
11 |
for root, dirs, files in os.walk("data"):
|
12 |
for dir in dirs:
|
13 |
domains.append(dir)
|
14 |
+
return domains
|
15 |
+
|
16 |
+
|
17 |
+
def get_sources():
|
18 |
+
res = []
|
19 |
+
for root, dirs, files in os.walk("data"):
|
20 |
+
for file in files:
|
21 |
+
if file.endswith(".pdf"):
|
22 |
+
file_path = os.path.join(root, file)
|
23 |
+
print("file_path", file_path)
|
24 |
+
res.append(
|
25 |
+
{
|
26 |
+
"domain": parse_domain(file_path),
|
27 |
+
"name": parse_name(file_path),
|
28 |
+
"file_path": file_path,
|
29 |
+
**metadata_by_file_path.get(file_path, {})
|
30 |
+
}
|
31 |
+
)
|
32 |
+
|
33 |
+
return res
|
34 |
+
|
35 |
+
|
36 |
+
def parse_name(source: str) -> str:
|
37 |
+
return source.split("/")[-1].split(".")[0].replace("_", " ")
|
38 |
+
|
39 |
+
|
40 |
+
def parse_domain(source: str) -> str:
|
41 |
+
return source.split("/")[1].replace("_", " ")
|
anyqa/source_table.py
CHANGED
@@ -1,16 +1,4 @@
|
|
1 |
-
|
2 |
-
{
|
3 |
-
"type": "Doism",
|
4 |
-
"source": "Tao Te Ching",
|
5 |
-
"URL": "https://www.with.org/tao_te_ching_en.pdf",
|
6 |
-
},
|
7 |
-
{
|
8 |
-
"type": "Confucianism",
|
9 |
-
"source": "The Analects of Confucius",
|
10 |
-
"URL": "https://chinatxt.sitehost.iu.edu/Analects_of_Confucius_(Eno-2015).pdf",
|
11 |
-
},
|
12 |
-
# Add more dictionaries as needed
|
13 |
-
]
|
14 |
|
15 |
|
16 |
def generate_source_table():
|
@@ -27,11 +15,11 @@ def generate_source_table():
|
|
27 |
rows = []
|
28 |
|
29 |
# Add each row to the list
|
30 |
-
for
|
31 |
row = [
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
]
|
36 |
|
37 |
row_str = " | ".join(row)
|
|
|
1 |
+
from anyqa.config import get_sources
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
|
4 |
def generate_source_table():
|
|
|
15 |
rows = []
|
16 |
|
17 |
# Add each row to the list
|
18 |
+
for source in get_sources():
|
19 |
row = [
|
20 |
+
source.get("domain", ""),
|
21 |
+
source.get("name", ""),
|
22 |
+
source.get("url", ""),
|
23 |
]
|
24 |
|
25 |
row_str = " | ".join(row)
|
app.py
CHANGED
@@ -345,18 +345,17 @@ with gr.Blocks(title="❓ Q&A", css="style.css", theme=theme) as demo:
|
|
345 |
with gr.Tabs() as tabs:
|
346 |
with gr.TabItem("📝 Examples", elem_id="tab-examples", id=0):
|
347 |
examples_hidden = gr.Textbox(elem_id="hidden-message")
|
348 |
-
|
349 |
questions = [
|
350 |
-
"How does
|
351 |
-
"From a
|
352 |
-
"How might
|
353 |
-
"Does
|
354 |
-
"How does
|
355 |
-
"From a
|
356 |
-
"How might
|
357 |
-
"Does
|
358 |
-
"How does
|
359 |
-
"From a
|
360 |
]
|
361 |
|
362 |
examples_questions = gr.Examples(
|
@@ -564,5 +563,3 @@ Or around 2 to 4 times more than a typical Google search.
|
|
564 |
demo.queue(concurrency_count=16)
|
565 |
|
566 |
demo.launch()
|
567 |
-
|
568 |
-
|
|
|
345 |
with gr.Tabs() as tabs:
|
346 |
with gr.TabItem("📝 Examples", elem_id="tab-examples", id=0):
|
347 |
examples_hidden = gr.Textbox(elem_id="hidden-message")
|
|
|
348 |
questions = [
|
349 |
+
"How does Daoism view our dependence on modern technology?",
|
350 |
+
"From a Confucian perspective, what is the role of tradition in modern society?",
|
351 |
+
"How might Daoism influence sustainable economic practices?",
|
352 |
+
"Does Confucianism advocate for a particular economic model?",
|
353 |
+
"How does Daoism interpret the dynamics of modern relationships?",
|
354 |
+
"From a Confucian viewpoint, what are the responsibilities of individuals in a family?",
|
355 |
+
"How might Daoism guide our approach to mental and physical health?",
|
356 |
+
"Does Confucianism offer insights into educational methods?",
|
357 |
+
"How does Daoism view the purpose and methods of modern education?",
|
358 |
+
"From a Confucian perspective, what is the importance of social harmony?",
|
359 |
]
|
360 |
|
361 |
examples_questions = gr.Examples(
|
|
|
563 |
demo.queue(concurrency_count=16)
|
564 |
|
565 |
demo.launch()
|
|
|
|
chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/data_level0.bin
RENAMED
File without changes
|
chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/header.bin
RENAMED
File without changes
|
chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/length.bin
RENAMED
File without changes
|
chroma_db/{1730b83a-f75a-41e2-aba7-637881bb5ea8 → 01ac9c34-80c7-488f-b1ef-a9d1a5ebc563}/link_lists.bin
RENAMED
File without changes
|
chroma_db/chroma.sqlite3
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4067328
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:70a6e9fbcc7cec17b315d076cca1f60f6bce9f8d02a12e3f0a5786b4d1565f86
|
3 |
size 4067328
|