Spaces:
Runtime error
Runtime error
LOUIS SANNA
commited on
Commit
•
d98ba57
1
Parent(s):
cc2ce8c
feat(data): add other pdfs
Browse files- chroma_db/13934663-2db5-404d-be0f-51734d442e08/data_level0.bin +3 -0
- chroma_db/13934663-2db5-404d-be0f-51734d442e08/header.bin +3 -0
- chroma_db/13934663-2db5-404d-be0f-51734d442e08/length.bin +3 -0
- chroma_db/13934663-2db5-404d-be0f-51734d442e08/link_lists.bin +0 -0
- chroma_db/chroma.sqlite3 +2 -2
- climateqa/build_index.py +37 -15
- climateqa/qa_logging.py +2 -0
- climateqa/vectorstore.py +3 -1
chroma_db/13934663-2db5-404d-be0f-51734d442e08/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a13e72541800c513c73dccea69f79e39cf4baef4fa23f7e117c0d6b0f5f99670
|
3 |
+
size 3212000
|
chroma_db/13934663-2db5-404d-be0f-51734d442e08/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ec6df10978b056a10062ed99efeef2702fa4a1301fad702b53dd2517103c746
|
3 |
+
size 100
|
chroma_db/13934663-2db5-404d-be0f-51734d442e08/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
|
3 |
+
size 4000
|
chroma_db/13934663-2db5-404d-be0f-51734d442e08/link_lists.bin
ADDED
File without changes
|
chroma_db/chroma.sqlite3
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4dc2c64a9de7507097ab452fdce23fc6348f38e0d34484d791a8c43366b78001
|
3 |
+
size 2564096
|
climateqa/build_index.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
-
|
|
|
2 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
from langchain.vectorstores import Chroma
|
4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
from langchain.document_loaders import PyPDFLoader
|
6 |
|
7 |
from .embeddings import EMBEDDING_MODEL_NAME
|
8 |
-
from .vectorstore import get_vectorstore
|
9 |
|
10 |
|
11 |
def load_data():
|
@@ -15,24 +16,33 @@ def load_data():
|
|
15 |
|
16 |
assert isinstance(vectorstore, Chroma)
|
17 |
vectorstore.from_documents(
|
18 |
-
docs, embedding_function, persist_directory=
|
19 |
)
|
20 |
return vectorstore
|
21 |
|
22 |
|
23 |
def parse_data():
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
# split it into chunks
|
28 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=0)
|
29 |
-
docs = text_splitter.split_documents(pages)
|
30 |
-
print(docs)
|
31 |
-
for doc in docs:
|
32 |
-
doc.metadata["name"] = parse_name(doc.metadata["source"])
|
33 |
-
doc.metadata["domain"] = parse_domain(doc.metadata["source"])
|
34 |
-
doc.metadata["page_number"] = doc.metadata["page"]
|
35 |
-
doc.metadata["short_name"] = doc.metadata["name"]
|
36 |
return docs
|
37 |
|
38 |
|
@@ -41,10 +51,22 @@ def parse_name(source: str) -> str:
|
|
41 |
|
42 |
|
43 |
def parse_domain(source: str) -> str:
|
44 |
-
return source.split("/")[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
|
47 |
if __name__ == "__main__":
|
|
|
48 |
db = load_data()
|
49 |
# query it
|
50 |
query = (
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
from langchain.vectorstores import Chroma
|
5 |
from langchain.embeddings import HuggingFaceEmbeddings
|
6 |
from langchain.document_loaders import PyPDFLoader
|
7 |
|
8 |
from .embeddings import EMBEDDING_MODEL_NAME
|
9 |
+
from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
|
10 |
|
11 |
|
12 |
def load_data():
|
|
|
16 |
|
17 |
assert isinstance(vectorstore, Chroma)
|
18 |
vectorstore.from_documents(
|
19 |
+
docs, embedding_function, persist_directory=PERSIST_DIRECTORY
|
20 |
)
|
21 |
return vectorstore
|
22 |
|
23 |
|
24 |
def parse_data():
|
25 |
+
docs = []
|
26 |
+
for root, dirs, files in os.walk("data"):
|
27 |
+
for file in files:
|
28 |
+
if file.endswith(".pdf"):
|
29 |
+
file_path = os.path.join(root, file)
|
30 |
+
loader = PyPDFLoader(file_path)
|
31 |
+
pages = loader.load_and_split()
|
32 |
+
|
33 |
+
# split it into chunks
|
34 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
35 |
+
chunk_size=1000, chunk_overlap=0
|
36 |
+
)
|
37 |
+
doc_chunks = text_splitter.split_documents(pages)
|
38 |
+
|
39 |
+
for chunk in doc_chunks:
|
40 |
+
chunk.metadata["name"] = parse_name(chunk.metadata["source"])
|
41 |
+
chunk.metadata["domain"] = parse_domain(chunk.metadata["source"])
|
42 |
+
chunk.metadata["page_number"] = chunk.metadata["page"]
|
43 |
+
chunk.metadata["short_name"] = chunk.metadata["name"]
|
44 |
+
docs.append(chunk)
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
return docs
|
47 |
|
48 |
|
|
|
51 |
|
52 |
|
53 |
def parse_domain(source: str) -> str:
|
54 |
+
return source.split("/")[1]
|
55 |
+
|
56 |
+
|
57 |
+
def clear_index():
|
58 |
+
folder = PERSIST_DIRECTORY
|
59 |
+
for filename in os.listdir(folder):
|
60 |
+
file_path = os.path.join(folder, filename)
|
61 |
+
try:
|
62 |
+
if os.path.isfile(file_path) or os.path.islink(file_path):
|
63 |
+
os.unlink(file_path)
|
64 |
+
except Exception as e:
|
65 |
+
print("Failed to delete %s. Reason: %s" % (file_path, e))
|
66 |
|
67 |
|
68 |
if __name__ == "__main__":
|
69 |
+
clear_index()
|
70 |
db = load_data()
|
71 |
# query it
|
72 |
query = (
|
climateqa/qa_logging.py
CHANGED
@@ -2,6 +2,7 @@ import datetime
|
|
2 |
import json
|
3 |
import os
|
4 |
|
|
|
5 |
def log(question, history, docs, user_id):
|
6 |
if has_blob_config():
|
7 |
log_in_azure(question, history, docs, user_id)
|
@@ -48,6 +49,7 @@ def get_azure_blob_client():
|
|
48 |
file_share_name = "climategpt"
|
49 |
# I don't know why this is necessary, but it cause an error otherwise when running build_index.py
|
50 |
from azure.storage.fileshare import ShareServiceClient
|
|
|
51 |
service = ShareServiceClient(account_url=account_url, credential=credential)
|
52 |
share_client = service.get_share_client(file_share_name)
|
53 |
return share_client
|
|
|
2 |
import json
|
3 |
import os
|
4 |
|
5 |
+
|
6 |
def log(question, history, docs, user_id):
|
7 |
if has_blob_config():
|
8 |
log_in_azure(question, history, docs, user_id)
|
|
|
49 |
file_share_name = "climategpt"
|
50 |
# I don't know why this is necessary, but it cause an error otherwise when running build_index.py
|
51 |
from azure.storage.fileshare import ShareServiceClient
|
52 |
+
|
53 |
service = ShareServiceClient(account_url=account_url, credential=credential)
|
54 |
share_client = service.get_share_client(file_share_name)
|
55 |
return share_client
|
climateqa/vectorstore.py
CHANGED
@@ -5,6 +5,8 @@ import os
|
|
5 |
import pinecone
|
6 |
from langchain.vectorstores import Chroma, Pinecone
|
7 |
|
|
|
|
|
8 |
try:
|
9 |
from dotenv import load_dotenv
|
10 |
|
@@ -21,7 +23,7 @@ def get_vectorstore(embeddings_function):
|
|
21 |
|
22 |
def get_chroma_vectore_store(embedding_function):
|
23 |
return Chroma(
|
24 |
-
persist_directory=
|
25 |
)
|
26 |
|
27 |
|
|
|
5 |
import pinecone
|
6 |
from langchain.vectorstores import Chroma, Pinecone
|
7 |
|
8 |
+
PERSIST_DIRECTORY = "./chroma_db"
|
9 |
+
|
10 |
try:
|
11 |
from dotenv import load_dotenv
|
12 |
|
|
|
23 |
|
24 |
def get_chroma_vectore_store(embedding_function):
|
25 |
return Chroma(
|
26 |
+
persist_directory=PERSIST_DIRECTORY, embedding_function=embedding_function
|
27 |
)
|
28 |
|
29 |
|