LOUIS SANNA commited on
Commit
d98ba57
1 Parent(s): cc2ce8c

feat(data): add other pdfs

Browse files
chroma_db/13934663-2db5-404d-be0f-51734d442e08/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a13e72541800c513c73dccea69f79e39cf4baef4fa23f7e117c0d6b0f5f99670
3
+ size 3212000
chroma_db/13934663-2db5-404d-be0f-51734d442e08/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ec6df10978b056a10062ed99efeef2702fa4a1301fad702b53dd2517103c746
3
+ size 100
chroma_db/13934663-2db5-404d-be0f-51734d442e08/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
3
+ size 4000
chroma_db/13934663-2db5-404d-be0f-51734d442e08/link_lists.bin ADDED
File without changes
chroma_db/chroma.sqlite3 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db081ece29301d223a01bac97e8b2905fada2e7c376cec96bf44fee0f5c95069
3
- size 1843200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dc2c64a9de7507097ab452fdce23fc6348f38e0d34484d791a8c43366b78001
3
+ size 2564096
climateqa/build_index.py CHANGED
@@ -1,11 +1,12 @@
1
- # import
 
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
  from langchain.vectorstores import Chroma
4
  from langchain.embeddings import HuggingFaceEmbeddings
5
  from langchain.document_loaders import PyPDFLoader
6
 
7
  from .embeddings import EMBEDDING_MODEL_NAME
8
- from .vectorstore import get_vectorstore
9
 
10
 
11
  def load_data():
@@ -15,24 +16,33 @@ def load_data():
15
 
16
  assert isinstance(vectorstore, Chroma)
17
  vectorstore.from_documents(
18
- docs, embedding_function, persist_directory="./chroma_db"
19
  )
20
  return vectorstore
21
 
22
 
23
  def parse_data():
24
- loader = PyPDFLoader("data/daoism/tao-te-ching.pdf")
25
- pages = loader.load_and_split()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- # split it into chunks
28
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=0)
29
- docs = text_splitter.split_documents(pages)
30
- print(docs)
31
- for doc in docs:
32
- doc.metadata["name"] = parse_name(doc.metadata["source"])
33
- doc.metadata["domain"] = parse_domain(doc.metadata["source"])
34
- doc.metadata["page_number"] = doc.metadata["page"]
35
- doc.metadata["short_name"] = doc.metadata["name"]
36
  return docs
37
 
38
 
@@ -41,10 +51,22 @@ def parse_name(source: str) -> str:
41
 
42
 
43
  def parse_domain(source: str) -> str:
44
- return source.split("/")[2]
 
 
 
 
 
 
 
 
 
 
 
45
 
46
 
47
  if __name__ == "__main__":
 
48
  db = load_data()
49
  # query it
50
  query = (
 
1
+ import os
2
+
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain.vectorstores import Chroma
5
  from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.document_loaders import PyPDFLoader
7
 
8
  from .embeddings import EMBEDDING_MODEL_NAME
9
+ from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
10
 
11
 
12
  def load_data():
 
16
 
17
  assert isinstance(vectorstore, Chroma)
18
  vectorstore.from_documents(
19
+ docs, embedding_function, persist_directory=PERSIST_DIRECTORY
20
  )
21
  return vectorstore
22
 
23
 
24
  def parse_data():
25
+ docs = []
26
+ for root, dirs, files in os.walk("data"):
27
+ for file in files:
28
+ if file.endswith(".pdf"):
29
+ file_path = os.path.join(root, file)
30
+ loader = PyPDFLoader(file_path)
31
+ pages = loader.load_and_split()
32
+
33
+ # split it into chunks
34
+ text_splitter = RecursiveCharacterTextSplitter(
35
+ chunk_size=1000, chunk_overlap=0
36
+ )
37
+ doc_chunks = text_splitter.split_documents(pages)
38
+
39
+ for chunk in doc_chunks:
40
+ chunk.metadata["name"] = parse_name(chunk.metadata["source"])
41
+ chunk.metadata["domain"] = parse_domain(chunk.metadata["source"])
42
+ chunk.metadata["page_number"] = chunk.metadata["page"]
43
+ chunk.metadata["short_name"] = chunk.metadata["name"]
44
+ docs.append(chunk)
45
 
 
 
 
 
 
 
 
 
 
46
  return docs
47
 
48
 
 
51
 
52
 
53
  def parse_domain(source: str) -> str:
54
+ return source.split("/")[1]
55
+
56
+
57
+ def clear_index():
58
+ folder = PERSIST_DIRECTORY
59
+ for filename in os.listdir(folder):
60
+ file_path = os.path.join(folder, filename)
61
+ try:
62
+ if os.path.isfile(file_path) or os.path.islink(file_path):
63
+ os.unlink(file_path)
64
+ except Exception as e:
65
+ print("Failed to delete %s. Reason: %s" % (file_path, e))
66
 
67
 
68
  if __name__ == "__main__":
69
+ clear_index()
70
  db = load_data()
71
  # query it
72
  query = (
climateqa/qa_logging.py CHANGED
@@ -2,6 +2,7 @@ import datetime
2
  import json
3
  import os
4
 
 
5
  def log(question, history, docs, user_id):
6
  if has_blob_config():
7
  log_in_azure(question, history, docs, user_id)
@@ -48,6 +49,7 @@ def get_azure_blob_client():
48
  file_share_name = "climategpt"
49
  # I don't know why this is necessary, but it cause an error otherwise when running build_index.py
50
  from azure.storage.fileshare import ShareServiceClient
 
51
  service = ShareServiceClient(account_url=account_url, credential=credential)
52
  share_client = service.get_share_client(file_share_name)
53
  return share_client
 
2
  import json
3
  import os
4
 
5
+
6
  def log(question, history, docs, user_id):
7
  if has_blob_config():
8
  log_in_azure(question, history, docs, user_id)
 
49
  file_share_name = "climategpt"
50
  # I don't know why this is necessary, but it cause an error otherwise when running build_index.py
51
  from azure.storage.fileshare import ShareServiceClient
52
+
53
  service = ShareServiceClient(account_url=account_url, credential=credential)
54
  share_client = service.get_share_client(file_share_name)
55
  return share_client
climateqa/vectorstore.py CHANGED
@@ -5,6 +5,8 @@ import os
5
  import pinecone
6
  from langchain.vectorstores import Chroma, Pinecone
7
 
 
 
8
  try:
9
  from dotenv import load_dotenv
10
 
@@ -21,7 +23,7 @@ def get_vectorstore(embeddings_function):
21
 
22
  def get_chroma_vectore_store(embedding_function):
23
  return Chroma(
24
- persist_directory="./chroma_db", embedding_function=embedding_function
25
  )
26
 
27
 
 
5
  import pinecone
6
  from langchain.vectorstores import Chroma, Pinecone
7
 
8
+ PERSIST_DIRECTORY = "./chroma_db"
9
+
10
  try:
11
  from dotenv import load_dotenv
12
 
 
23
 
24
  def get_chroma_vectore_store(embedding_function):
25
  return Chroma(
26
+ persist_directory=PERSIST_DIRECTORY, embedding_function=embedding_function
27
  )
28
 
29