Spaces:
Runtime error
Runtime error
File size: 2,519 Bytes
d98ba57 cc2ce8c d98ba57 cc2ce8c d6936f0 cc2ce8c d6936f0 cc2ce8c d6936f0 cc2ce8c d98ba57 cc2ce8c d6936f0 cc2ce8c d98ba57 cc2ce8c d6936f0 cc2ce8c d98ba57 528bf3d d98ba57 cc2ce8c d98ba57 cc2ce8c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from .embeddings import EMBEDDING_MODEL_NAME
from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
def load_data():
print("Loading data...")
docs = parse_data()
print("Loaded documents")
embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
print("Building index...")
vectorstore = get_vectorstore(embedding_function)
assert isinstance(vectorstore, Chroma)
vectorstore.from_documents(
docs, embedding_function, persist_directory=PERSIST_DIRECTORY
)
print("Index built")
return vectorstore
def parse_data():
docs = []
for root, dirs, files in os.walk("data"):
for file in files:
if file.endswith(".pdf"):
file_path = os.path.join(root, file)
loader = PyPDFLoader(file_path)
pages = loader.load_and_split()
# split it into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=0
)
doc_chunks = text_splitter.split_documents(pages)
for chunk in doc_chunks:
chunk.metadata["name"] = parse_name(chunk.metadata["source"])
chunk.metadata["domain"] = parse_domain(chunk.metadata["source"])
chunk.metadata["page_number"] = chunk.metadata["page"]
chunk.metadata["short_name"] = chunk.metadata["name"]
docs.append(chunk)
return docs
def parse_name(source: str) -> str:
return source.split("/")[-1].split(".")[0].replace("_", " ")
def parse_domain(source: str) -> str:
return source.split("/")[1]
def clear_index():
for filename in os.listdir("../chroma_db"):
file_path = os.path.join("../chroma_db", filename)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
except Exception as e:
print("Failed to delete %s. Reason: %s" % (file_path, e))
if __name__ == "__main__":
clear_index()
db = load_data()
# query it
query = (
"He who can bear the misfortune of a nation is called the ruler of the world."
)
docs = db.similarity_search(query)
print(docs)
|