Spaces:
Runtime error
Runtime error
import os | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import Chroma | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.document_loaders import PyPDFLoader | |
from .embeddings import EMBEDDING_MODEL_NAME | |
from .vectorstore import PERSIST_DIRECTORY, get_vectorstore | |
def load_data(): | |
docs = parse_data() | |
embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME) | |
vectorstore = get_vectorstore(embedding_function) | |
assert isinstance(vectorstore, Chroma) | |
vectorstore.from_documents( | |
docs, embedding_function, persist_directory=PERSIST_DIRECTORY | |
) | |
return vectorstore | |
def parse_data(): | |
docs = [] | |
for root, dirs, files in os.walk("data"): | |
for file in files: | |
if file.endswith(".pdf"): | |
file_path = os.path.join(root, file) | |
loader = PyPDFLoader(file_path) | |
pages = loader.load_and_split() | |
# split it into chunks | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, chunk_overlap=0 | |
) | |
doc_chunks = text_splitter.split_documents(pages) | |
for chunk in doc_chunks: | |
chunk.metadata["name"] = parse_name(chunk.metadata["source"]) | |
chunk.metadata["domain"] = parse_domain(chunk.metadata["source"]) | |
chunk.metadata["page_number"] = chunk.metadata["page"] | |
chunk.metadata["short_name"] = chunk.metadata["name"] | |
docs.append(chunk) | |
return docs | |
def parse_name(source: str) -> str: | |
return source.split("/")[-1].split(".")[0] | |
def parse_domain(source: str) -> str: | |
return source.split("/")[1] | |
def clear_index(): | |
folder = PERSIST_DIRECTORY | |
for filename in os.listdir(folder): | |
file_path = os.path.join(folder, filename) | |
try: | |
if os.path.isfile(file_path) or os.path.islink(file_path): | |
os.unlink(file_path) | |
except Exception as e: | |
print("Failed to delete %s. Reason: %s" % (file_path, e)) | |
if __name__ == "__main__": | |
clear_index() | |
db = load_data() | |
# query it | |
query = ( | |
"He who can bear the misfortune of a nation is called the ruler of the world." | |
) | |
docs = db.similarity_search(query) | |
print(docs) | |