Update utils.py
Browse files
utils.py
CHANGED
@@ -335,8 +335,8 @@ def document_loading_splitting():
|
|
335 |
|
336 |
# Dateien im Hugging Face Space auflisten
|
337 |
files_in_repo = list_repo_files(repo_id=STORAGE_REPO_ID, repo_type="space", token=hf_token)
|
338 |
-
pdf_files = [f for f in files_in_repo if f.endswith('.pdf') and f.startswith("chroma/
|
339 |
-
word_files = [f for f in files_in_repo if f.endswith('.docx') and f.startswith("chroma/
|
340 |
|
341 |
|
342 |
# Erstellen von DirectoryLoader für jeden Dateityp
|
@@ -403,7 +403,7 @@ def document_storage_chroma(splits):
|
|
403 |
########################################################
|
404 |
#Splits für den Vektorstore speichern - bzw. laden
|
405 |
########################################################
|
406 |
-
def save_splits(preprocessed_splits, original_splits, directory="chroma/
|
407 |
# Erstellen des Verzeichnisses, falls es nicht existiert
|
408 |
if not os.path.exists(directory):
|
409 |
os.makedirs(directory)
|
@@ -422,7 +422,7 @@ def save_splits(preprocessed_splits, original_splits, directory="chroma/kkg", pr
|
|
422 |
upload_file_to_huggingface(preprocessed_filepath, f"{directory}/{preprocessed_filename}")
|
423 |
upload_file_to_huggingface(original_filepath, f"{directory}/{original_filename}")
|
424 |
|
425 |
-
def load_splits(directory="chroma/
|
426 |
preprocessed_splits = None
|
427 |
original_splits = None
|
428 |
|
@@ -457,7 +457,7 @@ def load_splits(directory="chroma/kkg", preprocessed_filename="preprocessed_spli
|
|
457 |
########################################
|
458 |
#das Mapping der orginal-Splits und der preprocessed Splits speichern - und laden
|
459 |
########################################
|
460 |
-
def save_split_to_original_mapping(mapping, directory="chroma/
|
461 |
# Erstellen des Verzeichnisses, falls es nicht existiert
|
462 |
if not os.path.exists(directory):
|
463 |
os.makedirs(directory)
|
@@ -471,7 +471,7 @@ def save_split_to_original_mapping(mapping, directory="chroma/kkg", filename="ma
|
|
471 |
upload_file_to_huggingface(filepath, f"{directory}/{filename}")
|
472 |
|
473 |
|
474 |
-
def load_split_to_original_mapping(directory="chroma/
|
475 |
try:
|
476 |
# Laden des Mappings aus dem Hugging Face Repository
|
477 |
file_path = hf_hub_download(
|
@@ -739,9 +739,9 @@ def download_link(doc):
|
|
739 |
|
740 |
# Bestimmen des Dokumenttyps und Anpassen des Pfads
|
741 |
if doc_path.lower().endswith('.pdf'):
|
742 |
-
file_url = f"{base_url}/chroma/
|
743 |
elif doc_path.lower().endswith('.docx'):
|
744 |
-
file_url = f"{base_url}/chroma/
|
745 |
else:
|
746 |
# Fallback für andere Dateitypen
|
747 |
file_url = f"{base_url}/{quote(doc_path)}?token={hf_token}"
|
@@ -760,7 +760,7 @@ def display_files():
|
|
760 |
|
761 |
# PDF-Dateien
|
762 |
files_table += "<tr style='background-color: #930BBA; color: white; font-weight: bold; font-size: larger;'><th>Dateiname - PDF-Ordner</th></tr>"
|
763 |
-
pdf_files = [f for f in list_repo_files(repo_id=STORAGE_REPO_ID, repo_type="space", token=hf_token) if f.endswith('.pdf') and f.startswith("chroma/
|
764 |
for i, file in enumerate(pdf_files):
|
765 |
row_color = "#4f4f4f" if i % 2 == 0 else "#3a3a3a"
|
766 |
files_table += f"<tr style='background-color: {row_color}; border-bottom: 1px solid #ddd;'>"
|
@@ -768,7 +768,7 @@ def display_files():
|
|
768 |
|
769 |
# Word-Dateien
|
770 |
files_table += "<tr style='background-color: #930BBA; color: white; font-weight: bold; font-size: larger;'><th>Dateiname - Word-Ordner</th></tr>"
|
771 |
-
word_files = [f for f in list_repo_files(repo_id=STORAGE_REPO_ID, repo_type="space", token=hf_token) if f.endswith('.docx') and f.startswith("chroma/
|
772 |
for i, file in enumerate(word_files):
|
773 |
row_color = "#4f4f4f" if i % 2 == 0 else "#3a3a3a"
|
774 |
files_table += f"<tr style='background-color: {row_color}; border-bottom: 1px solid #ddd;'>"
|
|
|
335 |
|
336 |
# Dateien im Hugging Face Space auflisten
|
337 |
files_in_repo = list_repo_files(repo_id=STORAGE_REPO_ID, repo_type="space", token=hf_token)
|
338 |
+
pdf_files = [f for f in files_in_repo if f.endswith('.pdf') and f.startswith("chroma/demo/pdf/")]
|
339 |
+
word_files = [f for f in files_in_repo if f.endswith('.docx') and f.startswith("chroma/demo/word/")]
|
340 |
|
341 |
|
342 |
# Erstellen von DirectoryLoader für jeden Dateityp
|
|
|
403 |
########################################################
|
404 |
#Splits für den Vektorstore speichern - bzw. laden
|
405 |
########################################################
|
406 |
+
def save_splits(preprocessed_splits, original_splits, directory="chroma/demo", preprocessed_filename="preprocessed_splits.pkl", original_filename="original_splits.pkl"):
|
407 |
# Erstellen des Verzeichnisses, falls es nicht existiert
|
408 |
if not os.path.exists(directory):
|
409 |
os.makedirs(directory)
|
|
|
422 |
upload_file_to_huggingface(preprocessed_filepath, f"{directory}/{preprocessed_filename}")
|
423 |
upload_file_to_huggingface(original_filepath, f"{directory}/{original_filename}")
|
424 |
|
425 |
+
def load_splits(directory="chroma/demo", preprocessed_filename="preprocessed_splits.pkl", original_filename="original_splits.pkl"):
|
426 |
preprocessed_splits = None
|
427 |
original_splits = None
|
428 |
|
|
|
457 |
########################################
|
458 |
#das Mapping der orginal-Splits und der preprocessed Splits speichern - und laden
|
459 |
########################################
|
460 |
+
def save_split_to_original_mapping(mapping, directory="chroma/demo", filename="mapping.pkl"):
|
461 |
# Erstellen des Verzeichnisses, falls es nicht existiert
|
462 |
if not os.path.exists(directory):
|
463 |
os.makedirs(directory)
|
|
|
471 |
upload_file_to_huggingface(filepath, f"{directory}/{filename}")
|
472 |
|
473 |
|
474 |
+
def load_split_to_original_mapping(directory="chroma/demo", filename="mapping.pkl"):
|
475 |
try:
|
476 |
# Laden des Mappings aus dem Hugging Face Repository
|
477 |
file_path = hf_hub_download(
|
|
|
739 |
|
740 |
# Bestimmen des Dokumenttyps und Anpassen des Pfads
|
741 |
if doc_path.lower().endswith('.pdf'):
|
742 |
+
file_url = f"{base_url}/chroma/demo/pdf/{quote(title)}?token={hf_token}"
|
743 |
elif doc_path.lower().endswith('.docx'):
|
744 |
+
file_url = f"{base_url}/chroma/demo/word/{quote(title)}?token={hf_token}"
|
745 |
else:
|
746 |
# Fallback für andere Dateitypen
|
747 |
file_url = f"{base_url}/{quote(doc_path)}?token={hf_token}"
|
|
|
760 |
|
761 |
# PDF-Dateien
|
762 |
files_table += "<tr style='background-color: #930BBA; color: white; font-weight: bold; font-size: larger;'><th>Dateiname - PDF-Ordner</th></tr>"
|
763 |
+
pdf_files = [f for f in list_repo_files(repo_id=STORAGE_REPO_ID, repo_type="space", token=hf_token) if f.endswith('.pdf') and f.startswith("chroma/demo/pdf/")]
|
764 |
for i, file in enumerate(pdf_files):
|
765 |
row_color = "#4f4f4f" if i % 2 == 0 else "#3a3a3a"
|
766 |
files_table += f"<tr style='background-color: {row_color}; border-bottom: 1px solid #ddd;'>"
|
|
|
768 |
|
769 |
# Word-Dateien
|
770 |
files_table += "<tr style='background-color: #930BBA; color: white; font-weight: bold; font-size: larger;'><th>Dateiname - Word-Ordner</th></tr>"
|
771 |
+
word_files = [f for f in list_repo_files(repo_id=STORAGE_REPO_ID, repo_type="space", token=hf_token) if f.endswith('.docx') and f.startswith("chroma/demo/word/")]
|
772 |
for i, file in enumerate(word_files):
|
773 |
row_color = "#4f4f4f" if i % 2 == 0 else "#3a3a3a"
|
774 |
files_table += f"<tr style='background-color: {row_color}; border-bottom: 1px solid #ddd;'>"
|