Spaces:
Runtime error
Runtime error
"""Loads an epub file into a list of documents.""" | |
from dataclasses import dataclass | |
from pathlib import Path | |
from typing import List, Union | |
from epub2txt import epub2txt | |
from langchain.docstore.document import Document | |
from langchain.document_loaders.base import BaseLoader | |
from loguru import logger | |
class EpubLoader(BaseLoader): | |
"""Load an epub file into a list of documents. | |
Args: | |
file_path: file path or url to epub | |
Returns: | |
self.load() -> list of Documents | |
""" | |
file_path: Union[str, Path] | |
def load(self) -> List[Document]: | |
"""Load data into document objects.""" | |
try: | |
texts = epub2txt(self.file_path, outputlist=True) | |
ch_titles = epub2txt.content_titles | |
except Exception as exc: | |
logger.error(exc) | |
raise | |
docs = [] | |
for title, text in zip(ch_titles, texts): | |
metadata = {"source": self.file_path, "ch.": title} | |
docs.append(Document(page_content=text, metadata=metadata)) | |
return docs | |