multilingual-dokugpt / epub_loader.py
ffreemt
Update sync with github repo
bfb2c2a
raw
history blame
No virus
1.06 kB
"""Load an epub file into a list of documents."""
from dataclasses import dataclass
from pathlib import Path
from typing import List, Union
from epub2txt import epub2txt
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from loguru import logger
@dataclass
class EpubLoader(BaseLoader):
"""Load an epub file into a list of documents.
Args:
file_path: file path or url to epub
Returns:
self.load() -> list of Documents
"""
file_path: Union[str, Path]
def load(self) -> List[Document]:
"""Load data into document objects."""
try:
texts = epub2txt(self.file_path, outputlist=True)
ch_titles = epub2txt.content_titles
except Exception as exc:
logger.error(exc)
raise
docs = []
for title, text in zip(ch_titles, texts):
metadata = {"source": self.file_path, "ch.": title}
docs.append(Document(page_content=text, metadata=metadata))
return docs