from langchain_community.document_loaders import PyPDFLoader import os from typing import List class PDFProcessor: """ Class for processing PDF files to extract text content. """ def extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]: """ Extract text content from a list of PDF files. Args: file_paths (List[str]): A list of file paths to the PDF documents. Returns: List[str]: A list of text content extracted from the PDF documents. """ texts = [] for file_path in file_paths: try: loader = PyPDFLoader(file_path) pages = loader.load_and_split() for page in pages: if isinstance(page.page_content, bytes): text = page.page_content.decode('utf-8', errors='ignore') elif isinstance(page.page_content, str): text = page.page_content else: print(f"Unexpected type: {type(page.page_content)}") continue texts.append(text) except Exception as e: print(f"Failed to process {file_path}: {e}") return texts