from typing import Optional from llama_index.core.readers.base import BaseReader from llama_index.core.schema import Document from fastapi import UploadFile from typing import List from PyPDF2 import PdfReader from llama_parse import LlamaParse class Reader(BaseReader): async def read_from_uploadfile(self, file: UploadFile) -> List[Document]: try: file_content = await file.read() # Initialize PdfReader with file-like object reader = PdfReader(file.file) # Extract text from each page and store in a list pages = [] for page_num, page in enumerate(reader.pages): text = page.extract_text() or "" # Extract text or use empty if none if text.strip(): # Only add non-empty pages pages.append((page_num + 1, text.strip())) # Create Document objects with page number in metadata documents = [ Document(text=page_text, metadata={"page": page_num}) for page_num, page_text in pages ] return documents except Exception as e: # Handle specific exceptions or fallback to generic one print(f"Error reading PDF file: {e}") raise RuntimeError(f"Failed to process the uploaded file: {e}")