|
import os |
|
|
|
|
|
from langchain_community.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from llama_parse import LlamaParse |
|
|
|
from typing import Union, List, Dict |
|
|
|
from abc import ABC, abstractmethod |
|
|
|
class PDFExtractor(ABC): |
|
|
|
def __init__(self, file_or_list: Union[str, List[str]], num_workers: int = 1, verbose: bool = False): |
|
""" We can provide a list of files or a single file """ |
|
if isinstance(file_or_list, str): |
|
self.filelist = [file_or_list] |
|
else: |
|
self.filelist = file_or_list |
|
self.num_workers = num_workers |
|
self.verbose = verbose |
|
super().__init__() |
|
|
|
@abstractmethod |
|
def extract_text(self) -> Dict[str, List[str]]: |
|
""" Extracts text from the PDF, no processing. |
|
Return a dictionary, key = filename, value = list of strings, one for each page. |
|
""" |
|
pass |
|
|
|
@abstractmethod |
|
def extract_images(self): |
|
"""Extracts images from the PDF, no processing.""" |
|
pass |
|
|
|
@abstractmethod |
|
def extract_tables(self): |
|
""" Extracts tables from the PDF, no processing. |
|
Return in json format |
|
""" |
|
pass |
|
|
|
class _PyPDFLoader(PDFExtractor): |
|
|
|
def extract_text(self): |
|
output_dict = {} |
|
for fpath in self.filelist: |
|
fname = fpath.split('/')[-1] |
|
output_dict[fname] = [p.page_content for p in PyPDFLoader(fpath).load()] |
|
return output_dict |
|
|
|
def extract_images(self): |
|
raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction") |
|
return |
|
|
|
def extract_tables(self): |
|
raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction") |
|
return |
|
|
|
|
|
class _LlamaParse(PDFExtractor): |
|
|
|
def extract_text(self): |
|
|
|
if os.getenv("LLAMA_PARSE_API_KEY") is None: |
|
raise ValueError("LLAMA_PARSE_API_KEY is not set.") |
|
|
|
parser = LlamaParse( |
|
api_key = os.getenv("LLAMA_PARSE_API_KEY"), |
|
num_workers=self.num_workers, |
|
verbose=self.verbose, |
|
language="en", |
|
result_type="text" |
|
) |
|
output_dict = {} |
|
for fpath in self.filelist: |
|
|
|
docs = parser.get_json_result(fpath) |
|
docs[0]['pages'][0]['text'] |
|
output_dict[fpath] = None |
|
return output_dict |
|
|
|
def extract_images(self): |
|
raise NotImplementedError("Not implemented or LlamaParse does not support image extraction") |
|
return |
|
|
|
def extract_tables(self): |
|
raise NotImplementedError("Not implemented or LlamaParse does not support table extraction") |
|
return |
|
|
|
|
|
def pdf_extractor(extractor_type: str, *args, **kwargs) -> PDFExtractor: |
|
""" Factory function to return the appropriate PDF extractor instance, properly initialized """ |
|
|
|
if extractor_type == 'PyPDFLoader': |
|
return _PyPDFLoader(*args, **kwargs) |
|
|
|
elif extractor_type == 'LlamaParse': |
|
return _LlamaParse(*args, **kwargs) |
|
else: |
|
raise ValueError(f"Unsupported PDF extractor type: {extractor_type}") |
|
|
|
|
|
|
|
|
|
|