File size: 3,431 Bytes
10d6a86 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import os
# from langchain.document_loaders import PyPDFLoader # deprecated
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_parse import LlamaParse
from typing import Union, List, Dict
from abc import ABC, abstractmethod
class PDFExtractor(ABC):
def __init__(self, file_or_list: Union[str, List[str]], num_workers: int = 1, verbose: bool = False):
""" We can provide a list of files or a single file """
if isinstance(file_or_list, str):
self.filelist = [file_or_list]
else:
self.filelist = file_or_list
self.num_workers = num_workers
self.verbose = verbose
super().__init__()
@abstractmethod
def extract_text(self) -> Dict[str, List[str]]:
""" Extracts text from the PDF, no processing.
Return a dictionary, key = filename, value = list of strings, one for each page.
"""
pass
@abstractmethod
def extract_images(self):
"""Extracts images from the PDF, no processing."""
pass
@abstractmethod
def extract_tables(self):
""" Extracts tables from the PDF, no processing.
Return in json format
"""
pass
class _PyPDFLoader(PDFExtractor):
def extract_text(self):
output_dict = {}
for fpath in self.filelist:
fname = fpath.split('/')[-1]
output_dict[fname] = [p.page_content for p in PyPDFLoader(fpath).load()]
return output_dict
def extract_images(self):
raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction")
return
def extract_tables(self):
raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction")
return
class _LlamaParse(PDFExtractor):
def extract_text(self):
# https://github.com/run-llama/llama_parse
if os.getenv("LLAMA_PARSE_API_KEY") is None:
raise ValueError("LLAMA_PARSE_API_KEY is not set.")
parser = LlamaParse(
api_key = os.getenv("LLAMA_PARSE_API_KEY"),
num_workers=self.num_workers,
verbose=self.verbose,
language="en",
result_type="text" # or "markdown"
)
output_dict = {}
for fpath in self.filelist:
# https://github.com/run-llama/llama_parse/blob/main/examples/demo_json.ipynb
docs = parser.get_json_result(fpath)
docs[0]['pages'][0]['text']
output_dict[fpath] = None
return output_dict
def extract_images(self):
raise NotImplementedError("Not implemented or LlamaParse does not support image extraction")
return
def extract_tables(self):
raise NotImplementedError("Not implemented or LlamaParse does not support table extraction")
return
def pdf_extractor(extractor_type: str, *args, **kwargs) -> PDFExtractor:
""" Factory function to return the appropriate PDF extractor instance, properly initialized """
if extractor_type == 'PyPDFLoader':
return _PyPDFLoader(*args, **kwargs)
elif extractor_type == 'LlamaParse':
return _LlamaParse(*args, **kwargs)
else:
raise ValueError(f"Unsupported PDF extractor type: {extractor_type}")
|