File size: 3,431 Bytes
10d6a86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os

# from langchain.document_loaders import PyPDFLoader  # deprecated
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_parse import LlamaParse  

from typing import Union, List, Dict

from abc import ABC, abstractmethod

class PDFExtractor(ABC):
    
    def __init__(self, file_or_list: Union[str, List[str]], num_workers: int = 1, verbose: bool = False):
        """ We can provide a list of files or a single file """
        if isinstance(file_or_list, str):
            self.filelist = [file_or_list]
        else:
            self.filelist = file_or_list
        self.num_workers = num_workers
        self.verbose = verbose
        super().__init__()
    
    @abstractmethod
    def extract_text(self) -> Dict[str, List[str]]:
        """ Extracts text from the PDF, no processing.
            Return a dictionary, key = filename, value = list of strings, one for each page.
        """
        pass

    @abstractmethod
    def extract_images(self):
        """Extracts images from the PDF, no processing."""
        pass

    @abstractmethod
    def extract_tables(self):
        """ Extracts tables from the PDF, no processing.
            Return in json format
        """
        pass

class _PyPDFLoader(PDFExtractor):
    
    def extract_text(self):
        output_dict = {}
        for fpath in self.filelist:
            fname = fpath.split('/')[-1]
            output_dict[fname] = [p.page_content for p in PyPDFLoader(fpath).load()]  
        return output_dict
    
    def extract_images(self):
        raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction")
        return 
    
    def extract_tables(self):
        raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction")
        return


class _LlamaParse(PDFExtractor):
    
    def extract_text(self):
        # https://github.com/run-llama/llama_parse
        if os.getenv("LLAMA_PARSE_API_KEY") is None:
            raise ValueError("LLAMA_PARSE_API_KEY is not set.")
        
        parser = LlamaParse(
            api_key = os.getenv("LLAMA_PARSE_API_KEY"),
            num_workers=self.num_workers,
            verbose=self.verbose,
            language="en",
            result_type="text"  # or "markdown"
        )
        output_dict = {}
        for fpath in self.filelist:
            # https://github.com/run-llama/llama_parse/blob/main/examples/demo_json.ipynb
            docs = parser.get_json_result(fpath)
            docs[0]['pages'][0]['text']
            output_dict[fpath] = None
        return output_dict
    
    def extract_images(self):
        raise NotImplementedError("Not implemented or LlamaParse does not support image extraction")
        return 
    
    def extract_tables(self):
        raise NotImplementedError("Not implemented or LlamaParse does not support table extraction")
        return


def pdf_extractor(extractor_type: str, *args, **kwargs) -> PDFExtractor:
    """ Factory function to return the appropriate PDF extractor instance, properly initialized """
    
    if extractor_type == 'PyPDFLoader':
        return _PyPDFLoader(*args, **kwargs)
    
    elif extractor_type == 'LlamaParse':
        return _LlamaParse(*args, **kwargs)
    else:
        raise ValueError(f"Unsupported PDF extractor type: {extractor_type}")