File size: 5,570 Bytes
10d6a86
 
 
 
 
ae92cb7
 
10d6a86
 
 
 
 
 
ae92cb7
10d6a86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae92cb7
10d6a86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae92cb7
10d6a86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae92cb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10d6a86
ae92cb7
 
 
 
 
 
 
 
 
 
 
 
10d6a86
 
 
 
 
 
ae92cb7
 
 
 
10d6a86
 
 
 
 
ae92cb7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import os

# from langchain.document_loaders import PyPDFLoader  # deprecated
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.csv_loader import CSVLoader
# ^ if we want to add CSV support, it will transform every row into a k:v pair
from llama_parse import LlamaParse  

from typing import Union, List, Dict

from abc import ABC, abstractmethod

class Extractor(ABC):
    
    def __init__(self, file_or_list: Union[str, List[str]], num_workers: int = 1, verbose: bool = False):
        """ We can provide a list of files or a single file """
        if isinstance(file_or_list, str):
            self.filelist = [file_or_list]
        else:
            self.filelist = file_or_list
        self.num_workers = num_workers
        self.verbose = verbose
        super().__init__()
    
    @abstractmethod
    def extract_text(self) -> Dict[str, List[str]]:
        """ Extracts text from the PDF, no processing.
            Return a dictionary, key = filename, value = list of strings, one for each page.
        """
        pass

    @abstractmethod
    def extract_images(self):
        """Extracts images from the PDF, no processing."""
        pass

    @abstractmethod
    def extract_tables(self):
        """ Extracts tables from the PDF, no processing.
            Return in json format
        """
        pass

class _PyPDFLoader(Extractor):
    
    def extract_text(self):
        output_dict = {}
        for fpath in self.filelist:
            fname = fpath.split('/')[-1]
            output_dict[fname] = [p.page_content for p in PyPDFLoader(fpath).load()]  
        return output_dict
    
    def extract_images(self):
        raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction")
        return 
    
    def extract_tables(self):
        raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction")
        return


class _LlamaParse(Extractor):
    
    def extract_text(self):
        # https://github.com/run-llama/llama_parse
        if os.getenv("LLAMA_PARSE_API_KEY") is None:
            raise ValueError("LLAMA_PARSE_API_KEY is not set.")
        
        parser = LlamaParse(
            api_key = os.getenv("LLAMA_PARSE_API_KEY"),
            num_workers=self.num_workers,
            verbose=self.verbose,
            language="en",
            result_type="text"  # or "markdown"
        )
        output_dict = {}
        for fpath in self.filelist:
            # https://github.com/run-llama/llama_parse/blob/main/examples/demo_json.ipynb
            docs = parser.get_json_result(fpath)
            docs[0]['pages'][0]['text']
            output_dict[fpath] = None
        return output_dict
    
    def extract_images(self):
        raise NotImplementedError("Not implemented or LlamaParse does not support image extraction")
        return 
    
    def extract_tables(self):
        raise NotImplementedError("Not implemented or LlamaParse does not support table extraction")
        return

class _TXTLoader(Extractor):
    
    def extract_text(self):
        output_dict = {}
        for fpath in self.filelist:
            fname = fpath.split('/')[-1]
            output_dict[fname] = [open(fpath, 'r').read()]  
            # with pdfs, we use a list of strings, one for each page
            # so we must return a list here, even if it's just one string with everything
        return output_dict
    
    def extract_images(self):
        raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction")
        return 
    
    def extract_tables(self):
        raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction")
        return

class _CSVLoader(Extractor):
    # mock code for now, as a reminder of what we could do if time allows TODO
    def extract_text(self):
        output_dict = {}
        for fpath in self.filelist:
            fname = fpath.split('/')[-1]
            output_dict[fname] = [CSVLoader(fpath).load()]  # <<  untested! 

        return output_dict
    
    def extract_images(self):
        raise NotImplementedError("Not implemented or CSVLoader does not support image extraction")
        return 
    
    def extract_tables(self):
        raise NotImplementedError("Not implemented or CSVLoader does not support table extraction")
        return

def extractor(extractor_type: str, *args, **kwargs) -> Extractor:
    """ Function factory to return the appropriate PDF extractor instance, properly initialized """
    
    if extractor_type == 'PyPDFLoader':
        return _PyPDFLoader(*args, **kwargs)
    
    elif extractor_type == 'LlamaParse':
        return _LlamaParse(*args, **kwargs)
    
    elif extractor_type == 'txt':
        return _TXTLoader(*args, **kwargs)
    
    else:
        raise ValueError(f"Unsupported PDF extractor type: {extractor_type}")



#/usr/bin/env /Users/jpb2/Library/Caches/pypoetry/virtualenvs/reflex-Y1r5RCNB-py3.10/bin/python /Users/jpb2/.vscode/extensions/ms-python.debugpy-2024.6.0-darwin-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 51572 -- -m reflex run --frontend-port 3000 --loglevel debug 
#/usr/bin/env /Volumes/DATA/Dropbox/IMAC_BACKUP/WORK/PROJECTS/INNOVATION/venv/bin/python /Users/jpb2/.vscode/extensions/ms-python.debugpy-2024.6.0-darwin-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 53961 -- -m reflex run --frontend-port 3001 --loglevel debug --env dev