File size: 5,570 Bytes
10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 10d6a86 ae92cb7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import os
# from langchain.document_loaders import PyPDFLoader # deprecated
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.csv_loader import CSVLoader
# ^ if we want to add CSV support, it will transform every row into a k:v pair
from llama_parse import LlamaParse
from typing import Union, List, Dict
from abc import ABC, abstractmethod
class Extractor(ABC):
def __init__(self, file_or_list: Union[str, List[str]], num_workers: int = 1, verbose: bool = False):
""" We can provide a list of files or a single file """
if isinstance(file_or_list, str):
self.filelist = [file_or_list]
else:
self.filelist = file_or_list
self.num_workers = num_workers
self.verbose = verbose
super().__init__()
@abstractmethod
def extract_text(self) -> Dict[str, List[str]]:
""" Extracts text from the PDF, no processing.
Return a dictionary, key = filename, value = list of strings, one for each page.
"""
pass
@abstractmethod
def extract_images(self):
"""Extracts images from the PDF, no processing."""
pass
@abstractmethod
def extract_tables(self):
""" Extracts tables from the PDF, no processing.
Return in json format
"""
pass
class _PyPDFLoader(Extractor):
def extract_text(self):
output_dict = {}
for fpath in self.filelist:
fname = fpath.split('/')[-1]
output_dict[fname] = [p.page_content for p in PyPDFLoader(fpath).load()]
return output_dict
def extract_images(self):
raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction")
return
def extract_tables(self):
raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction")
return
class _LlamaParse(Extractor):
def extract_text(self):
# https://github.com/run-llama/llama_parse
if os.getenv("LLAMA_PARSE_API_KEY") is None:
raise ValueError("LLAMA_PARSE_API_KEY is not set.")
parser = LlamaParse(
api_key = os.getenv("LLAMA_PARSE_API_KEY"),
num_workers=self.num_workers,
verbose=self.verbose,
language="en",
result_type="text" # or "markdown"
)
output_dict = {}
for fpath in self.filelist:
# https://github.com/run-llama/llama_parse/blob/main/examples/demo_json.ipynb
docs = parser.get_json_result(fpath)
docs[0]['pages'][0]['text']
output_dict[fpath] = None
return output_dict
def extract_images(self):
raise NotImplementedError("Not implemented or LlamaParse does not support image extraction")
return
def extract_tables(self):
raise NotImplementedError("Not implemented or LlamaParse does not support table extraction")
return
class _TXTLoader(Extractor):
def extract_text(self):
output_dict = {}
for fpath in self.filelist:
fname = fpath.split('/')[-1]
output_dict[fname] = [open(fpath, 'r').read()]
# with pdfs, we use a list of strings, one for each page
# so we must return a list here, even if it's just one string with everything
return output_dict
def extract_images(self):
raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction")
return
def extract_tables(self):
raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction")
return
class _CSVLoader(Extractor):
# mock code for now, as a reminder of what we could do if time allows TODO
def extract_text(self):
output_dict = {}
for fpath in self.filelist:
fname = fpath.split('/')[-1]
output_dict[fname] = [CSVLoader(fpath).load()] # << untested!
return output_dict
def extract_images(self):
raise NotImplementedError("Not implemented or CSVLoader does not support image extraction")
return
def extract_tables(self):
raise NotImplementedError("Not implemented or CSVLoader does not support table extraction")
return
def extractor(extractor_type: str, *args, **kwargs) -> Extractor:
""" Function factory to return the appropriate PDF extractor instance, properly initialized """
if extractor_type == 'PyPDFLoader':
return _PyPDFLoader(*args, **kwargs)
elif extractor_type == 'LlamaParse':
return _LlamaParse(*args, **kwargs)
elif extractor_type == 'txt':
return _TXTLoader(*args, **kwargs)
else:
raise ValueError(f"Unsupported PDF extractor type: {extractor_type}")
#/usr/bin/env /Users/jpb2/Library/Caches/pypoetry/virtualenvs/reflex-Y1r5RCNB-py3.10/bin/python /Users/jpb2/.vscode/extensions/ms-python.debugpy-2024.6.0-darwin-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 51572 -- -m reflex run --frontend-port 3000 --loglevel debug
#/usr/bin/env /Volumes/DATA/Dropbox/IMAC_BACKUP/WORK/PROJECTS/INNOVATION/venv/bin/python /Users/jpb2/.vscode/extensions/ms-python.debugpy-2024.6.0-darwin-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 53961 -- -m reflex run --frontend-port 3001 --loglevel debug --env dev |