import os # from langchain.document_loaders import PyPDFLoader # deprecated from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders.csv_loader import CSVLoader # ^ if we want to add CSV support, it will transform every row into a k:v pair from llama_parse import LlamaParse from typing import Union, List, Dict from abc import ABC, abstractmethod class Extractor(ABC): def __init__(self, file_or_list: Union[str, List[str]], num_workers: int = 1, verbose: bool = False): """ We can provide a list of files or a single file """ if isinstance(file_or_list, str): self.filelist = [file_or_list] else: self.filelist = file_or_list self.num_workers = num_workers self.verbose = verbose super().__init__() @abstractmethod def extract_text(self) -> Dict[str, List[str]]: """ Extracts text from the PDF, no processing. Return a dictionary, key = filename, value = list of strings, one for each page. """ pass @abstractmethod def extract_images(self): """Extracts images from the PDF, no processing.""" pass @abstractmethod def extract_tables(self): """ Extracts tables from the PDF, no processing. Return in json format """ pass class _PyPDFLoader(Extractor): def extract_text(self): output_dict = {} for fpath in self.filelist: fname = fpath.split('/')[-1] output_dict[fname] = [p.page_content for p in PyPDFLoader(fpath).load()] return output_dict def extract_images(self): raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction") return def extract_tables(self): raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction") return class _LlamaParse(Extractor): def extract_text(self): # https://github.com/run-llama/llama_parse if os.getenv("LLAMA_PARSE_API_KEY") is None: raise ValueError("LLAMA_PARSE_API_KEY is not set.") parser = LlamaParse( api_key = os.getenv("LLAMA_PARSE_API_KEY"), num_workers=self.num_workers, verbose=self.verbose, language="en", result_type="text" # or "markdown" ) output_dict = {} for fpath in self.filelist: # https://github.com/run-llama/llama_parse/blob/main/examples/demo_json.ipynb docs = parser.get_json_result(fpath) docs[0]['pages'][0]['text'] output_dict[fpath] = None return output_dict def extract_images(self): raise NotImplementedError("Not implemented or LlamaParse does not support image extraction") return def extract_tables(self): raise NotImplementedError("Not implemented or LlamaParse does not support table extraction") return class _TXTLoader(Extractor): def extract_text(self): output_dict = {} for fpath in self.filelist: fname = fpath.split('/')[-1] output_dict[fname] = [open(fpath, 'r').read()] # with pdfs, we use a list of strings, one for each page # so we must return a list here, even if it's just one string with everything return output_dict def extract_images(self): raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction") return def extract_tables(self): raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction") return class _CSVLoader(Extractor): # mock code for now, as a reminder of what we could do if time allows TODO def extract_text(self): output_dict = {} for fpath in self.filelist: fname = fpath.split('/')[-1] output_dict[fname] = [CSVLoader(fpath).load()] # << untested! return output_dict def extract_images(self): raise NotImplementedError("Not implemented or CSVLoader does not support image extraction") return def extract_tables(self): raise NotImplementedError("Not implemented or CSVLoader does not support table extraction") return def extractor(extractor_type: str, *args, **kwargs) -> Extractor: """ Function factory to return the appropriate PDF extractor instance, properly initialized """ if extractor_type == 'PyPDFLoader': return _PyPDFLoader(*args, **kwargs) elif extractor_type == 'LlamaParse': return _LlamaParse(*args, **kwargs) elif extractor_type == 'txt': return _TXTLoader(*args, **kwargs) else: raise ValueError(f"Unsupported PDF extractor type: {extractor_type}") #/usr/bin/env /Users/jpb2/Library/Caches/pypoetry/virtualenvs/reflex-Y1r5RCNB-py3.10/bin/python /Users/jpb2/.vscode/extensions/ms-python.debugpy-2024.6.0-darwin-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 51572 -- -m reflex run --frontend-port 3000 --loglevel debug #/usr/bin/env /Volumes/DATA/Dropbox/IMAC_BACKUP/WORK/PROJECTS/INNOVATION/venv/bin/python /Users/jpb2/.vscode/extensions/ms-python.debugpy-2024.6.0-darwin-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 53961 -- -m reflex run --frontend-port 3001 --loglevel debug --env dev