mr

Build error

File size: 5,570 Bytes

import os

# from langchain.document_loaders import PyPDFLoader  # deprecated
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.csv_loader import CSVLoader
# ^ if we want to add CSV support, it will transform every row into a k:v pair
from llama_parse import LlamaParse  

from typing import Union, List, Dict

from abc import ABC, abstractmethod

class Extractor(ABC):
    
    def __init__(self, file_or_list: Union[str, List[str]], num_workers: int = 1, verbose: bool = False):
        """ We can provide a list of files or a single file """
        if isinstance(file_or_list, str):
            self.filelist = [file_or_list]
        else:
            self.filelist = file_or_list
        self.num_workers = num_workers
        self.verbose = verbose
        super().__init__()
    
    @abstractmethod
    def extract_text(self) -> Dict[str, List[str]]:
        """ Extracts text from the PDF, no processing.
            Return a dictionary, key = filename, value = list of strings, one for each page.
        """
        pass

    @abstractmethod
    def extract_images(self):
        """Extracts images from the PDF, no processing."""
        pass

    @abstractmethod
    def extract_tables(self):
        """ Extracts tables from the PDF, no processing.
            Return in json format
        """
        pass

class _PyPDFLoader(Extractor):
    
    def extract_text(self):
        output_dict = {}
        for fpath in self.filelist:
            fname = fpath.split('/')[-1]
            output_dict[fname] = [p.page_content for p in PyPDFLoader(fpath).load()]  
        return output_dict
    
    def extract_images(self):
        raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction")
        return 
    
    def extract_tables(self):
        raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction")
        return


class _LlamaParse(Extractor):
    
    def extract_text(self):
        # https://github.com/run-llama/llama_parse
        if os.getenv("LLAMA_PARSE_API_KEY") is None:
            raise ValueError("LLAMA_PARSE_API_KEY is not set.")
        
        parser = LlamaParse(
            api_key = os.getenv("LLAMA_PARSE_API_KEY"),
            num_workers=self.num_workers,
            verbose=self.verbose,
            language="en",
            result_type="text"  # or "markdown"
        )
        output_dict = {}
        for fpath in self.filelist:
            # https://github.com/run-llama/llama_parse/blob/main/examples/demo_json.ipynb
            docs = parser.get_json_result(fpath)
            docs[0]['pages'][0]['text']
            output_dict[fpath] = None
        return output_dict
    
    def extract_images(self):
        raise NotImplementedError("Not implemented or LlamaParse does not support image extraction")
        return 
    
    def extract_tables(self):
        raise NotImplementedError("Not implemented or LlamaParse does not support table extraction")
        return

class _TXTLoader(Extractor):
    
    def extract_text(self):
        output_dict = {}
        for fpath in self.filelist:
            fname = fpath.split('/')[-1]
            output_dict[fname] = [open(fpath, 'r').read()]  
            # with pdfs, we use a list of strings, one for each page
            # so we must return a list here, even if it's just one string with everything
        return output_dict
    
    def extract_images(self):
        raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction")
        return 
    
    def extract_tables(self):
        raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction")
        return

class _CSVLoader(Extractor):
    # mock code for now, as a reminder of what we could do if time allows TODO
    def extract_text(self):
        output_dict = {}
        for fpath in self.filelist:
            fname = fpath.split('/')[-1]
            output_dict[fname] = [CSVLoader(fpath).load()]  # <<  untested! 

        return output_dict
    
    def extract_images(self):
        raise NotImplementedError("Not implemented or CSVLoader does not support image extraction")
        return 
    
    def extract_tables(self):
        raise NotImplementedError("Not implemented or CSVLoader does not support table extraction")
        return

def extractor(extractor_type: str, *args, **kwargs) -> Extractor:
    """ Function factory to return the appropriate PDF extractor instance, properly initialized """
    
    if extractor_type == 'PyPDFLoader':
        return _PyPDFLoader(*args, **kwargs)
    
    elif extractor_type == 'LlamaParse':
        return _LlamaParse(*args, **kwargs)
    
    elif extractor_type == 'txt':
        return _TXTLoader(*args, **kwargs)
    
    else:
        raise ValueError(f"Unsupported PDF extractor type: {extractor_type}")



#/usr/bin/env /Users/jpb2/Library/Caches/pypoetry/virtualenvs/reflex-Y1r5RCNB-py3.10/bin/python /Users/jpb2/.vscode/extensions/ms-python.debugpy-2024.6.0-darwin-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 51572 -- -m reflex run --frontend-port 3000 --loglevel debug 
#/usr/bin/env /Volumes/DATA/Dropbox/IMAC_BACKUP/WORK/PROJECTS/INNOVATION/venv/bin/python /Users/jpb2/.vscode/extensions/ms-python.debugpy-2024.6.0-darwin-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 53961 -- -m reflex run --frontend-port 3001 --loglevel debug --env dev