mr / app /engine /loaders /file.py
JPBianchi's picture
endpoint only, no UI
ae92cb7
raw
history blame
5.57 kB
import os
# from langchain.document_loaders import PyPDFLoader # deprecated
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.csv_loader import CSVLoader
# ^ if we want to add CSV support, it will transform every row into a k:v pair
from llama_parse import LlamaParse
from typing import Union, List, Dict
from abc import ABC, abstractmethod
class Extractor(ABC):
def __init__(self, file_or_list: Union[str, List[str]], num_workers: int = 1, verbose: bool = False):
""" We can provide a list of files or a single file """
if isinstance(file_or_list, str):
self.filelist = [file_or_list]
else:
self.filelist = file_or_list
self.num_workers = num_workers
self.verbose = verbose
super().__init__()
@abstractmethod
def extract_text(self) -> Dict[str, List[str]]:
""" Extracts text from the PDF, no processing.
Return a dictionary, key = filename, value = list of strings, one for each page.
"""
pass
@abstractmethod
def extract_images(self):
"""Extracts images from the PDF, no processing."""
pass
@abstractmethod
def extract_tables(self):
""" Extracts tables from the PDF, no processing.
Return in json format
"""
pass
class _PyPDFLoader(Extractor):
def extract_text(self):
output_dict = {}
for fpath in self.filelist:
fname = fpath.split('/')[-1]
output_dict[fname] = [p.page_content for p in PyPDFLoader(fpath).load()]
return output_dict
def extract_images(self):
raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction")
return
def extract_tables(self):
raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction")
return
class _LlamaParse(Extractor):
def extract_text(self):
# https://github.com/run-llama/llama_parse
if os.getenv("LLAMA_PARSE_API_KEY") is None:
raise ValueError("LLAMA_PARSE_API_KEY is not set.")
parser = LlamaParse(
api_key = os.getenv("LLAMA_PARSE_API_KEY"),
num_workers=self.num_workers,
verbose=self.verbose,
language="en",
result_type="text" # or "markdown"
)
output_dict = {}
for fpath in self.filelist:
# https://github.com/run-llama/llama_parse/blob/main/examples/demo_json.ipynb
docs = parser.get_json_result(fpath)
docs[0]['pages'][0]['text']
output_dict[fpath] = None
return output_dict
def extract_images(self):
raise NotImplementedError("Not implemented or LlamaParse does not support image extraction")
return
def extract_tables(self):
raise NotImplementedError("Not implemented or LlamaParse does not support table extraction")
return
class _TXTLoader(Extractor):
def extract_text(self):
output_dict = {}
for fpath in self.filelist:
fname = fpath.split('/')[-1]
output_dict[fname] = [open(fpath, 'r').read()]
# with pdfs, we use a list of strings, one for each page
# so we must return a list here, even if it's just one string with everything
return output_dict
def extract_images(self):
raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction")
return
def extract_tables(self):
raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction")
return
class _CSVLoader(Extractor):
# mock code for now, as a reminder of what we could do if time allows TODO
def extract_text(self):
output_dict = {}
for fpath in self.filelist:
fname = fpath.split('/')[-1]
output_dict[fname] = [CSVLoader(fpath).load()] # << untested!
return output_dict
def extract_images(self):
raise NotImplementedError("Not implemented or CSVLoader does not support image extraction")
return
def extract_tables(self):
raise NotImplementedError("Not implemented or CSVLoader does not support table extraction")
return
def extractor(extractor_type: str, *args, **kwargs) -> Extractor:
""" Function factory to return the appropriate PDF extractor instance, properly initialized """
if extractor_type == 'PyPDFLoader':
return _PyPDFLoader(*args, **kwargs)
elif extractor_type == 'LlamaParse':
return _LlamaParse(*args, **kwargs)
elif extractor_type == 'txt':
return _TXTLoader(*args, **kwargs)
else:
raise ValueError(f"Unsupported PDF extractor type: {extractor_type}")
#/usr/bin/env /Users/jpb2/Library/Caches/pypoetry/virtualenvs/reflex-Y1r5RCNB-py3.10/bin/python /Users/jpb2/.vscode/extensions/ms-python.debugpy-2024.6.0-darwin-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 51572 -- -m reflex run --frontend-port 3000 --loglevel debug
#/usr/bin/env /Volumes/DATA/Dropbox/IMAC_BACKUP/WORK/PROJECTS/INNOVATION/venv/bin/python /Users/jpb2/.vscode/extensions/ms-python.debugpy-2024.6.0-darwin-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 53961 -- -m reflex run --frontend-port 3001 --loglevel debug --env dev