mr

Build error

App Files Files Community

mr / app /engine /loaders /file.py

JPBianchi

endpoint only, no UI

ae92cb7 5 months ago

raw

history blame

5.57 kB

	import os

	# from langchain.document_loaders import PyPDFLoader # deprecated
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders.csv_loader import CSVLoader
	# ^ if we want to add CSV support, it will transform every row into a k:v pair
	from llama_parse import LlamaParse

	from typing import Union, List, Dict

	from abc import ABC, abstractmethod

	class Extractor(ABC):

	def __init__(self, file_or_list: Union[str, List[str]], num_workers: int = 1, verbose: bool = False):
	""" We can provide a list of files or a single file """
	if isinstance(file_or_list, str):
	self.filelist = [file_or_list]
	else:
	self.filelist = file_or_list
	self.num_workers = num_workers
	self.verbose = verbose
	super().__init__()

	@abstractmethod
	def extract_text(self) -> Dict[str, List[str]]:
	""" Extracts text from the PDF, no processing.
	Return a dictionary, key = filename, value = list of strings, one for each page.
	"""
	pass

	@abstractmethod
	def extract_images(self):
	"""Extracts images from the PDF, no processing."""
	pass

	@abstractmethod
	def extract_tables(self):
	""" Extracts tables from the PDF, no processing.
	Return in json format
	"""
	pass

	class _PyPDFLoader(Extractor):

	def extract_text(self):
	output_dict = {}
	for fpath in self.filelist:
	fname = fpath.split('/')[-1]
	output_dict[fname] = [p.page_content for p in PyPDFLoader(fpath).load()]
	return output_dict

	def extract_images(self):
	raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction")
	return

	def extract_tables(self):
	raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction")
	return


	class _LlamaParse(Extractor):

	def extract_text(self):
	# https://github.com/run-llama/llama_parse
	if os.getenv("LLAMA_PARSE_API_KEY") is None:
	raise ValueError("LLAMA_PARSE_API_KEY is not set.")

	parser = LlamaParse(
	api_key = os.getenv("LLAMA_PARSE_API_KEY"),
	num_workers=self.num_workers,
	verbose=self.verbose,
	language="en",
	result_type="text" # or "markdown"
	)
	output_dict = {}
	for fpath in self.filelist:
	# https://github.com/run-llama/llama_parse/blob/main/examples/demo_json.ipynb
	docs = parser.get_json_result(fpath)
	docs[0]['pages'][0]['text']
	output_dict[fpath] = None
	return output_dict

	def extract_images(self):
	raise NotImplementedError("Not implemented or LlamaParse does not support image extraction")
	return

	def extract_tables(self):
	raise NotImplementedError("Not implemented or LlamaParse does not support table extraction")
	return

	class _TXTLoader(Extractor):

	def extract_text(self):
	output_dict = {}
	for fpath in self.filelist:
	fname = fpath.split('/')[-1]
	output_dict[fname] = [open(fpath, 'r').read()]
	# with pdfs, we use a list of strings, one for each page
	# so we must return a list here, even if it's just one string with everything
	return output_dict

	def extract_images(self):
	raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction")
	return

	def extract_tables(self):
	raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction")
	return

	class _CSVLoader(Extractor):
	# mock code for now, as a reminder of what we could do if time allows TODO
	def extract_text(self):
	output_dict = {}
	for fpath in self.filelist:
	fname = fpath.split('/')[-1]
	output_dict[fname] = [CSVLoader(fpath).load()] # << untested!

	return output_dict

	def extract_images(self):
	raise NotImplementedError("Not implemented or CSVLoader does not support image extraction")
	return

	def extract_tables(self):
	raise NotImplementedError("Not implemented or CSVLoader does not support table extraction")
	return

	def extractor(extractor_type: str, args, *kwargs) -> Extractor:
	""" Function factory to return the appropriate PDF extractor instance, properly initialized """

	if extractor_type == 'PyPDFLoader':
	return _PyPDFLoader(args, *kwargs)

	elif extractor_type == 'LlamaParse':
	return _LlamaParse(args, *kwargs)

	elif extractor_type == 'txt':
	return _TXTLoader(args, *kwargs)

	else:
	raise ValueError(f"Unsupported PDF extractor type: {extractor_type}")



	#/usr/bin/env /Users/jpb2/Library/Caches/pypoetry/virtualenvs/reflex-Y1r5RCNB-py3.10/bin/python /Users/jpb2/.vscode/extensions/ms-python.debugpy-2024.6.0-darwin-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 51572 -- -m reflex run --frontend-port 3000 --loglevel debug
	#/usr/bin/env /Volumes/DATA/Dropbox/IMAC_BACKUP/WORK/PROJECTS/INNOVATION/venv/bin/python /Users/jpb2/.vscode/extensions/ms-python.debugpy-2024.6.0-darwin-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 53961 -- -m reflex run --frontend-port 3001 --loglevel debug --env dev