Here, we test various parsers and approaches

## Dependencies

In [1]:
import os

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain_community.embeddings import HuggingFaceEmbeddings 
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain_community.llms import HuggingFaceEndpoint
from pathlib import Path
from PyPDF2 import PdfReader
from dotenv import load_dotenv, find_dotenv

from rich import print
from rich.pretty import pprint

from llama_parse import LlamaParse  


load_dotenv(find_dotenv('env'))

True

In [47]:
pdf1 = "data/test.pdf"  # a manually created simple pdf with a few words on 2 pages
pdf2 = "data/AMZN_Moodys_CreditRating_2023_p1.pdf" # first page, with the very difficult to read Exhibit 1 with a bar chart
pdf3 = "../../assignment_data/AMZN_Moodys_CreditRating_2023.pdf"

## PYPDFLOADER

In [4]:
# Load PDF document and create doc splits
def load_doc(list_file_path, chunk_size, chunk_overlap):
    # Processing for one document only
    # loader = PyPDFLoader(file_path)
    # pages = loader.load()
    loaders = [PyPDFLoader(x) for x in list_file_path]
    pages = []
    for loader in loaders:
        pages.extend(loader.load())
    # text_splitter = RecursiveCharacterTextSplitter(chunk_size = 600, chunk_overlap = 50)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size, 
        chunk_overlap = chunk_overlap)
    doc_splits = text_splitter.split_documents(pages)
    return doc_splits
splits = load_doc([pdf1], 600, 50)

print(splits)

[Document(page_content='���������������������������������', metadata={'source': 'data/test.pdf', 'page': 0})]


This is very bad since it's a very simple pdf with a few words, no images... but it works well on pdf2.

In [44]:
load_doc([pdf2], 600, 50)

[Document(page_content='CORPORATES\nCREDIT OPINION\n23 May 2023\nUpdate\nRATINGS\nAmazon.com, Inc.\nDomicile Seattle, Washington,\nUnited States\nLong Term Rating A1\nType Senior Unsecured -\nDom Curr\nOutlook Stable\nPlease see the ratings section  at the end of this report\nfor more information. The ratings and outlook shown\nreflect information as of the publication date.\nContacts\nChristina Boni +1.212.553.0514\nSenior Vice President\nchristina.boni@moodys.com\nJack Myers +1.212.553.5116\nAssociate Analyst\njack.myers@moodys.com\nMargaret Taylor +1.212.553.0424\nAssociate Managing Director\nmargaret.taylor@moodys.comAmazon.com, Inc.', metadata={'source': 'data/AMZN_Moodys_CreditRating_2023_p1.pdf', 'page': 0}),
 Document(page_content="margaret.taylor@moodys.comAmazon.com, Inc.\nUpdate to credit analysis\nSummary\nAmazon.com, Inc. 's (A1/Prime-1 stable) credit profile reflects its powerful global brand, which\nis synonymous with online retail, as well as the strength and profitabil

In [46]:
loaders = [PyPDFLoader(x) for x in [pdf2]]
pages = []
for loader in loaders:
    pages.extend(loader.load())
pprint(pages)

In [48]:
loaders = [PyPDFLoader(x) for x in [pdf3]]
pages = []
for loader in loaders:
    pages.extend(loader.load())
pprint(pages)

In [49]:
pages[0].page_content

"CORPORATES\nCREDIT OPINION\n23 May 2023\nUpdate\nRATINGS\nAmazon.com, Inc.\nDomicile Seattle, Washington,\nUnited States\nLong Term Rating A1\nType Senior Unsecured -\nDom Curr\nOutlook Stable\nPlease see the ratings section  at the end of this report\nfor more information. The ratings and outlook shown\nreflect information as of the publication date.\nContacts\nChristina Boni +1.212.553.0514\nSenior Vice President\nchristina.boni@moodys.com\nJack Myers +1.212.553.5116\nAssociate Analyst\njack.myers@moodys.com\nMargaret Taylor +1.212.553.0424\nAssociate Managing Director\nmargaret.taylor@moodys.comAmazon.com, Inc.\nUpdate to credit analysis\nSummary\nAmazon.com, Inc. 's (A1/Prime-1 stable) credit profile reflects its powerful global brand, which\nis synonymous with online retail, as well as the strength and profitability of Amazon Web\nServices (“AWS”), the market leader in the cloud computing market. The company is reliant\non the operating income derived from AWS, as its non-AWS pro

## PyPDF2

In [15]:
reader = PdfReader(pdf1)
page = reader.pages[0]
print(page.extract_text())

In [16]:
# https://pypdf2.readthedocs.io/en/3.x/user/extract-text.html
reader = PdfReader(pdf1)
number_of_pages = len(reader.pages)
print(f"Number of pages: {number_of_pages}")
page = reader.pages[0]
text = page.extract_text()
print(text)

In [17]:
# extract the images
reader = PdfReader(pdf2)
page = reader.pages[0]
count = 0

for image_file_object in page.images:
    with open(str(count) + image_file_object.name, "wb") as fp:
        fp.write(image_file_object.data)
        count += 1

OSError: cannot write mode PA as PNG

Total disaster with the library's own code.  PyPDF2 is not just a parser, it can merge etc so maybe we need something more specialized at parsing.

## LLAMA_PARSE

In [5]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()
#%%
# https://github.com/run-llama/llama_parse
parser = LlamaParse(
    api_key = os.getenv("LLAMA_PARSE_API_KEY"),
    num_workers=4,
    verbose=True,
    language="en",
    result_type="text"  # "markdown" and "text" are available
)
#%%
documents = parser.load_data(pdf1)
# documents = parser.load_data([pdf1, pdf2])   # sync batch

# documents = await parser.aload_data(pdf1)  # async
# documents = await parser.aload_data([pdf1, pdf2]) # async batch
print(documents)

Started parsing the file under job_id a3a95967-3301-4140-9c89-e1f5d4f0677d
[Document(id_='3f10f65a-c636-4df9-83b1-1ffb0878bb70', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='A very simple pdf file\n\n\nsecond line\n---\n\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')]


In [8]:
# https://github.com/run-llama/llama_parse/blob/main/examples/demo_json.ipynb
docs = parser.get_json_result(pdf1)
docs

Started parsing the file under job_id f34a3133-b632-4ddd-8035-eacbe2678796


[{'pages': [{'page': 1,
    'text': 'A very simple pdf file\n\n\nsecond line',
    'md': 'A very simple pdf file\n\nsecond line',
    'images': [],
    'items': [{'type': 'text',
      'value': 'A very simple pdf file\n\nsecond line',
      'md': 'A very simple pdf file\n\nsecond line'}]},
   {'page': 2,
    'text': 'text on 2nd page\n\n\nurl',
    'md': 'text on 2nd page\n\n\nurl',
    'images': [],
    'items': [{'type': 'text',
      'value': 'text on 2nd page\n\n\nurl',
      'md': 'text on 2nd page\n\n\nurl'}]}],
  'job_id': 'f34a3133-b632-4ddd-8035-eacbe2678796',
  'file_path': 'data/test.pdf'}]

In [9]:
docs[0]['pages'][0]['text']

'A very simple pdf file\n\n\nsecond line'

In [10]:
docs[0]['pages'][1]['text']

'text on 2nd page\n\n\nurl'

In [11]:
docs2 = parser.get_json_result(pdf2)
docs2[0]['pages'][0]['text']

Started parsing the file under job_id 4e1922f5-566b-4bf7-8684-da7f4a977e37


"                                                                                                                                                                                          CORPORATES\n\n\n                CREDIT OPINION                                            Amazon.com, Inc.\n                23 May 2023\nMoody's Adj. Operating Income (USD Millions)                              Update to credit analysis\n                  Update                                                  Summary\n                                                                          Amazon.com, Inc.'s (A1/Prime-1 stable) credit profile reflects its powerful global brand, which\n                           Send Your Feedback                             is synonymous with online retail, as well as the strength and profitability of Amazon Web\n                                                                          Services (“AWS”), the market leader in the cloud computing market. The company is 

In [13]:
pprint(docs2[0])

Quite good but it can't read the bar graph in Exhibit1

In [None]:
# one can use SimpleDirectoryReader to load files from a directory
from llama_index.core import SimpleDirectoryReader  # pip install llama-index

file_extractor = {".pdf": parser}
reader = SimpleDirectoryReader("./data", file_extractor=file_extractor)
documents = reader.load_data()

## PYMUPDF

In [23]:
# https://pymupdf.readthedocs.io/en/latest/rag.html
from llama_index.readers.file import PyMuPDFReader
loader = PyMuPDFReader()
documents = loader.load(file_path=pdf1)
pprint(documents)

In [24]:
documents = loader.load(file_path=pdf2)
pprint(documents)

Not bad but it reads the axis values of Exhibit1, and not the bar graph itself

## PDFPLUMBER

In [50]:
from langchain.document_loaders.parsers.pdf import PDFPlumberParser


# look into the library, plenty of options and other libraries 

# Initialize the parser
parser = PDFPlumberParser()

# Load your PDF data
data = parser.load(pdf1)

# Now you can process the data
processed_data = parser.process(data)
pprint(processed_data)

AttributeError: 'PDFPlumberParser' object has no attribute 'load'

In [36]:
import pdfplumber

In [43]:
with pdfplumber.open(pdf2) as pdf:
    first_page = pdf.pages[0]
    pprint(first_page.lines)

## LLAVA
Partition PDF tables, text, and images
LLaVA Paper: https://arxiv.org/pdf/2304.08485.pdf

In [35]:
# pip install "pip install unstructured[all-docs]"
from unstructured.partition.pdf import partition_pdf

# Path to save images
path = "data"

# Get elements
raw_pdf_elements = partition_pdf(
    filename=pdf1,
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=True,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=path,
)
pprint(raw_pdf_elements)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NameError: name 'sort_page_elements' is not defined

Impossible to make it work, it's error 'something is missing', I brew/pip install it, then a new error, 10 times over.  I pip installed unstructured[all-docs] as requested in the article, and yet there's sthg missing ... I give up on this.

## TABULA

For tables... https://github.com/tabulapdf/tabula

## CHARTS

https://www.researchgate.net/publication/372616217_Automatic_Chart_Understanding_a_Review

https://ieeexplore.ieee.org/document/9599112