Let's test our chunking, embedding and vectorization here

In [13]:

# import polars as pl
import pandas as pd
import pickle
from rich.pretty import pprint
import tiktoken  # tokenizer library for use with OpenAI LLMs 
import os, sys
from torch import set_num_threads
import torch
from tqdm import tqdm
# from preprocessing import FileIO

In [14]:
set_num_threads(5)
# it must be called before any model loading or inference operations begin or parallelism will be refused later

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# create tensors on GPU if available
if torch.cuda.is_available():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')

In [16]:
datadir = "../../data"

In [17]:
# from llama_index.text_splitter import SentenceSplitter doesn't work anymore
from llama_index.legacy.text_splitter import SentenceSplitter

In [8]:
from llama_index.core.node_parser import SentenceSplitter

In [1]:
from IPython.display import IFrame
IFrame(src="https://www.sbert.net/_static/html/models_en_sentence_embeddings.html", width='100%', height=600)

In [18]:
chunk_overlap = 20

#instantiate tokenizer for use with ChatGPT-3.5-Turbo
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0613')
# tokens = encoding.encode_batch(contents)
splitter = SentenceSplitter(chunk_size=256, # limit for 'all-mpnet-base-v2'
                            tokenizer=encoding.encode, 
                            chunk_overlap=chunk_overlap)
pprint(splitter)

In [None]:
sys.path.append(os.path.join(os.curdir, '..'))
from settings import pdf_content
pdf_content = os.path.join('../', pdf_content)
contents = pickle.load(open(pdf_content, 'rb'))
contents

{'AMZN_Moodys_CreditRating_2023.pdf': ["CORPORATES\nCREDIT OPINION\n23 May 2023\nUpdate\nRATINGS\nAmazon.com, Inc.\nDomicile Seattle, Washington,\nUnited States\nLong Term Rating A1\nType Senior Unsecured -\nDom Curr\nOutlook Stable\nPlease see the ratings section  at the end of this report\nfor more information. The ratings and outlook shown\nreflect information as of the publication date.\nContacts\nChristina Boni +1.212.553.0514\nSenior Vice President\nchristina.boni@moodys.com\nJack Myers +1.212.553.5116\nAssociate Analyst\njack.myers@moodys.com\nMargaret Taylor +1.212.553.0424\nAssociate Managing Director\nmargaret.taylor@moodys.comAmazon.com, Inc.\nUpdate to credit analysis\nSummary\nAmazon.com, Inc. 's (A1/Prime-1 stable) credit profile reflects its powerful global brand, which\nis synonymous with online retail, as well as the strength and profitability of Amazon Web\nServices (“AWS”), the market leader in the cloud computing market. The company is reliant\non the operating inco

In [14]:
pages = contents['AMZN_Moodys_CreditRating_2023.pdf']

In [24]:
splits=splitter.split_text(pages[0])
pprint(splits)

In [8]:
# now let's create the splits for every document
contents_splits = {}
for fname, content in contents.items():
    splits = [splitter.split_text(page) for page in content]
    contents_splits[fname] = [split for sublist in splits for split in sublist]

In [9]:
contents_splits

{'AMZN_Moodys_CreditRating_2023.pdf': ["CORPORATES\nCREDIT OPINION\n23 May 2023\nUpdate\nRATINGS\nAmazon.com, Inc.\nDomicile Seattle, Washington,\nUnited States\nLong Term Rating A1\nType Senior Unsecured -\nDom Curr\nOutlook Stable\nPlease see the ratings section  at the end of this report\nfor more information. The ratings and outlook shown\nreflect information as of the publication date.\nContacts\nChristina Boni +1.212.553.0514\nSenior Vice President\nchristina.boni@moodys.com\nJack Myers +1.212.553.5116\nAssociate Analyst\njack.myers@moodys.com\nMargaret Taylor +1.212.553.0424\nAssociate Managing Director\nmargaret.taylor@moodys.comAmazon.com, Inc.\nUpdate to credit analysis\nSummary\nAmazon.com, Inc. 's (A1/Prime-1 stable) credit profile reflects its powerful global brand, which\nis synonymous with online retail, as well as the strength and profitability of Amazon Web\nServices (“AWS”), the market leader in the cloud computing market.",
  "The company is reliant\non the operating

In [10]:
len(contents_splits['AMZN_Moodys_CreditRating_2023.pdf'])

41

In [11]:
contents_splits['AMZN_Moodys_CreditRating_2023.pdf'][2]

"Exhibit 1\nAmazon's debt has continued to rise as operating income remains below 2019\n$0$20,000$40,000$60,000$80,000$100,000$120,000$140,000$160,000$180,000\n$0$5,000$10,000$15,000$20,000$25,000$30,000\n2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 Q1 -23 LTM\nMoody's Adjusted Debt (USD Millions)Moody's Adj. Operating Income (USD Millions)Moody's adjusted operating income Moody's adjusted debt\nDebt includes lease\nSource: Moody’s Financial Metrics™"

In [12]:
from sentence_transformers import SentenceTransformer
# model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model_name = 'sentence-transformers/all-mpnet-base-v2'  #  models/bge-base-en-v1.5-finetuned-300" # @param ["sentence-transformers/all-MiniLM-L6-v2", "BAAI/bge-base-en-v1.5", "BAAI/bge-large-en-v1.5", "models/bge-base-en-v1.5-finetuned-300"]
model = SentenceTransformer(model_name)
model

  from .autonotebook import tqdm as notebook_tqdm


SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [23]:
model.save('models/all-mpnet-base-v2')

In [40]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"
pool = model.start_multi_process_pool()

content_embeddings = {}
for fname, splits in contents_splits.items():
    content_embeddings[fname] = model.encode_multi_process(tqdm(splits), pool)

model.stop_multi_process_pool(pool)
pprint(content_embeddings, max_length = 3)

100%|██████████| 10/10 [00:00<00:00, 2275.31it/s]


In [69]:
content_embeddings = {}
for fname, splits in contents_splits.items():
    content_embeddings[fname] = [(split, model.encode(split)) for split in tqdm(splits)]

100%|██████████| 41/41 [00:07<00:00,  5.77it/s]


In [75]:
content_embeddings['AMZN_Moodys_CreditRating_2023.pdf'][2]

("Exhibit 1\nAmazon's debt has continued to rise as operating income remains below 2019\n$0$20,000$40,000$60,000$80,000$100,000$120,000$140,000$160,000$180,000\n$0$5,000$10,000$15,000$20,000$25,000$30,000\n2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 Q1 -23 LTM\nMoody's Adjusted Debt (USD Millions)Moody's Adj. Operating Income (USD Millions)Moody's adjusted operating income Moody's adjusted debt\nDebt includes lease\nSource: Moody’s Financial Metrics™",
 array([-1.30822426e-02,  6.86794445e-02, -2.04694420e-02, -1.38254010e-03,
         3.70225944e-02,  3.41216698e-02,  2.21263207e-02, -7.08647957e-03,
        -1.05572930e-02, -6.59325495e-02, -1.74689535e-02,  6.06168024e-02,
         2.91264486e-02, -5.47851063e-03, -2.59431135e-02,  2.88099833e-02,
        -2.17851647e-03,  3.94609245e-03, -2.27084663e-02,  6.04690704e-03,
        -1.23097841e-02,  3.90404537e-02, -3.76035273e-02, -8.56765434e-02,
        -1.37383323e-02, -4.62083854e-02,  1.65111329e-02, -3.10253035e-02,
     

In [28]:
# sys.path.append(os.path.join(os.curdir, '..'))
from engine.fileIO import FileIO
io = FileIO()

In [83]:
text_vector_tuples = [(fname, split, emb.tolist()) for fname, splits_emb in content_embeddings.items() for split, emb in splits_emb]

pfile = os.path.join(datadir, 'text_vectors.parquet')
text_vector_tuples[2]

('AMZN_Moodys_CreditRating_2023.pdf',
 "Exhibit 1\nAmazon's debt has continued to rise as operating income remains below 2019\n$0$20,000$40,000$60,000$80,000$100,000$120,000$140,000$160,000$180,000\n$0$5,000$10,000$15,000$20,000$25,000$30,000\n2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 Q1 -23 LTM\nMoody's Adjusted Debt (USD Millions)Moody's Adj. Operating Income (USD Millions)Moody's adjusted operating income Moody's adjusted debt\nDebt includes lease\nSource: Moody’s Financial Metrics™",
 [-0.01308224257081747,
  0.06867944449186325,
  -0.020469442009925842,
  -0.0013825400965288281,
  0.03702259436249733,
  0.034121669828891754,
  0.022126320749521255,
  -0.0070864795707166195,
  -0.010557292960584164,
  -0.06593254953622818,
  -0.017468953505158424,
  0.06061680242419243,
  0.029126448556780815,
  -0.005478510633111,
  -0.02594311349093914,
  0.028809983283281326,
  -0.002178516471758485,
  0.0039460924454033375,
  -0.02270846627652645,
  0.006046907044947147,
  -0.0123097840

In [93]:
pd.DataFrame(text_vector_tuples, columns=['file', 'content', 'content_embedding']).to_parquet(pfile, index=False)

In [94]:
pd.read_parquet(pfile)

Unnamed: 0,file,content,content_embedding
0,AMZN_Moodys_CreditRating_2023.pdf,CORPORATES\nCREDIT OPINION\n23 May 2023\nUpdat...,"[-0.009635023772716522, 0.01748381368815899, -..."
1,AMZN_Moodys_CreditRating_2023.pdf,The company is reliant\non the operating incom...,"[0.0033761824015527964, 0.026259412989020348, ..."
2,AMZN_Moodys_CreditRating_2023.pdf,Exhibit 1\nAmazon's debt has continued to rise...,"[-0.01308224257081747, 0.06867944449186325, -0..."
3,AMZN_Moodys_CreditRating_2023.pdf,MOODY'S INVESTORS SERVICE CORPORATES\nCredit s...,"[-0.009228247217833996, 0.023773541674017906, ..."
4,AMZN_Moodys_CreditRating_2023.pdf,Factors that could lead to a downgrade\nRating...,"[-0.027790367603302002, 0.01033176202327013, -..."
5,AMZN_Moodys_CreditRating_2023.pdf,MOODY'S INVESTORS SERVICE CORPORATES\nKey indi...,"[0.003038279013708234, 0.024112433195114136, -..."
6,AMZN_Moodys_CreditRating_2023.pdf,Periods are Financial Year-End unless indicate...,"[-0.02161526307463646, 0.017826249822974205, -..."
7,AMZN_Moodys_CreditRating_2023.pdf,"541 $22,899 $24,879 \n$12,248 \n$13,353 \n $(1...","[-0.019557256251573563, 0.046015415340662, -0...."
8,AMZN_Moodys_CreditRating_2023.pdf,Revenue growth in Q1 2023 has decelerated to 1...,"[0.017024273052811623, 0.09144710004329681, -0..."
9,AMZN_Moodys_CreditRating_2023.pdf,Operating margin expansion for AWS could be ch...,"[-0.0007624945719726384, 0.019459251314401627,..."


In [92]:
vect = pd.read_parquet(pfile)
vect.file.value_counts()

file
ATT_SEC_AnnualReport_2022.pdf            421
ATT_CompanyReport_Annual_20230126.pdf    172
AMZN_Moodys_CreditRating_2023.pdf         41
Name: count, dtype: int64