In [74]:
import sys
sys.path.append('../')

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv('env'), override=True)

from engine.weaviate_interface_v4 import WeaviateIndexer, WeaviateWCS, get_weaviate_client

#standards
import os
import time
import json
from typing import List
from tqdm import tqdm
from rich import print  # nice library that provides improved printing output (overrides default print function)
from rich.pretty import pprint
import pandas as pd


In [3]:
datadir = "../../data"
pfile = os.path.join(datadir, 'text_vectors.parquet')

In [75]:
#read env vars from local .env file
api_key = os.environ['FINRAG_WEAVIATE_API_KEY']
url = os.environ['FINRAG_WEAVIATE_ENDPOINT']
model_path = 'sentence-transformers/all-mpnet-base-v2'
# careful this one is 768 bits long, all_Mini-L6-v2 is 384 bits
#instantiate client, 
try:
    client = WeaviateWCS(endpoint=url, api_key=api_key, model_name_or_path=model_path)
    client._client.is_live()
    print(f"Client is ready {client._client.is_ready()}")
except Exception as e:
    print(e)
    
# Alternatively we can simply use the get_weaviate_client 
# convenice function, which assumes a default config
# client = get_weaviate_client()



In [71]:
client._client.is_live()

False

In [67]:
client._connect()

In [45]:
from weaviate.classes.config import Property, DataType

finrag_properties = [  
                Property(name='filename',
                         data_type=DataType.TEXT,
                         description='Name of the file',
                         index_filterable=True,
                         index_searchable=True),
                # Property(name='keywords',
                #          data_type=DataType.TEXT_ARRAY,
                #          description='Keywords associated with the file',
                #          index_filterable=True,
                #          index_searchable=True),
                # Property(name='summary',
                #          data_type=DataType.TEXT,
                #          description='Short summary of the article',
                #          index_filterable=True,
                #          index_searchable=True),
                Property(name='content',
                         data_type=DataType.TEXT,
                          description='Splits of the article',
                         index_filterable=True,
                         index_searchable=True),
              ]

In [46]:
class_name = "FinRag_mpnet-base-v2"

class_config = {'classes': [

                      {"class": class_name,        
                       
                       "description": "Financial reports", 
                       
                       "vectorIndexType": "hnsw", 
                       
                       # Vector index specific settings
                       "vectorIndexConfig": {                   
                          
                            "ef": 64,
                            "efConstruction": 128, 
                            "maxConnections": 32,    
                                            },
                       
                       "vectorizer": "none",            
                       
                       # pre-defined property mappings
                       "properties": finrag_properties }         
                      ]
               }
print(class_config)

In [None]:
collection_name='finrag'
client.create_collection(collection_name=collection_name, 
                         properties=finrag_properties, 
                         description='Financial reports')

Collection "finrag" created


In [48]:
client.delete_collection(collection_name='finrag')

Collection "finrag" deleted


In [76]:
client.show_all_collections()

['Finrag']

In [77]:
client.show_collection_config('Finrag')

_CollectionConfig(name='Finrag', description='Financial reports', generative_config=None, inverted_index_config=_InvertedIndexConfig(bm25=_BM25Config(b=0.75, k1=1.2), cleanup_interval_seconds=60, index_null_state=False, index_property_length=False, index_timestamps=False, stopwords=_StopwordsConfig(preset=<StopwordsPreset.EN: 'en'>, additions=None, removals=None)), multi_tenancy_config=_MultiTenancyConfig(enabled=False), properties=[_Property(name='filename', description='Name of the file', data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer='none'), _Property(name='content', description='Splits of the article', data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer='none')], references=[], replication_config=_ReplicationConfig(facto

In [78]:
client.show_collection_properties('Finrag')

[_Property(name='filename', description='Name of the file', data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer='none'),
 _Property(name='content', description='Splits of the article', data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer='none')]

In [51]:
indexer = WeaviateIndexer(client)

In [79]:
data = pd.read_parquet(pfile)
data.file.value_counts()

FileNotFoundError: [Errno 2] No such file or directory: '../../data/text_vectors.parquet'

In [53]:
batch_object = indexer.batch_index_data(data.to_dict('records'), collection_name, 256)

100%|██████████| 41/41 [00:00<00:00, 853.70it/s]


Processing finished in 0.01 minutes.
Batch job completed with zero errors.


In [18]:
type(batch_object)

dict

In [54]:
list(batch_object.keys())

['num_errors', 'error_messages', 'doc_ids']

In [55]:
data.head()

Unnamed: 0,file,content,content_embedding
0,AMZN_Moodys_CreditRating_2023.pdf,CORPORATES\nCREDIT OPINION\n23 May 2023\nUpdat...,"[-0.009635023772716522, 0.01748381368815899, -..."
1,AMZN_Moodys_CreditRating_2023.pdf,The company is reliant\non the operating incom...,"[0.0033761824015527964, 0.026259412989020348, ..."
2,AMZN_Moodys_CreditRating_2023.pdf,Exhibit 1\nAmazon's debt has continued to rise...,"[-0.01308224257081747, 0.06867944449186325, -0..."
3,AMZN_Moodys_CreditRating_2023.pdf,MOODY'S INVESTORS SERVICE CORPORATES\nCredit s...,"[-0.009228247217833996, 0.023773541674017906, ..."
4,AMZN_Moodys_CreditRating_2023.pdf,Factors that could lead to a downgrade\nRating...,"[-0.027790367603302002, 0.01033176202327013, -..."


In [56]:
data.to_dict('records')

[{'file': 'AMZN_Moodys_CreditRating_2023.pdf',
  'content': "CORPORATES\nCREDIT OPINION\n23 May 2023\nUpdate\nRATINGS\nAmazon.com, Inc.\nDomicile Seattle, Washington,\nUnited States\nLong Term Rating A1\nType Senior Unsecured -\nDom Curr\nOutlook Stable\nPlease see the ratings section  at the end of this report\nfor more information. The ratings and outlook shown\nreflect information as of the publication date.\nContacts\nChristina Boni +1.212.553.0514\nSenior Vice President\nchristina.boni@moodys.com\nJack Myers +1.212.553.5116\nAssociate Analyst\njack.myers@moodys.com\nMargaret Taylor +1.212.553.0424\nAssociate Managing Director\nmargaret.taylor@moodys.comAmazon.com, Inc.\nUpdate to credit analysis\nSummary\nAmazon.com, Inc. 's (A1/Prime-1 stable) credit profile reflects its powerful global brand, which\nis synonymous with online retail, as well as the strength and profitability of Amazon Web\nServices (“AWS”), the market leader in the cloud computing market.",
  'content_embedding':

In [57]:
query = "What is Amazon debt?"
response = client.keyword_search(request=query,
                                 collection_name=collection_name,
                                 query_properties=['content'], 
                                 limit=5,
                                 filter=None,  
                                 return_properties=['filename', 'content'],
                                 return_raw=False)  # true gives a lot of additional info (not useful for RAG)

print(response)

In [58]:
vector_response = client.vector_search(request=query,
                                       collection_name=collection_name,
                                       limit=5, 
                                       return_properties=['filename', 'content'],
                                       filter=None,
                                       return_raw=False,
                                       device='cpu')
print(vector_response)

In [80]:
resp = client.hybrid_search(
            request=query,
            collection_name=collection_name,
            alpha=0.5,  # higher = more like keyword search 
            limit=5, 
            return_properties=['filename', 'content'],
            filter=None,
            return_raw=False,
            device='cpu')

pprint(resp)