mr

Build error

App Files Files Community

mr / app /engine /vectorstore.py

JPBianchi

endpoint only, no UI

ae92cb7 5 months ago

raw

history blame

11.3 kB

	import os, logging
	from app.engine.logger import logger

	from typing import List, Any
	import pandas as pd
	from weaviate.classes.config import Property, DataType

	from .weaviate_interface_v4 import WeaviateWCS, WeaviateIndexer

	from ..settings import parquet_file
	from weaviate.classes.query import Filter
	from torch import cuda

	if os.path.exists('.we_are_local'):
	COLLECTION = 'MultiRAG_local_mr'
	else:
	COLLECTION = 'MultiRAG'

	class dummyWeaviate:
	""" Created to pass on HF since I had again the client creation issue
	Temporary solution
	"""
	def __init__(self,
	endpoint: str=None,
	api_key: str=None,
	model_name_or_path: str='sentence-transformers/all-MiniLM-L6-v2',
	embedded: bool=False,
	openai_api_key: str=None,
	skip_init_checks: bool=False,
	**kwargs
	):
	return

	def _connect(self) -> None:
	return

	def _client(self):
	return

	def create_collection(self,
	collection_name: str,
	properties: list[Property],
	description: str=None,
	**kwargs
	) -> None:
	return

	def show_all_collections(self,
	detailed: bool=False,
	max_details: bool=False
	) -> list[str] \| dict:
	return ['abc', 'def']

	def show_collection_config(self, collection_name: str):
	return

	def show_collection_properties(self, collection_name: str):
	return

	def delete_collection(self, collection_name: str):
	return

	def get_doc_count(self, collection_name: str):
	return

	def keyword_search(self,
	request: str,
	collection_name: str,
	query_properties: list[str]=['content'],
	limit: int=10,
	filter: Filter=None,
	return_properties: list[str]=None,
	return_raw: bool=False
	):
	return

	def vector_search(self,
	request: str,
	collection_name: str,
	limit: int=10,
	return_properties: list[str]=None,
	filter: Filter=None,
	return_raw: bool=False,
	device: str='cuda:0' if cuda.is_available() else 'cpu'
	):
	return

	def hybrid_search(self,
	request: str,
	collection_name: str,
	query_properties: list[str]=['content'],
	alpha: float=0.5,
	limit: int=10,
	filter: Filter=None,
	return_properties: list[str]=None,
	return_raw: bool=False,
	device: str='cuda:0' if cuda.is_available() else 'cpu'
	):
	return

	class VectorStore:
	def __init__(self, model_path: str = 'sentence-transformers/all-mpnet-base-v2'):
	# we can create several instances to test various models, especially if we finetune one

	self.MultiRAG_properties = [
	Property(name='file',
	data_type=DataType.TEXT,
	description='Name of the file',
	index_filterable=True,
	index_searchable=True),
	# Property(name='keywords',
	# data_type=DataType.TEXT_ARRAY,
	# description='Keywords associated with the file',
	# index_filterable=True,
	# index_searchable=True),
	Property(name='content',
	data_type=DataType.TEXT,
	description='Splits of the article',
	index_filterable=True,
	index_searchable=True),
	]

	self.class_name = "MultiRAG_all-mpnet-base-v2"

	self.class_config = {'classes': [

	{"class": self.class_name,

	"description": "multiple types of docs",

	"vectorIndexType": "hnsw",

	# Vector index specific app.settings for HSNW
	"vectorIndexConfig": {

	"ef": 64, # higher is better quality vs slower search
	"efConstruction": 128, # higher = better index but slower build
	"maxConnections": 32, # max conn per layer - higher = more memory
	},

	"vectorizer": "none",

	"properties": self.MultiRAG_properties}
	]
	}

	self.model_path = model_path

	try:
	self.api_key = os.environ.get('FINRAG_WEAVIATE_API_KEY')
	logger(f"API key: {self.api_key[:5]}")
	self.url = os.environ.get('FINRAG_WEAVIATE_ENDPOINT')
	logger(f"URL: {self.url[8:15]}")
	self.client = WeaviateWCS(
	endpoint=self.url,
	api_key=self.api_key,
	model_name_or_path=self.model_path,
	)
	assert self.client._client.is_live(), "Weaviate is not live"
	assert self.client._client.is_ready(), "Weaviate is not ready"
	logger(f"Weaviate client created")
	except Exception as e:
	# raise Exception(f"Could not create Weaviate client: {e}")
	self.client = dummyWeaviate() # used when issue with HF client creation, to continue on HF
	logger(f"Could not create Weaviate client: {e}")

	# if we fail these tests 'VectorStore' object has no attribute 'client'
	# it's prob not the env var but the model missing
	# assert self.client._client.is_live(), "Weaviate is not live"
	# assert self.client._client.is_ready(), "Weaviate is not ready"
	# careful with accessing '_client' since the weaviate helper usually closes the connection every time

	self.indexer = None

	self.create_collection()

	@property
	def collections(self):

	return self.client.show_all_collections()

	def create_collection(self,
	collection_name: str=COLLECTION,
	description: str='Documents'):

	self.collection_name = collection_name
	if collection_name not in self.collections:
	self.client.create_collection(collection_name=collection_name,
	properties=self.MultiRAG_properties,
	description=description)
	# self.collection_name = collection_name
	else:
	logger(f"Collection {collection_name} already exists")


	def empty_collection(self, collection_name: str=COLLECTION) -> bool:

	# not in the library yet, so I simply delete and recreate it
	if collection_name in self.collections:
	self.client.delete_collection(collection_name=collection_name)
	self.create_collection()
	return True
	else:
	logger(f"Collection {collection_name} doesn't exist")
	return False


	def index_data(self, data: List[dict]= None, collection_name: str=COLLECTION):

	if self.indexer is None:
	self.indexer = WeaviateIndexer(self.client)

	if data is None:
	# use the parquet file, otherwise use the data passed
	data = pd.read_parquet(parquet_file).to_dict('records')
	# the parquet file was created/incremented when a new article was uploaded
	# it is a dataframe with columns: file, content, content_embedding
	# and reflects exactly the data that we want to index at all times
	self.status = self.indexer.batch_index_data(data, collection_name, 256)

	self.num_errors, self.error_messages, self.doc_ids = self.status

	# in this case with few articles, we don't tolerate errors
	# batch_index_data already tests errors against a threshold
	# assert self.num_errors == 0, f"Errors: {self.num_errors}"


	def keyword_search(self,
	query: str,
	limit: int=5,
	return_properties: List[str]=['file', 'content'],
	alpha=None # dummy parameter to match the hybrid_search signature
	) -> List[str]:
	response = self.client.keyword_search(
	request=query,
	collection_name=self.collection_name,
	query_properties=['file', 'content'],
	limit=limit,
	filter=None,
	return_properties=return_properties,
	return_raw=False)

	return [(res['file'], res['content'], res['score']) for res in response]


	def vector_search(self,
	query: str,
	limit: int=5,
	return_properties: List[str]=['file', 'content'],
	alpha=None # dummy parameter to match the hybrid_search signature
	) -> List[str]:

	response = self.client.vector_search(
	request=query,
	collection_name=self.collection_name,
	limit=limit,
	filter=None,
	return_properties=return_properties,
	return_raw=False)

	return [(res['file'], res['content'], res['score']) for res in response]


	def hybrid_search(self,
	query: str,
	limit: int=10,
	alpha=0.5, # higher = more vector search
	return_properties: List[str]=['file', 'content']
	) -> List[str]:

	response = self.client.hybrid_search(
	request=query,
	collection_name=self.collection_name,
	query_properties=['file', 'content'],
	alpha=alpha,
	limit=limit,
	filter=None,
	return_properties=return_properties,
	return_raw=False)

	return [(res['file'], res['content'], res['score']) for res in response]