import json from typing import List, Tuple import numpy as np # from fastapi.responses import JSONResponse # from sentence_transformers import SentenceTransformer # from transformers import pipeline from app.db_local_storage.in_memory_db import query_response_storage class QuerySearchFeature: def __init__(self, model, qa_pipeline): self.model = model self.qa_pipeline = qa_pipeline async def query_search(self, query: str) -> dict: user_query = { "text": query, "isSender": True, } query_response_storage.append(user_query) # dataBase = await QuerySearchFeature.load_data() dataBase = vector_files_db text_data, embeddings = await QuerySearchFeature.split_dataBase(dataBase) lexical_results = await QuerySearchFeature.lexical_search(query, text_data) semantic_results = await QuerySearchFeature.semantic_search( query, text_data, embeddings, self.model ) combined_results = list(set(lexical_results + semantic_results)) context = await QuerySearchFeature.get_context(combined_results) response = self.qa_pipeline(question=query, context=context) response_query = { "text": response["answer"], "isSender": False, } query_response_storage.append(response_query) return { "message": response["answer"], "context_used": context, "chunks": context, } @staticmethod async def semantic_search( query: str, chunks: List[str], embeddings: np.ndarray, model ) -> List[str]: query_embedding = model.encode([query]) similarities = np.dot(embeddings, query_embedding.T).flatten() top_indices = np.argsort(-similarities)[:3] return [chunks[i] for i in top_indices] @staticmethod async def lexical_search(query: str, chunks: List[str]) -> List[str]: return [chunk for chunk in chunks if query.lower() in chunk.lower()] @staticmethod async def load_data(): with open(VECTOR_FILES_DIRECTORY, "r") as file: dataBase = json.load(file) return dataBase @staticmethod async def split_dataBase(db) -> Tuple[List[str], np.ndarray]: text_data = [] embeddings = [] for document in db.values(): for page in document["data"]: text_data.append(page["metadata"]["original_text"]) embeddings.append(page["embedding"]) return text_data, embeddings @staticmethod async def get_context(chunks: List[str]) -> str: return " ".join(chunks)