Spaces:
Running
Running
import json | |
from typing import List, Tuple | |
import numpy as np | |
# from fastapi.responses import JSONResponse | |
# from sentence_transformers import SentenceTransformer | |
# from transformers import pipeline | |
from app.db_local_storage.vector_files_db import vector_files_db | |
from app.db_local_storage.files_db import VECTOR_FILES_DIRECTORY | |
from app.db_local_storage.in_memory_db import query_response_storage | |
class QuerySearchFeature: | |
def __init__(self, model, qa_pipeline): | |
self.model = model | |
self.qa_pipeline = qa_pipeline | |
async def query_search(self, query: str) -> dict: | |
user_query = { | |
"text": query, | |
"isSender": True, | |
} | |
query_response_storage.append(user_query) | |
# dataBase = await QuerySearchFeature.load_data() | |
dataBase = vector_files_db | |
text_data, embeddings = await QuerySearchFeature.split_dataBase(dataBase) | |
lexical_results = await QuerySearchFeature.lexical_search(query, text_data) | |
semantic_results = await QuerySearchFeature.semantic_search( | |
query, text_data, embeddings, self.model | |
) | |
combined_results = list(set(lexical_results + semantic_results)) | |
context = await QuerySearchFeature.get_context(combined_results) | |
response = self.qa_pipeline(question=query, context=context) | |
response_query = { | |
"text": response["answer"], | |
"isSender": False, | |
} | |
query_response_storage.append(response_query) | |
return { | |
"message": response["answer"], | |
"context_used": context, | |
"chunks": context, | |
} | |
async def semantic_search( | |
query: str, chunks: List[str], embeddings: np.ndarray, model | |
) -> List[str]: | |
query_embedding = model.encode([query]) | |
similarities = np.dot(embeddings, query_embedding.T).flatten() | |
top_indices = np.argsort(-similarities)[:3] | |
return [chunks[i] for i in top_indices] | |
async def lexical_search(query: str, chunks: List[str]) -> List[str]: | |
return [chunk for chunk in chunks if query.lower() in chunk.lower()] | |
async def load_data(): | |
with open(VECTOR_FILES_DIRECTORY, "r") as file: | |
dataBase = json.load(file) | |
return dataBase | |
async def split_dataBase(db) -> Tuple[List[str], np.ndarray]: | |
text_data = [] | |
embeddings = [] | |
for document in db.values(): | |
for page in document["data"]: | |
text_data.append(page["metadata"]["original_text"]) | |
embeddings.append(page["embedding"]) | |
return text_data, embeddings | |
async def get_context(chunks: List[str]) -> str: | |
return " ".join(chunks) | |