import json from typing import List from sentence_transformers import SentenceTransformer from app.db_local_storage.files_db import VECTOR_FILES_DIRECTORY from app.db_local_storage.vector_files_db import vector_files_db as EMBEDDING_DATA class CreateEmbeddingsFeature: @staticmethod def chunk_text(text: str, chunk_size: int = 512) -> List[str]: chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)] return chunks @staticmethod async def create_embeddings(text: str, filename: str) -> List: # TODO: Check model model = SentenceTransformer("paraphrase-MiniLM-L6-v2") chunks = CreateEmbeddingsFeature.chunk_text(text) # with open(VECTOR_FILES_DIRECTORY, "r") as file: # EMBEDDING_DATA = json.load(file) id = len(EMBEDDING_DATA) + 1 docoument_index = f"document_{id}" EMBEDDING_DATA[docoument_index] = { "metadata": {"id": id, "filename": filename, "chunks": len(chunks)}, "data": [], } for i, chunk in enumerate(chunks): embedding = model.encode(chunk).tolist() embedding_entry = { "embedding": embedding, "metadata": { "chunk_index": i, "original_text": chunk, "document_id": docoument_index, }, } EMBEDDING_DATA[docoument_index]["data"].append(embedding_entry) # print(EMBEDDING_DATA) # with open(VECTOR_FILES_DIRECTORY, "w") as f: # json.dump(EMBEDDING_DATA, f) return