Spaces:
Sleeping
Sleeping
import json | |
from typing import List | |
from sentence_transformers import SentenceTransformer | |
from app.db_local_storage.files_db import VECTOR_FILES_DIRECTORY | |
from app.db_local_storage.vector_files_db import vector_files_db as EMBEDDING_DATA | |
class CreateEmbeddingsFeature: | |
def chunk_text(text: str, chunk_size: int = 512) -> List[str]: | |
chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)] | |
return chunks | |
async def create_embeddings(text: str, filename: str) -> List: | |
# TODO: Check model | |
model = SentenceTransformer("paraphrase-MiniLM-L6-v2") | |
chunks = CreateEmbeddingsFeature.chunk_text(text) | |
# with open(VECTOR_FILES_DIRECTORY, "r") as file: | |
# EMBEDDING_DATA = json.load(file) | |
id = len(EMBEDDING_DATA) + 1 | |
docoument_index = f"document_{id}" | |
EMBEDDING_DATA[docoument_index] = { | |
"metadata": {"id": id, "filename": filename, "chunks": len(chunks)}, | |
"data": [], | |
} | |
for i, chunk in enumerate(chunks): | |
embedding = model.encode(chunk).tolist() | |
embedding_entry = { | |
"embedding": embedding, | |
"metadata": { | |
"chunk_index": i, | |
"original_text": chunk, | |
"document_id": docoument_index, | |
}, | |
} | |
EMBEDDING_DATA[docoument_index]["data"].append(embedding_entry) | |
# print(EMBEDDING_DATA) | |
# with open(VECTOR_FILES_DIRECTORY, "w") as f: | |
# json.dump(EMBEDDING_DATA, f) | |
return | |