File size: 2,178 Bytes
47b5f0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from typing import List
import uuid

from qdrant_client.models import PointStruct

from app.infrastructure.models.my_models import EmbeddingCreation
from app.infrastructure.repository.document_handeler_repository import (
    DocumentHandelerRepository,
)
from app.modules.denseEmbeddings.denseEmbeddings import DenseEmbeddings


class CreateEmbeddingsFeature:

    def __init__(
        self,
        dense_embeddings: DenseEmbeddings,
        document_handeler_repository: DocumentHandelerRepository,
    ):
        self.dense_embeddings = dense_embeddings
        self.document_handeler_repository = document_handeler_repository

    def chunk_text(self, text: str, chunk_size: int = 512) -> List[str]:
        """
        Chunk text into smaller pieces

        :param text: str
        :param chunk_size: int
        :return: List[str]
        """
        chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
        return chunks

    async def create_embeddings(self, text: str, filename: str) -> EmbeddingCreation:
        """
        Create embeddings for the text

        :param text: str
        :param filename: str
        :return: EmbeddingCreation
        """

        chunks = self.chunk_text(text)

        document_id = filename.split(".")[0]

        points = [
            PointStruct(
                id=str(uuid.uuid4()),
                vector={
                    "text-dense": self.dense_embeddings.get_dense_vector(chunk).vector,
                    "text-sparse": self.dense_embeddings.get_sparse_vector(chunk),
                },
                payload={
                    "document_id": document_id,
                    "chunk_index": i,
                    "filename": filename,
                    "chunk-text": chunk,
                },
            )
            for i, chunk in enumerate(chunks)
        ]

        result = self.document_handeler_repository.insert_points(points)
        if result.status:
            return EmbeddingCreation(
                success=True, message="Embeddings created successfully"
            )
        return EmbeddingCreation(success=False, message="Embeddings creation failed")