abadesalex commited on
Commit
47b5f0c
1 Parent(s): dcb6c5f

Update to Qdrant db

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +4 -1
  2. Api/app/__pycache__/main.cpython-310.pyc +0 -0
  3. Api/app/__pycache__/qdrant.cpython-310.pyc +0 -0
  4. Api/app/db_local_storage/__pycache__/documents_db.cpython-310.pyc +0 -0
  5. Api/app/db_local_storage/__pycache__/vector_files_db.cpython-310.pyc +0 -0
  6. Api/app/db_local_storage/documents_db.py +0 -2
  7. Api/app/db_local_storage/files_db.py +0 -4
  8. Api/app/db_local_storage/vector_files_db.py +0 -1
  9. Api/app/infrastructure/models/__pycache__/my_models.cpython-310.pyc +0 -0
  10. Api/app/infrastructure/models/my_models.py +28 -0
  11. Api/app/infrastructure/repository/__pycache__/document_handeler_repository.cpython-310.pyc +0 -0
  12. Api/app/infrastructure/repository/__pycache__/query_search_repository.cpython-310.pyc +0 -0
  13. Api/app/infrastructure/repository/__pycache__/updateDocument_repository.cpython-310.pyc +0 -0
  14. Api/app/infrastructure/repository/document_handeler_repository.py +48 -0
  15. Api/app/infrastructure/repository/query_search_repository.py +20 -0
  16. Api/app/main.py +34 -11
  17. Api/app/modules/__pycache__/model.cpython-310.pyc +0 -0
  18. Api/app/modules/clearVariables/routes/__pycache__/clearVariables_route.cpython-310.pyc +0 -0
  19. Api/app/modules/clearVariables/routes/clearVariables_route.py +4 -12
  20. Api/app/modules/denseEmbeddings/__pycache__/denseEmbeddings.cpython-310.pyc +0 -0
  21. Api/app/modules/denseEmbeddings/denseEmbeddings.py +69 -0
  22. Api/app/modules/{uploadDocument → documentHandeler}/controllers/__pycache__/FileUploadController.cpython-310.pyc +0 -0
  23. Api/app/modules/documentHandeler/controllers/__pycache__/document_handeler_controller.cpython-310.pyc +0 -0
  24. Api/app/modules/documentHandeler/controllers/__pycache__/file_upload_controller.cpython-310.pyc +0 -0
  25. Api/app/modules/documentHandeler/controllers/document_handeler_controller.py +66 -0
  26. Api/app/modules/documentHandeler/dependencies/__pycache__/dependencies.cpython-310.pyc +0 -0
  27. Api/app/modules/documentHandeler/dependencies/dependencies.py +112 -0
  28. Api/app/modules/documentHandeler/features/__pycache__/createEmbeddings_feature.cpython-310.pyc +0 -0
  29. Api/app/modules/documentHandeler/features/__pycache__/deleteDocument_feature.cpython-310.pyc +0 -0
  30. Api/app/modules/documentHandeler/features/__pycache__/extractText_feature.cpython-310.pyc +0 -0
  31. Api/app/modules/documentHandeler/features/__pycache__/getAllChunkedText_feature.cpython-310.pyc +0 -0
  32. Api/app/modules/{uploadDocument → documentHandeler}/features/__pycache__/uploadDocument_feature.cpython-310.pyc +0 -0
  33. Api/app/modules/documentHandeler/features/createEmbeddings_feature.py +69 -0
  34. Api/app/modules/documentHandeler/features/deleteDocument_feature.py +18 -0
  35. Api/app/modules/documentHandeler/features/extractText_feature.py +19 -0
  36. Api/app/modules/documentHandeler/features/getAllChunkedText_feature.py +32 -0
  37. Api/app/modules/documentHandeler/features/uploadDocument_feature.py +26 -0
  38. Api/app/modules/documentHandeler/routes/__pycache__/document_handeler_route.cpython-310.pyc +0 -0
  39. Api/app/modules/documentHandeler/routes/__pycache__/uploadDocument_route.cpython-310.pyc +0 -0
  40. Api/app/modules/documentHandeler/routes/document_handeler_route.py +31 -0
  41. Api/app/modules/{uploadDocument → documentHandeler}/schemas/uploadDocument_schema.py +0 -0
  42. Api/app/modules/hybridSearcher/__pycache__/hybridSearcher.cpython-310.pyc +0 -0
  43. Api/app/modules/hybridSearcher/hybridSearcher.py +71 -0
  44. Api/app/modules/model.py +0 -6
  45. Api/app/modules/querySearch/__pycache__/dependecies.cpython-310.pyc +0 -0
  46. Api/app/modules/querySearch/controllers/__pycache__/querySearch_controller.cpython-310.pyc +0 -0
  47. Api/app/modules/querySearch/controllers/querySearch_controller.py +3 -5
  48. Api/app/modules/querySearch/dependecies.py +85 -0
  49. Api/app/modules/querySearch/features/__pycache__/querySearch_feature.cpython-310.pyc +0 -0
  50. Api/app/modules/querySearch/features/querySearch_feature.py +37 -75
.gitignore CHANGED
@@ -2,4 +2,7 @@
2
  Api/venv
3
 
4
  # Other versions
5
- Api/out1
 
 
 
 
2
  Api/venv
3
 
4
  # Other versions
5
+ Api/out1
6
+
7
+ # env
8
+ .env
Api/app/__pycache__/main.cpython-310.pyc CHANGED
Binary files a/Api/app/__pycache__/main.cpython-310.pyc and b/Api/app/__pycache__/main.cpython-310.pyc differ
 
Api/app/__pycache__/qdrant.cpython-310.pyc ADDED
Binary file (3.09 kB). View file
 
Api/app/db_local_storage/__pycache__/documents_db.cpython-310.pyc CHANGED
Binary files a/Api/app/db_local_storage/__pycache__/documents_db.cpython-310.pyc and b/Api/app/db_local_storage/__pycache__/documents_db.cpython-310.pyc differ
 
Api/app/db_local_storage/__pycache__/vector_files_db.cpython-310.pyc CHANGED
Binary files a/Api/app/db_local_storage/__pycache__/vector_files_db.cpython-310.pyc and b/Api/app/db_local_storage/__pycache__/vector_files_db.cpython-310.pyc differ
 
Api/app/db_local_storage/documents_db.py DELETED
@@ -1,2 +0,0 @@
1
- documents_db = []
2
- documents_text = []
 
 
 
Api/app/db_local_storage/files_db.py DELETED
@@ -1,4 +0,0 @@
1
- FILES_NAMES_DATABASE = {}
2
- FILES_DIRECTORY = "src/db_local_storage/files"
3
- TEXT_FILES_DIRECTORY = "src/db_local_storage/text_files"
4
- VECTOR_FILES_DIRECTORY = "src/db_local_storage/vector_files/vec_db.json"
 
 
 
 
 
Api/app/db_local_storage/vector_files_db.py DELETED
@@ -1 +0,0 @@
1
- vector_files_db = {}
 
 
Api/app/infrastructure/models/__pycache__/my_models.cpython-310.pyc ADDED
Binary file (1.41 kB). View file
 
Api/app/infrastructure/models/my_models.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import Dict, List, Optional
3
+
4
+
5
+ class ModelResponse(BaseModel, extra="forbid"):
6
+ text: str
7
+ isSender: bool
8
+ message: Optional[str] = None
9
+
10
+
11
+ class EmbeddingCreation(BaseModel, extra="forbid"):
12
+ success: bool
13
+ message: Optional[str] = None
14
+
15
+
16
+ class HybridSearchResponse(BaseModel):
17
+ success: bool
18
+ data: Optional[List[Dict]] = None
19
+ message: Optional[str] = None
20
+
21
+
22
+ class Chunk(BaseModel):
23
+ index: int
24
+ text: str
25
+
26
+
27
+ class ChunksResponse(BaseModel):
28
+ data: Dict[str, List[Chunk]]
Api/app/infrastructure/repository/__pycache__/document_handeler_repository.cpython-310.pyc ADDED
Binary file (2.23 kB). View file
 
Api/app/infrastructure/repository/__pycache__/query_search_repository.cpython-310.pyc ADDED
Binary file (1.14 kB). View file
 
Api/app/infrastructure/repository/__pycache__/updateDocument_repository.cpython-310.pyc ADDED
Binary file (1.6 kB). View file
 
Api/app/infrastructure/repository/document_handeler_repository.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, List, Tuple
2
+ from app.qdrant import QdrantConnectionDb
3
+ from qdrant_client import models
4
+
5
+
6
+ class DocumentHandelerRepository:
7
+ def __init__(self, qdrant_connection_db: QdrantConnectionDb):
8
+ self.client = qdrant_connection_db.get_client()
9
+ self.collection_name = qdrant_connection_db.get_collection_name()
10
+
11
+ def find_points_by_document_name(self, document_name) -> List[int]:
12
+ result = self.client.scroll(
13
+ collection_name=self.collection_name,
14
+ scroll_filter=models.Filter(
15
+ must=[
16
+ models.FieldCondition(
17
+ key="document_id", match=models.MatchValue(value=document_name)
18
+ )
19
+ ]
20
+ ),
21
+ )
22
+
23
+ if result[0]:
24
+ return [point.id for point in result[0]]
25
+
26
+ return
27
+
28
+ def delete_document_by_id(self, documents_id: List[int]) -> None:
29
+ return self.client.delete(
30
+ collection_name=self.collection_name,
31
+ points_selector=models.PointIdsList(points=documents_id),
32
+ )
33
+
34
+ def insert_points(self, points: List[models.PointStruct]) -> models.UpdateResult:
35
+ return self.client.upsert(
36
+ collection_name=self.collection_name,
37
+ wait=True,
38
+ points=points,
39
+ )
40
+
41
+ def get_all_documents(
42
+ self,
43
+ ) -> Tuple[List[models.Record], Any]: # models.ScrollResult
44
+ return self.client.scroll(
45
+ collection_name=self.collection_name,
46
+ with_payload=True,
47
+ with_vectors=False,
48
+ )
Api/app/infrastructure/repository/query_search_repository.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.qdrant import QdrantConnectionDb
2
+ from qdrant_client.models import QueryResponse, Prefetch, NamedVector
3
+
4
+
5
+ class QuerySearchRepository:
6
+ def __init__(self, qdrant_connection_db: QdrantConnectionDb):
7
+ self.client = qdrant_connection_db.get_client()
8
+ self.collection_name = qdrant_connection_db.get_collection_name()
9
+
10
+ def find_text_by_hybrid_search(
11
+ self, prefetch_context: Prefetch, dense_vector: NamedVector
12
+ ) -> QueryResponse:
13
+ return self.client.query_points(
14
+ collection_name=self.collection_name,
15
+ prefetch=prefetch_context,
16
+ query=dense_vector.vector,
17
+ using="text-dense",
18
+ with_payload=True,
19
+ limit=10,
20
+ )
Api/app/main.py CHANGED
@@ -1,22 +1,45 @@
1
  import logging
 
2
 
3
  import uvicorn
4
  from fastapi import APIRouter, FastAPI
5
  from fastapi.middleware.cors import CORSMiddleware
6
  from fastapi.responses import FileResponse
7
  from fastapi.staticfiles import StaticFiles
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- from app.modules.querySearch.routes.querySearch_route import (
10
- router as query_search_routes,
11
- )
12
- from app.modules.uploadDocument.routes.uploadDocument_route import (
13
- router as upload_file_routes,
14
- )
15
- from app.modules.clearVariables.routes.clearVariables_route import (
16
- router as clear_variables_routes,
17
- )
18
 
19
- app = FastAPI()
20
 
21
  origins = [
22
  "http://localhost:8000",
@@ -36,7 +59,7 @@ app.add_middleware(
36
  )
37
 
38
  app_router = APIRouter(prefix="/api")
39
- app_router.include_router(upload_file_routes, prefix="/upload", tags=["upload"])
40
  app_router.include_router(query_search_routes, prefix="/query", tags=["query"])
41
  app_router.include_router(clear_variables_routes, prefix="/clear", tags=["clear"])
42
 
 
1
  import logging
2
+ from contextlib import asynccontextmanager
3
 
4
  import uvicorn
5
  from fastapi import APIRouter, FastAPI
6
  from fastapi.middleware.cors import CORSMiddleware
7
  from fastapi.responses import FileResponse
8
  from fastapi.staticfiles import StaticFiles
9
+ from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer, pipeline
10
+
11
+ from app.modules.clearVariables.routes.clearVariables_route import \
12
+ router as clear_variables_routes
13
+ from app.modules.documentHandeler.routes.document_handeler_route import \
14
+ router as upload_file_routes
15
+ from app.modules.querySearch.routes.querySearch_route import \
16
+ router as query_search_routes
17
+
18
+
19
+ @asynccontextmanager
20
+ async def lifespan(app: FastAPI):
21
+ dense_model_name = "sentence-transformers/all-MiniLM-L6-v2"
22
+ sparse_model_name = "prithivida/Splade_PP_en_v1"
23
+ qa_model_name = "deepset/roberta-base-squad2"
24
+
25
+ dense_tokenizer = AutoTokenizer.from_pretrained(dense_model_name)
26
+ dense_model = AutoModel.from_pretrained(dense_model_name)
27
+
28
+ sparse_tokenizer = AutoTokenizer.from_pretrained(sparse_model_name)
29
+ sparse_model = AutoModelForMaskedLM.from_pretrained(sparse_model_name)
30
+
31
+ qa_pipeline = pipeline("question-answering", model=qa_model_name)
32
+
33
+ yield {
34
+ "dense_tokenizer": dense_tokenizer,
35
+ "dense_model": dense_model,
36
+ "sparse_tokenizer": sparse_tokenizer,
37
+ "sparse_model": sparse_model,
38
+ "qa_pipeline": qa_pipeline,
39
+ }
40
 
 
 
 
 
 
 
 
 
 
41
 
42
+ app = FastAPI(lifespan=lifespan)
43
 
44
  origins = [
45
  "http://localhost:8000",
 
59
  )
60
 
61
  app_router = APIRouter(prefix="/api")
62
+ app_router.include_router(upload_file_routes, prefix="/document", tags=["document"])
63
  app_router.include_router(query_search_routes, prefix="/query", tags=["query"])
64
  app_router.include_router(clear_variables_routes, prefix="/clear", tags=["clear"])
65
 
Api/app/modules/__pycache__/model.cpython-310.pyc CHANGED
Binary files a/Api/app/modules/__pycache__/model.cpython-310.pyc and b/Api/app/modules/__pycache__/model.cpython-310.pyc differ
 
Api/app/modules/clearVariables/routes/__pycache__/clearVariables_route.cpython-310.pyc CHANGED
Binary files a/Api/app/modules/clearVariables/routes/__pycache__/clearVariables_route.cpython-310.pyc and b/Api/app/modules/clearVariables/routes/__pycache__/clearVariables_route.cpython-310.pyc differ
 
Api/app/modules/clearVariables/routes/clearVariables_route.py CHANGED
@@ -1,20 +1,12 @@
1
- from app.db_local_storage.documents_db import documents_db, documents_text
2
  from app.db_local_storage.in_memory_db import query_response_storage
3
- from app.db_local_storage.vector_files_db import vector_files_db
4
- from app.modules.uploadDocument.controllers.file_upload_controller import \
5
- FileUploadController
6
- from fastapi import APIRouter, File, HTTPException, UploadFile
7
  from fastapi.responses import JSONResponse
8
 
9
  router = APIRouter()
10
- fileUploadController = FileUploadController()
11
 
12
  @router.delete("/clear_variables/")
13
  async def clear_variables():
14
- vector_files_db.clear()
15
- documents_db.clear()
16
  query_response_storage.clear()
17
- documents_db.clear()
18
- documents_text.clear()
19
-
20
- return JSONResponse(status_code=200, content={"message": "All variables cleared"})
 
 
1
  from app.db_local_storage.in_memory_db import query_response_storage
2
+ from fastapi import APIRouter
 
 
 
3
  from fastapi.responses import JSONResponse
4
 
5
  router = APIRouter()
6
+
7
 
8
  @router.delete("/clear_variables/")
9
  async def clear_variables():
 
 
10
  query_response_storage.clear()
11
+
12
+ return JSONResponse(status_code=200, content={"message": "All variables cleared"})
 
 
Api/app/modules/denseEmbeddings/__pycache__/denseEmbeddings.cpython-310.pyc ADDED
Binary file (2.33 kB). View file
 
Api/app/modules/denseEmbeddings/denseEmbeddings.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from qdrant_client import models
3
+ from qdrant_client.models import NamedVector
4
+ from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer
5
+
6
+
7
+ class DenseEmbeddings:
8
+
9
+ def __init__(
10
+ self,
11
+ dense_model: AutoModel,
12
+ dense_tokenizer: AutoTokenizer,
13
+ sparse_model: AutoModelForMaskedLM,
14
+ sparse_tokenizer: AutoTokenizer,
15
+ ):
16
+
17
+ self.dense_model = dense_model
18
+ self.dense_tokenizer = dense_tokenizer
19
+ self.sparse_model = sparse_model
20
+ self.sparse_tokenizer = sparse_tokenizer
21
+
22
+ def get_dense_vector(self, text: str) -> NamedVector:
23
+ """
24
+ Get dense vector from the dense model
25
+
26
+ :param text: str
27
+ :return: NamedVector
28
+ """
29
+ inputs = self.dense_tokenizer(
30
+ text, return_tensors="pt", padding=True, truncation=True
31
+ )
32
+ with torch.no_grad():
33
+ outputs = self.dense_model(**inputs)
34
+
35
+ dense_vector = NamedVector(
36
+ name="text-dense",
37
+ vector=torch.mean(outputs.last_hidden_state, dim=1).squeeze().numpy(),
38
+ )
39
+ return dense_vector
40
+
41
+ def get_sparse_vector(self, text: str) -> models.SparseVector:
42
+ """
43
+ Get sparse vector from the sparse model
44
+
45
+ :param text: str
46
+ :return: SparseVector
47
+ """
48
+
49
+ inputs = self.sparse_tokenizer(
50
+ text, return_tensors="pt", padding=True, truncation=True
51
+ )
52
+ with torch.no_grad():
53
+ outputs = self.sparse_model(**inputs)
54
+
55
+ token_scores = outputs.logits.squeeze().max(dim=0)[0]
56
+ token_ids = inputs["input_ids"].squeeze()
57
+
58
+ sparse_vector = {
59
+ int(token_id): float(score)
60
+ for token_id, score in zip(token_ids, token_scores)
61
+ if score > -5.0
62
+ }
63
+
64
+ sparse_vector = models.SparseVector(
65
+ indices=list(sparse_vector.keys()),
66
+ values=list(sparse_vector.values()),
67
+ )
68
+
69
+ return sparse_vector
Api/app/modules/{uploadDocument → documentHandeler}/controllers/__pycache__/FileUploadController.cpython-310.pyc RENAMED
File without changes
Api/app/modules/documentHandeler/controllers/__pycache__/document_handeler_controller.cpython-310.pyc ADDED
Binary file (2.59 kB). View file
 
Api/app/modules/documentHandeler/controllers/__pycache__/file_upload_controller.cpython-310.pyc ADDED
Binary file (1.97 kB). View file
 
Api/app/modules/documentHandeler/controllers/document_handeler_controller.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import HTTPException, UploadFile
2
+ from fastapi.responses import JSONResponse
3
+
4
+ from app.infrastructure.repository.document_handeler_repository import (
5
+ DocumentHandelerRepository,
6
+ )
7
+ from app.modules.documentHandeler.features.createEmbeddings_feature import (
8
+ CreateEmbeddingsFeature,
9
+ )
10
+ from app.modules.documentHandeler.features.deleteDocument_feature import (
11
+ DeleteDocumentFeature,
12
+ )
13
+ from app.modules.documentHandeler.features.extractText_feature import ExtractTextFeature
14
+ from app.modules.documentHandeler.features.getAllChunkedText_feature import (
15
+ GetAllChunkedTextFeature,
16
+ )
17
+
18
+
19
+ class DocumentHandelerController:
20
+
21
+ def __init__(
22
+ self,
23
+ delete_document_feature: DeleteDocumentFeature,
24
+ create_embeddings_feature: CreateEmbeddingsFeature,
25
+ get_all_chunked_text_feature: GetAllChunkedTextFeature,
26
+ ):
27
+ self.create_embeddings_feature = create_embeddings_feature
28
+ self.delete_document_feature = delete_document_feature
29
+ self.get_all_chunked_text_feature = get_all_chunked_text_feature
30
+
31
+ async def handle_file_upload(self, file: UploadFile) -> JSONResponse:
32
+ try:
33
+
34
+ text_file = await ExtractTextFeature.extract_text_from_pdf(file)
35
+ result = await self.create_embeddings_feature.create_embeddings(
36
+ text_file, file.filename
37
+ )
38
+
39
+ return JSONResponse(status_code=200, content=result.model_dump())
40
+
41
+ except Exception as e:
42
+ raise HTTPException(status_code=500, detail="Probelm on controller")
43
+
44
+ async def delete_document(self, text: str) -> JSONResponse:
45
+ try:
46
+ result = await self.delete_document_feature.delete_document_by_filename(
47
+ text
48
+ )
49
+ if result:
50
+ return JSONResponse(
51
+ status_code=200, content={"message": "Document deleted"}
52
+ )
53
+ return JSONResponse(
54
+ status_code=404, content={"message": "Document not found"}
55
+ )
56
+
57
+ except Exception as e:
58
+ raise HTTPException(status_code=500, detail=str(e))
59
+
60
+ async def get_all_chunks(self) -> JSONResponse:
61
+ try:
62
+ result = await self.get_all_chunked_text_feature.get_all_chunked_text()
63
+ return JSONResponse(status_code=200, content=result.model_dump())
64
+
65
+ except Exception as e:
66
+ raise HTTPException(status_code=500, detail=str(e))
Api/app/modules/documentHandeler/dependencies/__pycache__/dependencies.cpython-310.pyc ADDED
Binary file (3.09 kB). View file
 
Api/app/modules/documentHandeler/dependencies/dependencies.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import Depends, Request
2
+ from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer
3
+
4
+ from app.infrastructure.repository.document_handeler_repository import (
5
+ DocumentHandelerRepository,
6
+ )
7
+ from app.modules.denseEmbeddings.denseEmbeddings import DenseEmbeddings
8
+ from app.modules.documentHandeler.controllers.document_handeler_controller import (
9
+ DocumentHandelerController,
10
+ )
11
+ from app.modules.documentHandeler.features.createEmbeddings_feature import (
12
+ CreateEmbeddingsFeature,
13
+ )
14
+ from app.modules.documentHandeler.features.deleteDocument_feature import (
15
+ DeleteDocumentFeature,
16
+ )
17
+ from app.modules.documentHandeler.features.getAllChunkedText_feature import (
18
+ GetAllChunkedTextFeature,
19
+ )
20
+ from app.qdrant import QdrantConnectionDb
21
+
22
+
23
+ def get_qdrant_connection_db() -> QdrantConnectionDb:
24
+ return QdrantConnectionDb()
25
+
26
+
27
+ def get_document_handeler_repository(
28
+ qdrant_connection_db: QdrantConnectionDb = Depends(get_qdrant_connection_db),
29
+ ):
30
+ return DocumentHandelerRepository(qdrant_connection_db)
31
+
32
+
33
+ def get_dense_model(request: Request) -> AutoModel:
34
+ return request.scope["state"]["dense_model"]
35
+
36
+
37
+ def get_sparse_model(request: Request) -> AutoModelForMaskedLM:
38
+ return request.scope["state"]["sparse_model"]
39
+
40
+
41
+ def get_dense_tokenizer(request: Request) -> AutoTokenizer:
42
+ return request.scope["state"]["dense_tokenizer"]
43
+
44
+
45
+ def get_sparse_tokenizer(request: Request) -> AutoTokenizer:
46
+ return request.scope["state"]["sparse_tokenizer"]
47
+
48
+
49
+ def get_dense_embeddings(
50
+ dense_model: AutoModel = Depends(get_dense_model),
51
+ dense_tokenizer: AutoTokenizer = Depends(get_dense_tokenizer),
52
+ sparse_model: AutoModelForMaskedLM = Depends(get_sparse_model),
53
+ sparse_tokenizer: AutoTokenizer = Depends(get_sparse_tokenizer),
54
+ ):
55
+ return DenseEmbeddings(
56
+ dense_model=dense_model,
57
+ dense_tokenizer=dense_tokenizer,
58
+ sparse_model=sparse_model,
59
+ sparse_tokenizer=sparse_tokenizer,
60
+ )
61
+
62
+
63
+ def get_all_chunked_text_feature(
64
+ document_handeler_repository: DocumentHandelerRepository = Depends(
65
+ get_document_handeler_repository
66
+ ),
67
+ ):
68
+ return GetAllChunkedTextFeature(document_handeler_repository)
69
+
70
+
71
+ def get_create_embeddings_feature(
72
+ dense_embeddings: DenseEmbeddings = Depends(get_dense_embeddings),
73
+ document_handeler_repository: DocumentHandelerRepository = Depends(
74
+ get_document_handeler_repository
75
+ ),
76
+ ):
77
+ return CreateEmbeddingsFeature(dense_embeddings, document_handeler_repository)
78
+
79
+
80
+ def get_delete_document_feature(
81
+ document_handeler_repository: DocumentHandelerRepository = Depends(
82
+ get_document_handeler_repository
83
+ ),
84
+ ):
85
+ return DeleteDocumentFeature(document_handeler_repository)
86
+
87
+
88
+ def get_document_handeler_controller(
89
+ delete_document_feature: DeleteDocumentFeature = Depends(
90
+ get_delete_document_feature
91
+ ),
92
+ create_embeddings_feature: CreateEmbeddingsFeature = Depends(
93
+ get_create_embeddings_feature
94
+ ),
95
+ get_all_chunked_text_feature: GetAllChunkedTextFeature = Depends(
96
+ get_all_chunked_text_feature
97
+ ),
98
+ ):
99
+ return DocumentHandelerController(
100
+ delete_document_feature=delete_document_feature,
101
+ create_embeddings_feature=create_embeddings_feature,
102
+ get_all_chunked_text_feature=get_all_chunked_text_feature,
103
+ )
104
+
105
+
106
+ def get_create_embeddings_feature(
107
+ dense_embeddings: DenseEmbeddings = Depends(get_dense_embeddings),
108
+ document_handeler_repository: DocumentHandelerRepository = Depends(
109
+ get_document_handeler_repository
110
+ ),
111
+ ):
112
+ return CreateEmbeddingsFeature(dense_embeddings, document_handeler_repository)
Api/app/modules/documentHandeler/features/__pycache__/createEmbeddings_feature.cpython-310.pyc ADDED
Binary file (2.59 kB). View file
 
Api/app/modules/documentHandeler/features/__pycache__/deleteDocument_feature.cpython-310.pyc ADDED
Binary file (985 Bytes). View file
 
Api/app/modules/documentHandeler/features/__pycache__/extractText_feature.cpython-310.pyc ADDED
Binary file (839 Bytes). View file
 
Api/app/modules/documentHandeler/features/__pycache__/getAllChunkedText_feature.cpython-310.pyc ADDED
Binary file (1.4 kB). View file
 
Api/app/modules/{uploadDocument → documentHandeler}/features/__pycache__/uploadDocument_feature.cpython-310.pyc RENAMED
File without changes
Api/app/modules/documentHandeler/features/createEmbeddings_feature.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import uuid
3
+
4
+ from qdrant_client.models import PointStruct
5
+
6
+ from app.infrastructure.models.my_models import EmbeddingCreation
7
+ from app.infrastructure.repository.document_handeler_repository import (
8
+ DocumentHandelerRepository,
9
+ )
10
+ from app.modules.denseEmbeddings.denseEmbeddings import DenseEmbeddings
11
+
12
+
13
+ class CreateEmbeddingsFeature:
14
+
15
+ def __init__(
16
+ self,
17
+ dense_embeddings: DenseEmbeddings,
18
+ document_handeler_repository: DocumentHandelerRepository,
19
+ ):
20
+ self.dense_embeddings = dense_embeddings
21
+ self.document_handeler_repository = document_handeler_repository
22
+
23
+ def chunk_text(self, text: str, chunk_size: int = 512) -> List[str]:
24
+ """
25
+ Chunk text into smaller pieces
26
+
27
+ :param text: str
28
+ :param chunk_size: int
29
+ :return: List[str]
30
+ """
31
+ chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
32
+ return chunks
33
+
34
+ async def create_embeddings(self, text: str, filename: str) -> EmbeddingCreation:
35
+ """
36
+ Create embeddings for the text
37
+
38
+ :param text: str
39
+ :param filename: str
40
+ :return: EmbeddingCreation
41
+ """
42
+
43
+ chunks = self.chunk_text(text)
44
+
45
+ document_id = filename.split(".")[0]
46
+
47
+ points = [
48
+ PointStruct(
49
+ id=str(uuid.uuid4()),
50
+ vector={
51
+ "text-dense": self.dense_embeddings.get_dense_vector(chunk).vector,
52
+ "text-sparse": self.dense_embeddings.get_sparse_vector(chunk),
53
+ },
54
+ payload={
55
+ "document_id": document_id,
56
+ "chunk_index": i,
57
+ "filename": filename,
58
+ "chunk-text": chunk,
59
+ },
60
+ )
61
+ for i, chunk in enumerate(chunks)
62
+ ]
63
+
64
+ result = self.document_handeler_repository.insert_points(points)
65
+ if result.status:
66
+ return EmbeddingCreation(
67
+ success=True, message="Embeddings created successfully"
68
+ )
69
+ return EmbeddingCreation(success=False, message="Embeddings creation failed")
Api/app/modules/documentHandeler/features/deleteDocument_feature.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.infrastructure.repository.document_handeler_repository import (
2
+ DocumentHandelerRepository,
3
+ )
4
+
5
+
6
+ class DeleteDocumentFeature:
7
+
8
+ def __init__(self, update_document_repository: DocumentHandelerRepository):
9
+ self.update_document_repository = update_document_repository
10
+
11
+ async def delete_document_by_filename(self, document_name: str) -> bool:
12
+ document = self.update_document_repository.find_points_by_document_name(
13
+ document_name
14
+ )
15
+ if document is None:
16
+ return False
17
+ self.update_document_repository.delete_document_by_id(document)
18
+ return True
Api/app/modules/documentHandeler/features/extractText_feature.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+
4
+ from fastapi import UploadFile
5
+ import pdfplumber
6
+
7
+
8
+ class ExtractTextFeature:
9
+
10
+ @staticmethod
11
+ async def extract_text_from_pdf(file: UploadFile) -> str:
12
+
13
+ content = await file.read()
14
+ with pdfplumber.open(io.BytesIO(content)) as pdf:
15
+ text = ""
16
+ for page in pdf.pages:
17
+ text += page.extract_text()
18
+
19
+ return text
Api/app/modules/documentHandeler/features/getAllChunkedText_feature.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.infrastructure.models.my_models import ChunksResponse
2
+ from app.infrastructure.repository.document_handeler_repository import (
3
+ DocumentHandelerRepository,
4
+ )
5
+
6
+
7
+ class GetAllChunkedTextFeature:
8
+ def __init__(self, document_handeler_repository: DocumentHandelerRepository):
9
+ self.document_handeler_repository = document_handeler_repository
10
+
11
+ async def get_all_chunked_text(self):
12
+
13
+ qdrant_response = self.document_handeler_repository.get_all_documents()
14
+
15
+ transformed_data = {}
16
+
17
+ for document in qdrant_response[0]:
18
+ document_id = document.payload["document_id"]
19
+ chunk_index = document.payload["chunk_index"]
20
+ text = document.payload["chunk-text"]
21
+
22
+ if document_id not in transformed_data:
23
+ transformed_data[document_id] = []
24
+
25
+ transformed_data[document_id].append({"index": chunk_index, "text": text})
26
+
27
+ for doc in transformed_data:
28
+ transformed_data[doc] = sorted(
29
+ transformed_data[doc], key=lambda x: x["index"]
30
+ )
31
+
32
+ return ChunksResponse(data=transformed_data)
Api/app/modules/documentHandeler/features/uploadDocument_feature.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import os
2
+ # from typing import Dict
3
+
4
+ # from fastapi import UploadFile
5
+ # from app.db_local_storage.files_db import FILES_DIRECTORY, FILES_NAMES_DATABASE
6
+ # from app.db_local_storage.documents_db import documents_db
7
+
8
+
9
+ # class UploadDocumentFeature:
10
+
11
+ # @staticmethod
12
+ # async def uploadFile(document: UploadFile) -> Dict[str, str]:
13
+ # """
14
+ # Upload a file to the server
15
+ # :param document: the file to upload
16
+ # :return: a message to confirm the upload
17
+ # """
18
+
19
+ # data = {
20
+ # "id": len(documents_db) + 1,
21
+ # "filename": document.filename,
22
+ # }
23
+
24
+ # documents_db.append(data)
25
+
26
+ # return {"message": "Document Updated"}
Api/app/modules/documentHandeler/routes/__pycache__/document_handeler_route.cpython-310.pyc ADDED
Binary file (1.16 kB). View file
 
Api/app/modules/documentHandeler/routes/__pycache__/uploadDocument_route.cpython-310.pyc ADDED
Binary file (1.94 kB). View file
 
Api/app/modules/documentHandeler/routes/document_handeler_route.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Depends, File, UploadFile
2
+
3
+ from app.modules.documentHandeler.controllers.document_handeler_controller import \
4
+ DocumentHandelerController
5
+ from app.modules.documentHandeler.dependencies.dependencies import \
6
+ get_document_handeler_controller
7
+
8
+ router = APIRouter()
9
+
10
+
11
+ @router.get("/get_chunks")
12
+ async def get_all_documents(
13
+ controller: DocumentHandelerController = Depends(get_document_handeler_controller),
14
+ ):
15
+ return await controller.get_all_chunks()
16
+
17
+
18
+ @router.delete("/delete_document/{filename}")
19
+ async def delete_document(
20
+ filename: str,
21
+ controller: DocumentHandelerController = Depends(get_document_handeler_controller),
22
+ ):
23
+ return await controller.delete_document(filename)
24
+
25
+
26
+ @router.post("/upload_file")
27
+ async def upload_file(
28
+ file: UploadFile = File(...),
29
+ controller: DocumentHandelerController = Depends(get_document_handeler_controller),
30
+ ):
31
+ return await controller.handle_file_upload(file)
Api/app/modules/{uploadDocument → documentHandeler}/schemas/uploadDocument_schema.py RENAMED
File without changes
Api/app/modules/hybridSearcher/__pycache__/hybridSearcher.cpython-310.pyc ADDED
Binary file (2.41 kB). View file
 
Api/app/modules/hybridSearcher/hybridSearcher.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from qdrant_client import models
2
+ from qdrant_client.conversions import common_types as types
3
+ from qdrant_client.models import NamedVector, SparseVector
4
+
5
+ from app.infrastructure.models.my_models import HybridSearchResponse
6
+ from app.infrastructure.repository.query_search_repository import QuerySearchRepository
7
+ from app.modules.denseEmbeddings.denseEmbeddings import DenseEmbeddings
8
+ from app.qdrant import QdrantConnectionDb
9
+
10
+
11
+ class HybridSearcher:
12
+
13
+ def __init__(
14
+ self,
15
+ dense_embeddings: DenseEmbeddings,
16
+ query_search_repository: QuerySearchRepository,
17
+ ):
18
+ self.dense_embeddings = dense_embeddings
19
+ self.query_search_repository = query_search_repository
20
+
21
+ def sparse_dense_rrf_prefetch(
22
+ self, sparse_vector: SparseVector, dense_vector: NamedVector
23
+ ) -> models.Prefetch:
24
+ result = models.Prefetch(
25
+ prefetch=[
26
+ models.Prefetch(
27
+ query=dense_vector.vector,
28
+ using="text-dense",
29
+ limit=10,
30
+ ),
31
+ models.Prefetch(
32
+ query=sparse_vector,
33
+ using="text-sparse",
34
+ limit=10,
35
+ ),
36
+ ],
37
+ query=models.FusionQuery(
38
+ fusion=models.Fusion.RRF,
39
+ ),
40
+ )
41
+
42
+ return result
43
+
44
+ def hybrid_search(self, user_query: str) -> types.QueryResponse:
45
+ """
46
+ Hybrid search
47
+
48
+ :param user_query: str
49
+ :return: types.QueryResponse
50
+ """
51
+ try:
52
+ sparse_vector = self.dense_embeddings.get_sparse_vector(user_query)
53
+ dense_vector = self.dense_embeddings.get_dense_vector(user_query)
54
+
55
+ prefetch_context = self.sparse_dense_rrf_prefetch(
56
+ sparse_vector, dense_vector
57
+ )
58
+
59
+ result = self.query_search_repository.find_text_by_hybrid_search(
60
+ prefetch_context, dense_vector
61
+ )
62
+
63
+ response_data = [
64
+ {"chunk-text": point.payload["chunk-text"]} for point in result.points
65
+ ]
66
+ return HybridSearchResponse(success=True, data=response_data)
67
+
68
+ except Exception as e:
69
+ return HybridSearchResponse(
70
+ success=False, message=f"Database operation failed: {str(e)}"
71
+ )
Api/app/modules/model.py DELETED
@@ -1,6 +0,0 @@
1
- from sentence_transformers import SentenceTransformer
2
- from transformers import pipeline
3
-
4
-
5
- model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
6
- qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
 
 
 
 
 
 
 
Api/app/modules/querySearch/__pycache__/dependecies.cpython-310.pyc ADDED
Binary file (2.95 kB). View file
 
Api/app/modules/querySearch/controllers/__pycache__/querySearch_controller.cpython-310.pyc CHANGED
Binary files a/Api/app/modules/querySearch/controllers/__pycache__/querySearch_controller.cpython-310.pyc and b/Api/app/modules/querySearch/controllers/__pycache__/querySearch_controller.cpython-310.pyc differ
 
Api/app/modules/querySearch/controllers/querySearch_controller.py CHANGED
@@ -1,4 +1,3 @@
1
- from typing import Any
2
  from fastapi import HTTPException
3
  from fastapi.responses import JSONResponse
4
 
@@ -10,13 +9,12 @@ class QuerySearchController:
10
  def __init__(self, query_search_feature: QuerySearchFeature):
11
  self.query_search_feature = query_search_feature
12
 
13
- async def handle_query_search(self, q: str) -> Any:
14
  try:
15
 
16
- result = await self.query_search_feature.query_search(q)
17
- message = result.get("message", "No message provided")
18
 
19
- return JSONResponse(status_code=200, content={"message": message})
20
 
21
  except Exception as e:
22
  raise HTTPException(status_code=500, detail=str(e))
 
 
1
  from fastapi import HTTPException
2
  from fastapi.responses import JSONResponse
3
 
 
9
  def __init__(self, query_search_feature: QuerySearchFeature):
10
  self.query_search_feature = query_search_feature
11
 
12
+ async def handle_query_search(self, query: str) -> JSONResponse:
13
  try:
14
 
15
+ result = await self.query_search_feature.query_search(query)
 
16
 
17
+ return JSONResponse(status_code=200, content=result.model_dump())
18
 
19
  except Exception as e:
20
  raise HTTPException(status_code=500, detail=str(e))
Api/app/modules/querySearch/dependecies.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import Depends, Request
2
+ from transformers import (AutoModel, AutoModelForMaskedLM, AutoTokenizer,
3
+ pipeline)
4
+
5
+ from app.infrastructure.repository.query_search_repository import \
6
+ QuerySearchRepository
7
+ from app.modules.denseEmbeddings.denseEmbeddings import DenseEmbeddings
8
+ from app.modules.hybridSearcher.hybridSearcher import HybridSearcher
9
+ from app.modules.querySearch.controllers.querySearch_controller import \
10
+ QuerySearchController
11
+ from app.modules.querySearch.features.querySearch_feature import \
12
+ QuerySearchFeature
13
+ from app.modules.questionAnswer.questionAnswer import QuestionAnswering
14
+ from app.qdrant import QdrantConnectionDb
15
+
16
+
17
+ def get_qdrant_connection_db() -> QdrantConnectionDb:
18
+ return QdrantConnectionDb()
19
+
20
+
21
+ def get_query_search_repository(
22
+ qdrant_connection_db: QdrantConnectionDb = Depends(get_qdrant_connection_db),
23
+ ):
24
+ return QuerySearchRepository(qdrant_connection_db)
25
+
26
+
27
+ def get_dense_model(request: Request) -> AutoModel:
28
+ return request.scope["state"]["dense_model"]
29
+
30
+
31
+ def get_sparse_model(request: Request) -> AutoModelForMaskedLM:
32
+ return request.scope["state"]["sparse_model"]
33
+
34
+
35
+ def get_dense_tokenizer(request: Request) -> AutoTokenizer:
36
+ return request.scope["state"]["dense_tokenizer"]
37
+
38
+
39
+ def get_sparse_tokenizer(request: Request) -> AutoTokenizer:
40
+ return request.scope["state"]["sparse_tokenizer"]
41
+
42
+
43
+ def get_dense_embeddings(
44
+ dense_model: AutoModel = Depends(get_dense_model),
45
+ dense_tokenizer: AutoTokenizer = Depends(get_dense_tokenizer),
46
+ sparse_model: AutoModelForMaskedLM = Depends(get_sparse_model),
47
+ sparse_tokenizer: AutoTokenizer = Depends(get_sparse_tokenizer),
48
+ ):
49
+ return DenseEmbeddings(
50
+ dense_model=dense_model,
51
+ dense_tokenizer=dense_tokenizer,
52
+ sparse_model=sparse_model,
53
+ sparse_tokenizer=sparse_tokenizer,
54
+ )
55
+
56
+
57
+ def get_qa_pipeline(request: Request):
58
+ return request.scope["state"]["qa_pipeline"]
59
+
60
+
61
+ def get_question_ansering(qa_pipline: pipeline = Depends(get_qa_pipeline)):
62
+ return QuestionAnswering(qa_pipline)
63
+
64
+
65
+ def get_hybrid_searcher(
66
+ dense_embeddings: DenseEmbeddings = Depends(get_dense_embeddings),
67
+ query_search_repository: QuerySearchRepository = Depends(
68
+ get_query_search_repository
69
+ ),
70
+ ):
71
+ return HybridSearcher(dense_embeddings, query_search_repository)
72
+
73
+
74
+ def get_query_search_feature(
75
+ qa_pipeline: pipeline = Depends(get_qa_pipeline),
76
+ hybrid_searcher: HybridSearcher = Depends(get_hybrid_searcher),
77
+ question_answering: QuestionAnswering = Depends(get_question_ansering),
78
+ ):
79
+ return QuerySearchFeature(qa_pipeline, hybrid_searcher, question_answering)
80
+
81
+
82
+ def get_query_search_controller(
83
+ query_search_feature: QuerySearchFeature = Depends(get_query_search_feature),
84
+ ):
85
+ return QuerySearchController(query_search_feature)
Api/app/modules/querySearch/features/__pycache__/querySearch_feature.cpython-310.pyc CHANGED
Binary files a/Api/app/modules/querySearch/features/__pycache__/querySearch_feature.cpython-310.pyc and b/Api/app/modules/querySearch/features/__pycache__/querySearch_feature.cpython-310.pyc differ
 
Api/app/modules/querySearch/features/querySearch_feature.py CHANGED
@@ -1,89 +1,51 @@
1
- import json
2
- from typing import List, Tuple
3
 
4
- import numpy as np
5
- # from fastapi.responses import JSONResponse
6
- # from sentence_transformers import SentenceTransformer
7
- # from transformers import pipeline
8
-
9
- from app.db_local_storage.vector_files_db import vector_files_db
10
- from app.db_local_storage.files_db import VECTOR_FILES_DIRECTORY
11
  from app.db_local_storage.in_memory_db import query_response_storage
12
-
 
 
 
13
 
14
 
15
  class QuerySearchFeature:
16
 
17
- def __init__(self, model, qa_pipeline):
18
- self.model = model
 
 
 
 
19
  self.qa_pipeline = qa_pipeline
 
 
20
 
21
- async def query_search(self, query: str) -> dict:
22
-
23
- user_query = {
24
- "text": query,
25
- "isSender": True,
26
- }
27
-
28
- query_response_storage.append(user_query)
29
 
30
- # dataBase = await QuerySearchFeature.load_data()
31
- dataBase = vector_files_db
32
- text_data, embeddings = await QuerySearchFeature.split_dataBase(dataBase)
33
-
34
- lexical_results = await QuerySearchFeature.lexical_search(query, text_data)
35
- semantic_results = await QuerySearchFeature.semantic_search(
36
- query, text_data, embeddings, self.model
37
  )
38
 
39
- combined_results = list(set(lexical_results + semantic_results))
40
- context = await QuerySearchFeature.get_context(combined_results)
41
-
42
- response = self.qa_pipeline(question=query, context=context)
43
-
44
- response_query = {
45
- "text": response["answer"],
46
- "isSender": False,
47
- }
48
 
49
- query_response_storage.append(response_query)
50
-
51
- return {
52
- "message": response["answer"],
53
- "context_used": context,
54
- "chunks": context,
55
- }
56
-
57
- @staticmethod
58
- async def semantic_search(
59
- query: str, chunks: List[str], embeddings: np.ndarray, model
60
- ) -> List[str]:
61
- query_embedding = model.encode([query])
62
- similarities = np.dot(embeddings, query_embedding.T).flatten()
63
- top_indices = np.argsort(-similarities)[:3]
64
- return [chunks[i] for i in top_indices]
65
-
66
- @staticmethod
67
- async def lexical_search(query: str, chunks: List[str]) -> List[str]:
68
- return [chunk for chunk in chunks if query.lower() in chunk.lower()]
69
-
70
- @staticmethod
71
- async def load_data():
72
- with open(VECTOR_FILES_DIRECTORY, "r") as file:
73
- dataBase = json.load(file)
74
- return dataBase
75
-
76
- @staticmethod
77
- async def split_dataBase(db) -> Tuple[List[str], np.ndarray]:
78
- text_data = []
79
- embeddings = []
80
 
81
- for document in db.values():
82
- for page in document["data"]:
83
- text_data.append(page["metadata"]["original_text"])
84
- embeddings.append(page["embedding"])
85
- return text_data, embeddings
86
 
87
- @staticmethod
88
- async def get_context(chunks: List[str]) -> str:
89
- return " ".join(chunks)
 
1
+ from qdrant_client.conversions import common_types as types
 
2
 
 
 
 
 
 
 
 
3
  from app.db_local_storage.in_memory_db import query_response_storage
4
+ from app.infrastructure.models.my_models import HybridSearchResponse, ModelResponse
5
+ from app.modules.hybridSearcher.hybridSearcher import HybridSearcher
6
+ from app.modules.questionAnswer.questionAnswer import QuestionAnswering
7
+ from transformers import pipeline
8
 
9
 
10
  class QuerySearchFeature:
11
 
12
+ def __init__(
13
+ self,
14
+ qa_pipeline: pipeline,
15
+ hybrid_searcher: HybridSearcher,
16
+ question_answering: QuestionAnswering,
17
+ ):
18
  self.qa_pipeline = qa_pipeline
19
+ self.hybrid_searcher = hybrid_searcher
20
+ self.question_answering = question_answering
21
 
22
+ async def query_search(self, query: str) -> ModelResponse:
 
 
 
 
 
 
 
23
 
24
+ query_response_storage.append(
25
+ {
26
+ "text": query,
27
+ "isSender": True,
28
+ }
 
 
29
  )
30
 
31
+ result = self.hybrid_searcher.hybrid_search(query)
32
+ context = self.get_and_join_context(result)
33
+ model_response = self.question_answering.answer_question(query, context)
 
 
 
 
 
 
34
 
35
+ # TODO: Manage memory for display messages
36
+ query_response_storage.append(
37
+ {
38
+ "text": model_response,
39
+ "isSender": False,
40
+ }
41
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ return ModelResponse(
44
+ text=model_response,
45
+ isSender=False,
46
+ message="success",
47
+ )
48
 
49
+ def get_and_join_context(self, search_result: HybridSearchResponse) -> str:
50
+ contexts = [point["chunk-text"] for point in search_result.data]
51
+ return ", ".join(contexts)