Spaces:
Running
Running
abadesalex
commited on
Commit
•
47b5f0c
1
Parent(s):
dcb6c5f
Update to Qdrant db
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +4 -1
- Api/app/__pycache__/main.cpython-310.pyc +0 -0
- Api/app/__pycache__/qdrant.cpython-310.pyc +0 -0
- Api/app/db_local_storage/__pycache__/documents_db.cpython-310.pyc +0 -0
- Api/app/db_local_storage/__pycache__/vector_files_db.cpython-310.pyc +0 -0
- Api/app/db_local_storage/documents_db.py +0 -2
- Api/app/db_local_storage/files_db.py +0 -4
- Api/app/db_local_storage/vector_files_db.py +0 -1
- Api/app/infrastructure/models/__pycache__/my_models.cpython-310.pyc +0 -0
- Api/app/infrastructure/models/my_models.py +28 -0
- Api/app/infrastructure/repository/__pycache__/document_handeler_repository.cpython-310.pyc +0 -0
- Api/app/infrastructure/repository/__pycache__/query_search_repository.cpython-310.pyc +0 -0
- Api/app/infrastructure/repository/__pycache__/updateDocument_repository.cpython-310.pyc +0 -0
- Api/app/infrastructure/repository/document_handeler_repository.py +48 -0
- Api/app/infrastructure/repository/query_search_repository.py +20 -0
- Api/app/main.py +34 -11
- Api/app/modules/__pycache__/model.cpython-310.pyc +0 -0
- Api/app/modules/clearVariables/routes/__pycache__/clearVariables_route.cpython-310.pyc +0 -0
- Api/app/modules/clearVariables/routes/clearVariables_route.py +4 -12
- Api/app/modules/denseEmbeddings/__pycache__/denseEmbeddings.cpython-310.pyc +0 -0
- Api/app/modules/denseEmbeddings/denseEmbeddings.py +69 -0
- Api/app/modules/{uploadDocument → documentHandeler}/controllers/__pycache__/FileUploadController.cpython-310.pyc +0 -0
- Api/app/modules/documentHandeler/controllers/__pycache__/document_handeler_controller.cpython-310.pyc +0 -0
- Api/app/modules/documentHandeler/controllers/__pycache__/file_upload_controller.cpython-310.pyc +0 -0
- Api/app/modules/documentHandeler/controllers/document_handeler_controller.py +66 -0
- Api/app/modules/documentHandeler/dependencies/__pycache__/dependencies.cpython-310.pyc +0 -0
- Api/app/modules/documentHandeler/dependencies/dependencies.py +112 -0
- Api/app/modules/documentHandeler/features/__pycache__/createEmbeddings_feature.cpython-310.pyc +0 -0
- Api/app/modules/documentHandeler/features/__pycache__/deleteDocument_feature.cpython-310.pyc +0 -0
- Api/app/modules/documentHandeler/features/__pycache__/extractText_feature.cpython-310.pyc +0 -0
- Api/app/modules/documentHandeler/features/__pycache__/getAllChunkedText_feature.cpython-310.pyc +0 -0
- Api/app/modules/{uploadDocument → documentHandeler}/features/__pycache__/uploadDocument_feature.cpython-310.pyc +0 -0
- Api/app/modules/documentHandeler/features/createEmbeddings_feature.py +69 -0
- Api/app/modules/documentHandeler/features/deleteDocument_feature.py +18 -0
- Api/app/modules/documentHandeler/features/extractText_feature.py +19 -0
- Api/app/modules/documentHandeler/features/getAllChunkedText_feature.py +32 -0
- Api/app/modules/documentHandeler/features/uploadDocument_feature.py +26 -0
- Api/app/modules/documentHandeler/routes/__pycache__/document_handeler_route.cpython-310.pyc +0 -0
- Api/app/modules/documentHandeler/routes/__pycache__/uploadDocument_route.cpython-310.pyc +0 -0
- Api/app/modules/documentHandeler/routes/document_handeler_route.py +31 -0
- Api/app/modules/{uploadDocument → documentHandeler}/schemas/uploadDocument_schema.py +0 -0
- Api/app/modules/hybridSearcher/__pycache__/hybridSearcher.cpython-310.pyc +0 -0
- Api/app/modules/hybridSearcher/hybridSearcher.py +71 -0
- Api/app/modules/model.py +0 -6
- Api/app/modules/querySearch/__pycache__/dependecies.cpython-310.pyc +0 -0
- Api/app/modules/querySearch/controllers/__pycache__/querySearch_controller.cpython-310.pyc +0 -0
- Api/app/modules/querySearch/controllers/querySearch_controller.py +3 -5
- Api/app/modules/querySearch/dependecies.py +85 -0
- Api/app/modules/querySearch/features/__pycache__/querySearch_feature.cpython-310.pyc +0 -0
- Api/app/modules/querySearch/features/querySearch_feature.py +37 -75
.gitignore
CHANGED
@@ -2,4 +2,7 @@
|
|
2 |
Api/venv
|
3 |
|
4 |
# Other versions
|
5 |
-
Api/out1
|
|
|
|
|
|
|
|
2 |
Api/venv
|
3 |
|
4 |
# Other versions
|
5 |
+
Api/out1
|
6 |
+
|
7 |
+
# env
|
8 |
+
.env
|
Api/app/__pycache__/main.cpython-310.pyc
CHANGED
Binary files a/Api/app/__pycache__/main.cpython-310.pyc and b/Api/app/__pycache__/main.cpython-310.pyc differ
|
|
Api/app/__pycache__/qdrant.cpython-310.pyc
ADDED
Binary file (3.09 kB). View file
|
|
Api/app/db_local_storage/__pycache__/documents_db.cpython-310.pyc
CHANGED
Binary files a/Api/app/db_local_storage/__pycache__/documents_db.cpython-310.pyc and b/Api/app/db_local_storage/__pycache__/documents_db.cpython-310.pyc differ
|
|
Api/app/db_local_storage/__pycache__/vector_files_db.cpython-310.pyc
CHANGED
Binary files a/Api/app/db_local_storage/__pycache__/vector_files_db.cpython-310.pyc and b/Api/app/db_local_storage/__pycache__/vector_files_db.cpython-310.pyc differ
|
|
Api/app/db_local_storage/documents_db.py
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
documents_db = []
|
2 |
-
documents_text = []
|
|
|
|
|
|
Api/app/db_local_storage/files_db.py
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
FILES_NAMES_DATABASE = {}
|
2 |
-
FILES_DIRECTORY = "src/db_local_storage/files"
|
3 |
-
TEXT_FILES_DIRECTORY = "src/db_local_storage/text_files"
|
4 |
-
VECTOR_FILES_DIRECTORY = "src/db_local_storage/vector_files/vec_db.json"
|
|
|
|
|
|
|
|
|
|
Api/app/db_local_storage/vector_files_db.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
vector_files_db = {}
|
|
|
|
Api/app/infrastructure/models/__pycache__/my_models.cpython-310.pyc
ADDED
Binary file (1.41 kB). View file
|
|
Api/app/infrastructure/models/my_models.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel
|
2 |
+
from typing import Dict, List, Optional
|
3 |
+
|
4 |
+
|
5 |
+
class ModelResponse(BaseModel, extra="forbid"):
|
6 |
+
text: str
|
7 |
+
isSender: bool
|
8 |
+
message: Optional[str] = None
|
9 |
+
|
10 |
+
|
11 |
+
class EmbeddingCreation(BaseModel, extra="forbid"):
|
12 |
+
success: bool
|
13 |
+
message: Optional[str] = None
|
14 |
+
|
15 |
+
|
16 |
+
class HybridSearchResponse(BaseModel):
|
17 |
+
success: bool
|
18 |
+
data: Optional[List[Dict]] = None
|
19 |
+
message: Optional[str] = None
|
20 |
+
|
21 |
+
|
22 |
+
class Chunk(BaseModel):
|
23 |
+
index: int
|
24 |
+
text: str
|
25 |
+
|
26 |
+
|
27 |
+
class ChunksResponse(BaseModel):
|
28 |
+
data: Dict[str, List[Chunk]]
|
Api/app/infrastructure/repository/__pycache__/document_handeler_repository.cpython-310.pyc
ADDED
Binary file (2.23 kB). View file
|
|
Api/app/infrastructure/repository/__pycache__/query_search_repository.cpython-310.pyc
ADDED
Binary file (1.14 kB). View file
|
|
Api/app/infrastructure/repository/__pycache__/updateDocument_repository.cpython-310.pyc
ADDED
Binary file (1.6 kB). View file
|
|
Api/app/infrastructure/repository/document_handeler_repository.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, List, Tuple
|
2 |
+
from app.qdrant import QdrantConnectionDb
|
3 |
+
from qdrant_client import models
|
4 |
+
|
5 |
+
|
6 |
+
class DocumentHandelerRepository:
|
7 |
+
def __init__(self, qdrant_connection_db: QdrantConnectionDb):
|
8 |
+
self.client = qdrant_connection_db.get_client()
|
9 |
+
self.collection_name = qdrant_connection_db.get_collection_name()
|
10 |
+
|
11 |
+
def find_points_by_document_name(self, document_name) -> List[int]:
|
12 |
+
result = self.client.scroll(
|
13 |
+
collection_name=self.collection_name,
|
14 |
+
scroll_filter=models.Filter(
|
15 |
+
must=[
|
16 |
+
models.FieldCondition(
|
17 |
+
key="document_id", match=models.MatchValue(value=document_name)
|
18 |
+
)
|
19 |
+
]
|
20 |
+
),
|
21 |
+
)
|
22 |
+
|
23 |
+
if result[0]:
|
24 |
+
return [point.id for point in result[0]]
|
25 |
+
|
26 |
+
return
|
27 |
+
|
28 |
+
def delete_document_by_id(self, documents_id: List[int]) -> None:
|
29 |
+
return self.client.delete(
|
30 |
+
collection_name=self.collection_name,
|
31 |
+
points_selector=models.PointIdsList(points=documents_id),
|
32 |
+
)
|
33 |
+
|
34 |
+
def insert_points(self, points: List[models.PointStruct]) -> models.UpdateResult:
|
35 |
+
return self.client.upsert(
|
36 |
+
collection_name=self.collection_name,
|
37 |
+
wait=True,
|
38 |
+
points=points,
|
39 |
+
)
|
40 |
+
|
41 |
+
def get_all_documents(
|
42 |
+
self,
|
43 |
+
) -> Tuple[List[models.Record], Any]: # models.ScrollResult
|
44 |
+
return self.client.scroll(
|
45 |
+
collection_name=self.collection_name,
|
46 |
+
with_payload=True,
|
47 |
+
with_vectors=False,
|
48 |
+
)
|
Api/app/infrastructure/repository/query_search_repository.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from app.qdrant import QdrantConnectionDb
|
2 |
+
from qdrant_client.models import QueryResponse, Prefetch, NamedVector
|
3 |
+
|
4 |
+
|
5 |
+
class QuerySearchRepository:
|
6 |
+
def __init__(self, qdrant_connection_db: QdrantConnectionDb):
|
7 |
+
self.client = qdrant_connection_db.get_client()
|
8 |
+
self.collection_name = qdrant_connection_db.get_collection_name()
|
9 |
+
|
10 |
+
def find_text_by_hybrid_search(
|
11 |
+
self, prefetch_context: Prefetch, dense_vector: NamedVector
|
12 |
+
) -> QueryResponse:
|
13 |
+
return self.client.query_points(
|
14 |
+
collection_name=self.collection_name,
|
15 |
+
prefetch=prefetch_context,
|
16 |
+
query=dense_vector.vector,
|
17 |
+
using="text-dense",
|
18 |
+
with_payload=True,
|
19 |
+
limit=10,
|
20 |
+
)
|
Api/app/main.py
CHANGED
@@ -1,22 +1,45 @@
|
|
1 |
import logging
|
|
|
2 |
|
3 |
import uvicorn
|
4 |
from fastapi import APIRouter, FastAPI
|
5 |
from fastapi.middleware.cors import CORSMiddleware
|
6 |
from fastapi.responses import FileResponse
|
7 |
from fastapi.staticfiles import StaticFiles
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
from app.modules.querySearch.routes.querySearch_route import (
|
10 |
-
router as query_search_routes,
|
11 |
-
)
|
12 |
-
from app.modules.uploadDocument.routes.uploadDocument_route import (
|
13 |
-
router as upload_file_routes,
|
14 |
-
)
|
15 |
-
from app.modules.clearVariables.routes.clearVariables_route import (
|
16 |
-
router as clear_variables_routes,
|
17 |
-
)
|
18 |
|
19 |
-
app = FastAPI()
|
20 |
|
21 |
origins = [
|
22 |
"http://localhost:8000",
|
@@ -36,7 +59,7 @@ app.add_middleware(
|
|
36 |
)
|
37 |
|
38 |
app_router = APIRouter(prefix="/api")
|
39 |
-
app_router.include_router(upload_file_routes, prefix="/
|
40 |
app_router.include_router(query_search_routes, prefix="/query", tags=["query"])
|
41 |
app_router.include_router(clear_variables_routes, prefix="/clear", tags=["clear"])
|
42 |
|
|
|
1 |
import logging
|
2 |
+
from contextlib import asynccontextmanager
|
3 |
|
4 |
import uvicorn
|
5 |
from fastapi import APIRouter, FastAPI
|
6 |
from fastapi.middleware.cors import CORSMiddleware
|
7 |
from fastapi.responses import FileResponse
|
8 |
from fastapi.staticfiles import StaticFiles
|
9 |
+
from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer, pipeline
|
10 |
+
|
11 |
+
from app.modules.clearVariables.routes.clearVariables_route import \
|
12 |
+
router as clear_variables_routes
|
13 |
+
from app.modules.documentHandeler.routes.document_handeler_route import \
|
14 |
+
router as upload_file_routes
|
15 |
+
from app.modules.querySearch.routes.querySearch_route import \
|
16 |
+
router as query_search_routes
|
17 |
+
|
18 |
+
|
19 |
+
@asynccontextmanager
|
20 |
+
async def lifespan(app: FastAPI):
|
21 |
+
dense_model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
22 |
+
sparse_model_name = "prithivida/Splade_PP_en_v1"
|
23 |
+
qa_model_name = "deepset/roberta-base-squad2"
|
24 |
+
|
25 |
+
dense_tokenizer = AutoTokenizer.from_pretrained(dense_model_name)
|
26 |
+
dense_model = AutoModel.from_pretrained(dense_model_name)
|
27 |
+
|
28 |
+
sparse_tokenizer = AutoTokenizer.from_pretrained(sparse_model_name)
|
29 |
+
sparse_model = AutoModelForMaskedLM.from_pretrained(sparse_model_name)
|
30 |
+
|
31 |
+
qa_pipeline = pipeline("question-answering", model=qa_model_name)
|
32 |
+
|
33 |
+
yield {
|
34 |
+
"dense_tokenizer": dense_tokenizer,
|
35 |
+
"dense_model": dense_model,
|
36 |
+
"sparse_tokenizer": sparse_tokenizer,
|
37 |
+
"sparse_model": sparse_model,
|
38 |
+
"qa_pipeline": qa_pipeline,
|
39 |
+
}
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
+
app = FastAPI(lifespan=lifespan)
|
43 |
|
44 |
origins = [
|
45 |
"http://localhost:8000",
|
|
|
59 |
)
|
60 |
|
61 |
app_router = APIRouter(prefix="/api")
|
62 |
+
app_router.include_router(upload_file_routes, prefix="/document", tags=["document"])
|
63 |
app_router.include_router(query_search_routes, prefix="/query", tags=["query"])
|
64 |
app_router.include_router(clear_variables_routes, prefix="/clear", tags=["clear"])
|
65 |
|
Api/app/modules/__pycache__/model.cpython-310.pyc
CHANGED
Binary files a/Api/app/modules/__pycache__/model.cpython-310.pyc and b/Api/app/modules/__pycache__/model.cpython-310.pyc differ
|
|
Api/app/modules/clearVariables/routes/__pycache__/clearVariables_route.cpython-310.pyc
CHANGED
Binary files a/Api/app/modules/clearVariables/routes/__pycache__/clearVariables_route.cpython-310.pyc and b/Api/app/modules/clearVariables/routes/__pycache__/clearVariables_route.cpython-310.pyc differ
|
|
Api/app/modules/clearVariables/routes/clearVariables_route.py
CHANGED
@@ -1,20 +1,12 @@
|
|
1 |
-
from app.db_local_storage.documents_db import documents_db, documents_text
|
2 |
from app.db_local_storage.in_memory_db import query_response_storage
|
3 |
-
from
|
4 |
-
from app.modules.uploadDocument.controllers.file_upload_controller import \
|
5 |
-
FileUploadController
|
6 |
-
from fastapi import APIRouter, File, HTTPException, UploadFile
|
7 |
from fastapi.responses import JSONResponse
|
8 |
|
9 |
router = APIRouter()
|
10 |
-
|
11 |
|
12 |
@router.delete("/clear_variables/")
|
13 |
async def clear_variables():
|
14 |
-
vector_files_db.clear()
|
15 |
-
documents_db.clear()
|
16 |
query_response_storage.clear()
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
return JSONResponse(status_code=200, content={"message": "All variables cleared"})
|
|
|
|
|
1 |
from app.db_local_storage.in_memory_db import query_response_storage
|
2 |
+
from fastapi import APIRouter
|
|
|
|
|
|
|
3 |
from fastapi.responses import JSONResponse
|
4 |
|
5 |
router = APIRouter()
|
6 |
+
|
7 |
|
8 |
@router.delete("/clear_variables/")
|
9 |
async def clear_variables():
|
|
|
|
|
10 |
query_response_storage.clear()
|
11 |
+
|
12 |
+
return JSONResponse(status_code=200, content={"message": "All variables cleared"})
|
|
|
|
Api/app/modules/denseEmbeddings/__pycache__/denseEmbeddings.cpython-310.pyc
ADDED
Binary file (2.33 kB). View file
|
|
Api/app/modules/denseEmbeddings/denseEmbeddings.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from qdrant_client import models
|
3 |
+
from qdrant_client.models import NamedVector
|
4 |
+
from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer
|
5 |
+
|
6 |
+
|
7 |
+
class DenseEmbeddings:
|
8 |
+
|
9 |
+
def __init__(
|
10 |
+
self,
|
11 |
+
dense_model: AutoModel,
|
12 |
+
dense_tokenizer: AutoTokenizer,
|
13 |
+
sparse_model: AutoModelForMaskedLM,
|
14 |
+
sparse_tokenizer: AutoTokenizer,
|
15 |
+
):
|
16 |
+
|
17 |
+
self.dense_model = dense_model
|
18 |
+
self.dense_tokenizer = dense_tokenizer
|
19 |
+
self.sparse_model = sparse_model
|
20 |
+
self.sparse_tokenizer = sparse_tokenizer
|
21 |
+
|
22 |
+
def get_dense_vector(self, text: str) -> NamedVector:
|
23 |
+
"""
|
24 |
+
Get dense vector from the dense model
|
25 |
+
|
26 |
+
:param text: str
|
27 |
+
:return: NamedVector
|
28 |
+
"""
|
29 |
+
inputs = self.dense_tokenizer(
|
30 |
+
text, return_tensors="pt", padding=True, truncation=True
|
31 |
+
)
|
32 |
+
with torch.no_grad():
|
33 |
+
outputs = self.dense_model(**inputs)
|
34 |
+
|
35 |
+
dense_vector = NamedVector(
|
36 |
+
name="text-dense",
|
37 |
+
vector=torch.mean(outputs.last_hidden_state, dim=1).squeeze().numpy(),
|
38 |
+
)
|
39 |
+
return dense_vector
|
40 |
+
|
41 |
+
def get_sparse_vector(self, text: str) -> models.SparseVector:
|
42 |
+
"""
|
43 |
+
Get sparse vector from the sparse model
|
44 |
+
|
45 |
+
:param text: str
|
46 |
+
:return: SparseVector
|
47 |
+
"""
|
48 |
+
|
49 |
+
inputs = self.sparse_tokenizer(
|
50 |
+
text, return_tensors="pt", padding=True, truncation=True
|
51 |
+
)
|
52 |
+
with torch.no_grad():
|
53 |
+
outputs = self.sparse_model(**inputs)
|
54 |
+
|
55 |
+
token_scores = outputs.logits.squeeze().max(dim=0)[0]
|
56 |
+
token_ids = inputs["input_ids"].squeeze()
|
57 |
+
|
58 |
+
sparse_vector = {
|
59 |
+
int(token_id): float(score)
|
60 |
+
for token_id, score in zip(token_ids, token_scores)
|
61 |
+
if score > -5.0
|
62 |
+
}
|
63 |
+
|
64 |
+
sparse_vector = models.SparseVector(
|
65 |
+
indices=list(sparse_vector.keys()),
|
66 |
+
values=list(sparse_vector.values()),
|
67 |
+
)
|
68 |
+
|
69 |
+
return sparse_vector
|
Api/app/modules/{uploadDocument → documentHandeler}/controllers/__pycache__/FileUploadController.cpython-310.pyc
RENAMED
File without changes
|
Api/app/modules/documentHandeler/controllers/__pycache__/document_handeler_controller.cpython-310.pyc
ADDED
Binary file (2.59 kB). View file
|
|
Api/app/modules/documentHandeler/controllers/__pycache__/file_upload_controller.cpython-310.pyc
ADDED
Binary file (1.97 kB). View file
|
|
Api/app/modules/documentHandeler/controllers/document_handeler_controller.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import HTTPException, UploadFile
|
2 |
+
from fastapi.responses import JSONResponse
|
3 |
+
|
4 |
+
from app.infrastructure.repository.document_handeler_repository import (
|
5 |
+
DocumentHandelerRepository,
|
6 |
+
)
|
7 |
+
from app.modules.documentHandeler.features.createEmbeddings_feature import (
|
8 |
+
CreateEmbeddingsFeature,
|
9 |
+
)
|
10 |
+
from app.modules.documentHandeler.features.deleteDocument_feature import (
|
11 |
+
DeleteDocumentFeature,
|
12 |
+
)
|
13 |
+
from app.modules.documentHandeler.features.extractText_feature import ExtractTextFeature
|
14 |
+
from app.modules.documentHandeler.features.getAllChunkedText_feature import (
|
15 |
+
GetAllChunkedTextFeature,
|
16 |
+
)
|
17 |
+
|
18 |
+
|
19 |
+
class DocumentHandelerController:
|
20 |
+
|
21 |
+
def __init__(
|
22 |
+
self,
|
23 |
+
delete_document_feature: DeleteDocumentFeature,
|
24 |
+
create_embeddings_feature: CreateEmbeddingsFeature,
|
25 |
+
get_all_chunked_text_feature: GetAllChunkedTextFeature,
|
26 |
+
):
|
27 |
+
self.create_embeddings_feature = create_embeddings_feature
|
28 |
+
self.delete_document_feature = delete_document_feature
|
29 |
+
self.get_all_chunked_text_feature = get_all_chunked_text_feature
|
30 |
+
|
31 |
+
async def handle_file_upload(self, file: UploadFile) -> JSONResponse:
|
32 |
+
try:
|
33 |
+
|
34 |
+
text_file = await ExtractTextFeature.extract_text_from_pdf(file)
|
35 |
+
result = await self.create_embeddings_feature.create_embeddings(
|
36 |
+
text_file, file.filename
|
37 |
+
)
|
38 |
+
|
39 |
+
return JSONResponse(status_code=200, content=result.model_dump())
|
40 |
+
|
41 |
+
except Exception as e:
|
42 |
+
raise HTTPException(status_code=500, detail="Probelm on controller")
|
43 |
+
|
44 |
+
async def delete_document(self, text: str) -> JSONResponse:
|
45 |
+
try:
|
46 |
+
result = await self.delete_document_feature.delete_document_by_filename(
|
47 |
+
text
|
48 |
+
)
|
49 |
+
if result:
|
50 |
+
return JSONResponse(
|
51 |
+
status_code=200, content={"message": "Document deleted"}
|
52 |
+
)
|
53 |
+
return JSONResponse(
|
54 |
+
status_code=404, content={"message": "Document not found"}
|
55 |
+
)
|
56 |
+
|
57 |
+
except Exception as e:
|
58 |
+
raise HTTPException(status_code=500, detail=str(e))
|
59 |
+
|
60 |
+
async def get_all_chunks(self) -> JSONResponse:
|
61 |
+
try:
|
62 |
+
result = await self.get_all_chunked_text_feature.get_all_chunked_text()
|
63 |
+
return JSONResponse(status_code=200, content=result.model_dump())
|
64 |
+
|
65 |
+
except Exception as e:
|
66 |
+
raise HTTPException(status_code=500, detail=str(e))
|
Api/app/modules/documentHandeler/dependencies/__pycache__/dependencies.cpython-310.pyc
ADDED
Binary file (3.09 kB). View file
|
|
Api/app/modules/documentHandeler/dependencies/dependencies.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import Depends, Request
|
2 |
+
from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer
|
3 |
+
|
4 |
+
from app.infrastructure.repository.document_handeler_repository import (
|
5 |
+
DocumentHandelerRepository,
|
6 |
+
)
|
7 |
+
from app.modules.denseEmbeddings.denseEmbeddings import DenseEmbeddings
|
8 |
+
from app.modules.documentHandeler.controllers.document_handeler_controller import (
|
9 |
+
DocumentHandelerController,
|
10 |
+
)
|
11 |
+
from app.modules.documentHandeler.features.createEmbeddings_feature import (
|
12 |
+
CreateEmbeddingsFeature,
|
13 |
+
)
|
14 |
+
from app.modules.documentHandeler.features.deleteDocument_feature import (
|
15 |
+
DeleteDocumentFeature,
|
16 |
+
)
|
17 |
+
from app.modules.documentHandeler.features.getAllChunkedText_feature import (
|
18 |
+
GetAllChunkedTextFeature,
|
19 |
+
)
|
20 |
+
from app.qdrant import QdrantConnectionDb
|
21 |
+
|
22 |
+
|
23 |
+
def get_qdrant_connection_db() -> QdrantConnectionDb:
|
24 |
+
return QdrantConnectionDb()
|
25 |
+
|
26 |
+
|
27 |
+
def get_document_handeler_repository(
|
28 |
+
qdrant_connection_db: QdrantConnectionDb = Depends(get_qdrant_connection_db),
|
29 |
+
):
|
30 |
+
return DocumentHandelerRepository(qdrant_connection_db)
|
31 |
+
|
32 |
+
|
33 |
+
def get_dense_model(request: Request) -> AutoModel:
|
34 |
+
return request.scope["state"]["dense_model"]
|
35 |
+
|
36 |
+
|
37 |
+
def get_sparse_model(request: Request) -> AutoModelForMaskedLM:
|
38 |
+
return request.scope["state"]["sparse_model"]
|
39 |
+
|
40 |
+
|
41 |
+
def get_dense_tokenizer(request: Request) -> AutoTokenizer:
|
42 |
+
return request.scope["state"]["dense_tokenizer"]
|
43 |
+
|
44 |
+
|
45 |
+
def get_sparse_tokenizer(request: Request) -> AutoTokenizer:
|
46 |
+
return request.scope["state"]["sparse_tokenizer"]
|
47 |
+
|
48 |
+
|
49 |
+
def get_dense_embeddings(
|
50 |
+
dense_model: AutoModel = Depends(get_dense_model),
|
51 |
+
dense_tokenizer: AutoTokenizer = Depends(get_dense_tokenizer),
|
52 |
+
sparse_model: AutoModelForMaskedLM = Depends(get_sparse_model),
|
53 |
+
sparse_tokenizer: AutoTokenizer = Depends(get_sparse_tokenizer),
|
54 |
+
):
|
55 |
+
return DenseEmbeddings(
|
56 |
+
dense_model=dense_model,
|
57 |
+
dense_tokenizer=dense_tokenizer,
|
58 |
+
sparse_model=sparse_model,
|
59 |
+
sparse_tokenizer=sparse_tokenizer,
|
60 |
+
)
|
61 |
+
|
62 |
+
|
63 |
+
def get_all_chunked_text_feature(
|
64 |
+
document_handeler_repository: DocumentHandelerRepository = Depends(
|
65 |
+
get_document_handeler_repository
|
66 |
+
),
|
67 |
+
):
|
68 |
+
return GetAllChunkedTextFeature(document_handeler_repository)
|
69 |
+
|
70 |
+
|
71 |
+
def get_create_embeddings_feature(
|
72 |
+
dense_embeddings: DenseEmbeddings = Depends(get_dense_embeddings),
|
73 |
+
document_handeler_repository: DocumentHandelerRepository = Depends(
|
74 |
+
get_document_handeler_repository
|
75 |
+
),
|
76 |
+
):
|
77 |
+
return CreateEmbeddingsFeature(dense_embeddings, document_handeler_repository)
|
78 |
+
|
79 |
+
|
80 |
+
def get_delete_document_feature(
|
81 |
+
document_handeler_repository: DocumentHandelerRepository = Depends(
|
82 |
+
get_document_handeler_repository
|
83 |
+
),
|
84 |
+
):
|
85 |
+
return DeleteDocumentFeature(document_handeler_repository)
|
86 |
+
|
87 |
+
|
88 |
+
def get_document_handeler_controller(
|
89 |
+
delete_document_feature: DeleteDocumentFeature = Depends(
|
90 |
+
get_delete_document_feature
|
91 |
+
),
|
92 |
+
create_embeddings_feature: CreateEmbeddingsFeature = Depends(
|
93 |
+
get_create_embeddings_feature
|
94 |
+
),
|
95 |
+
get_all_chunked_text_feature: GetAllChunkedTextFeature = Depends(
|
96 |
+
get_all_chunked_text_feature
|
97 |
+
),
|
98 |
+
):
|
99 |
+
return DocumentHandelerController(
|
100 |
+
delete_document_feature=delete_document_feature,
|
101 |
+
create_embeddings_feature=create_embeddings_feature,
|
102 |
+
get_all_chunked_text_feature=get_all_chunked_text_feature,
|
103 |
+
)
|
104 |
+
|
105 |
+
|
106 |
+
def get_create_embeddings_feature(
|
107 |
+
dense_embeddings: DenseEmbeddings = Depends(get_dense_embeddings),
|
108 |
+
document_handeler_repository: DocumentHandelerRepository = Depends(
|
109 |
+
get_document_handeler_repository
|
110 |
+
),
|
111 |
+
):
|
112 |
+
return CreateEmbeddingsFeature(dense_embeddings, document_handeler_repository)
|
Api/app/modules/documentHandeler/features/__pycache__/createEmbeddings_feature.cpython-310.pyc
ADDED
Binary file (2.59 kB). View file
|
|
Api/app/modules/documentHandeler/features/__pycache__/deleteDocument_feature.cpython-310.pyc
ADDED
Binary file (985 Bytes). View file
|
|
Api/app/modules/documentHandeler/features/__pycache__/extractText_feature.cpython-310.pyc
ADDED
Binary file (839 Bytes). View file
|
|
Api/app/modules/documentHandeler/features/__pycache__/getAllChunkedText_feature.cpython-310.pyc
ADDED
Binary file (1.4 kB). View file
|
|
Api/app/modules/{uploadDocument → documentHandeler}/features/__pycache__/uploadDocument_feature.cpython-310.pyc
RENAMED
File without changes
|
Api/app/modules/documentHandeler/features/createEmbeddings_feature.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
import uuid
|
3 |
+
|
4 |
+
from qdrant_client.models import PointStruct
|
5 |
+
|
6 |
+
from app.infrastructure.models.my_models import EmbeddingCreation
|
7 |
+
from app.infrastructure.repository.document_handeler_repository import (
|
8 |
+
DocumentHandelerRepository,
|
9 |
+
)
|
10 |
+
from app.modules.denseEmbeddings.denseEmbeddings import DenseEmbeddings
|
11 |
+
|
12 |
+
|
13 |
+
class CreateEmbeddingsFeature:
|
14 |
+
|
15 |
+
def __init__(
|
16 |
+
self,
|
17 |
+
dense_embeddings: DenseEmbeddings,
|
18 |
+
document_handeler_repository: DocumentHandelerRepository,
|
19 |
+
):
|
20 |
+
self.dense_embeddings = dense_embeddings
|
21 |
+
self.document_handeler_repository = document_handeler_repository
|
22 |
+
|
23 |
+
def chunk_text(self, text: str, chunk_size: int = 512) -> List[str]:
|
24 |
+
"""
|
25 |
+
Chunk text into smaller pieces
|
26 |
+
|
27 |
+
:param text: str
|
28 |
+
:param chunk_size: int
|
29 |
+
:return: List[str]
|
30 |
+
"""
|
31 |
+
chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
|
32 |
+
return chunks
|
33 |
+
|
34 |
+
async def create_embeddings(self, text: str, filename: str) -> EmbeddingCreation:
|
35 |
+
"""
|
36 |
+
Create embeddings for the text
|
37 |
+
|
38 |
+
:param text: str
|
39 |
+
:param filename: str
|
40 |
+
:return: EmbeddingCreation
|
41 |
+
"""
|
42 |
+
|
43 |
+
chunks = self.chunk_text(text)
|
44 |
+
|
45 |
+
document_id = filename.split(".")[0]
|
46 |
+
|
47 |
+
points = [
|
48 |
+
PointStruct(
|
49 |
+
id=str(uuid.uuid4()),
|
50 |
+
vector={
|
51 |
+
"text-dense": self.dense_embeddings.get_dense_vector(chunk).vector,
|
52 |
+
"text-sparse": self.dense_embeddings.get_sparse_vector(chunk),
|
53 |
+
},
|
54 |
+
payload={
|
55 |
+
"document_id": document_id,
|
56 |
+
"chunk_index": i,
|
57 |
+
"filename": filename,
|
58 |
+
"chunk-text": chunk,
|
59 |
+
},
|
60 |
+
)
|
61 |
+
for i, chunk in enumerate(chunks)
|
62 |
+
]
|
63 |
+
|
64 |
+
result = self.document_handeler_repository.insert_points(points)
|
65 |
+
if result.status:
|
66 |
+
return EmbeddingCreation(
|
67 |
+
success=True, message="Embeddings created successfully"
|
68 |
+
)
|
69 |
+
return EmbeddingCreation(success=False, message="Embeddings creation failed")
|
Api/app/modules/documentHandeler/features/deleteDocument_feature.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from app.infrastructure.repository.document_handeler_repository import (
|
2 |
+
DocumentHandelerRepository,
|
3 |
+
)
|
4 |
+
|
5 |
+
|
6 |
+
class DeleteDocumentFeature:
|
7 |
+
|
8 |
+
def __init__(self, update_document_repository: DocumentHandelerRepository):
|
9 |
+
self.update_document_repository = update_document_repository
|
10 |
+
|
11 |
+
async def delete_document_by_filename(self, document_name: str) -> bool:
|
12 |
+
document = self.update_document_repository.find_points_by_document_name(
|
13 |
+
document_name
|
14 |
+
)
|
15 |
+
if document is None:
|
16 |
+
return False
|
17 |
+
self.update_document_repository.delete_document_by_id(document)
|
18 |
+
return True
|
Api/app/modules/documentHandeler/features/extractText_feature.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import os
|
3 |
+
|
4 |
+
from fastapi import UploadFile
|
5 |
+
import pdfplumber
|
6 |
+
|
7 |
+
|
8 |
+
class ExtractTextFeature:
|
9 |
+
|
10 |
+
@staticmethod
|
11 |
+
async def extract_text_from_pdf(file: UploadFile) -> str:
|
12 |
+
|
13 |
+
content = await file.read()
|
14 |
+
with pdfplumber.open(io.BytesIO(content)) as pdf:
|
15 |
+
text = ""
|
16 |
+
for page in pdf.pages:
|
17 |
+
text += page.extract_text()
|
18 |
+
|
19 |
+
return text
|
Api/app/modules/documentHandeler/features/getAllChunkedText_feature.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from app.infrastructure.models.my_models import ChunksResponse
|
2 |
+
from app.infrastructure.repository.document_handeler_repository import (
|
3 |
+
DocumentHandelerRepository,
|
4 |
+
)
|
5 |
+
|
6 |
+
|
7 |
+
class GetAllChunkedTextFeature:
|
8 |
+
def __init__(self, document_handeler_repository: DocumentHandelerRepository):
|
9 |
+
self.document_handeler_repository = document_handeler_repository
|
10 |
+
|
11 |
+
async def get_all_chunked_text(self):
|
12 |
+
|
13 |
+
qdrant_response = self.document_handeler_repository.get_all_documents()
|
14 |
+
|
15 |
+
transformed_data = {}
|
16 |
+
|
17 |
+
for document in qdrant_response[0]:
|
18 |
+
document_id = document.payload["document_id"]
|
19 |
+
chunk_index = document.payload["chunk_index"]
|
20 |
+
text = document.payload["chunk-text"]
|
21 |
+
|
22 |
+
if document_id not in transformed_data:
|
23 |
+
transformed_data[document_id] = []
|
24 |
+
|
25 |
+
transformed_data[document_id].append({"index": chunk_index, "text": text})
|
26 |
+
|
27 |
+
for doc in transformed_data:
|
28 |
+
transformed_data[doc] = sorted(
|
29 |
+
transformed_data[doc], key=lambda x: x["index"]
|
30 |
+
)
|
31 |
+
|
32 |
+
return ChunksResponse(data=transformed_data)
|
Api/app/modules/documentHandeler/features/uploadDocument_feature.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import os
|
2 |
+
# from typing import Dict
|
3 |
+
|
4 |
+
# from fastapi import UploadFile
|
5 |
+
# from app.db_local_storage.files_db import FILES_DIRECTORY, FILES_NAMES_DATABASE
|
6 |
+
# from app.db_local_storage.documents_db import documents_db
|
7 |
+
|
8 |
+
|
9 |
+
# class UploadDocumentFeature:
|
10 |
+
|
11 |
+
# @staticmethod
|
12 |
+
# async def uploadFile(document: UploadFile) -> Dict[str, str]:
|
13 |
+
# """
|
14 |
+
# Upload a file to the server
|
15 |
+
# :param document: the file to upload
|
16 |
+
# :return: a message to confirm the upload
|
17 |
+
# """
|
18 |
+
|
19 |
+
# data = {
|
20 |
+
# "id": len(documents_db) + 1,
|
21 |
+
# "filename": document.filename,
|
22 |
+
# }
|
23 |
+
|
24 |
+
# documents_db.append(data)
|
25 |
+
|
26 |
+
# return {"message": "Document Updated"}
|
Api/app/modules/documentHandeler/routes/__pycache__/document_handeler_route.cpython-310.pyc
ADDED
Binary file (1.16 kB). View file
|
|
Api/app/modules/documentHandeler/routes/__pycache__/uploadDocument_route.cpython-310.pyc
ADDED
Binary file (1.94 kB). View file
|
|
Api/app/modules/documentHandeler/routes/document_handeler_route.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, Depends, File, UploadFile
|
2 |
+
|
3 |
+
from app.modules.documentHandeler.controllers.document_handeler_controller import \
|
4 |
+
DocumentHandelerController
|
5 |
+
from app.modules.documentHandeler.dependencies.dependencies import \
|
6 |
+
get_document_handeler_controller
|
7 |
+
|
8 |
+
router = APIRouter()
|
9 |
+
|
10 |
+
|
11 |
+
@router.get("/get_chunks")
|
12 |
+
async def get_all_documents(
|
13 |
+
controller: DocumentHandelerController = Depends(get_document_handeler_controller),
|
14 |
+
):
|
15 |
+
return await controller.get_all_chunks()
|
16 |
+
|
17 |
+
|
18 |
+
@router.delete("/delete_document/{filename}")
|
19 |
+
async def delete_document(
|
20 |
+
filename: str,
|
21 |
+
controller: DocumentHandelerController = Depends(get_document_handeler_controller),
|
22 |
+
):
|
23 |
+
return await controller.delete_document(filename)
|
24 |
+
|
25 |
+
|
26 |
+
@router.post("/upload_file")
|
27 |
+
async def upload_file(
|
28 |
+
file: UploadFile = File(...),
|
29 |
+
controller: DocumentHandelerController = Depends(get_document_handeler_controller),
|
30 |
+
):
|
31 |
+
return await controller.handle_file_upload(file)
|
Api/app/modules/{uploadDocument → documentHandeler}/schemas/uploadDocument_schema.py
RENAMED
File without changes
|
Api/app/modules/hybridSearcher/__pycache__/hybridSearcher.cpython-310.pyc
ADDED
Binary file (2.41 kB). View file
|
|
Api/app/modules/hybridSearcher/hybridSearcher.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from qdrant_client import models
|
2 |
+
from qdrant_client.conversions import common_types as types
|
3 |
+
from qdrant_client.models import NamedVector, SparseVector
|
4 |
+
|
5 |
+
from app.infrastructure.models.my_models import HybridSearchResponse
|
6 |
+
from app.infrastructure.repository.query_search_repository import QuerySearchRepository
|
7 |
+
from app.modules.denseEmbeddings.denseEmbeddings import DenseEmbeddings
|
8 |
+
from app.qdrant import QdrantConnectionDb
|
9 |
+
|
10 |
+
|
11 |
+
class HybridSearcher:
|
12 |
+
|
13 |
+
def __init__(
|
14 |
+
self,
|
15 |
+
dense_embeddings: DenseEmbeddings,
|
16 |
+
query_search_repository: QuerySearchRepository,
|
17 |
+
):
|
18 |
+
self.dense_embeddings = dense_embeddings
|
19 |
+
self.query_search_repository = query_search_repository
|
20 |
+
|
21 |
+
def sparse_dense_rrf_prefetch(
|
22 |
+
self, sparse_vector: SparseVector, dense_vector: NamedVector
|
23 |
+
) -> models.Prefetch:
|
24 |
+
result = models.Prefetch(
|
25 |
+
prefetch=[
|
26 |
+
models.Prefetch(
|
27 |
+
query=dense_vector.vector,
|
28 |
+
using="text-dense",
|
29 |
+
limit=10,
|
30 |
+
),
|
31 |
+
models.Prefetch(
|
32 |
+
query=sparse_vector,
|
33 |
+
using="text-sparse",
|
34 |
+
limit=10,
|
35 |
+
),
|
36 |
+
],
|
37 |
+
query=models.FusionQuery(
|
38 |
+
fusion=models.Fusion.RRF,
|
39 |
+
),
|
40 |
+
)
|
41 |
+
|
42 |
+
return result
|
43 |
+
|
44 |
+
def hybrid_search(self, user_query: str) -> types.QueryResponse:
|
45 |
+
"""
|
46 |
+
Hybrid search
|
47 |
+
|
48 |
+
:param user_query: str
|
49 |
+
:return: types.QueryResponse
|
50 |
+
"""
|
51 |
+
try:
|
52 |
+
sparse_vector = self.dense_embeddings.get_sparse_vector(user_query)
|
53 |
+
dense_vector = self.dense_embeddings.get_dense_vector(user_query)
|
54 |
+
|
55 |
+
prefetch_context = self.sparse_dense_rrf_prefetch(
|
56 |
+
sparse_vector, dense_vector
|
57 |
+
)
|
58 |
+
|
59 |
+
result = self.query_search_repository.find_text_by_hybrid_search(
|
60 |
+
prefetch_context, dense_vector
|
61 |
+
)
|
62 |
+
|
63 |
+
response_data = [
|
64 |
+
{"chunk-text": point.payload["chunk-text"]} for point in result.points
|
65 |
+
]
|
66 |
+
return HybridSearchResponse(success=True, data=response_data)
|
67 |
+
|
68 |
+
except Exception as e:
|
69 |
+
return HybridSearchResponse(
|
70 |
+
success=False, message=f"Database operation failed: {str(e)}"
|
71 |
+
)
|
Api/app/modules/model.py
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
from sentence_transformers import SentenceTransformer
|
2 |
-
from transformers import pipeline
|
3 |
-
|
4 |
-
|
5 |
-
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
|
6 |
-
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Api/app/modules/querySearch/__pycache__/dependecies.cpython-310.pyc
ADDED
Binary file (2.95 kB). View file
|
|
Api/app/modules/querySearch/controllers/__pycache__/querySearch_controller.cpython-310.pyc
CHANGED
Binary files a/Api/app/modules/querySearch/controllers/__pycache__/querySearch_controller.cpython-310.pyc and b/Api/app/modules/querySearch/controllers/__pycache__/querySearch_controller.cpython-310.pyc differ
|
|
Api/app/modules/querySearch/controllers/querySearch_controller.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
from typing import Any
|
2 |
from fastapi import HTTPException
|
3 |
from fastapi.responses import JSONResponse
|
4 |
|
@@ -10,13 +9,12 @@ class QuerySearchController:
|
|
10 |
def __init__(self, query_search_feature: QuerySearchFeature):
|
11 |
self.query_search_feature = query_search_feature
|
12 |
|
13 |
-
async def handle_query_search(self,
|
14 |
try:
|
15 |
|
16 |
-
result = await self.query_search_feature.query_search(
|
17 |
-
message = result.get("message", "No message provided")
|
18 |
|
19 |
-
return JSONResponse(status_code=200, content=
|
20 |
|
21 |
except Exception as e:
|
22 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
1 |
from fastapi import HTTPException
|
2 |
from fastapi.responses import JSONResponse
|
3 |
|
|
|
9 |
def __init__(self, query_search_feature: QuerySearchFeature):
|
10 |
self.query_search_feature = query_search_feature
|
11 |
|
12 |
+
async def handle_query_search(self, query: str) -> JSONResponse:
|
13 |
try:
|
14 |
|
15 |
+
result = await self.query_search_feature.query_search(query)
|
|
|
16 |
|
17 |
+
return JSONResponse(status_code=200, content=result.model_dump())
|
18 |
|
19 |
except Exception as e:
|
20 |
raise HTTPException(status_code=500, detail=str(e))
|
Api/app/modules/querySearch/dependecies.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import Depends, Request
|
2 |
+
from transformers import (AutoModel, AutoModelForMaskedLM, AutoTokenizer,
|
3 |
+
pipeline)
|
4 |
+
|
5 |
+
from app.infrastructure.repository.query_search_repository import \
|
6 |
+
QuerySearchRepository
|
7 |
+
from app.modules.denseEmbeddings.denseEmbeddings import DenseEmbeddings
|
8 |
+
from app.modules.hybridSearcher.hybridSearcher import HybridSearcher
|
9 |
+
from app.modules.querySearch.controllers.querySearch_controller import \
|
10 |
+
QuerySearchController
|
11 |
+
from app.modules.querySearch.features.querySearch_feature import \
|
12 |
+
QuerySearchFeature
|
13 |
+
from app.modules.questionAnswer.questionAnswer import QuestionAnswering
|
14 |
+
from app.qdrant import QdrantConnectionDb
|
15 |
+
|
16 |
+
|
17 |
+
def get_qdrant_connection_db() -> QdrantConnectionDb:
|
18 |
+
return QdrantConnectionDb()
|
19 |
+
|
20 |
+
|
21 |
+
def get_query_search_repository(
|
22 |
+
qdrant_connection_db: QdrantConnectionDb = Depends(get_qdrant_connection_db),
|
23 |
+
):
|
24 |
+
return QuerySearchRepository(qdrant_connection_db)
|
25 |
+
|
26 |
+
|
27 |
+
def get_dense_model(request: Request) -> AutoModel:
|
28 |
+
return request.scope["state"]["dense_model"]
|
29 |
+
|
30 |
+
|
31 |
+
def get_sparse_model(request: Request) -> AutoModelForMaskedLM:
|
32 |
+
return request.scope["state"]["sparse_model"]
|
33 |
+
|
34 |
+
|
35 |
+
def get_dense_tokenizer(request: Request) -> AutoTokenizer:
|
36 |
+
return request.scope["state"]["dense_tokenizer"]
|
37 |
+
|
38 |
+
|
39 |
+
def get_sparse_tokenizer(request: Request) -> AutoTokenizer:
|
40 |
+
return request.scope["state"]["sparse_tokenizer"]
|
41 |
+
|
42 |
+
|
43 |
+
def get_dense_embeddings(
|
44 |
+
dense_model: AutoModel = Depends(get_dense_model),
|
45 |
+
dense_tokenizer: AutoTokenizer = Depends(get_dense_tokenizer),
|
46 |
+
sparse_model: AutoModelForMaskedLM = Depends(get_sparse_model),
|
47 |
+
sparse_tokenizer: AutoTokenizer = Depends(get_sparse_tokenizer),
|
48 |
+
):
|
49 |
+
return DenseEmbeddings(
|
50 |
+
dense_model=dense_model,
|
51 |
+
dense_tokenizer=dense_tokenizer,
|
52 |
+
sparse_model=sparse_model,
|
53 |
+
sparse_tokenizer=sparse_tokenizer,
|
54 |
+
)
|
55 |
+
|
56 |
+
|
57 |
+
def get_qa_pipeline(request: Request):
|
58 |
+
return request.scope["state"]["qa_pipeline"]
|
59 |
+
|
60 |
+
|
61 |
+
def get_question_ansering(qa_pipline: pipeline = Depends(get_qa_pipeline)):
|
62 |
+
return QuestionAnswering(qa_pipline)
|
63 |
+
|
64 |
+
|
65 |
+
def get_hybrid_searcher(
|
66 |
+
dense_embeddings: DenseEmbeddings = Depends(get_dense_embeddings),
|
67 |
+
query_search_repository: QuerySearchRepository = Depends(
|
68 |
+
get_query_search_repository
|
69 |
+
),
|
70 |
+
):
|
71 |
+
return HybridSearcher(dense_embeddings, query_search_repository)
|
72 |
+
|
73 |
+
|
74 |
+
def get_query_search_feature(
|
75 |
+
qa_pipeline: pipeline = Depends(get_qa_pipeline),
|
76 |
+
hybrid_searcher: HybridSearcher = Depends(get_hybrid_searcher),
|
77 |
+
question_answering: QuestionAnswering = Depends(get_question_ansering),
|
78 |
+
):
|
79 |
+
return QuerySearchFeature(qa_pipeline, hybrid_searcher, question_answering)
|
80 |
+
|
81 |
+
|
82 |
+
def get_query_search_controller(
|
83 |
+
query_search_feature: QuerySearchFeature = Depends(get_query_search_feature),
|
84 |
+
):
|
85 |
+
return QuerySearchController(query_search_feature)
|
Api/app/modules/querySearch/features/__pycache__/querySearch_feature.cpython-310.pyc
CHANGED
Binary files a/Api/app/modules/querySearch/features/__pycache__/querySearch_feature.cpython-310.pyc and b/Api/app/modules/querySearch/features/__pycache__/querySearch_feature.cpython-310.pyc differ
|
|
Api/app/modules/querySearch/features/querySearch_feature.py
CHANGED
@@ -1,89 +1,51 @@
|
|
1 |
-
import
|
2 |
-
from typing import List, Tuple
|
3 |
|
4 |
-
import numpy as np
|
5 |
-
# from fastapi.responses import JSONResponse
|
6 |
-
# from sentence_transformers import SentenceTransformer
|
7 |
-
# from transformers import pipeline
|
8 |
-
|
9 |
-
from app.db_local_storage.vector_files_db import vector_files_db
|
10 |
-
from app.db_local_storage.files_db import VECTOR_FILES_DIRECTORY
|
11 |
from app.db_local_storage.in_memory_db import query_response_storage
|
12 |
-
|
|
|
|
|
|
|
13 |
|
14 |
|
15 |
class QuerySearchFeature:
|
16 |
|
17 |
-
def __init__(
|
18 |
-
self
|
|
|
|
|
|
|
|
|
19 |
self.qa_pipeline = qa_pipeline
|
|
|
|
|
20 |
|
21 |
-
async def query_search(self, query: str) ->
|
22 |
-
|
23 |
-
user_query = {
|
24 |
-
"text": query,
|
25 |
-
"isSender": True,
|
26 |
-
}
|
27 |
-
|
28 |
-
query_response_storage.append(user_query)
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
semantic_results = await QuerySearchFeature.semantic_search(
|
36 |
-
query, text_data, embeddings, self.model
|
37 |
)
|
38 |
|
39 |
-
|
40 |
-
context =
|
41 |
-
|
42 |
-
response = self.qa_pipeline(question=query, context=context)
|
43 |
-
|
44 |
-
response_query = {
|
45 |
-
"text": response["answer"],
|
46 |
-
"isSender": False,
|
47 |
-
}
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
@staticmethod
|
58 |
-
async def semantic_search(
|
59 |
-
query: str, chunks: List[str], embeddings: np.ndarray, model
|
60 |
-
) -> List[str]:
|
61 |
-
query_embedding = model.encode([query])
|
62 |
-
similarities = np.dot(embeddings, query_embedding.T).flatten()
|
63 |
-
top_indices = np.argsort(-similarities)[:3]
|
64 |
-
return [chunks[i] for i in top_indices]
|
65 |
-
|
66 |
-
@staticmethod
|
67 |
-
async def lexical_search(query: str, chunks: List[str]) -> List[str]:
|
68 |
-
return [chunk for chunk in chunks if query.lower() in chunk.lower()]
|
69 |
-
|
70 |
-
@staticmethod
|
71 |
-
async def load_data():
|
72 |
-
with open(VECTOR_FILES_DIRECTORY, "r") as file:
|
73 |
-
dataBase = json.load(file)
|
74 |
-
return dataBase
|
75 |
-
|
76 |
-
@staticmethod
|
77 |
-
async def split_dataBase(db) -> Tuple[List[str], np.ndarray]:
|
78 |
-
text_data = []
|
79 |
-
embeddings = []
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
return " ".join(
|
|
|
1 |
+
from qdrant_client.conversions import common_types as types
|
|
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from app.db_local_storage.in_memory_db import query_response_storage
|
4 |
+
from app.infrastructure.models.my_models import HybridSearchResponse, ModelResponse
|
5 |
+
from app.modules.hybridSearcher.hybridSearcher import HybridSearcher
|
6 |
+
from app.modules.questionAnswer.questionAnswer import QuestionAnswering
|
7 |
+
from transformers import pipeline
|
8 |
|
9 |
|
10 |
class QuerySearchFeature:
|
11 |
|
12 |
+
def __init__(
|
13 |
+
self,
|
14 |
+
qa_pipeline: pipeline,
|
15 |
+
hybrid_searcher: HybridSearcher,
|
16 |
+
question_answering: QuestionAnswering,
|
17 |
+
):
|
18 |
self.qa_pipeline = qa_pipeline
|
19 |
+
self.hybrid_searcher = hybrid_searcher
|
20 |
+
self.question_answering = question_answering
|
21 |
|
22 |
+
async def query_search(self, query: str) -> ModelResponse:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
query_response_storage.append(
|
25 |
+
{
|
26 |
+
"text": query,
|
27 |
+
"isSender": True,
|
28 |
+
}
|
|
|
|
|
29 |
)
|
30 |
|
31 |
+
result = self.hybrid_searcher.hybrid_search(query)
|
32 |
+
context = self.get_and_join_context(result)
|
33 |
+
model_response = self.question_answering.answer_question(query, context)
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
# TODO: Manage memory for display messages
|
36 |
+
query_response_storage.append(
|
37 |
+
{
|
38 |
+
"text": model_response,
|
39 |
+
"isSender": False,
|
40 |
+
}
|
41 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
+
return ModelResponse(
|
44 |
+
text=model_response,
|
45 |
+
isSender=False,
|
46 |
+
message="success",
|
47 |
+
)
|
48 |
|
49 |
+
def get_and_join_context(self, search_result: HybridSearchResponse) -> str:
|
50 |
+
contexts = [point["chunk-text"] for point in search_result.data]
|
51 |
+
return ", ".join(contexts)
|