Spaces:
Runtime error
Runtime error
comments
Browse files- client.py +3 -1
- preprocessing.py +4 -2
client.py
CHANGED
@@ -20,11 +20,12 @@ class HybridClient:
|
|
20 |
|
21 |
def create(self, collection: str):
|
22 |
if not self.qdrant_client.collection_exists(collection):
|
23 |
-
self.create_collection(
|
24 |
collection_name=collection,
|
25 |
vectors_config=self.qdrant_client.get_fastembed_vector_params(),
|
26 |
sparse_vectors_config=self.qdrant_client.get_fastembed_sparse_vector_params(),
|
27 |
)
|
|
|
28 |
return collection
|
29 |
return None
|
30 |
|
@@ -39,6 +40,7 @@ class HybridClient:
|
|
39 |
metadata=chunks,
|
40 |
parallel=0,
|
41 |
)
|
|
|
42 |
|
43 |
def search(self, collection, text: str, limit: int = 10):
|
44 |
search_result = self.qdrant_client.query(
|
|
|
20 |
|
21 |
def create(self, collection: str):
|
22 |
if not self.qdrant_client.collection_exists(collection):
|
23 |
+
self.qdrant_client.create_collection(
|
24 |
collection_name=collection,
|
25 |
vectors_config=self.qdrant_client.get_fastembed_vector_params(),
|
26 |
sparse_vectors_config=self.qdrant_client.get_fastembed_sparse_vector_params(),
|
27 |
)
|
28 |
+
print(f"--- {collection} collection created")
|
29 |
return collection
|
30 |
return None
|
31 |
|
|
|
40 |
metadata=chunks,
|
41 |
parallel=0,
|
42 |
)
|
43 |
+
print("--- pdf inserted")
|
44 |
|
45 |
def search(self, collection, text: str, limit: int = 10):
|
46 |
search_result = self.qdrant_client.query(
|
preprocessing.py
CHANGED
@@ -36,7 +36,7 @@ def majority_element(spans, param):
|
|
36 |
|
37 |
|
38 |
def clean_text(text):
|
39 |
-
|
40 |
words = text.split()
|
41 |
unique_words = OrderedDict.fromkeys(words)
|
42 |
cleaned_text = " ".join(unique_words)
|
@@ -88,6 +88,7 @@ def get_chunks(doc):
|
|
88 |
|
89 |
|
90 |
def process_activities(chunks):
|
|
|
91 |
# activities = []
|
92 |
i = 0
|
93 |
while i < len(chunks):
|
@@ -112,11 +113,12 @@ def process_activities(chunks):
|
|
112 |
return chunks
|
113 |
|
114 |
|
115 |
-
def
|
116 |
if buffer:
|
117 |
doc = pymupdf.open(stream=path, filetype="pdf")
|
118 |
else:
|
119 |
doc = pymupdf.open(path)
|
120 |
chunks = get_chunks(doc)
|
121 |
chunks = process_activities(chunks)
|
|
|
122 |
return chunks
|
|
|
36 |
|
37 |
|
38 |
def clean_text(text):
|
39 |
+
"""Cleans repeated text (OCR error)"""
|
40 |
words = text.split()
|
41 |
unique_words = OrderedDict.fromkeys(words)
|
42 |
cleaned_text = " ".join(unique_words)
|
|
|
88 |
|
89 |
|
90 |
def process_activities(chunks):
|
91 |
+
"""Groups lines of 'Activity' together"""
|
92 |
# activities = []
|
93 |
i = 0
|
94 |
while i < len(chunks):
|
|
|
113 |
return chunks
|
114 |
|
115 |
|
116 |
+
def index_pdf(path, buffer=False):
|
117 |
if buffer:
|
118 |
doc = pymupdf.open(stream=path, filetype="pdf")
|
119 |
else:
|
120 |
doc = pymupdf.open(path)
|
121 |
chunks = get_chunks(doc)
|
122 |
chunks = process_activities(chunks)
|
123 |
+
print("--- pdf indexed")
|
124 |
return chunks
|