omkar334 commited on
Commit
2ef8487
1 Parent(s): 76bc633
Files changed (2) hide show
  1. client.py +3 -1
  2. preprocessing.py +4 -2
client.py CHANGED
@@ -20,11 +20,12 @@ class HybridClient:
20
 
21
  def create(self, collection: str):
22
  if not self.qdrant_client.collection_exists(collection):
23
- self.create_collection(
24
  collection_name=collection,
25
  vectors_config=self.qdrant_client.get_fastembed_vector_params(),
26
  sparse_vectors_config=self.qdrant_client.get_fastembed_sparse_vector_params(),
27
  )
 
28
  return collection
29
  return None
30
 
@@ -39,6 +40,7 @@ class HybridClient:
39
  metadata=chunks,
40
  parallel=0,
41
  )
 
42
 
43
  def search(self, collection, text: str, limit: int = 10):
44
  search_result = self.qdrant_client.query(
 
20
 
21
  def create(self, collection: str):
22
  if not self.qdrant_client.collection_exists(collection):
23
+ self.qdrant_client.create_collection(
24
  collection_name=collection,
25
  vectors_config=self.qdrant_client.get_fastembed_vector_params(),
26
  sparse_vectors_config=self.qdrant_client.get_fastembed_sparse_vector_params(),
27
  )
28
+ print(f"--- {collection} collection created")
29
  return collection
30
  return None
31
 
 
40
  metadata=chunks,
41
  parallel=0,
42
  )
43
+ print("--- pdf inserted")
44
 
45
  def search(self, collection, text: str, limit: int = 10):
46
  search_result = self.qdrant_client.query(
preprocessing.py CHANGED
@@ -36,7 +36,7 @@ def majority_element(spans, param):
36
 
37
 
38
  def clean_text(text):
39
- print("Cleaning = ", text)
40
  words = text.split()
41
  unique_words = OrderedDict.fromkeys(words)
42
  cleaned_text = " ".join(unique_words)
@@ -88,6 +88,7 @@ def get_chunks(doc):
88
 
89
 
90
  def process_activities(chunks):
 
91
  # activities = []
92
  i = 0
93
  while i < len(chunks):
@@ -112,11 +113,12 @@ def process_activities(chunks):
112
  return chunks
113
 
114
 
115
- def embed_pdf(path, buffer=False):
116
  if buffer:
117
  doc = pymupdf.open(stream=path, filetype="pdf")
118
  else:
119
  doc = pymupdf.open(path)
120
  chunks = get_chunks(doc)
121
  chunks = process_activities(chunks)
 
122
  return chunks
 
36
 
37
 
38
  def clean_text(text):
39
+ """Cleans repeated text (OCR error)"""
40
  words = text.split()
41
  unique_words = OrderedDict.fromkeys(words)
42
  cleaned_text = " ".join(unique_words)
 
88
 
89
 
90
  def process_activities(chunks):
91
+ """Groups lines of 'Activity' together"""
92
  # activities = []
93
  i = 0
94
  while i < len(chunks):
 
113
  return chunks
114
 
115
 
116
+ def index_pdf(path, buffer=False):
117
  if buffer:
118
  doc = pymupdf.open(stream=path, filetype="pdf")
119
  else:
120
  doc = pymupdf.open(path)
121
  chunks = get_chunks(doc)
122
  chunks = process_activities(chunks)
123
+ print("--- pdf indexed")
124
  return chunks