Spaces:
Runtime error
Runtime error
quantization, reduce chunksize
Browse files- client.py +14 -1
- preprocessing.py +4 -5
- scraper.py +1 -2
client.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import os
|
2 |
|
3 |
from dotenv import load_dotenv
|
4 |
-
from qdrant_client import QdrantClient
|
5 |
|
6 |
load_dotenv()
|
7 |
|
@@ -24,6 +24,13 @@ class HybridClient:
|
|
24 |
collection_name=collection,
|
25 |
vectors_config=self.qdrant_client.get_fastembed_vector_params(),
|
26 |
sparse_vectors_config=self.qdrant_client.get_fastembed_sparse_vector_params(),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
)
|
28 |
print(f"--- {collection} collection created")
|
29 |
return collection
|
@@ -33,6 +40,8 @@ class HybridClient:
|
|
33 |
documents = []
|
34 |
for chunk in chunks:
|
35 |
documents.append(chunk.pop("text"))
|
|
|
|
|
36 |
|
37 |
self.qdrant_client.add(
|
38 |
collection_name=collection,
|
@@ -52,3 +61,7 @@ class HybridClient:
|
|
52 |
# Select and return metadata
|
53 |
# metadata = [hit.metadata for hit in search_result]
|
54 |
return search_result
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
|
3 |
from dotenv import load_dotenv
|
4 |
+
from qdrant_client import QdrantClient, models
|
5 |
|
6 |
load_dotenv()
|
7 |
|
|
|
24 |
collection_name=collection,
|
25 |
vectors_config=self.qdrant_client.get_fastembed_vector_params(),
|
26 |
sparse_vectors_config=self.qdrant_client.get_fastembed_sparse_vector_params(),
|
27 |
+
quantization_config=models.ScalarQuantization(
|
28 |
+
scalar=models.ScalarQuantizationConfig(
|
29 |
+
type=models.ScalarType.INT8,
|
30 |
+
quantile=0.99,
|
31 |
+
always_ram=False,
|
32 |
+
),
|
33 |
+
),
|
34 |
)
|
35 |
print(f"--- {collection} collection created")
|
36 |
return collection
|
|
|
40 |
documents = []
|
41 |
for chunk in chunks:
|
42 |
documents.append(chunk.pop("text"))
|
43 |
+
chunk.pop("color")
|
44 |
+
chunk.pop("size")
|
45 |
|
46 |
self.qdrant_client.add(
|
47 |
collection_name=collection,
|
|
|
61 |
# Select and return metadata
|
62 |
# metadata = [hit.metadata for hit in search_result]
|
63 |
return search_result
|
64 |
+
|
65 |
+
def get_chapter_name(self, collection: str):
|
66 |
+
points = self.qdrant_client.retrieve(collection_name=collection, ids=[0])
|
67 |
+
return points[0]
|
preprocessing.py
CHANGED
@@ -10,14 +10,14 @@ def sort_text(chunks):
|
|
10 |
right_column = []
|
11 |
|
12 |
for chunk in chunks:
|
13 |
-
if chunk["
|
14 |
left_column.append(chunk)
|
15 |
else:
|
16 |
right_column.append(chunk)
|
17 |
|
18 |
# Sort the chunks within each column based on the y-coordinate
|
19 |
-
left_column = sorted(left_column, key=lambda item: item["
|
20 |
-
right_column = sorted(right_column, key=lambda item: item["
|
21 |
|
22 |
sorted_text = left_column + right_column
|
23 |
return sorted_text
|
@@ -75,8 +75,7 @@ def get_chunks(doc):
|
|
75 |
{
|
76 |
"text": clean_text(text.strip()),
|
77 |
"page": page_num,
|
78 |
-
"
|
79 |
-
"y": block["bbox"][1],
|
80 |
"color": majority_element(spans, "color"),
|
81 |
"size": majority_element(spans, "size"),
|
82 |
}
|
|
|
10 |
right_column = []
|
11 |
|
12 |
for chunk in chunks:
|
13 |
+
if chunk["coordinates"][0] < x_threshold:
|
14 |
left_column.append(chunk)
|
15 |
else:
|
16 |
right_column.append(chunk)
|
17 |
|
18 |
# Sort the chunks within each column based on the y-coordinate
|
19 |
+
left_column = sorted(left_column, key=lambda item: item["coordinates"][1])
|
20 |
+
right_column = sorted(right_column, key=lambda item: item["coordinates"][1])
|
21 |
|
22 |
sorted_text = left_column + right_column
|
23 |
return sorted_text
|
|
|
75 |
{
|
76 |
"text": clean_text(text.strip()),
|
77 |
"page": page_num,
|
78 |
+
"coordinates": [round(block["bbox"][0], 1), round(block["bbox"][1], 1)],
|
|
|
79 |
"color": majority_element(spans, "color"),
|
80 |
"size": majority_element(spans, "size"),
|
81 |
}
|
scraper.py
CHANGED
@@ -70,8 +70,7 @@ async def download(session: aiohttp.ClientSession, url: str, max_retries: int =
|
|
70 |
async def upload_book(grade, subject, chapters=None):
|
71 |
hclient = HybridClient()
|
72 |
|
73 |
-
book = await get_book(grade, subject)
|
74 |
-
print(type(book))
|
75 |
for collection, pdf in book.items():
|
76 |
print(collection)
|
77 |
chunks = index_pdf(pdf, buffer=True)
|
|
|
70 |
async def upload_book(grade, subject, chapters=None):
|
71 |
hclient = HybridClient()
|
72 |
|
73 |
+
book = await get_book(grade, subject, chapters)
|
|
|
74 |
for collection, pdf in book.items():
|
75 |
print(collection)
|
76 |
chunks = index_pdf(pdf, buffer=True)
|