omkar334 commited on
Commit
67f3761
1 Parent(s): e601b80

pdf processing

Browse files
Files changed (1) hide show
  1. preprocessing.py +83 -0
preprocessing.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+
3
+ import pymupdf
4
+
5
+
6
+ def sort_text(chunks):
7
+ x_threshold = 300
8
+ left_column = []
9
+ right_column = []
10
+
11
+ for chunk in chunks:
12
+ if chunk["x"] < x_threshold:
13
+ left_column.append(chunk)
14
+ else:
15
+ right_column.append(chunk)
16
+
17
+ # Sort the chunks within each column based on the y-coordinate
18
+ left_column = sorted(left_column, key=lambda item: item["y"])
19
+ right_column = sorted(right_column, key=lambda item: item["y"])
20
+
21
+ sorted_text = left_column + right_column
22
+ return sorted_text
23
+
24
+
25
+ def majority_element(spans, param):
26
+ char_count = defaultdict(int)
27
+
28
+ for span in spans:
29
+ span_text = span["text"]
30
+ span_param = span[param] # Get the color or size for this span
31
+ char_count[span_param] += len(span_text) # Count characters
32
+
33
+ # Return the parameter value with the highest character count
34
+ return max(char_count, key=char_count.get, default=None)
35
+
36
+
37
+ def get_chunks(doc):
38
+ allchunks = []
39
+
40
+ # Page Iteration
41
+ for page_num in range(doc.page_count):
42
+ chunks = []
43
+ page = doc[page_num]
44
+
45
+ # Filter images (not needed)
46
+ blocks = [i for i in page.get_text("dict")["blocks"] if "image" not in i]
47
+
48
+ # Block Iteration
49
+ for block in blocks:
50
+ text = ""
51
+ spans = []
52
+
53
+ # Line iteration
54
+ for line in block["lines"]:
55
+ for span in line["spans"]:
56
+ # Only include text with a size greater than 9
57
+ if span["size"] > 9:
58
+ span_text = span["text"]
59
+ text += span_text + " "
60
+ spans.append(span) # Store the span for majority calculation
61
+
62
+ # Filter empty strings
63
+ if text.strip():
64
+ chunks.append(
65
+ {
66
+ "text": text.strip(),
67
+ "page": page_num,
68
+ "x": block["bbox"][0],
69
+ "y": block["bbox"][1],
70
+ "color": majority_element(spans, "color"),
71
+ "size": majority_element(spans, "size"),
72
+ }
73
+ )
74
+
75
+ # Sort text according to column order
76
+ allchunks.extend(sort_text(chunks))
77
+ return allchunks
78
+
79
+
80
+ def embed_pdf(path):
81
+ doc = pymupdf.open(path)
82
+ chunks = get_chunks(doc)
83
+ return chunks