Spaces:
Runtime error
Runtime error
File size: 2,429 Bytes
67f3761 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
from collections import defaultdict
import pymupdf
def sort_text(chunks):
x_threshold = 300
left_column = []
right_column = []
for chunk in chunks:
if chunk["x"] < x_threshold:
left_column.append(chunk)
else:
right_column.append(chunk)
# Sort the chunks within each column based on the y-coordinate
left_column = sorted(left_column, key=lambda item: item["y"])
right_column = sorted(right_column, key=lambda item: item["y"])
sorted_text = left_column + right_column
return sorted_text
def majority_element(spans, param):
char_count = defaultdict(int)
for span in spans:
span_text = span["text"]
span_param = span[param] # Get the color or size for this span
char_count[span_param] += len(span_text) # Count characters
# Return the parameter value with the highest character count
return max(char_count, key=char_count.get, default=None)
def get_chunks(doc):
allchunks = []
# Page Iteration
for page_num in range(doc.page_count):
chunks = []
page = doc[page_num]
# Filter images (not needed)
blocks = [i for i in page.get_text("dict")["blocks"] if "image" not in i]
# Block Iteration
for block in blocks:
text = ""
spans = []
# Line iteration
for line in block["lines"]:
for span in line["spans"]:
# Only include text with a size greater than 9
if span["size"] > 9:
span_text = span["text"]
text += span_text + " "
spans.append(span) # Store the span for majority calculation
# Filter empty strings
if text.strip():
chunks.append(
{
"text": text.strip(),
"page": page_num,
"x": block["bbox"][0],
"y": block["bbox"][1],
"color": majority_element(spans, "color"),
"size": majority_element(spans, "size"),
}
)
# Sort text according to column order
allchunks.extend(sort_text(chunks))
return allchunks
def embed_pdf(path):
doc = pymupdf.open(path)
chunks = get_chunks(doc)
return chunks
|