Spaces:
Runtime error
Runtime error
from collections import defaultdict | |
import pymupdf | |
def sort_text(chunks): | |
x_threshold = 300 | |
left_column = [] | |
right_column = [] | |
for chunk in chunks: | |
if chunk["x"] < x_threshold: | |
left_column.append(chunk) | |
else: | |
right_column.append(chunk) | |
# Sort the chunks within each column based on the y-coordinate | |
left_column = sorted(left_column, key=lambda item: item["y"]) | |
right_column = sorted(right_column, key=lambda item: item["y"]) | |
sorted_text = left_column + right_column | |
return sorted_text | |
def majority_element(spans, param): | |
char_count = defaultdict(int) | |
for span in spans: | |
span_text = span["text"] | |
span_param = span[param] # Get the color or size for this span | |
char_count[span_param] += len(span_text) # Count characters | |
# Return the parameter value with the highest character count | |
return max(char_count, key=char_count.get, default=None) | |
def get_chunks(doc): | |
allchunks = [] | |
# Page Iteration | |
for page_num in range(doc.page_count): | |
chunks = [] | |
page = doc[page_num] | |
# Filter images (not needed) | |
blocks = [i for i in page.get_text("dict")["blocks"] if "image" not in i] | |
# Block Iteration | |
for block in blocks: | |
text = "" | |
spans = [] | |
# Line iteration | |
for line in block["lines"]: | |
for span in line["spans"]: | |
# Only include text with a size greater than 9 | |
if span["size"] > 9: | |
span_text = span["text"] | |
text += span_text + " " | |
spans.append(span) # Store the span for majority calculation | |
# Filter empty strings | |
if text.strip(): | |
chunks.append( | |
{ | |
"text": text.strip(), | |
"page": page_num, | |
"x": block["bbox"][0], | |
"y": block["bbox"][1], | |
"color": majority_element(spans, "color"), | |
"size": majority_element(spans, "size"), | |
} | |
) | |
# Sort text according to column order | |
allchunks.extend(sort_text(chunks)) | |
return allchunks | |
def embed_pdf(path): | |
doc = pymupdf.open(path) | |
chunks = get_chunks(doc) | |
return chunks | |