File size: 3,541 Bytes
df32b2c
 
67f3761
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df32b2c
 
 
 
 
 
 
 
 
67f3761
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df32b2c
67f3761
 
 
 
 
 
 
 
 
 
 
 
 
df32b2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67f3761
df32b2c
67f3761
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import re
from collections import OrderedDict, defaultdict

import pymupdf


def sort_text(chunks):
    x_threshold = 300
    left_column = []
    right_column = []

    for chunk in chunks:
        if chunk["x"] < x_threshold:
            left_column.append(chunk)
        else:
            right_column.append(chunk)

    # Sort the chunks within each column based on the y-coordinate
    left_column = sorted(left_column, key=lambda item: item["y"])
    right_column = sorted(right_column, key=lambda item: item["y"])

    sorted_text = left_column + right_column
    return sorted_text


def majority_element(spans, param):
    char_count = defaultdict(int)

    for span in spans:
        span_text = span["text"]
        span_param = span[param]  # Get the color or size for this span
        char_count[span_param] += len(span_text)  # Count characters

    # Return the parameter value with the highest character count
    return max(char_count, key=char_count.get, default=None)


def clean_text(text):
    print("Cleaning = ", text)
    words = text.split()
    unique_words = OrderedDict.fromkeys(words)
    cleaned_text = " ".join(unique_words)
    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
    return cleaned_text


def get_chunks(doc):
    allchunks = []

    # Page Iteration
    for page_num in range(doc.page_count):
        chunks = []
        page = doc[page_num]

        # Filter images (not needed)
        blocks = [i for i in page.get_text("dict")["blocks"] if "image" not in i]

        # Block Iteration
        for block in blocks:
            text = ""
            spans = []

            # Line iteration
            for line in block["lines"]:
                for span in line["spans"]:
                    # Only include text with a size greater than 9
                    if span["size"] > 9:
                        span_text = span["text"]
                        text += span_text + " "
                        spans.append(span)  # Store the span for majority calculation

            # Filter empty strings
            if text.strip():
                chunks.append(
                    {
                        "text": clean_text(text.strip()),
                        "page": page_num,
                        "x": block["bbox"][0],
                        "y": block["bbox"][1],
                        "color": majority_element(spans, "color"),
                        "size": majority_element(spans, "size"),
                    }
                )

        # Sort text according to column order
        allchunks.extend(sort_text(chunks))
    return allchunks


def process_activities(chunks):
    # activities = []
    i = 0
    while i < len(chunks):
        chunk = chunks[i]
        if "Activity" in chunk["text"]:
            activity = chunk.copy()
            activity_size = chunks[i + 1]["size"] if i + 1 < len(chunks) else None

            j = i + 1
            while j < len(chunks) and chunks[j]["size"] == activity_size:
                activity["text"] += "\n" + chunks[j]["text"]
                j += 1

            # Replace the range of chunks with the single activity chunk
            chunks[i:j] = [activity]

            # activities.append(activity)
            i += 1
        else:
            i += 1

    return chunks


def embed_pdf(path, buffer=False):
    if buffer:
        doc = pymupdf.open(stream=path, filetype="pdf")
    else:
        doc = pymupdf.open(path)
    chunks = get_chunks(doc)
    chunks = process_activities(chunks)
    return chunks