Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -77,9 +77,36 @@ def process_docx(file_path, chunk_size):
|
|
77 |
if current_chunk: # Add any remaining text as the last chunk
|
78 |
chunks.append(current_chunk)
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
return chunks
|
81 |
|
82 |
-
def
|
83 |
chunks = []
|
84 |
with open(file_path, 'rb') as file:
|
85 |
reader = PyPDF2.PdfReader(file)
|
|
|
77 |
if current_chunk: # Add any remaining text as the last chunk
|
78 |
chunks.append(current_chunk)
|
79 |
|
80 |
+
return chunks
|
81 |
+
|
82 |
+
def process_pdf3(file_path, chunk_size):
|
83 |
+
chunks = []
|
84 |
+
with pdfplumber.open(file_path) as pdf:
|
85 |
+
for page in pdf.pages:
|
86 |
+
text = page.extract_text()
|
87 |
+
if text:
|
88 |
+
# Process each page individually
|
89 |
+
page_chunks = []
|
90 |
+
start = 0
|
91 |
+
while start < len(text):
|
92 |
+
end = start + chunk_size
|
93 |
+
if end > len(text):
|
94 |
+
end = len(text)
|
95 |
+
else:
|
96 |
+
# Find the nearest word boundary
|
97 |
+
while end > start and not text[end].isspace():
|
98 |
+
end -= 1
|
99 |
+
if end == start:
|
100 |
+
end = start + chunk_size
|
101 |
+
|
102 |
+
page_chunks.append(text[start:end].strip())
|
103 |
+
start = end
|
104 |
+
|
105 |
+
chunks.extend(page_chunks)
|
106 |
+
|
107 |
return chunks
|
108 |
|
109 |
+
def process_pdf2(file_path, chunk_size):
|
110 |
chunks = []
|
111 |
with open(file_path, 'rb') as file:
|
112 |
reader = PyPDF2.PdfReader(file)
|