prashu333 commited on
Commit
cd13442
1 Parent(s): b059823

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -1
app.py CHANGED
@@ -77,9 +77,36 @@ def process_docx(file_path, chunk_size):
77
  if current_chunk: # Add any remaining text as the last chunk
78
  chunks.append(current_chunk)
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  return chunks
81
 
82
- def process_pdf3(file_path, chunk_size):
83
  chunks = []
84
  with open(file_path, 'rb') as file:
85
  reader = PyPDF2.PdfReader(file)
 
77
  if current_chunk: # Add any remaining text as the last chunk
78
  chunks.append(current_chunk)
79
 
80
+ return chunks
81
+
82
+ def process_pdf3(file_path, chunk_size):
83
+ chunks = []
84
+ with pdfplumber.open(file_path) as pdf:
85
+ for page in pdf.pages:
86
+ text = page.extract_text()
87
+ if text:
88
+ # Process each page individually
89
+ page_chunks = []
90
+ start = 0
91
+ while start < len(text):
92
+ end = start + chunk_size
93
+ if end > len(text):
94
+ end = len(text)
95
+ else:
96
+ # Find the nearest word boundary
97
+ while end > start and not text[end].isspace():
98
+ end -= 1
99
+ if end == start:
100
+ end = start + chunk_size
101
+
102
+ page_chunks.append(text[start:end].strip())
103
+ start = end
104
+
105
+ chunks.extend(page_chunks)
106
+
107
  return chunks
108
 
109
+ def process_pdf2(file_path, chunk_size):
110
  chunks = []
111
  with open(file_path, 'rb') as file:
112
  reader = PyPDF2.PdfReader(file)