Spaces:

prashu333
/

genAI-qa-bot

Sleeping

prashu333 commited on Sep 18

Commit

cd13442

•

1 Parent(s): b059823

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -77,9 +77,36 @@ def process_docx(file_path, chunk_size):
     if current_chunk:  # Add any remaining text as the last chunk
         chunks.append(current_chunk)
     return chunks
-def process_pdf3(file_path, chunk_size):
     chunks = []
     with open(file_path, 'rb') as file:
         reader = PyPDF2.PdfReader(file)

     if current_chunk:  # Add any remaining text as the last chunk
         chunks.append(current_chunk)
+    return chunks
+def process_pdf3(file_path, chunk_size):
+    chunks = []
+    with pdfplumber.open(file_path) as pdf:
+        for page in pdf.pages:
+            text = page.extract_text()
+            if text:
+                # Process each page individually
+                page_chunks = []
+                start = 0
+                while start < len(text):
+                    end = start + chunk_size
+                    if end > len(text):
+                        end = len(text)
+                    else:
+                        # Find the nearest word boundary
+                        while end > start and not text[end].isspace():
+                            end -= 1
+                        if end == start:
+                            end = start + chunk_size
+                    page_chunks.append(text[start:end].strip())
+                    start = end
+                chunks.extend(page_chunks)
     return chunks
+def process_pdf2(file_path, chunk_size):
     chunks = []
     with open(file_path, 'rb') as file:
         reader = PyPDF2.PdfReader(file)