import os import boto3 import tempfile import fitz from io import BytesIO from fastapi import HTTPException class Loader: def __init__(self): # Create S3 and Transcribe clients with credentials self.bucket_name = "multimedika" self.s3_client = boto3.client( "s3", aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), region_name="us-west-2", ) # def upload_to_s3(self, file, object_name, folder_name="summarizer"): # try: # # If folder_name is provided, prepend it to the object_name # if folder_name: # object_name = f"{folder_name}/{object_name}" # # Create an in-memory file-like object # with BytesIO() as file_stream: # # Write the contents of the uploaded file to the stream # file_stream.write(file.file.read()) # file_stream.seek(0) # Move to the beginning of the stream # # Upload file to S3 # self.s3_client.upload_fileobj(file_stream, self.bucket_name, object_name) # print(f"File '{object_name}' successfully uploaded to bucket '{self.bucket_name}'.") # except Exception as e: # raise HTTPException(status_code=400, detail=f"Error uploading to AWS: {e}") def upload_to_s3(self, file, object_name, folder_name="summarizer"): try: # If folder_name is provided, prepend it to the object_name if folder_name: object_name = f"{folder_name}/{object_name}" # Open the PDF with PyMuPDF (fitz) pdf_document = fitz.open(stream=file.file.read(), filetype="pdf") # Loop through each page of the PDF for page_num in range(pdf_document.page_count): # Convert the page to bytes (as a separate PDF) page_stream = BytesIO() single_page_pdf = fitz.open() # Create a new PDF single_page_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num) single_page_pdf.save(page_stream) single_page_pdf.close() # Reset the stream position to the start page_stream.seek(0) # Define the object name for each page (e.g., 'summarizer/object_name/page_1.pdf') page_object_name = f"{object_name}/{page_num + 1}.pdf" # Upload each page to S3 self.s3_client.upload_fileobj(page_stream, self.bucket_name, page_object_name) print(f"Page {page_num + 1} of '{object_name}' successfully uploaded as '{page_object_name}' to bucket '{self.bucket_name}'.") except Exception as e: raise HTTPException(status_code=400, detail=f"Error uploading to AWS: {e}") def get_file_aws(self, object_name, local_file_name=None): """Downloads a PDF file from S3 and reads it using PyMuPDF.""" if local_file_name is None: local_file_name = "downloaded_pdf_file.pdf" # Default file name try: # Create a temporary directory to store the file temp_dir = tempfile.mkdtemp() file_path = os.path.join(temp_dir, local_file_name) # Download the file from S3 with open(file_path, "wb") as temp_file: self.s3_client.download_fileobj( self.bucket_name, object_name, temp_file ) # Open and read the PDF using PyMuPDF doc = fitz.open(file_path) # Example: Print the number of pages print(f"Number of pages: {doc.page_count}") # Do something with the PDF, like read text for page in doc: print(page.get_text()) # Close the document doc.close() # Clean up the downloaded file if needed os.remove(file_path) except Exception as e: raise HTTPException(status_code=400, detail=f"Error get file file in aws: {e}")