Spaces:

JPBianchi
/

vectorsearch

Running

File size: 4,625 Bytes

30ffb9e

import os
import json
import pandas as pd
from typing import List, Union, Dict
from loguru import logger
import pandas as pd
import pathlib


## Set of helper functions that support data preprocessing 
class FileIO:
    '''
    Convenience class for saving and loading data in parquet and 
    json formats to/from disk.
    '''

    def save_as_parquet(self, 
                        file_path: str, 
                        data: Union[List[dict], pd.DataFrame], 
                        overwrite: bool=False) -> None:
        '''
        Saves DataFrame to disk as a parquet file.  Removes the index. 

        Args:
        -----
        file_path : str
            Output path to save file, if not included "parquet" will be appended
            as file extension.
        data : Union[List[dict], pd.DataFrame]
            Data to save as parquet file. If data is a list of dicts, it will be
            converted to a DataFrame before saving.
        overwrite : bool
            Overwrite existing file if True, otherwise raise FileExistsError.
        '''
        if isinstance(data, list):
           data = self._convert_toDataFrame(data)
        if not file_path.endswith('parquet'):
            file_path = self._rename_file_extension(file_path, 'parquet')
        self._check_file_path(file_path, overwrite=overwrite)
        data.to_parquet(file_path, index=False)
        logger.info(f'DataFrame saved as parquet file here: {file_path}')
        
    def _convert_toDataFrame(self, data: List[dict]) -> pd.DataFrame:
        return pd.DataFrame().from_dict(data)

    def _rename_file_extension(self, file_path: str, extension: str):
        '''
        Renames file with appropriate extension if file_path
        does not already have correct extension.
        '''
        prefix = os.path.splitext(file_path)[0]
        file_path = prefix + '.' + extension
        return file_path

    def _check_file_path(self, file_path: str, overwrite: bool) -> None:
        '''
        Checks for existence of file and overwrite permissions.
        '''
        if os.path.exists(file_path) and overwrite == False:
            raise FileExistsError(f'File by name {file_path} already exists, try using another file name or set overwrite to True.')
        elif os.path.exists(file_path):
            os.remove(file_path)
        else: 
            file_name = os.path.basename(file_path)
            dir_structure = file_path.replace(file_name, '')
            pathlib.Path(dir_structure).mkdir(parents=True, exist_ok=True)
    
    def load_parquet(self, file_path: str, verbose: bool=True) -> List[dict]:
        '''
        Loads parquet from disk, converts to pandas DataFrame as intermediate
        step and outputs a list of dicts (docs).
        '''
        df = pd.read_parquet(file_path)
        vector_labels = ['content_vector', 'image_vector', 'content_embedding']
        for label in vector_labels:
            if label in df.columns:
                df[label] = df[label].apply(lambda x: x.tolist())
        if verbose:
            memory_usage = round(df.memory_usage().sum()/(1024*1024),2)
            print(f'Shape of data: {df.values.shape}')
            print(f'Memory Usage: {memory_usage}+ MB')
        list_of_dicts = df.to_dict('records')
        return list_of_dicts
    
    def load_json(self, file_path: str):
        '''
        Loads json file from disk.
        '''
        with open(file_path) as f:
            data = json.load(f)
        return data
    
    def save_as_json(self, 
                     file_path: str, 
                     data: Union[List[dict], dict], 
                     indent: int=4,
                     overwrite: bool=False
                     ) -> None:
        '''
        Saves data to disk as a json file. Data can be a list of dicts or a single dict.
        '''
        if not file_path.endswith('json'):
            file_path = self._rename_file_extension(file_path, 'json')
        self._check_file_path(file_path, overwrite=overwrite)
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=indent)
        logger.info(f'Data saved as json file here: {file_path}')

class Utilities: 

    def create_video_url(self, video_id: str, playlist_id: str):
        '''
        Creates a hyperlink to a video episode given a video_id and playlist_id.

        Args:
        -----
        video_id : str
            Video id of the episode from YouTube
        playlist_id : str
            Playlist id of the episode from YouTube
        '''
        return f'https://www.youtube.com/watch?v={video_id}&list={playlist_id}'