Spaces:
Running
Running
File size: 4,625 Bytes
30ffb9e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import os
import json
import pandas as pd
from typing import List, Union, Dict
from loguru import logger
import pandas as pd
import pathlib
## Set of helper functions that support data preprocessing
class FileIO:
'''
Convenience class for saving and loading data in parquet and
json formats to/from disk.
'''
def save_as_parquet(self,
file_path: str,
data: Union[List[dict], pd.DataFrame],
overwrite: bool=False) -> None:
'''
Saves DataFrame to disk as a parquet file. Removes the index.
Args:
-----
file_path : str
Output path to save file, if not included "parquet" will be appended
as file extension.
data : Union[List[dict], pd.DataFrame]
Data to save as parquet file. If data is a list of dicts, it will be
converted to a DataFrame before saving.
overwrite : bool
Overwrite existing file if True, otherwise raise FileExistsError.
'''
if isinstance(data, list):
data = self._convert_toDataFrame(data)
if not file_path.endswith('parquet'):
file_path = self._rename_file_extension(file_path, 'parquet')
self._check_file_path(file_path, overwrite=overwrite)
data.to_parquet(file_path, index=False)
logger.info(f'DataFrame saved as parquet file here: {file_path}')
def _convert_toDataFrame(self, data: List[dict]) -> pd.DataFrame:
return pd.DataFrame().from_dict(data)
def _rename_file_extension(self, file_path: str, extension: str):
'''
Renames file with appropriate extension if file_path
does not already have correct extension.
'''
prefix = os.path.splitext(file_path)[0]
file_path = prefix + '.' + extension
return file_path
def _check_file_path(self, file_path: str, overwrite: bool) -> None:
'''
Checks for existence of file and overwrite permissions.
'''
if os.path.exists(file_path) and overwrite == False:
raise FileExistsError(f'File by name {file_path} already exists, try using another file name or set overwrite to True.')
elif os.path.exists(file_path):
os.remove(file_path)
else:
file_name = os.path.basename(file_path)
dir_structure = file_path.replace(file_name, '')
pathlib.Path(dir_structure).mkdir(parents=True, exist_ok=True)
def load_parquet(self, file_path: str, verbose: bool=True) -> List[dict]:
'''
Loads parquet from disk, converts to pandas DataFrame as intermediate
step and outputs a list of dicts (docs).
'''
df = pd.read_parquet(file_path)
vector_labels = ['content_vector', 'image_vector', 'content_embedding']
for label in vector_labels:
if label in df.columns:
df[label] = df[label].apply(lambda x: x.tolist())
if verbose:
memory_usage = round(df.memory_usage().sum()/(1024*1024),2)
print(f'Shape of data: {df.values.shape}')
print(f'Memory Usage: {memory_usage}+ MB')
list_of_dicts = df.to_dict('records')
return list_of_dicts
def load_json(self, file_path: str):
'''
Loads json file from disk.
'''
with open(file_path) as f:
data = json.load(f)
return data
def save_as_json(self,
file_path: str,
data: Union[List[dict], dict],
indent: int=4,
overwrite: bool=False
) -> None:
'''
Saves data to disk as a json file. Data can be a list of dicts or a single dict.
'''
if not file_path.endswith('json'):
file_path = self._rename_file_extension(file_path, 'json')
self._check_file_path(file_path, overwrite=overwrite)
with open(file_path, 'w') as f:
json.dump(data, f, indent=indent)
logger.info(f'Data saved as json file here: {file_path}')
class Utilities:
def create_video_url(self, video_id: str, playlist_id: str):
'''
Creates a hyperlink to a video episode given a video_id and playlist_id.
Args:
-----
video_id : str
Video id of the episode from YouTube
playlist_id : str
Playlist id of the episode from YouTube
'''
return f'https://www.youtube.com/watch?v={video_id}&list={playlist_id}'
|