import pandas as pd from utilities.praw_downloader import praw_downloader from utilities.praw_processor import preprocess_praw_data def get_latest_data(): submissions = praw_downloader() df = preprocess_praw_data(submissions=submissions) return df def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame: """ Removes rows with redundant ids, retaining the one with the longest content. Parameters: - df (pd.DataFrame): The input DataFrame with columns 'id' and 'content'. Returns: - pd.DataFrame: A filtered DataFrame with unique ids, where each id is associated with the longest content available. """ # Create a column for content length df['content_length'] = df['content'].str.len() # Use groupby to get the index of the row with the longest content for each 'id' idx_to_keep = df.groupby('id')['content_length'].idxmax().values # Filter the DataFrame to only keep those rows df_filtered = df.loc[idx_to_keep] # Drop the 'content_length' column df_filtered = df_filtered.drop(columns=['content_length']) return df_filtered def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame: """ Merges the provided dataset with the latest data, sorts them by 'date_utc', filters out redundant IDs, and returns the merged and filtered dataset. Args: - dataset (Type[Dataset]): The dataset to be merged with the latest data. Returns: - Type[Dataset]: The merged and filtered dataset. """ latest_df = get_latest_data() df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True) df = filter_redundant_ids(df) return df