|
import pandas as pd |
|
|
|
from utilities.praw_downloader import praw_downloader |
|
from utilities.praw_processor import preprocess_praw_data |
|
|
|
|
|
def get_latest_data(): |
|
submissions = praw_downloader() |
|
df = preprocess_praw_data(submissions=submissions) |
|
return df |
|
|
|
|
|
def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame: |
|
""" |
|
Removes rows with redundant ids, retaining the one with the longest content. |
|
|
|
Parameters: |
|
- df (pd.DataFrame): The input DataFrame with columns 'id' and 'content'. |
|
|
|
Returns: |
|
- pd.DataFrame: A filtered DataFrame with unique ids, where each id is associated |
|
with the longest content available. |
|
""" |
|
|
|
|
|
df['content_length'] = df['content'].str.len() |
|
|
|
|
|
idx_to_keep = df.groupby('id')['content_length'].idxmax().values |
|
|
|
|
|
df_filtered = df.loc[idx_to_keep] |
|
|
|
|
|
df_filtered = df_filtered.drop(columns=['content_length']) |
|
|
|
return df_filtered |
|
|
|
|
|
def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame: |
|
""" |
|
Merges the provided dataset with the latest data, sorts them by 'date_utc', |
|
filters out redundant IDs, and returns the merged and filtered dataset. |
|
|
|
Args: |
|
- dataset (Type[Dataset]): The dataset to be merged with the latest data. |
|
|
|
Returns: |
|
- Type[Dataset]: The merged and filtered dataset. |
|
""" |
|
latest_df = get_latest_data() |
|
|
|
df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True) |
|
df = filter_redundant_ids(df) |
|
return df |
|
|