derek-thomas's picture
derek-thomas HF staff
Major updates, moving away from pushshift.io into PRAW
285612d
raw
history blame
1.73 kB
import pandas as pd
from utilities.praw_downloader import praw_downloader
from utilities.praw_processor import preprocess_praw_data
def get_latest_data():
submissions = praw_downloader()
df = preprocess_praw_data(submissions=submissions)
return df
def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
"""
Removes rows with redundant ids, retaining the one with the longest content.
Parameters:
- df (pd.DataFrame): The input DataFrame with columns 'id' and 'content'.
Returns:
- pd.DataFrame: A filtered DataFrame with unique ids, where each id is associated
with the longest content available.
"""
# Create a column for content length
df['content_length'] = df['content'].str.len()
# Use groupby to get the index of the row with the longest content for each 'id'
idx_to_keep = df.groupby('id')['content_length'].idxmax().values
# Filter the DataFrame to only keep those rows
df_filtered = df.loc[idx_to_keep]
# Drop the 'content_length' column
df_filtered = df_filtered.drop(columns=['content_length'])
return df_filtered
def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
"""
Merges the provided dataset with the latest data, sorts them by 'date_utc',
filters out redundant IDs, and returns the merged and filtered dataset.
Args:
- dataset (Type[Dataset]): The dataset to be merged with the latest data.
Returns:
- Type[Dataset]: The merged and filtered dataset.
"""
latest_df = get_latest_data()
df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
df = filter_redundant_ids(df)
return df