Spaces:

reddit-tools-HF
/

dataset-creator-reddit-bestofredditorupdates

Running

App Files Files Community

dataset-creator-reddit-bestofredditorupdates / utilities /data_collator.py

derek-thomas HF staff

Major updates, moving away from pushshift.io into PRAW

285612d about 1 year ago

raw

history blame

1.73 kB

	import pandas as pd

	from utilities.praw_downloader import praw_downloader
	from utilities.praw_processor import preprocess_praw_data


	def get_latest_data():
	submissions = praw_downloader()
	df = preprocess_praw_data(submissions=submissions)
	return df


	def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Removes rows with redundant ids, retaining the one with the longest content.

	Parameters:
	- df (pd.DataFrame): The input DataFrame with columns 'id' and 'content'.

	Returns:
	- pd.DataFrame: A filtered DataFrame with unique ids, where each id is associated
	with the longest content available.
	"""

	# Create a column for content length
	df['content_length'] = df['content'].str.len()

	# Use groupby to get the index of the row with the longest content for each 'id'
	idx_to_keep = df.groupby('id')['content_length'].idxmax().values

	# Filter the DataFrame to only keep those rows
	df_filtered = df.loc[idx_to_keep]

	# Drop the 'content_length' column
	df_filtered = df_filtered.drop(columns=['content_length'])

	return df_filtered


	def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
	"""
	Merges the provided dataset with the latest data, sorts them by 'date_utc',
	filters out redundant IDs, and returns the merged and filtered dataset.

	Args:
	- dataset (Type[Dataset]): The dataset to be merged with the latest data.

	Returns:
	- Type[Dataset]: The merged and filtered dataset.
	"""
	latest_df = get_latest_data()

	df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
	df = filter_redundant_ids(df)
	return df