Spaces:

reddit-tools-HF
/

dataset-creator-reddit-bestofredditorupdates

Running

App Files Files Community

derek-thomas HF staff commited on Oct 26, 2023

Commit

d0c9304

•

1 Parent(s): e6a15ab

Keeping highest score

Files changed (1) hide show

utilities/data_collator.py +31 -11

utilities/data_collator.py CHANGED Viewed

@@ -12,29 +12,33 @@ def get_latest_data():
 def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
     """
-    Removes rows with redundant ids, retaining the one with the longest content.
     Parameters:
-    - df (pd.DataFrame): The input DataFrame with columns 'id' and 'content'.
     Returns:
-    - pd.DataFrame: A filtered DataFrame with unique ids, where each id is associated
-                    with the longest content available.
     """
     # Create a column for content length
     df['content_length'] = df['content'].str.len()
-    # Use groupby to get the index of the row with the longest content for each 'id'
-    idx_to_keep = df.groupby('id')['content_length'].idxmax().values
-    # Filter the DataFrame to only keep those rows
-    df_filtered = df.loc[idx_to_keep]
-    # Drop the 'content_length' column
-    df_filtered = df_filtered.drop(columns=['content_length'])
-    return df_filtered
 def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
@@ -53,3 +57,19 @@ def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
     df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
     df = filter_redundant_ids(df)
     return df

 def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
     """
+    For each id, creates a new row with the longest content and the highest score
+    from the available rows with the same id.
     Parameters:
+    - df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'.
     Returns:
+    - pd.DataFrame: A DataFrame with unique ids, where each id is associated
+                    with the longest content available and the highest score from
+                    potentially different rows.
     """
     # Create a column for content length
     df['content_length'] = df['content'].str.len()
+    # Find row with the longest content for each 'id'
+    idx_longest_content = df.groupby('id')['content_length'].idxmax().values
+    df_longest_content = df.loc[idx_longest_content][['id', 'content']]
+    # Find row with the highest score for each 'id'
+    idx_highest_score = df.groupby('id')['score'].idxmax().values
+    df_highest_score = df.loc[idx_highest_score][['id', 'score']]
+    # Merge the two DataFrames on 'id'
+    df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
+    return df_merged
 def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
     df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
     df = filter_redundant_ids(df)
     return df
+if __name__ == '__main__':
+    # Mock data
+    data = {
+        'id': [1, 1, 2, 2, 3, 3],
+        'content': ['short', 'longer content', 'medium', 'really long content here', 'tiny', 'big'],
+        'score': [10, 5, 20, 15, 30, 25]
+        }
+    df = pd.DataFrame(data)
+    print("Original DataFrame:")
+    print(df)
+    print("\nFiltered DataFrame:")
+    print(filter_redundant_ids(df))