derek-thomas HF staff commited on
Commit
d0c9304
1 Parent(s): e6a15ab

Keeping highest score

Browse files
Files changed (1) hide show
  1. utilities/data_collator.py +31 -11
utilities/data_collator.py CHANGED
@@ -12,29 +12,33 @@ def get_latest_data():
12
 
13
  def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
14
  """
15
- Removes rows with redundant ids, retaining the one with the longest content.
 
16
 
17
  Parameters:
18
- - df (pd.DataFrame): The input DataFrame with columns 'id' and 'content'.
19
 
20
  Returns:
21
- - pd.DataFrame: A filtered DataFrame with unique ids, where each id is associated
22
- with the longest content available.
 
23
  """
24
 
25
  # Create a column for content length
26
  df['content_length'] = df['content'].str.len()
27
 
28
- # Use groupby to get the index of the row with the longest content for each 'id'
29
- idx_to_keep = df.groupby('id')['content_length'].idxmax().values
 
30
 
31
- # Filter the DataFrame to only keep those rows
32
- df_filtered = df.loc[idx_to_keep]
 
33
 
34
- # Drop the 'content_length' column
35
- df_filtered = df_filtered.drop(columns=['content_length'])
36
 
37
- return df_filtered
38
 
39
 
40
  def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
@@ -53,3 +57,19 @@ def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
53
  df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
54
  df = filter_redundant_ids(df)
55
  return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
14
  """
15
+ For each id, creates a new row with the longest content and the highest score
16
+ from the available rows with the same id.
17
 
18
  Parameters:
19
+ - df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'.
20
 
21
  Returns:
22
+ - pd.DataFrame: A DataFrame with unique ids, where each id is associated
23
+ with the longest content available and the highest score from
24
+ potentially different rows.
25
  """
26
 
27
  # Create a column for content length
28
  df['content_length'] = df['content'].str.len()
29
 
30
+ # Find row with the longest content for each 'id'
31
+ idx_longest_content = df.groupby('id')['content_length'].idxmax().values
32
+ df_longest_content = df.loc[idx_longest_content][['id', 'content']]
33
 
34
+ # Find row with the highest score for each 'id'
35
+ idx_highest_score = df.groupby('id')['score'].idxmax().values
36
+ df_highest_score = df.loc[idx_highest_score][['id', 'score']]
37
 
38
+ # Merge the two DataFrames on 'id'
39
+ df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
40
 
41
+ return df_merged
42
 
43
 
44
  def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
 
57
  df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
58
  df = filter_redundant_ids(df)
59
  return df
60
+
61
+
62
+ if __name__ == '__main__':
63
+ # Mock data
64
+ data = {
65
+ 'id': [1, 1, 2, 2, 3, 3],
66
+ 'content': ['short', 'longer content', 'medium', 'really long content here', 'tiny', 'big'],
67
+ 'score': [10, 5, 20, 15, 30, 25]
68
+ }
69
+
70
+ df = pd.DataFrame(data)
71
+
72
+ print("Original DataFrame:")
73
+ print(df)
74
+ print("\nFiltered DataFrame:")
75
+ print(filter_redundant_ids(df))