Commit
•
d0c9304
1
Parent(s):
e6a15ab
Keeping highest score
Browse files- utilities/data_collator.py +31 -11
utilities/data_collator.py
CHANGED
@@ -12,29 +12,33 @@ def get_latest_data():
|
|
12 |
|
13 |
def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
|
14 |
"""
|
15 |
-
|
|
|
16 |
|
17 |
Parameters:
|
18 |
-
- df (pd.DataFrame): The input DataFrame with columns 'id' and '
|
19 |
|
20 |
Returns:
|
21 |
-
- pd.DataFrame: A
|
22 |
-
with the longest content available
|
|
|
23 |
"""
|
24 |
|
25 |
# Create a column for content length
|
26 |
df['content_length'] = df['content'].str.len()
|
27 |
|
28 |
-
#
|
29 |
-
|
|
|
30 |
|
31 |
-
#
|
32 |
-
|
|
|
33 |
|
34 |
-
#
|
35 |
-
|
36 |
|
37 |
-
return
|
38 |
|
39 |
|
40 |
def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
|
@@ -53,3 +57,19 @@ def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
|
|
53 |
df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
|
54 |
df = filter_redundant_ids(df)
|
55 |
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
|
14 |
"""
|
15 |
+
For each id, creates a new row with the longest content and the highest score
|
16 |
+
from the available rows with the same id.
|
17 |
|
18 |
Parameters:
|
19 |
+
- df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'.
|
20 |
|
21 |
Returns:
|
22 |
+
- pd.DataFrame: A DataFrame with unique ids, where each id is associated
|
23 |
+
with the longest content available and the highest score from
|
24 |
+
potentially different rows.
|
25 |
"""
|
26 |
|
27 |
# Create a column for content length
|
28 |
df['content_length'] = df['content'].str.len()
|
29 |
|
30 |
+
# Find row with the longest content for each 'id'
|
31 |
+
idx_longest_content = df.groupby('id')['content_length'].idxmax().values
|
32 |
+
df_longest_content = df.loc[idx_longest_content][['id', 'content']]
|
33 |
|
34 |
+
# Find row with the highest score for each 'id'
|
35 |
+
idx_highest_score = df.groupby('id')['score'].idxmax().values
|
36 |
+
df_highest_score = df.loc[idx_highest_score][['id', 'score']]
|
37 |
|
38 |
+
# Merge the two DataFrames on 'id'
|
39 |
+
df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
|
40 |
|
41 |
+
return df_merged
|
42 |
|
43 |
|
44 |
def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
|
|
|
57 |
df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
|
58 |
df = filter_redundant_ids(df)
|
59 |
return df
|
60 |
+
|
61 |
+
|
62 |
+
if __name__ == '__main__':
|
63 |
+
# Mock data
|
64 |
+
data = {
|
65 |
+
'id': [1, 1, 2, 2, 3, 3],
|
66 |
+
'content': ['short', 'longer content', 'medium', 'really long content here', 'tiny', 'big'],
|
67 |
+
'score': [10, 5, 20, 15, 30, 25]
|
68 |
+
}
|
69 |
+
|
70 |
+
df = pd.DataFrame(data)
|
71 |
+
|
72 |
+
print("Original DataFrame:")
|
73 |
+
print(df)
|
74 |
+
print("\nFiltered DataFrame:")
|
75 |
+
print(filter_redundant_ids(df))
|