derek-thomas HF staff commited on
Commit
04654e7
1 Parent(s): 8bb39bf

Fixing datatypes so it can be visualized

Browse files
Files changed (2) hide show
  1. main.py +4 -1
  2. utilities/pushshift_data.py +2 -2
main.py CHANGED
@@ -65,6 +65,7 @@ def main(date_to_fetch):
65
 
66
  # Figure out dates when we restart
67
  old_data_most_recent_date = old_data['date'].max()
 
68
  most_recent_date = max(old_data_most_recent_date, most_recent_date)
69
 
70
  if len(old_data) == len(new_data):
@@ -73,6 +74,9 @@ def main(date_to_fetch):
73
 
74
  # Convert back to dataset
75
  dataset["all_days"] = Dataset.from_pandas(new_data)
 
 
 
76
  else:
77
  logger.debug("Creating new split 'all_days'")
78
  dataset["all_days"] = Dataset.from_pandas(df)
@@ -81,7 +85,6 @@ def main(date_to_fetch):
81
 
82
  # Push the augmented dataset to the Hugging Face hub
83
  logger.debug(f"Pushing data for {date_to_fetch} to the Hugging Face hub")
84
- update_readme(dataset_name, subreddit, date_to_fetch)
85
  dataset.push_to_hub(dataset_name, token=auth_token)
86
  logger.info(f"Processed and pushed data for {date_to_fetch} to the Hugging Face Hub")
87
  return most_recent_date
 
65
 
66
  # Figure out dates when we restart
67
  old_data_most_recent_date = old_data['date'].max()
68
+ old_data_most_recent_date = datetime.strptime(old_data_most_recent_date, '%Y-%m-%d').date()
69
  most_recent_date = max(old_data_most_recent_date, most_recent_date)
70
 
71
  if len(old_data) == len(new_data):
 
74
 
75
  # Convert back to dataset
76
  dataset["all_days"] = Dataset.from_pandas(new_data)
77
+
78
+ # Update README
79
+ update_readme(dataset_name, subreddit, date_to_fetch)
80
  else:
81
  logger.debug("Creating new split 'all_days'")
82
  dataset["all_days"] = Dataset.from_pandas(df)
 
85
 
86
  # Push the augmented dataset to the Hugging Face hub
87
  logger.debug(f"Pushing data for {date_to_fetch} to the Hugging Face hub")
 
88
  dataset.push_to_hub(dataset_name, token=auth_token)
89
  logger.info(f"Processed and pushed data for {date_to_fetch} to the Hugging Face Hub")
90
  return most_recent_date
utilities/pushshift_data.py CHANGED
@@ -149,8 +149,8 @@ def submissions_to_dataframe(submissions: List[Dict[str, Any]]) -> pd.DataFrame:
149
  df = df[cols]
150
  # Convert the "created_utc" column to a datetime column with timezone information
151
  df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s').dt.tz_localize('UTC')
152
- df['date'] = df['created_utc'].dt.date
153
- df['time'] = df['created_utc'].dt.time
154
  return df
155
 
156
 
 
149
  df = df[cols]
150
  # Convert the "created_utc" column to a datetime column with timezone information
151
  df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s').dt.tz_localize('UTC')
152
+ df['date'] = df['created_utc'].dt.date.astype(str)
153
+ df['time'] = df['created_utc'].dt.time.astype(str)
154
  return df
155
 
156