derek-thomas HF staff commited on
Commit
130902a
1 Parent(s): ffea6b6

Updating log levels

Browse files
Files changed (2) hide show
  1. main.py +8 -8
  2. utilities/pushshift_data.py +1 -1
main.py CHANGED
@@ -68,23 +68,23 @@ def main(date_to_fetch):
68
  # Load the existing dataset from the Hugging Face hub or create a new one
69
  try:
70
  dataset = load_dataset(dataset_name, download_mode="reuse_cache_if_exists", ignore_verifications=True)
71
- logger.info("Loading existing dataset")
72
  if "__index_level_0__" in dataset["all_days"].column_names:
73
  dataset = dataset.remove_columns(["__index_level_0__"])
74
  except FileNotFoundError:
75
- logger.info("Creating new dataset")
76
  dataset = DatasetDict()
77
 
78
  # Call get_subreddit_day with the calculated date
79
  logger.info(f"Fetching data for {str(date_to_fetch)}")
80
  submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
81
  df = submissions_to_dataframe(submissions)
82
- logger.info(f"Data fetched for {str(date_to_fetch)}")
83
  most_recent_date = date_to_fetch
84
 
85
  # Append DataFrame to split 'all_days' or create new split
86
  if "all_days" in dataset:
87
- logger.info("Appending data to split 'all_days'")
88
  # Merge the new submissions
89
  old_data = dataset['all_days'].to_pandas()
90
  new_data = pd.concat([old_data, df], ignore_index=True)
@@ -105,13 +105,13 @@ def main(date_to_fetch):
105
  # Convert back to dataset
106
  dataset["all_days"] = Dataset.from_pandas(new_data)
107
  else:
108
- logger.info("Creating new split 'all_days'")
109
  dataset["all_days"] = Dataset.from_pandas(df)
110
  # Log appending or creating split 'all'
111
- logger.info("Appended or created split 'all_days'")
112
 
113
  # Push the augmented dataset to the Hugging Face hub
114
- logger.info(f"Pushing data for {date_to_fetch} to the Hugging Face hub")
115
  readme_text = update_readme(dataset_name, subreddit, date_to_fetch)
116
  dataset.description = readme_text
117
  dataset.push_to_hub(dataset_name, token=auth_token)
@@ -136,7 +136,7 @@ def run_main_continuously():
136
  two_days_ago = today - timedelta(days=2)
137
 
138
  if start_date <= two_days_ago:
139
- logger.info(f"Running main function for date: {start_date}")
140
  most_recent_date = main(start_date)
141
  start_date = most_recent_date + timedelta(days=1)
142
  else:
 
68
  # Load the existing dataset from the Hugging Face hub or create a new one
69
  try:
70
  dataset = load_dataset(dataset_name, download_mode="reuse_cache_if_exists", ignore_verifications=True)
71
+ logger.debug("Loading existing dataset")
72
  if "__index_level_0__" in dataset["all_days"].column_names:
73
  dataset = dataset.remove_columns(["__index_level_0__"])
74
  except FileNotFoundError:
75
+ logger.warning("Creating new dataset")
76
  dataset = DatasetDict()
77
 
78
  # Call get_subreddit_day with the calculated date
79
  logger.info(f"Fetching data for {str(date_to_fetch)}")
80
  submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
81
  df = submissions_to_dataframe(submissions)
82
+ logger.debug(f"Data fetched for {str(date_to_fetch)}")
83
  most_recent_date = date_to_fetch
84
 
85
  # Append DataFrame to split 'all_days' or create new split
86
  if "all_days" in dataset:
87
+ logger.debug("Appending data to split 'all_days'")
88
  # Merge the new submissions
89
  old_data = dataset['all_days'].to_pandas()
90
  new_data = pd.concat([old_data, df], ignore_index=True)
 
105
  # Convert back to dataset
106
  dataset["all_days"] = Dataset.from_pandas(new_data)
107
  else:
108
+ logger.debug("Creating new split 'all_days'")
109
  dataset["all_days"] = Dataset.from_pandas(df)
110
  # Log appending or creating split 'all'
111
+ logger.debug("Appended or created split 'all_days'")
112
 
113
  # Push the augmented dataset to the Hugging Face hub
114
+ logger.debug(f"Pushing data for {date_to_fetch} to the Hugging Face hub")
115
  readme_text = update_readme(dataset_name, subreddit, date_to_fetch)
116
  dataset.description = readme_text
117
  dataset.push_to_hub(dataset_name, token=auth_token)
 
136
  two_days_ago = today - timedelta(days=2)
137
 
138
  if start_date <= two_days_ago:
139
+ logger.warning(f"Running main function for date: {start_date}")
140
  most_recent_date = main(start_date)
141
  start_date = most_recent_date + timedelta(days=1)
142
  else:
utilities/pushshift_data.py CHANGED
@@ -115,7 +115,7 @@ def scrape_submissions_by_day(subreddit_to_scrape: str, day_to_scrape: str) -> L
115
  actual_requests = 0
116
  while after < before:
117
  after_str, before_str = convert_timestamp_to_datetime(after), convert_timestamp_to_datetime(before)
118
- logger.info(f"Fetching data between timestamps {after_str} and {before_str}")
119
  data = get_pushshift_data(subreddit_to_scrape, before=before, after=after)
120
  if data is None or len(data["data"]) == 0:
121
  break
 
115
  actual_requests = 0
116
  while after < before:
117
  after_str, before_str = convert_timestamp_to_datetime(after), convert_timestamp_to_datetime(before)
118
+ logger.debug(f"Fetching data between timestamps {after_str} and {before_str}")
119
  data = get_pushshift_data(subreddit_to_scrape, before=before, after=after)
120
  if data is None or len(data["data"]) == 0:
121
  break