Spaces:

reddit-tools-HF
/

dataset-creator-reddit-bestofredditorupdates

Running

App Files Files Community

derek-thomas HF staff commited on Apr 14, 2023

Commit

130902a

•

1 Parent(s): ffea6b6

Updating log levels

Browse files

Files changed (2) hide show

main.py +8 -8
utilities/pushshift_data.py +1 -1

main.py CHANGED Viewed

@@ -68,23 +68,23 @@ def main(date_to_fetch):
     # Load the existing dataset from the Hugging Face hub or create a new one
     try:
         dataset = load_dataset(dataset_name, download_mode="reuse_cache_if_exists", ignore_verifications=True)
-        logger.info("Loading existing dataset")
         if "__index_level_0__" in dataset["all_days"].column_names:
             dataset = dataset.remove_columns(["__index_level_0__"])
     except FileNotFoundError:
-        logger.info("Creating new dataset")
         dataset = DatasetDict()
     # Call get_subreddit_day with the calculated date
     logger.info(f"Fetching data for {str(date_to_fetch)}")
     submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
     df = submissions_to_dataframe(submissions)
-    logger.info(f"Data fetched for {str(date_to_fetch)}")
     most_recent_date = date_to_fetch
     # Append DataFrame to split 'all_days' or create new split
     if "all_days" in dataset:
-        logger.info("Appending data to split 'all_days'")
         # Merge the new submissions
         old_data = dataset['all_days'].to_pandas()
         new_data = pd.concat([old_data, df], ignore_index=True)
@@ -105,13 +105,13 @@ def main(date_to_fetch):
         # Convert back to dataset
         dataset["all_days"] = Dataset.from_pandas(new_data)
     else:
-        logger.info("Creating new split 'all_days'")
         dataset["all_days"] = Dataset.from_pandas(df)
     # Log appending or creating split 'all'
-    logger.info("Appended or created split 'all_days'")
     # Push the augmented dataset to the Hugging Face hub
-    logger.info(f"Pushing data for {date_to_fetch} to the Hugging Face hub")
     readme_text = update_readme(dataset_name, subreddit, date_to_fetch)
     dataset.description = readme_text
     dataset.push_to_hub(dataset_name, token=auth_token)
@@ -136,7 +136,7 @@ def run_main_continuously():
         two_days_ago = today - timedelta(days=2)
         if start_date <= two_days_ago:
-            logger.info(f"Running main function for date: {start_date}")
             most_recent_date = main(start_date)
             start_date = most_recent_date + timedelta(days=1)
         else:

     # Load the existing dataset from the Hugging Face hub or create a new one
     try:
         dataset = load_dataset(dataset_name, download_mode="reuse_cache_if_exists", ignore_verifications=True)
+        logger.debug("Loading existing dataset")
         if "__index_level_0__" in dataset["all_days"].column_names:
             dataset = dataset.remove_columns(["__index_level_0__"])
     except FileNotFoundError:
+        logger.warning("Creating new dataset")
         dataset = DatasetDict()
     # Call get_subreddit_day with the calculated date
     logger.info(f"Fetching data for {str(date_to_fetch)}")
     submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
     df = submissions_to_dataframe(submissions)
+    logger.debug(f"Data fetched for {str(date_to_fetch)}")
     most_recent_date = date_to_fetch
     # Append DataFrame to split 'all_days' or create new split
     if "all_days" in dataset:
+        logger.debug("Appending data to split 'all_days'")
         # Merge the new submissions
         old_data = dataset['all_days'].to_pandas()
         new_data = pd.concat([old_data, df], ignore_index=True)
         # Convert back to dataset
         dataset["all_days"] = Dataset.from_pandas(new_data)
     else:
+        logger.debug("Creating new split 'all_days'")
         dataset["all_days"] = Dataset.from_pandas(df)
     # Log appending or creating split 'all'
+    logger.debug("Appended or created split 'all_days'")
     # Push the augmented dataset to the Hugging Face hub
+    logger.debug(f"Pushing data for {date_to_fetch} to the Hugging Face hub")
     readme_text = update_readme(dataset_name, subreddit, date_to_fetch)
     dataset.description = readme_text
     dataset.push_to_hub(dataset_name, token=auth_token)
         two_days_ago = today - timedelta(days=2)
         if start_date <= two_days_ago:
+            logger.warning(f"Running main function for date: {start_date}")
             most_recent_date = main(start_date)
             start_date = most_recent_date + timedelta(days=1)
         else:

utilities/pushshift_data.py CHANGED Viewed

@@ -115,7 +115,7 @@ def scrape_submissions_by_day(subreddit_to_scrape: str, day_to_scrape: str) -> L
     actual_requests = 0
     while after < before:
         after_str, before_str = convert_timestamp_to_datetime(after), convert_timestamp_to_datetime(before)
-        logger.info(f"Fetching data between timestamps {after_str} and {before_str}")
         data = get_pushshift_data(subreddit_to_scrape, before=before, after=after)
         if data is None or len(data["data"]) == 0:
             break

     actual_requests = 0
     while after < before:
         after_str, before_str = convert_timestamp_to_datetime(after), convert_timestamp_to_datetime(before)
+        logger.debug(f"Fetching data between timestamps {after_str} and {before_str}")
         data = get_pushshift_data(subreddit_to_scrape, before=before, after=after)
         if data is None or len(data["data"]) == 0:
             break