Commit
•
5ec6657
1
Parent(s):
89f0e00
Updating README.md the right way :'(
Browse files- main.py +4 -2
- requirements.txt +1 -2
- utilities/readme_update.py +56 -30
main.py
CHANGED
@@ -8,7 +8,7 @@ from datasets import Dataset
|
|
8 |
|
9 |
from utilities.user_defined_functions import get_latest_data, merge_data, load_or_create_dataset
|
10 |
from utilities.my_logger import setup_logger
|
11 |
-
from utilities.readme_update import
|
12 |
|
13 |
# Set dataset name, path to README.md, and existing dataset details
|
14 |
subreddit = os.environ["SUBREDDIT"]
|
@@ -47,13 +47,15 @@ def main():
|
|
47 |
dataset['train'] = Dataset.from_pandas(df, preserve_index=False)
|
48 |
|
49 |
# Update README
|
50 |
-
update_readme(dataset_name=dataset_name, subreddit=subreddit, latest_date=date, new_rows=new_rows)
|
51 |
logger.info(f"Adding {new_rows} rows for {date}.")
|
52 |
|
53 |
# Push the augmented dataset to the Hugging Face hub
|
54 |
logger.debug(f"Pushing data for {date} to the Hugging Face hub")
|
55 |
dataset.push_to_hub(dataset_name, token=auth_token)
|
56 |
logger.info(f"Processed and pushed data for {date} to the Hugging Face Hub")
|
|
|
|
|
|
|
57 |
# files_cleaned = dataset.cleanup_cache_files()
|
58 |
# logger.info(f"Removed {files_cleaned} cache files")
|
59 |
|
|
|
8 |
|
9 |
from utilities.user_defined_functions import get_latest_data, merge_data, load_or_create_dataset
|
10 |
from utilities.my_logger import setup_logger
|
11 |
+
from utilities.readme_update import update_dataset_readme
|
12 |
|
13 |
# Set dataset name, path to README.md, and existing dataset details
|
14 |
subreddit = os.environ["SUBREDDIT"]
|
|
|
47 |
dataset['train'] = Dataset.from_pandas(df, preserve_index=False)
|
48 |
|
49 |
# Update README
|
|
|
50 |
logger.info(f"Adding {new_rows} rows for {date}.")
|
51 |
|
52 |
# Push the augmented dataset to the Hugging Face hub
|
53 |
logger.debug(f"Pushing data for {date} to the Hugging Face hub")
|
54 |
dataset.push_to_hub(dataset_name, token=auth_token)
|
55 |
logger.info(f"Processed and pushed data for {date} to the Hugging Face Hub")
|
56 |
+
logger.info(f"Updating rEADME...")
|
57 |
+
update_dataset_readme(dataset_name=dataset_name, subreddit=subreddit, new_rows=new_rows)
|
58 |
+
logger.info(f"Updated README.")
|
59 |
# files_cleaned = dataset.cleanup_cache_files()
|
60 |
# logger.info(f"Removed {files_cleaned} cache files")
|
61 |
|
requirements.txt
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
praw==7.7.1
|
2 |
gradio==3.50.2
|
3 |
nbdev==2.3.12
|
4 |
-
|
5 |
-
git+https://github.com/huggingface/datasets.git@81b3ccfc016f6a39837334a0173dac3f59112856
|
6 |
requests==2.28.2
|
7 |
loguru==0.7.0
|
8 |
rich==13.3.4
|
|
|
1 |
praw==7.7.1
|
2 |
gradio==3.50.2
|
3 |
nbdev==2.3.12
|
4 |
+
datasets==2.14.6
|
|
|
5 |
requests==2.28.2
|
6 |
loguru==0.7.0
|
7 |
rich==13.3.4
|
utilities/readme_update.py
CHANGED
@@ -2,26 +2,63 @@ import os
|
|
2 |
from datetime import datetime
|
3 |
|
4 |
import pytz
|
5 |
-
from
|
6 |
-
from datasets.utils.file_utils import cached_path
|
7 |
-
from datasets.utils.hub import hf_hub_url
|
8 |
|
9 |
frequency = os.environ.get("FREQUENCY", '').lower()
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
|
20 |
latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')
|
21 |
|
22 |
readme_text = f"""
|
23 |
## Dataset Overview
|
24 |
-
The goal is to have an open dataset of [r/{subreddit}](https://www.reddit.com/r/{subreddit}/) submissions.
|
25 |
|
26 |
There is a limit of 1000 in an API call and limited search functionality, so this is run {frequency} to get new submissions.
|
27 |
|
@@ -33,27 +70,16 @@ The dataset is updated {frequency} with the most recent update being `{latest_ho
|
|
33 |
|
34 |
## Licensing
|
35 |
[Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25:
|
36 |
-
|
37 |
-
|
38 |
-
My take is that you can't use this data for *training* without getting permission.
|
39 |
|
40 |
## Opt-out
|
41 |
To opt-out of this dataset please make a request in the community tab
|
42 |
"""
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
def append_readme(path, readme_text):
|
48 |
-
generated_below_marker = "--- Generated Part of README Below ---"
|
49 |
-
with open(path, "r") as file:
|
50 |
-
content = file.read()
|
51 |
-
|
52 |
-
if generated_below_marker in content:
|
53 |
-
index = content.index(generated_below_marker) + len(generated_below_marker)
|
54 |
-
content = content[:index] + "\n\n" + readme_text
|
55 |
else:
|
56 |
-
|
57 |
|
58 |
-
|
59 |
-
file.write(content)
|
|
|
2 |
from datetime import datetime
|
3 |
|
4 |
import pytz
|
5 |
+
from huggingface_hub import HfApi, Repository
|
|
|
|
|
6 |
|
7 |
frequency = os.environ.get("FREQUENCY", '').lower()
|
8 |
+
GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---"
|
9 |
+
hf_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
|
10 |
+
local_repo_path = "./readme_repo"
|
11 |
+
|
12 |
+
|
13 |
+
def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None:
|
14 |
+
"""
|
15 |
+
Update the README file of a specified dataset repository with new information.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
dataset_name (str): Name of the dataset repository.
|
19 |
+
subreddit (str): Name of the subreddit being used for dataset creation.
|
20 |
+
new_rows (int): Number of new rows added in the latest update.
|
21 |
+
hf_token (str): Hugging Face authentication token.
|
22 |
+
local_repo_path (str): Local path to clone the repository.
|
23 |
+
"""
|
24 |
+
# Initialize HfApi
|
25 |
+
api = HfApi()
|
26 |
+
|
27 |
+
# Clone the repository locally
|
28 |
+
repo = Repository(local_repo_path, clone_from=dataset_name, repo_type='dataset', use_auth_token=hf_token)
|
29 |
+
|
30 |
+
# Read the README file
|
31 |
+
with open(f"{local_repo_path}/README.md", "r") as file:
|
32 |
+
old_readme = file.read()
|
33 |
+
|
34 |
+
# Modify the README
|
35 |
+
new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme)
|
36 |
+
|
37 |
+
# Write the updated README back to the repository
|
38 |
+
with open(f"{local_repo_path}/README.md", "w") as file:
|
39 |
+
file.write(new_readme)
|
40 |
+
|
41 |
+
# Push the changes
|
42 |
+
repo.push_to_hub(blocking=True, commit_message=f'Pushing {new_rows}')
|
43 |
+
|
44 |
+
def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str:
|
45 |
+
"""
|
46 |
+
Append new information to the existing README content.
|
47 |
+
|
48 |
+
Args:
|
49 |
+
subreddit (str): Name of the subreddit.
|
50 |
+
new_rows (int): Number of new rows added.
|
51 |
+
old_readme (str): Existing README content.
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
str: Updated README content.
|
55 |
+
"""
|
56 |
latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
|
57 |
latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')
|
58 |
|
59 |
readme_text = f"""
|
60 |
## Dataset Overview
|
61 |
+
The goal is to have an open dataset of [r/{subreddit}](https://www.reddit.com/r/{subreddit}/) submissions. I'm leveraging PRAW and the Reddit API to get downloads.
|
62 |
|
63 |
There is a limit of 1000 in an API call and limited search functionality, so this is run {frequency} to get new submissions.
|
64 |
|
|
|
70 |
|
71 |
## Licensing
|
72 |
[Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25:
|
73 |
+
[License information]
|
|
|
|
|
74 |
|
75 |
## Opt-out
|
76 |
To opt-out of this dataset please make a request in the community tab
|
77 |
"""
|
78 |
|
79 |
+
if GENERATED_BELOW_MARKER in old_readme:
|
80 |
+
index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER)
|
81 |
+
new_readme = old_readme[:index] + "\n\n" + readme_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
else:
|
83 |
+
new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n"
|
84 |
|
85 |
+
return new_readme
|
|