derek-thomas HF staff commited on
Commit
ef9cbc8
1 Parent(s): 3772eaf

Fix async bug and add readme

Browse files
Files changed (3) hide show
  1. app.py +6 -1
  2. src/readme_update.py +93 -0
  3. src/utilities.py +5 -5
app.py CHANGED
@@ -8,6 +8,7 @@ from src.my_logger import setup_logger
8
  from src.utilities import load_datasets, merge_and_update_datasets
9
  from src.visualize_logs import log_file_to_html_string
10
  from src.build_nomic import build_nomic
 
11
 
12
  proj_dir = Path(__name__).parent
13
 
@@ -96,13 +97,17 @@ async def community(payload: WebhookPayload):
96
  logger.info(f"Loaded new dataset")
97
 
98
  logger.info(f"Merging and Updating row...")
99
- dataset = merge_and_update_datasets(dataset, original_dataset)
100
 
101
  # Push the augmented dataset to the Hugging Face hub
102
  logger.info(f"Pushing processed data to the Hugging Face Hub...")
103
  dataset.push_to_hub(PROCESSED_DATASET, token=HUGGINGFACE_AUTH_TOKEN)
104
  logger.info(f"Pushed processed data to the Hugging Face Hub")
105
 
 
 
 
 
106
  logger.info(f"Building Nomic...")
107
  build_nomic(dataset=dataset)
108
  logger.info(f"Built Nomic")
 
8
  from src.utilities import load_datasets, merge_and_update_datasets
9
  from src.visualize_logs import log_file_to_html_string
10
  from src.build_nomic import build_nomic
11
+ from src.readme_update import update_dataset_readme
12
 
13
  proj_dir = Path(__name__).parent
14
 
 
97
  logger.info(f"Loaded new dataset")
98
 
99
  logger.info(f"Merging and Updating row...")
100
+ dataset, updated_row_count = merge_and_update_datasets(dataset, original_dataset)
101
 
102
  # Push the augmented dataset to the Hugging Face hub
103
  logger.info(f"Pushing processed data to the Hugging Face Hub...")
104
  dataset.push_to_hub(PROCESSED_DATASET, token=HUGGINGFACE_AUTH_TOKEN)
105
  logger.info(f"Pushed processed data to the Hugging Face Hub")
106
 
107
+ update_dataset_readme(dataset_name=PROCESSED_DATASET, subreddit=SUBREDDIT, new_rows=updated_row_count)
108
+ logger.info(f"Updated README.")
109
+
110
+ # Build Nomic
111
  logger.info(f"Building Nomic...")
112
  build_nomic(dataset=dataset)
113
  logger.info(f"Built Nomic")
src/readme_update.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+ from shutil import rmtree
5
+
6
+ import pytz
7
+ from huggingface_hub import HfApi, Repository
8
+
9
+ GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---"
10
+ hf_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
11
+ local_repo_path = "./readme_repo"
12
+
13
+
14
+ def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None:
15
+ """
16
+ Update the README file of a specified dataset repository with new information.
17
+
18
+ Args:
19
+ dataset_name (str): Name of the dataset repository.
20
+ subreddit (str): Name of the subreddit being used for dataset creation.
21
+ new_rows (int): Number of new rows added in the latest update.
22
+ hf_token (str): Hugging Face authentication token.
23
+ local_repo_path (str): Local path to clone the repository.
24
+ """
25
+ # Initialize HfApi
26
+ api = HfApi()
27
+
28
+ if Path(local_repo_path).exists():
29
+ rmtree(local_repo_path)
30
+
31
+ # Clone the repository locally
32
+ repo = Repository(local_repo_path, clone_from=dataset_name, repo_type='dataset', use_auth_token=hf_token)
33
+
34
+ # Read the README file
35
+ with open(f"{local_repo_path}/README.md", "r") as file:
36
+ old_readme = file.read()
37
+
38
+ # Modify the README
39
+ new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme)
40
+
41
+ # Write the updated README back to the repository
42
+ with open(f"{local_repo_path}/README.md", "w") as file:
43
+ file.write(new_readme)
44
+
45
+ # Push the changes
46
+ repo.push_to_hub(blocking=True, commit_message=f'Pushing {new_rows} new rows')
47
+
48
+
49
+ def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str:
50
+ """
51
+ Append new information to the existing README content.
52
+
53
+ Args:
54
+ subreddit (str): Name of the subreddit.
55
+ new_rows (int): Number of new rows added.
56
+ old_readme (str): Existing README content.
57
+
58
+ Returns:
59
+ str: Updated README content.
60
+ """
61
+ latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
62
+ latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')
63
+
64
+ readme_text = f"""
65
+ ## Dataset Overview
66
+ This dataset is based on [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit})
67
+ and will add [nomic-ai/nomic-embed-text-v1](https://huggingface.co/nomic-ai/nomic-embed-text-v1) embeddings based on the
68
+ `content` field.
69
+
70
+ The goal is to be able to have an automatic and free semantic/neural tool for any subreddit.
71
+
72
+ The last run was on {latest_hour_str} and updated {new_rows}.
73
+
74
+ ## Creation Details
75
+ This is done by triggering [derek-thomas/processing-bestofredditorupdates](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates)
76
+ based on a repository update webhook to calculate the embeddings and update the [nomic atlas](https://docs.nomic.ai)
77
+ visualization.
78
+
79
+ ## Update Frequency
80
+ The dataset is updated based on a webhook trigger, so each time [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit})
81
+ is updated, this dataset will be updated.
82
+
83
+ ## Opt-out
84
+ To opt-out of this dataset please make a request in the community tab
85
+ """
86
+
87
+ if GENERATED_BELOW_MARKER in old_readme:
88
+ index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER)
89
+ new_readme = old_readme[:index] + "\n\n" + readme_text
90
+ else:
91
+ new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n"
92
+
93
+ return new_readme
src/utilities.py CHANGED
@@ -15,7 +15,7 @@ PROCESSED_DATASET = os.environ['PROCESSED_DATASET']
15
  logger = setup_logger(__name__)
16
 
17
 
18
- async def load_datasets():
19
  # Get latest datasets locally
20
  logger.debug(f"Trying to download {PROCESSED_DATASET}")
21
  dataset = load_dataset(PROCESSED_DATASET, download_mode=DownloadMode.FORCE_REDOWNLOAD)
@@ -38,7 +38,7 @@ def merge_and_update_datasets(dataset, original_dataset):
38
  # Step 1: Merge df onto odf
39
  # We'll bring in 'content' and 'embedding' from df to compare and possibly update 'embedding'
40
  merged_df = pd.merge(odf, df[['id', 'content', 'embedding']], on='id', how='left', suffixes=('_odf', ''))
41
- updated_rows = len(merged_df[merged_df.content != merged_df.content_odf])
42
 
43
  # Step 2: Compare 'content' from odf and df, update 'embedding' if they differ
44
  merged_df['embedding'] = np.where(merged_df['content_odf'] != merged_df['content'], None, merged_df['embedding'])
@@ -48,15 +48,15 @@ def merge_and_update_datasets(dataset, original_dataset):
48
  merged_df = merged_df.drop(columns=['content', 'new', 'updated']) # Update columns to match df
49
  merged_df.rename(columns={'content_odf': 'content'}, inplace=True) # Rename 'content_odf' back to 'content'
50
 
51
- logger.info(f"Updating {updated_rows} rows...")
52
  # Iterate over the DataFrame rows where 'embedding' is None
53
  for index, row in merged_df[merged_df['embedding'].isnull()].iterrows():
54
  # Update 'embedding' for the current row using our function
55
  merged_df.at[index, 'embedding'] = update_embeddings(content=row['content'], client=client)
56
 
57
  dataset['train'] = Dataset.from_pandas(merged_df)
58
- logger.info(f"Updated {updated_rows} rows")
59
- return dataset
60
 
61
 
62
  def update_embeddings(content, client):
 
15
  logger = setup_logger(__name__)
16
 
17
 
18
+ def load_datasets():
19
  # Get latest datasets locally
20
  logger.debug(f"Trying to download {PROCESSED_DATASET}")
21
  dataset = load_dataset(PROCESSED_DATASET, download_mode=DownloadMode.FORCE_REDOWNLOAD)
 
38
  # Step 1: Merge df onto odf
39
  # We'll bring in 'content' and 'embedding' from df to compare and possibly update 'embedding'
40
  merged_df = pd.merge(odf, df[['id', 'content', 'embedding']], on='id', how='left', suffixes=('_odf', ''))
41
+ updated_row_count = len(merged_df[merged_df.content != merged_df.content_odf])
42
 
43
  # Step 2: Compare 'content' from odf and df, update 'embedding' if they differ
44
  merged_df['embedding'] = np.where(merged_df['content_odf'] != merged_df['content'], None, merged_df['embedding'])
 
48
  merged_df = merged_df.drop(columns=['content', 'new', 'updated']) # Update columns to match df
49
  merged_df.rename(columns={'content_odf': 'content'}, inplace=True) # Rename 'content_odf' back to 'content'
50
 
51
+ logger.info(f"Updating {updated_row_count} rows...")
52
  # Iterate over the DataFrame rows where 'embedding' is None
53
  for index, row in merged_df[merged_df['embedding'].isnull()].iterrows():
54
  # Update 'embedding' for the current row using our function
55
  merged_df.at[index, 'embedding'] = update_embeddings(content=row['content'], client=client)
56
 
57
  dataset['train'] = Dataset.from_pandas(merged_df)
58
+ logger.info(f"Updated {updated_row_count} rows")
59
+ return dataset, updated_row_count
60
 
61
 
62
  def update_embeddings(content, client):