File size: 3,412 Bytes
ef9cbc8
 
 
 
5f3e4e7
ef9cbc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f3e4e7
 
 
ef9cbc8
5f3e4e7
 
ef9cbc8
 
5f3e4e7
ef9cbc8
 
5f3e4e7
 
 
 
 
 
 
 
 
ef9cbc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9930cd7
ef9cbc8
 
 
9930cd7
 
ef9cbc8
 
9930cd7
ef9cbc8
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
from datetime import datetime

import pytz
from huggingface_hub import HfApi

GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---"


def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None:
    """
    Update the README file of a specified dataset repository with new information.

    Args:
    dataset_name (str): Name of the dataset repository.
    subreddit (str): Name of the subreddit being used for dataset creation.
    new_rows (int): Number of new rows added in the latest update.
    """
    # Initialize HfApi
    api = HfApi()
    
    # Download README file
    readme_path = api.hf_hub_download(repo_id=dataset_name, repo_type="dataset", filename="README.md", token=hf_token)

    # Read it
    with open(readme_path, "r") as file:
        old_readme = file.read()

    # Modify it
    new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme)

    # Commit modifications
    api.upload_file(
        path_or_fileobj=new_readme.encode(),
        path_in_repo="README.md",
        repo_id=dataset_name,
        repo_type="dataset",
        token=hf_token,
        commit_message=f'Pushing {new_rows} new rows'
    )


def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str:
    """
    Append new information to the existing README content.

    Args:
    subreddit (str): Name of the subreddit.
    new_rows (int): Number of new rows added.
    old_readme (str): Existing README content.

    Returns:
    str: Updated README content.
    """
    latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
    latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')

    readme_text = f"""
## Dataset Overview
This dataset is based on [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit}) 
and will add [nomic-ai/nomic-embed-text-v1](https://huggingface.co/nomic-ai/nomic-embed-text-v1) embeddings based on the
`content` field.

The goal is to be able to have an automatic and free semantic/neural tool for any subreddit.

The last run was on {latest_hour_str} and updated {new_rows} new rows.

## Creation Details
This is done by triggering [derek-thomas/processing-bestofredditorupdates](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates) 
based on a repository update [webhook](https://huggingface.co/docs/hub/en/webhooks) to calculate the embeddings and update the [nomic atlas](https://docs.nomic.ai) 
visualization. This is done by this [processing space](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates).

## Update Frequency
The dataset is updated based on a [webhook](https://huggingface.co/docs/hub/en/webhooks) trigger, so each time [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit})
is updated, this dataset will be updated. 

## Opt-out
To opt-out of this dataset please make a request in the community tab
"""

    if GENERATED_BELOW_MARKER in old_readme:
        index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER)
        new_readme = old_readme[:index] + "\n\n" + readme_text
    else:
        new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n"

    return new_readme