derek-thomas's picture
derek-thomas HF staff
Making share false to prevent hub issues
21acbc3 verified
raw
history blame contribute delete
No virus
3.79 kB
import os
from pathlib import Path
import gradio as gr
from huggingface_hub import WebhookPayload, WebhooksServer
from utilities.my_logger import setup_logger
from utilities.visualize_logs import log_file_to_html_string
proj_dir = Path(__name__).parent
SUBREDDIT = os.environ["SUBREDDIT"]
USERNAME = os.environ["USERNAME"]
DATASET_NAME = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"
FREQUENCY = os.environ.get("FREQUENCY", '').lower()
if FREQUENCY not in ["daily", "hourly"]:
raise gr.Error("FREQUENCY environment variable must be 'daily' or 'hourly'")
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')
logger = setup_logger(__name__)
intro_md = f"""
# Reddit Dataset Creator
This is a reddit dataset creator which builds and updates [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME})
which pulls from [/r/{SUBREDDIT}](http://www.reddit.com/r/{SUBREDDIT}). Check the dataset for more details.
As shown in the below diagram this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
"""
how_to_md = f"""
# How to make your own space and dataset
1. Create a [reddit application](https://www.reddit.com/prefs/apps), use 'Script for personal use'
- Redirect URI can be anything, I use 'http://www.example.com/unused/redirect/uri'
- You need the `secret` and the `Client ID` from the reddit application.
- `REDDIT_USER_AGENT` can be any descriptive string, probably any undescriptive string too.
2. Get your writable [huggingface token](https://huggingface.co/settings/tokens)
3. <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-amitheasshole?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg" alt="Duplicate Space"></a>
and fill in the information
"""
how_does_it_work_md = f"""
# Core Components
There are 2 core components [main](main.py) and [app](app.py).
Main does a few things:
- Pulls from a datasource
- Updates a dataset on the hub
- Updates the README of the dataset
- Writes a local log file (inaccessible outside the spaces container)
App
- Visualizes the log file from Main
# Running it
This uses a docker space so that I can execute supervisor. Supervisor allows me to kick off 2 processes and manage the
log files. I use gradio for `app` and map that to the open port of huggingface spaces.
The only communication between `app` and `main` is the log file.
"""
js_func = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'dark') {
url.searchParams.set('__theme', 'dark');
window.location.href = url.href;
}
}
"""
with gr.Blocks(js=js_func) as ui:
with gr.Tab("Application"):
gr.Markdown(intro_md)
gr.Image(str(proj_dir / 'media' / 'reddit_scraper.drawio.png'), type='filepath')
gr.Markdown("# Logs")
output = gr.HTML(log_file_to_html_string, every=1)
with gr.Tab("How to Create?"):
gr.Markdown(how_to_md)
with gr.Tab("How does it work?"):
gr.Markdown(how_does_it_work_md)
app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET)
@app.add_webhook("/dataset_repo")
async def community(payload: WebhookPayload):
if payload.event.scope.startswith("repo"):
logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}")
if __name__ == '__main__':
app.launch(server_name="0.0.0.0", show_error=True, server_port=7860, share=False)
# ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860, share=False)