derek-thomas's picture
derek-thomas HF staff
Updating details
b6f8c08
raw
history blame
5.17 kB
import os
from pathlib import Path
from fastapi import BackgroundTasks, Response, status
import gradio as gr
from huggingface_hub import WebhookPayload, WebhooksServer
from src.my_logger import setup_logger
from src.utilities import load_datasets, merge_and_update_datasets
from src.visualize_logs import log_file_to_html_string
from src.build_nomic import build_nomic
from src.readme_update import update_dataset_readme
proj_dir = Path(__name__).parent
logger = setup_logger(__name__)
logger.info("Starting Application...")
SUBREDDIT = os.environ["SUBREDDIT"]
USERNAME = os.environ["USERNAME"]
OG_DATASET = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"
PROCESSED_DATASET = os.environ['PROCESSED_DATASET']
HUGGINGFACE_AUTH_TOKEN = os.environ["HUGGINGFACE_AUTH_TOKEN"]
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')
intro_md = """
# Processing BORU
This is a space to visually search the subreddit [/r/bestofredditorupdates](https://www.reddit.com/r/BestofRedditorUpdates/).
Have you ever been curious to search for stories that are similar to one of your favorites? This can help!
- Each dot represents a post (try clicking on one)
- Closer dots are similar in topic
- Use the filters on the left to help you narrow down what you are looking for
- The lasso can help you search in a smaller range that you drag with your mouse
- The filter can help you narrow by field,
- Filtering posts that are `CONCLUDED`
- Filtering popular posts
- Filtering by date
- The search can help you look by keyword
Check out the original on [Nomic](https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map)
"""
details_md = """
# Details
## Creation Details
1. This space is triggered by a webhook for changes on [reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates](https://huggingface.co/datasets/reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates).
2. It then takes the updates from that dataset and get embeddings by making leveraging [reddit-tools-HF/nomic-embeddings](https://huggingface.co/spaces/reddit-tools-HF/nomic-embeddings)
- [reddit-tools-HF/nomic-embeddings](https://huggingface.co/spaces/reddit-tools-HF/nomic-embeddings) is using [zero-spaces](https://huggingface.co/zero-gpu-explorers) a free GPU service to compute the model [nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)
- Im calling this via [gradio_client](https://www.gradio.app/docs/client) which allows any space to be used as an API
3. The calculated embeddings are stored in this dataset [reddit-tools-HF/reddit-bestofredditorupdates-processed](https://huggingface.co/datasets/reddit-tools-HF/reddit-bestofredditorupdates-processed)
4. These get visualized by [nomic atlas](https://docs.nomic.ai/atlas/introduction/quick-start). You can see how I process it in [build_nomic.py](https://huggingface.co/spaces/reddit-tools-HF/processing-bestofredditorupdates/blob/main/src/build_nomic.py)
"""
url = "https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map"
html_str = f'<iframe src={url} style="border:none;height:1024px;width:100%" allow="clipboard-read; clipboard-write" title="Nomic Atlas">'
with gr.Blocks() as ui:
with gr.Tab("Application"):
gr.Markdown(intro_md)
gr.HTML(html_str)
with gr.Tab("Logs"):
gr.Markdown("# Logs")
output = gr.HTML(log_file_to_html_string, every=1)
with gr.Tab("Details"):
gr.Markdown(details_md)
app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET)
@app.add_webhook("/dataset_repo")
async def community(payload: WebhookPayload, task_queue: BackgroundTasks):
if not payload.event.scope.startswith("repo"):
return Response("No task scheduled", status_code=status.HTTP_200_OK)
logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}")
task_queue.add_task(_process_webhook, payload=payload)
return Response("Task scheduled.", status_code=status.HTTP_202_ACCEPTED)
def _process_webhook(payload: WebhookPayload):
logger.info(f"Loading new dataset...")
dataset, original_dataset = load_datasets()
logger.info(f"Loaded new dataset")
logger.info(f"Merging and Updating rows...")
dataset, updated_row_count = merge_and_update_datasets(dataset, original_dataset)
logger.info(f"Merged and Updated rows")
# Push the augmented dataset to the Hugging Face hub
logger.info(f"Pushing processed data to the Hugging Face Hub...")
dataset.push_to_hub(PROCESSED_DATASET, token=HUGGINGFACE_AUTH_TOKEN)
logger.info(f"Pushed processed data to the Hugging Face Hub")
update_dataset_readme(dataset_name=PROCESSED_DATASET, subreddit=SUBREDDIT, new_rows=updated_row_count)
logger.info(f"Updated README.")
# Build Nomic
logger.info(f"Building Nomic...")
build_nomic(dataset=dataset)
logger.info(f"Built Nomic")
logger.info(f"Update from webhook completed!")
if __name__ == '__main__':
app.launch(server_name="0.0.0.0", show_error=True, server_port=7860)
# ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)