import os from pathlib import Path import gradio as gr from huggingface_hub import WebhookPayload, WebhooksServer from src.my_logger import setup_logger from src.utilities import load_datasets, merge_and_update_datasets from src.visualize_logs import log_file_to_html_string from src.build_nomic import build_nomic proj_dir = Path(__name__).parent logger = setup_logger(__name__) logger.info("Starting Application...") SUBREDDIT = os.environ["SUBREDDIT"] USERNAME = os.environ["USERNAME"] OG_DATASET = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}" PROCESSED_DATASET = os.environ['PROCESSED_DATASET'] HUGGINGFACE_AUTH_TOKEN = os.environ["HUGGINGFACE_AUTH_TOKEN"] WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret') intro_md = """ # Processing BORU This space is triggered by a webhook for changes on [derek-thomas/dataset-creator-reddit-bestofredditorupdates](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-bestofredditorupdates). It then takes the updates from that dataset and get embeddings and puts the results in [https://huggingface.co/datasets/derek-thomas/reddit-bestofredditorupdates-processed](https://huggingface.co/datasets/derek-thomas/reddit-bestofredditorupdates-processed) Check out the original on [Nomic](https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map) """ html_str = """ conll2003
""" with gr.Blocks() as ui: with gr.Tab("Application"): gr.Markdown(intro_md) gr.HTML(html_str) with gr.Tab("Logs"): gr.Markdown("# Logs") output = gr.HTML(log_file_to_html_string, every=1) app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET) @app.add_webhook("/dataset_repo") async def community(payload: WebhookPayload): if payload.event.scope.startswith("repo"): logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}") else: return logger.info(f"Loading new dataset...") dataset, original_dataset = load_datasets() logger.info(f"Loaded new dataset") logger.info(f"Merging and Updating row...") dataset = merge_and_update_datasets(dataset, original_dataset) # Push the augmented dataset to the Hugging Face hub logger.info(f"Pushing processed data to the Hugging Face Hub...") dataset.push_to_hub(PROCESSED_DATASET, token=HUGGINGFACE_AUTH_TOKEN) logger.info(f"Pushed processed data to the Hugging Face Hub") logger.info(f"Building Nomic...") build_nomic(dataset=dataset) logger.info(f"Built Nomic") if __name__ == '__main__': app.launch(server_name="0.0.0.0", show_error=True, server_port=7860) # ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)