import urllib from typing import Iterable import gradio as gr import markdown as md import pandas as pd from distilabel.cli.pipeline.utils import get_config_from_url from gradio_leaderboard import ColumnFilter, Leaderboard, SearchColumns, SelectColumns from gradio_modal import Modal from huggingface_hub import HfApi, HfFileSystem, RepoCard from huggingface_hub.hf_api import DatasetInfo # Initialize the Hugging Face API api = HfApi() fs = HfFileSystem() def _categorize_dtypes(df): dtype_mapping = { 'int64': 'number', 'float64': 'number', 'bool': 'bool', 'datetime64[ns]': 'date', 'datetime64[ns, UTC]': 'date', 'object': 'str' } categorized_dtypes = [] for _, dtype in df.dtypes.items(): dtype_str = str(dtype) if dtype_str in dtype_mapping: categorized_dtypes.append(dtype_mapping[dtype_str]) else: categorized_dtypes.append('markdown') return categorized_dtypes def _get_tag_category(entry: list[str], tag_category: str): for item in entry: if tag_category in item: return item.split(f"{tag_category}:")[-1] else: return None def _check_pipeline(repo_id): for file_type in [".json", ".yaml", ".yml"]: file_path = f"datasets/{repo_id}/pipeline{file_type}" url = f"https://huggingface.co/datasets/{repo_id}/raw/main/pipeline{file_type}" if fs.exists(file_path): return get_config_from_url(url) def _has_pipline(x): if isinstance(x, str): if "distilabel pipeline run" in x: return "yes" return "no" def _search_distilabel_repos(query: str = None,): filter = "library:distilabel" if query: filter = f"{filter}&search={urllib.urlencode(query)}" datasets: Iterable[DatasetInfo] = api.list_datasets(filter=filter) data = [ex.__dict__ for ex in datasets] df = pd.DataFrame.from_records(data) df["size_categories"] = df.tags.apply(_get_tag_category, args=["size_categories"]) df["has_pipeline"] = df.description.apply(_has_pipline) subset_columns = ['id', 'likes', 'downloads', "size_categories", 'has_pipeline', 'last_modified', 'description'] new_column_order = subset_columns + [col for col in df.columns if col not in subset_columns] df = df[new_column_order] return df def _create_modal_info(row: dict) -> str: def _get_main_title(repo_id): return f'

{repo_id}

' def _embed_dataset_viewer(repo_id): return ( f"""""" ) def _get_dataset_card(repo_id): return md.markdown(RepoCard.load(repo_id_or_path=repo_id, repo_type="dataset").text) return "
".join([ _get_main_title(repo_id=row["id"]), _embed_dataset_viewer(repo_id=row["id"]), _get_dataset_card(repo_id=row["id"]), ]), _check_pipeline(repo_id=row["id"]) # Define the Gradio interface with gr.Blocks(delete_cache=[1,1]) as demo: gr.Markdown("# ⚗️ Distilabel Synthetic Data Pipeline Finder") gr.HTML("Select a repo_id to show the pipeline, dataset viewer and model card.") df: pd.DataFrame = _search_distilabel_repos() leader_board = Leaderboard( value=df, datatype=_categorize_dtypes(df), search_columns=SearchColumns(primary_column="id", secondary_columns=["description", "author"], placeholder="Search by id, description or author. To search by description or author, type 'description:', 'author:'", label="Search"), filter_columns=[ ColumnFilter("likes", type="slider", min=0, max=df.likes.max(), default=[0, df.likes.max()]), ColumnFilter("downloads", type="slider", min=0, max=df.downloads.max(), default=[0, df.downloads.max()]), ColumnFilter("size_categories", type="checkboxgroup"), ColumnFilter("has_pipeline", type="dropdown"), ], hide_columns=[ "_id", "private", "gated", "disabled", "sha", "downloads_all_time", "paperswithcode_id", "tags", "siblings", "cardData", "lastModified", "card_data", "key"], select_columns=SelectColumns(default_selection=["id", "last_modified", "downloads", "likes", "has_pipeline", "size_categories"], cant_deselect=["id"], label="Select The Columns", info="Helpful information"), ) with Modal() as modal: with gr.Tab(label="dataset"): markdown_1 = gr.HTML() with gr.Tab(label="pipeline"): markdown_2 = gr.JSON() def update(leader_board: pd.DataFrame, markdown_1, markdown_2, evt: gr.SelectData): if not isinstance(evt.index, int): leader_board_filtered = leader_board[leader_board["id"] == evt.value] if leader_board_filtered.empty: modal = Modal(visible=False) gr.Info("Press a cell with the repo id.") else: row = leader_board_filtered.iloc[0].to_dict() markdown_1, markdown_2 = _create_modal_info(row=row) modal = Modal(visible=True) return leader_board, markdown_1, markdown_2, modal else: modal = Modal(visible=False) return leader_board, markdown_1, markdown_2, modal leader_board.select( update, [leader_board, markdown_1, markdown_2], [leader_board, markdown_1, markdown_2, modal] ) if __name__ == "__main__": demo.launch()