import pandas as pd import numpy as np import os import pickle from prefixspan import PrefixSpan import gradio as gr import altair as alt from plot import Plot alt.data_transformers.enable("vegafusion") # from dynabench.task_evaluator import * BASE_DIR = "db" MODELS = ['qwenvl-chat', 'qwenvl', 'llava15-7b', 'llava15-13b', 'instructblip-vicuna13b', 'instructblip-vicuna7b'] VIDEO_MODELS = ['video-chat2-7b','video-llama2-7b','video-llama2-13b','chat-univi-7b','chat-univi-13b','video-llava-7b','video-chatgpt-7b'] domains = ["imageqa-2d-sticker", "imageqa-3d-tabletop", "imageqa-scene-graph", "videoqa-3d-tabletop", "videoqa-scene-graph"] domain2folder = {"imageqa-2d-sticker": "2d", "imageqa-3d-tabletop": "3d", "imageqa-scene-graph": "sg", "videoqa-3d-tabletop": "video-3d", "videoqa-scene-graph": "video-sg", None: '2d'} def find_frequent_patterns(k, df, scores=None): if len(df) == 0: return [] df = df.reset_index(drop=True) cols = df.columns.to_list() df = df.fillna('').astype('str') db = [[(c, v) for c, v in zip(cols, d) if v] for d in df.values.tolist()] ps = PrefixSpan(db) patterns = ps.topk(k, closed=True) if scores is None: return patterns else: aggregated_scores = [] scores = np.asarray(scores) for count, pattern in patterns: q = ' and '.join([f"`{k}` == {repr(v)}" for k, v in pattern]) indices = df.query(q).index.to_numpy() aggregated_scores.append(np.mean(scores[indices])) return patterns, aggregated_scores def update_partition_and_models(domain): domain = domain2folder[domain] path = f"{BASE_DIR}/{domain}" if os.path.exists(path): partitions = list_directories(path) if domain.find("video") > -1: model = gr.Dropdown(VIDEO_MODELS, value=VIDEO_MODELS[0], label="model") else: model = gr.Dropdown(MODELS, value=MODELS[0], label="model") partition = gr.Dropdown(partitions, value=partitions[0], label="task space of the following task generator") return [partition, model] else: partition = gr.Dropdown([], value=None, label="task space of the following task generator") model = gr.Dropdown([], value=None, label="model") return [partition, model] def update_partition_and_models_and_baselines(domain): domain = domain2folder[domain] path = f"{BASE_DIR}/{domain}" if os.path.exists(path): partitions = list_directories(path) if domain.find("video") > -1: model = gr.Dropdown(VIDEO_MODELS, value=VIDEO_MODELS[0], label="model") baseline = gr.Dropdown(VIDEO_MODELS, value=VIDEO_MODELS[0], label="baseline") else: model = gr.Dropdown(MODELS, value=MODELS[0], label="model") baseline = gr.Dropdown(MODELS, value=MODELS[0], label="baseline") partition = gr.Dropdown(partitions, value=partitions[0], label="task space of the following task generator") else: partition = gr.Dropdown([], value=None, label="task space of the following task generator") model = gr.Dropdown([], value=None, label="model") baseline = gr.Dropdown([], value=None, label="baseline") return [partition, model, baseline] def get_filtered_task_ids(domain, partition, models, rank, k, threshold, baseline): domain = domain2folder[domain] data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" if not os.path.exists(data_path): return [] else: merged_df = pd.read_csv(data_path) merged_df.rename(columns={'llavav1.5-7b': 'llava15-7b', 'llavav1.5-13b': 'llava15-13b'}, inplace=True) df = merged_df select_top = rank == "top" # Model X is good / bad at for model in models: if baseline: df = df[df[model] >= df[baseline]] else: if select_top: df = df[df[model] >= threshold] else: df = df[df[model] <= threshold] if not baseline: df['mean score'] = df[models].mean(axis=1) df = df.sort_values(by='mean score', ascending=False) df = df.iloc[:k, :] if select_top else df.iloc[-k:, :] task_ids = list(df.index) return task_ids def plot_patterns(domain, partition, models, rank, k, threshold, baseline, pattern, order): domain = domain2folder[domain] data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" if not os.path.exists(data_path): return None task_ids = get_filtered_task_ids(domain, partition, models, rank, k, threshold, baseline) expand_df = pd.read_csv(data_path) chart_df = expand_df[expand_df['model'].isin((models + [baseline]) if baseline else models)] chart_df = chart_df[chart_df['task id'].isin(task_ids)] print(pattern) freq, cols = eval(pattern) pattern_str = "" df = chart_df for col in cols: col_name, col_val = col try: col_val = int(col_val) except: col_val = col_val df = df[df[col_name] == col_val] pattern_str += f"{col_name} = {col_val}, " print(len(df)) if baseline: model_str = (', '.join(models) if len(models) > 1 else models[0]) phrase = f'{model_str} perform' if len(models) > 1 else f'{model_str} performs' title = f"{phrase} better than {baseline} on {freq} tasks where {pattern_str[:-2]}" else: title = f"Models are {'best' if rank == 'top' else 'worst'} at {freq} tasks where {pattern_str[:-2]}" chart = alt.Chart(df).mark_bar().encode( alt.X('model:N', sort=alt.EncodingSortField(field=f'score', order=order, op="mean"), axis=alt.Axis(labels=False, tickSize=0)), # no title, no label angle), alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), alt.Color('model:N').legend(), ).properties( width=400, height=300, title=title ) return chart def plot_embedding(domain, partition, category): domain = domain2folder[domain] data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" if os.path.exists(data_path): merged_df = pd.read_csv(data_path) # models = merged_df.columns has_image = 'image' in merged_df chart = alt.Chart(merged_df).mark_point(size=30, filled=True).encode( alt.OpacityValue(0.5), alt.X('x:Q'), alt.Y('y:Q'), alt.Color(f'{category}:N'), tooltip=['question', 'answer'] + (['image'] if has_image else []), ).properties( width=800, height=800, title="UMAP Projected Task Embeddings" ).configure_axis( labelFontSize=25, titleFontSize=25, ).configure_title( fontSize=40 ).configure_legend( labelFontSize=25, titleFontSize=25, ).interactive() return chart else: return None def plot_multi_models(domain, partition, category, cat_options, models, order, pattern, aggregate="mean"): domain = domain2folder[domain] data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" if not os.path.exists(data_path): return None expand_df = pd.read_csv(data_path) print(pattern) if pattern is not None: df = expand_df freq, cols = eval(pattern) pattern_str = "" for col in cols: col_name, col_val = col try: col_val = int(col_val) except: col_val = col_val df = df[df[col_name] == col_val] pattern_str += f"{col_name} = {col_val}, " chart = alt.Chart(df).mark_bar().encode( alt.X('model:N', sort=alt.EncodingSortField(field=f'score', order='ascending', op="mean"), axis=alt.Axis(labels=False, tickSize=0)), # no title, no label angle), alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), alt.Color('model:N').legend(), ).properties( width=200, height=100, title=f"How do models perform on tasks where {pattern_str[:-2]} (N={freq})?" ) return chart else: df = expand_df[(expand_df['model'].isin(models)) & (expand_df[category].isin(cat_options))] if len(models) > 1: chart = alt.Chart(df).mark_bar().encode( alt.X('model:N', sort=alt.EncodingSortField(field=f'score', order=order, op="mean"), axis=alt.Axis(labels=False, tickSize=0, title=None)), alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), alt.Color('model:N').legend(), alt.Column(f'{category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom')) ).properties( width=200, height=100, title=f"How do models perform across {category}?" ) else: chart = alt.Chart(df).mark_bar().encode( alt.X(f'{category}:N', sort=alt.EncodingSortField(field=f'score', order=order, op="mean")), # no title, no label angle), alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), alt.Color(f'{category}:N').legend(None), ).properties( width=200, height=100, title=f"How does {models[0]} perform across {category}?" ) chart = chart.configure_title(fontSize=15, offset=5, orient='top', anchor='middle') return chart def plot(domain, partition, models, rank, k, threshold, baseline, order, category, cat_options): domain = domain2folder[domain] data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" expand_data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" # task_plan.reset_index(inplace=True) if not os.path.exists(data_path) or not os.path.exists(expand_data_path): return None else: merged_df = pd.read_csv(data_path) merged_df.rename(columns={'llavav1.5-7b': 'llava15-7b', 'llavav1.5-13b': 'llava15-13b'}, inplace=True) expand_df = pd.read_csv(expand_data_path) df = merged_df select_top = rank == "top" # Model X is good / bad at for model in models: if baseline: df = df[df[model] >= df[baseline]] else: if select_top: df = df[df[model] >= threshold] else: df = df[df[model] <= threshold] if not baseline: df['mean score'] = df[models].mean(axis=1) df = df.sort_values(by='mean score', ascending=False) df = df.iloc[:k, :] if select_top else df.iloc[-k:, :] task_ids = list(df.index) if baseline: models += [baseline] chart_df = expand_df[expand_df['model'].isin(models)] chart_df = chart_df[chart_df['task id'].isin(task_ids)] if cat_options: df = chart_df[chart_df[category].isin(cat_options)] else: df = chart_df if baseline: model_str = (', '.join(models) if len(models) > 1 else models[0]) phrase = f'{model_str} perform' if len(models) > 1 else f'{model_str} performs' title = f"Are there any tasks where {phrase} better than {baseline} (by {category})?" else: title = f"What tasks are models {'best' if select_top else 'worst'} at by {category}?" if len(models) > 1: chart = alt.Chart(df).mark_bar().encode( alt.X('model:N', sort=alt.EncodingSortField(field=f'score', order=order, op="mean"), axis=alt.Axis(labels=False, tickSize=0, title=None)), alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), alt.Color('model:N').legend(), alt.Column(f'{category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom')) ).properties( width=200, height=100, title=title ) else: chart = alt.Chart(df).mark_bar().encode( alt.X(f'{category}:N', sort=alt.EncodingSortField(field=f'score', order=order, op="mean")), # no title, no label angle), alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), alt.Color(f'{category}:N').legend(None), ).properties( width=200, height=100, title=f"What tasks is model {models[0]} {'best' if select_top else 'worst'} at by {category}?" ) chart = chart.configure_title(fontSize=15, offset=5, orient='top', anchor='middle') return chart def get_frequent_patterns(task_plan, scores): find_frequent_patterns(k=10, df=task_plan, scores=scores) def list_directories(path): """List all directories within a given path.""" return [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] def update_category(domain, partition): domain = domain2folder[domain] data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl" if os.path.exists(data_path): data = pickle.load(open(data_path, 'rb')) categories = list(data.columns) category = gr.Dropdown(categories+["task id"], value=None, label="task metadata", interactive=True) return category else: return gr.Dropdown([], value=None, label="task metadata") def update_category2(domain, partition, existing_category): domain = domain2folder[domain] data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl" if os.path.exists(data_path): data = pickle.load(open(data_path, 'rb')) categories = list(data.columns) if existing_category and existing_category in categories: categories.remove(existing_category) category = gr.Dropdown(categories, value=None, label="Optional: second task metadata", interactive=True) return category else: return gr.Dropdown([], value=None, label="task metadata") def update_partition(domain): domain = domain2folder[domain] path = f"{BASE_DIR}/{domain}" if os.path.exists(path): partitions = list_directories(path) return gr.Dropdown(partitions, value=partitions[0], label="task space of the following task generator") else: return gr.Dropdown([], value=None, label="task space of the following task generator") def update_k(domain, partition, category=None): domain = domain2folder[domain] data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" if os.path.exists(data_path): data = pd.read_csv(data_path) max_k = len(data[category].unique()) if category and category != "task id" else len(data) mid = max_k // 2 return gr.Slider(1, max_k, mid, step=1.0, label="k") else: return gr.Slider(1, 1, 1, step=1.0, label="k") # def update_category_values(domain, partition, category): # data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" # if os.path.exists(data_path) and category is not None: # data = pd.read_csv(data_path) # uni_cats = list(data[category].unique()) # return gr.Dropdown(uni_cats, multiselect=True, value=None, interactive=True, label="category values") # else: # return gr.Dropdown([], multiselect=True, value=None, interactive=False, label="category values") # def update_category_values(domain, partition, models, rank, k, threshold, baseline, category): # data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" # if not os.path.exists(data_path): # return gr.Dropdown([], multiselect=True, value=None, interactive=False, label="category values") # else: # merged_df = pd.read_csv(data_path) # merged_df.rename(columns={'llavav1.5-7b': 'llava15-7b', 'llavav1.5-13b': 'llava15-13b'}, inplace=True) # df = merged_df # select_top = rank == "top" # # Model X is good / bad at # for model in models: # if baseline: # df = df[df[model] >= df[baseline]] # else: # if select_top: # df = df[df[model] >= threshold] # else: # df = df[df[model] <= threshold] # if not baseline: # df['mean score'] = df[models].mean(axis=1) # df = df.sort_values(by='mean score', ascending=False) # df = df.iloc[:k, :] if select_top else df.iloc[-k:, :] # uni_cats = list(df[category].unique()) # return gr.Dropdown(uni_cats, multiselect=True, value=None, interactive=True, label="category values") def update_tasks(domain, partition, find_pattern): domain = domain2folder[domain] if find_pattern == "yes": k1 = gr.Slider(1, 10000, 10, step=1.0, label="k", interactive=True) pattern = gr.Dropdown([], value=None, interactive=True, label="pattern") category1 = gr.Dropdown([], value=None, interactive=False, label="task metadata") return [k1, pattern, category1] else: k1 = gr.Slider(1, 10000, 10, step=1.0, label="k", interactive=False) pattern = gr.Dropdown([], value=None, interactive=False, label="pattern") data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" if os.path.exists(data_path): data = pd.read_csv(data_path) non_columns = MODELS + ['question', 'answer'] categories = [cat for cat in list(data.columns) if cat not in non_columns] category1 = gr.Dropdown(categories, value=categories[0], interactive=True, label="task metadata") else: category1 = gr.Dropdown([], value=None, label="task metadata") return [k1, pattern, category1] def update_pattern(domain, partition, k): domain = domain2folder[domain] data_path = f"{BASE_DIR}/{domain}/{partition}/patterns.pkl" if not os.path.exists(data_path): return gr.Dropdown([], value=None, interactive=False, label="pattern") else: results = pickle.load(open(data_path, 'rb')) patterns = results[0] patterns = [str(p) for p in patterns] print(patterns) return gr.Dropdown(patterns[:k], value=None, interactive=True, label="pattern") def update_threshold(domain, partition, baseline): domain = domain2folder[domain] print(baseline) if baseline: rank = gr.Radio(['top', 'bottom'], value='top', label="rank", interactive=False) k = gr.Slider(1, 10000, 10, step=1.0, label="k", interactive=False) threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=False) return [rank, k, threshold] else: data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" if os.path.exists(data_path): data = pd.read_csv(data_path) max_k = len(data) print(max_k) k = gr.Slider(1, max_k, 10, step=1.0, label="k", interactive=True) else: k = gr.Slider(1, 1, 1, step=1.0, label="k") rank = gr.Radio(['top', 'bottom'], value='top', label="rank", interactive=True) threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=True) return [rank, k, threshold] def calc_surprisingness(model, scores, embeddings, k): scores = scores[model].to_numpy() sim = embeddings @ embeddings.T # print("sim values:", sim.shape, sim) indices = np.argsort(-sim)[:, :k] # print("indices:", indices.shape, indices) score_diff = scores[:, None] - scores[indices] # print("score differences:", score_diff.shape, score_diff) sim = sim[np.arange(len(scores))[:, None], indices] # print("top10 sim:", sim.shape, sim) all_surprisingness = score_diff * sim # print("all surprisingness:", all_surprisingness.shape, all_surprisingness) mean_surprisingness = np.mean(score_diff * sim, axis=1) res = {'similarity': sim, 'task index': indices, 'score difference': score_diff, 'all surprisingness': all_surprisingness, 'mean surprisingness': mean_surprisingness } return res def plot_surprisingness(domain, partition, model, rank, k, num_neighbors): domain = domain2folder[domain] # model = model[0] model_str = model.replace("-", "_") # sp_path = f"{BASE_DIR}/{domain}/{partition}/surprise_data.csv" sp_pkl = f"{BASE_DIR}/{domain}/{partition}/{model_str}_surprise.pkl" merged_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" if os.path.exists(sp_pkl) and os.path.exists(merged_path): # and not os.path.exists(sp_path) # if os.path.exists(sp_path): # sp_df = pd.read_csv(sp_path) # # res = calc_surprisingness(model, scores, embeds, num_neighbors) # # k = 10 # model = 'qwenvl' # num_neighbors = 10 # if os.path.exists(sp_pkl): res = pickle.load(open(sp_pkl, 'rb')) total_num_task = res['task index'].shape[0] all_records = [] for i in range(total_num_task): mean_surprisingness = np.mean(res['all surprisingness'][i, :num_neighbors]) for j in range(num_neighbors): neighbor_id = res['task index'][i, j] score_diff = res['score difference'][i, j] surprisingness = res['all surprisingness'][i, j] similarity = res['similarity'][i, j] record = {"task id": i, "neighbor rank": j, "neighbor id": neighbor_id, "score difference": score_diff, "surprisingness": surprisingness, "mean surprisingness": mean_surprisingness, "similarity": similarity } # print(record) all_records.append(record) sp_df = pd.DataFrame.from_records(all_records) sp_df = sp_df.sort_values(by="mean surprisingness", ascending=False) num_rows = k * num_neighbors df = sp_df.iloc[:num_rows, :] if rank == "top" else sp_df.iloc[-num_rows:, :] print(len(df)) df['is target'] = df.apply(lambda row: int(row['task id'] == row['neighbor id']), axis=1) merged_df = pd.read_csv(merged_path) for col in merged_df.columns: df[col] = df.apply(lambda row: merged_df.iloc[int(row['neighbor id']), :][col], axis=1) tooltips = ['neighbor id'] + ['image', 'question', 'answer', model] print(df.head()) pts = alt.selection_point(encodings=['x']) embeds = alt.Chart(df).mark_point(size=30, filled=True).encode( alt.OpacityValue(0.5), alt.X('x:Q', scale=alt.Scale(zero=False)), alt.Y('y:Q', scale=alt.Scale(zero=False)), alt.Color(f'{model}:Q'), #scale=alt.Scale(domain=[1, 0.5, 0], range=['blue', 'white', 'red'], interpolate='rgb') alt.Size("is target:N", legend=None, scale=alt.Scale(domain=[0, 1], range=[300, 500])), alt.Shape("is target:N", legend=None, scale=alt.Scale(domain=[0, 1], range=['circle', 'triangle'])), alt.Order("is target:N"), tooltip=tooltips, ).properties( width=400, height=400, title=f"What are the tasks {model} is surprisingly {'good' if rank == 'top' else 'bad'} at compared to {num_neighbors} similar tasks?" ).transform_filter( pts ) bar = alt.Chart(df).mark_bar().encode( alt.Y('mean(mean surprisingness):Q'), alt.X('task id:N', sort=alt.EncodingSortField(field='mean surprisingness', order='descending')), color=alt.condition(pts, alt.ColorValue("steelblue"), alt.ColorValue("grey")), # ).add_params(pts).properties( width=400, height=200, ) chart = alt.hconcat( bar, embeds ).resolve_legend( color="independent", size="independent" ).configure_title( fontSize=20 ).configure_legend( labelFontSize=10, titleFontSize=10, ) return chart else: print(sp_pkl, merged_path) return None def plot_task_distribution(domain, partition, category): domain = domain2folder[domain] task_plan = pickle.load(open(f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl", "rb")) task_plan.reset_index(inplace=True) col_name = category task_plan_cnt = task_plan.groupby(col_name)['index'].count().reset_index() task_plan_cnt.rename(columns={'index': 'count'}, inplace=True) task_plan_cnt['frequency (%)'] = round(task_plan_cnt['count'] / len(task_plan) * 100, 2) task_plan_cnt.head() base = alt.Chart(task_plan_cnt).encode( alt.Theta("count:Q").stack(True), alt.Color(f"{col_name}:N").legend(), tooltip=[col_name, 'count', 'frequency (%)'] ) pie = base.mark_arc(outerRadius=120) return pie def plot_all(domain, partition, models, category1, category2, agg): domain = domain2folder[domain] data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" if not os.path.exists(data_path): return None expand_df = pd.read_csv(data_path) chart_df = expand_df[expand_df['model'].isin(models)] if category2: color_val = f'{agg}(score):Q' chart = alt.Chart(chart_df).mark_rect().encode( alt.X(f'{category1}:N', sort=alt.EncodingSortField(field='score', order='ascending', op=agg)), alt.Y(f'{category2}:N', sort=alt.EncodingSortField(field='score', order='descending', op=agg)), # no title, no label angle), alt.Color(color_val), alt.Tooltip('score', aggregate=agg, title=f"{agg} score"), ).properties( width=800, height=200, ) else: category = "index" if category1 == "task id" else category1 # cat_options = list(chart_df[category].unique()) # cat_options = cat_options[:5] y_val = f'{agg}(score):Q' df = chart_df # df = chart_df[chart_df[category].isin(cat_options)] if len(models) > 1: chart = alt.Chart(df).mark_bar().encode( alt.X('model:N', sort=alt.EncodingSortField(field=f'score', order='ascending', op=agg), axis=alt.Axis(labels=False, tickSize=0, title=None)), alt.Y(y_val, scale=alt.Scale(zero=True)), alt.Color('model:N').legend(), alt.Column(f'{category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom')) ).properties( width=200, height=100, title=f"How do models perform across {category}?" ) else: chart = alt.Chart(df).mark_bar().encode( alt.X(f'{category}:N', sort=alt.EncodingSortField(field=f'score', order='ascending', op=agg)), # no title, no label angle), alt.Y(y_val, scale=alt.Scale(zero=True)), alt.Color(f'{category}:N').legend(None), ).properties( width=200, height=100, title=f"How does {models[0]} perform across {category}?" ) chart = chart.configure_title(fontSize=20, offset=5, orient='top', anchor='middle').configure_axis( labelFontSize=20, titleFontSize=20, ).configure_legend( labelFontSize=15, titleFontSize=15, ) return chart def update_widgets(domain, partition, category, query_type): domain = domain2folder[domain] data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" if not os.path.exists(data_path): print("here?") return [None] * 11 df = pd.read_csv(data_path) max_k = len(df[category].unique()) if category and category != "task id" else len(df) widgets = [] if query_type == "top k": # aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) rank = gr.Radio(['top', 'bottom'], value='top', label=" ", interactive=True, visible=True) k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", interactive=True, visible=True) model = gr.Dropdown(MODELS, value=MODELS, label="of model(s)'", multiselect=True, interactive=True, visible=True) # model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate", interactive=True, visible=True) model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) baseline = gr.Dropdown(MODELS, value=None, label="baseline", visible=False) direction = gr.Radio(['above', 'below'], value='above', label=" ", visible=False) threshold = gr.Slider(0, 1, 0.0, label="threshold", visible=False) baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="baseline aggregate", visible=False) md1 = gr.Markdown(r"

ranked by the

") md2 = gr.Markdown(r"

accuracy

") md3 = gr.Markdown(r"") elif query_type == "threshold": # aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task aggregate", interactive=True, visible=True) # aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) model = gr.Dropdown(MODELS, value=MODELS[0], label="of model(s)'", multiselect=True, interactive=True, visible=True) direction = gr.Radio(['above', 'below'], value='above', label=" ", interactive=True, visible=True) threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=True, visible=True) # model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate", interactive=True, visible=True) model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) rank = gr.Radio(['top', 'bottom'], value='top', label=" ", visible=False) k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", visible=False) baseline = gr.Dropdown(MODELS, value=None, label="baseline", visible=False) baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="baseline aggregate", visible=False) md1 = gr.Markdown(r"

where the

") md2 = gr.Markdown(r"

accuracy is

") md3 = gr.Markdown(r"") elif query_type == "model comparison": model = gr.Dropdown(MODELS, value=MODELS[0], label="of model(s)' accuracy", multiselect=True, interactive=True, visible=True) baseline = gr.Dropdown(MODELS, value=None, label="of baseline(s)' accuracy", multiselect=True, interactive=True, visible=True) direction = gr.Radio(['above', 'below'], value='above', label=" ", interactive=True, visible=True) threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=True, visible=True) model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) # baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate (over baselines)", interactive=True, visible=True) baseline_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) # aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task aggregate", interactive=True, visible=False) rank = gr.Radio(['top', 'bottom'], value='top', label=" ", visible=False) k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", visible=False) md1 = gr.Markdown(r"

where the difference between the

") md2 = gr.Markdown(r"

is

") md3 = gr.Markdown(r"

and the

") elif query_type == "model debugging": model = gr.Dropdown(MODELS, value=MODELS[0], label="model's", multiselect=False, interactive=True, visible=True) # aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", visible=False) baseline = gr.Dropdown(MODELS, value=None, label="baseline", visible=False) direction = gr.Radio(['above', 'below'], value='above', label=" ", visible=False) threshold = gr.Slider(0, 1, 0.0, label="threshold", visible=False) rank = gr.Radio(['top', 'bottom'], value='top', label=" ", visible=False) k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", visible=False) model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate (over models)", visible=False) baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="baseline aggregate", visible=False) md1 = gr.Markdown(r"

where

") md2 = gr.Markdown(r"

mean accuracy is below its overall mean accuracy by one standard deviation

") md3 = gr.Markdown(r"") else: widgets = [None] * 11 widgets = [rank, k, direction, threshold, model, model_aggregate, baseline, baseline_aggregate, md1, md2, md3] return widgets def select_tasks(domain, partition, category, query_type, task_agg, models, model_agg, rank, k, direction, threshold, baselines, baseline_agg): domain = domain2folder[domain] data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" merged_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" if not os.path.exists(data_path) or not os.path.exists(merged_path): return gr.DataFrame(None) df = pd.read_csv(data_path) merged_df = pd.read_csv(merged_path) task_plan = pickle.load(open(f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl", 'rb')) task_plan.reset_index(inplace=True) if not category or category == "task id": category = 'index' if query_type == "top k": df = df[df['model'].isin(models)] df = df.groupby([category, 'model'])['score'].agg(task_agg).reset_index() df = df.groupby([category])['score'].agg(model_agg).reset_index() df = df.sort_values(by='score', ascending=False) if rank == "bottom": df = df.iloc[-k:, :] else: df = df.iloc[:k, :] elif query_type == "threshold": df = df[df['model'].isin(models)] df = df.groupby([category, 'model'])['score'].agg(task_agg).reset_index() df = df.groupby([category])['score'].agg(model_agg).reset_index() if direction == "below": df = df[df['score'] <= threshold] else: df = df[df['score'] >= threshold] elif query_type == "model comparison": # df = merged_df # df.reset_index(inplace=True) # df = df.groupby([category])[[model, baseline]].agg(task_agg).reset_index() # df = df[(df[model] - df[baseline] > threshold)] df_baseline = deepcopy(df) df = df[df['model'].isin(models)] df = df.groupby([category, 'model'])['score'].agg(task_agg).reset_index() df = df.groupby([category])['score'].agg(model_agg).reset_index() model_str = ', '.join(models) exp_score_id = f'{model_agg}({model_str})' if len(models) > 1 else model_str df = df.sort_values(by=category) df_baseline = df_baseline[df_baseline['model'].isin(baselines)] df_baseline = df_baseline.groupby([category, 'model'])['score'].agg(task_agg).reset_index() df_baseline = df_baseline.groupby([category])['score'].agg(baseline_agg).reset_index() model_str = ', '.join(baselines) baseline_score_id = f'{baseline_agg}({model_str})' if len(baselines) > 1 else model_str df_baseline = df_baseline.sort_values(by=category) df.rename(columns={'score': exp_score_id}, inplace=True) df_baseline.rename(columns={'score': baseline_score_id}, inplace=True) df = pd.merge(df, df_baseline, on=category) df = df[(df[exp_score_id] - df[baseline_score_id] > threshold)] elif query_type == "model debugging": model = models print(models) avg_acc = merged_df[model].mean() std = merged_df[model].std() t = avg_acc - std df = df[df['model'] == model] df = df.groupby(['model', category])['score'].agg(task_agg).reset_index() df = df[df['score'] < t] df['mean'] = round(avg_acc, 4) df['std'] = round(std, 4) print(df.head()) if category == 'index': task_attrs = list(df[category]) selected_tasks = task_plan[task_plan[category].isin(task_attrs)] if len(selected_tasks) == 0: return gr.DataFrame(None, label="There is no such task.") if query_type == "model comparison" and (models and baselines): # selected_tasks[model] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][model].values[0], axis=1) # selected_tasks[baseline] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][baseline].values[0], axis=1) selected_tasks[exp_score_id] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][exp_score_id].values[0], axis=1) selected_tasks[baseline_score_id] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][baseline_score_id].values[0], axis=1) else: selected_tasks['score'] = selected_tasks.apply(lambda row: df[df['index'] == row['index']]['score'].values[0], axis=1) print(selected_tasks.head()) return gr.DataFrame(selected_tasks, label=f"There are {len(selected_tasks)} (out of {len(task_plan)}) tasks in total.") else: if len(df) == 0: return gr.DataFrame(None, label=f"There is no such {category}.") else: return gr.DataFrame(df, label=f"The total number of such {category} is {len(df)}.") def find_patterns(selected_tasks, num_patterns, models, baselines, model_agg, baseline_agg): if len(selected_tasks) == 0: return gr.DataFrame(None) print(selected_tasks.head()) if 'score' in selected_tasks: scores = selected_tasks['score'] # elif model in selected_tasks: # scores = selected_tasks[model] else: scores = None print(scores) model_str = ', '.join(models) exp_score_id = f'{model_agg}({model_str})' if len(models) > 1 else model_str if baselines: baseline_str = ', '.join(baselines) baseline_score_id = f'{baseline_agg}({baseline_str})' if len(baselines) > 1 else baseline_str tasks_only = selected_tasks all_score_cols = ['score', exp_score_id] if baselines: all_score_cols += [baseline_score_id] for name in all_score_cols: if name in selected_tasks: tasks_only = tasks_only.drop(name, axis=1) results = find_frequent_patterns(k=num_patterns, df=tasks_only, scores=scores) records = [] if scores is not None: patterns, scores = results[0], results[1] for pattern, score in zip(patterns, scores): pattern_str = "" for t in pattern[1]: col_name, col_val = t pattern_str += f"{col_name} = {col_val}, " record = {'pattern': pattern_str[:-2], 'count': pattern[0], 'score': score} #{model} records.append(record) else: patterns = results for pattern in patterns: pattern_str = "" for t in pattern[1]: col_name, col_val = t pattern_str += f"{col_name} = {col_val}, " record = {'pattern': pattern_str[:-2], 'count': pattern[0]} records.append(record) df = pd.DataFrame.from_records(records) return gr.DataFrame(df) def visualize_task_distribution(selected_tasks, col_name, model1, model2): if not col_name: return None task_plan_cnt = selected_tasks.groupby(col_name)['index'].count().reset_index() task_plan_cnt.rename(columns={'index': 'count'}, inplace=True) task_plan_cnt['frequency (%)'] = round(task_plan_cnt['count'] / len(selected_tasks) * 100, 2) print(task_plan_cnt.head()) tooltips = [col_name, 'count', 'frequency (%)'] base = alt.Chart(task_plan_cnt).encode( alt.Theta("count:Q").stack(True), alt.Color(f"{col_name}:N").legend(), tooltip=tooltips ) pie = base.mark_arc(outerRadius=120) return pie def plot_performance_for_selected_tasks(domain, partition, df, query_type, models, baselines, select_category, vis_category, task_agg, model_agg, baseline_agg, rank, direction, threshold): domain = domain2folder[domain] task_agg = "mean" data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" mereged_data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" if not os.path.exists(data_path) or not os.path.exists(mereged_data_path) or len(df) == 0: return None select_tasks = select_category == "task id" and vis_category if select_tasks: # select tasks y_val = f'{task_agg}(score):Q' else: # select task categories y_val = f'score:Q' if select_category == "task id": select_category = "index" print(df.head()) if query_type == "model comparison": # re-format the data for plotting model_str = ', '.join(models) exp_score_id = f'{model_agg}({model_str})' if len(models) > 1 else model_str baseline_str = ', '.join(baselines) baseline_score_id = f'{baseline_agg}({baseline_str})' if len(baselines) > 1 else baseline_str # other_cols = list(df.columns) # other_cols.remove(select_category) print(exp_score_id, baseline_score_id) df = df.melt(id_vars=[select_category], value_vars=[exp_score_id, baseline_score_id]) df.rename(columns={'variable': 'model', 'value': 'score'}, inplace=True) print(df.head()) if select_tasks: merged_df = pd.read_csv(mereged_data_path) df[vis_category] = df.apply(lambda row: merged_df[merged_df.index == row['index']][vis_category].values[0], axis=1) num_columns = len(df['model'].unique()) * len(df[f'{vis_category}'].unique()) chart = alt.Chart(df).mark_bar().encode( alt.X('model:N', sort=alt.EncodingSortField(field=f'score', order='descending', op=task_agg), axis=alt.Axis(labels=False, tickSize=0, title=None)), alt.Y(y_val, scale=alt.Scale(zero=True), title="accuracy"), alt.Color('model:N').legend(), alt.Column(f'{vis_category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom', labelFontSize=20, titleFontSize=20,)) ).properties( width=num_columns * 30, height=200, title=f"How do models perform by {vis_category}?" ) print(num_columns * 50) else: if query_type == "model debugging": y_title = "accuracy" plot_title = f"{models} performs worse than its (mean - std) on these {vis_category}s" models = [models] else: model_str = ', '.join(models) y_title = f"{model_agg} accuracy" if len(models) > 0 else "accuracy" suffix = f"on these tasks (by {vis_category})" if select_category == "index" else f"on these {vis_category}s" if query_type == "top k": plot_title = f"The {model_agg} accuracy of {model_str} is the {'highest' if rank == 'top' else 'lowest'} " + suffix elif query_type == "threshold": plot_title = f"The {model_agg} accuracy of {model_str} is {direction} {threshold} " + suffix if select_tasks: expand_df = pd.read_csv(data_path) task_ids = list(df['index'].unique()) # all_models = (models + baselines) if baselines else models df = expand_df[(expand_df['model'].isin(models)) & (expand_df['task id'].isin(task_ids))] num_columns = len(df[f'{vis_category}'].unique()) chart = alt.Chart(df).mark_bar().encode( alt.X(f'{vis_category}:N', sort=alt.EncodingSortField(field=f'score', order='ascending', op=task_agg), axis=alt.Axis(labelAngle=-20)), # no title, no label angle), alt.Y(y_val, scale=alt.Scale(zero=True), title=y_title), alt.Color(f'{vis_category}:N').legend(None), ).properties( width=num_columns * 30, height=200, title=plot_title ) chart = chart.configure_title(fontSize=20, offset=5, orient='top', anchor='middle').configure_axis( labelFontSize=20, titleFontSize=20, ).configure_legend( labelFontSize=20, titleFontSize=20, labelLimit=200, ) return chart def sync_vis_category(domain, partition, category): domain = domain2folder[domain] if category and category != "task id": return [gr.Dropdown([category], value=category, label="by task metadata", interactive=False), gr.Dropdown([category], value=category, label="by task metadata", interactive=False)] else: data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl" if os.path.exists(data_path): data = pickle.load(open(data_path, 'rb')) categories = list(data.columns) return [gr.Dropdown(categories, value=categories[0], label="by task metadata", interactive=True), gr.Dropdown(categories, value=categories[0], label="by task metadata", interactive=True)] else: return [None, None] def hide_fpm_and_dist_components(domain, partition, category): domain = domain2folder[domain] print(category) if category and category != "task id": num_patterns = gr.Slider(1, 100, 50, step=1.0, label="number of patterns", visible=False) btn_pattern = gr.Button(value="Find patterns among tasks", visible=False) table = gr.DataFrame({}, height=250, visible=False) dist_chart = Plot(visible=False) col_name = gr.Dropdown([], value=None, label="by task metadata", visible=False) btn_dist = gr.Button(value="Visualize task distribution", visible=False) else: data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl" if os.path.exists(data_path): data = pickle.load(open(data_path, 'rb')) categories = list(data.columns) col_name = gr.Dropdown(categories, value=categories[0], label="by task metadata", interactive=True, visible=True) else: col_name = gr.Dropdown([], value=None, label="by task metadata", interactive=True, visible=True) num_patterns = gr.Slider(1, 100, 50, step=1.0, label="number of patterns", interactive=True, visible=True) btn_pattern = gr.Button(value="Find patterns among tasks", interactive=True, visible=True) table = gr.DataFrame({}, height=250, interactive=True, visible=True) dist_chart = Plot(visible=True) btn_dist = gr.Button(value="Visualize task distribution", interactive=True, visible=True) return [num_patterns, btn_pattern, table, col_name, btn_dist, dist_chart] # domains = list_directories(BASE_DIR) theme = gr.Theme.from_hub('sudeepshouche/minimalist') theme.font = [gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"] # gr.themes.GoogleFont("Source Sans Pro") # [gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"] theme.text_size = gr.themes.sizes.text_lg # theme = theme.set(font=) demo = gr.Blocks(theme=theme, title="TaskVerse-UI") # with demo: with gr.Row(): with gr.Column(scale=1): gr.Markdown( r"" ) with gr.Column(scale=1): gr.Markdown( r"

Welcome to TaskVerse-UI!

" ) with gr.Column(scale=1): gr.Markdown( r"" ) with gr.Tab("📊 Overview"): gr.Markdown( r"

📊 Visualize the overall task distribution and model performance

" ) with gr.Row(): domain = gr.Radio(domains, label="scenario", scale=2) partition = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1) # domain.change(fn=update_partition, inputs=domain, outputs=partition) gr.Markdown( r"

Overall task metadata distribution

" ) with gr.Row(): category = gr.Dropdown([], value=None, label="task metadata") partition.change(fn=update_category, inputs=[domain, partition], outputs=category) with gr.Row(): output = Plot() with gr.Row(): btn = gr.Button(value="Plot") btn.click(plot_task_distribution, [domain, partition, category], output) gr.Markdown( r"

Models' overall performance by task metadata

" ) with gr.Row(): with gr.Column(scale=2): models = gr.CheckboxGroup(MODELS, label="model(s)", value=MODELS) with gr.Column(scale=1): aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="aggregate models' accuracy by") with gr.Row(): # with gr.Column(scale=1): category1 = gr.Dropdown([], value=None, label="task metadata", interactive=True) category2 = gr.Dropdown([], value=None, label="Optional: second task metadata", interactive=True) partition.change(fn=update_category, inputs=[domain, partition], outputs=category1) category1.change(fn=update_category2, inputs=[domain, partition, category1], outputs=category2) domain.change(fn=update_partition_and_models, inputs=domain, outputs=[partition, models]) with gr.Row(): output = Plot() with gr.Row(): btn = gr.Button(value="Plot") btn.click(plot_all, [domain, partition, models, category1, category2, aggregate], output) # gr.Examples(["hello", "bonjour", "merhaba"], input_textbox) with gr.Tab("✨ Embedding"): gr.Markdown( r"

✨ Visualize the tasks' embeddings in the 2D space

" ) with gr.Row(): domain2 = gr.Radio(domains, label="scenario", scale=2) # domain = gr.Dropdown(domains, value=domains[0], label="scenario") partition2 = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1) category2 = gr.Dropdown([], value=None, label="colored by task metadata", scale=1) domain2.change(fn=update_partition, inputs=domain2, outputs=partition2) partition2.change(fn=update_category, inputs=[domain2, partition2], outputs=category2) with gr.Row(): output2 = Plot() with gr.Row(): btn = gr.Button(value="Run") btn.click(plot_embedding, [domain2, partition2, category2], output2) with gr.Tab("❓ Query"): gr.Markdown( r"

❓ Find out the answers to your queries by finding and visualizing the relevant tasks and models' performance

" ) with gr.Row(equal_height=True): domain = gr.Radio(domains, label="scenario", scale=2) partition = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1) with gr.Row(): query1 = "top k" query2 = "threshold" query3 = "model debugging" query4 = "model comparison" query_type = gr.Radio([query1, query2, query3, query4], value="top k", label=r"query type") with gr.Row(): with gr.Accordion("See more details about the query type"): gr.Markdown( r"" ) with gr.Row(): gr.Markdown(r"

Help me find the

") with gr.Row(equal_height=True): # with gr.Column(scale=1): rank = gr.Radio(['top', 'bottom'], value='top', label=" ", interactive=True, visible=True) # with gr.Column(scale=2): k = gr.Slider(1, 10, 5 // 2, step=1.0, label="k", interactive=True, visible=True) # with gr.Column(scale=2): category = gr.Dropdown([], value=None, label="tasks / task metadata", interactive=True) with gr.Row(): md1 = gr.Markdown(r"

ranked by the

") with gr.Row(equal_height=True): # with gr.Column(scale=1, min_width=100): # model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True, scale=1) # with gr.Column(scale=8): model = gr.Dropdown(MODELS, value=MODELS, label="of model(s)", multiselect=True, interactive=True, visible=True, scale=2) # with gr.Column(scale=1, min_width=100): # aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True, scale=1) with gr.Row(): md3 = gr.Markdown(r"") with gr.Row(equal_height=True): baseline_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=False, scale=1) baseline = gr.Dropdown(MODELS, value=None, label="of baseline(s)'", visible=False, scale=2) # aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) # with gr.Column(scale=1, min_width=50): with gr.Row(): md2 = gr.Markdown(r"

accuracy

") with gr.Row(): # baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate (over baselines)", visible=False) direction = gr.Radio(['above', 'below'], value='above', label=" ", visible=False) threshold = gr.Slider(0, 1, 0.0, label="threshold", visible=False) widgets = [rank, k, direction, threshold, model, model_aggregate, baseline, baseline_aggregate, md1, md2, md3] partition.change(fn=update_category, inputs=[domain, partition], outputs=category) query_type.change(update_widgets, [domain, partition, category, query_type], widgets) domain.change(fn=update_partition_and_models_and_baselines, inputs=domain, outputs=[partition, model, baseline]) with gr.Row(): df = gr.DataFrame({}, height=200) btn = gr.Button(value="Find tasks / task metadata") btn.click(select_tasks, [domain, partition, category, query_type, aggregate, model, model_aggregate, rank, k, direction, threshold, baseline, baseline_aggregate], df) with gr.Row(): plot = Plot() with gr.Row(): col_name2 = gr.Dropdown([], value=None, label="by task metadata", interactive=True) partition.change(fn=update_category, inputs=[domain, partition], outputs=col_name2) btn_plot = gr.Button(value="Plot model performance", interactive=True) btn_plot.click(plot_performance_for_selected_tasks, [domain, partition, df, query_type, model, baseline, category, col_name2, aggregate, model_aggregate, baseline_aggregate, rank, direction, threshold], plot) with gr.Row(): dist_chart = Plot() with gr.Row(): col_name = gr.Dropdown([], value=None, label="by task metadata", interactive=True) partition.change(fn=update_category, inputs=[domain, partition], outputs=col_name) btn_dist = gr.Button(value="Visualize task distribution", interactive=True) btn_dist.click(visualize_task_distribution, [df, col_name, model, baseline], dist_chart) with gr.Row(): table = gr.DataFrame({}, height=250) with gr.Row(): num_patterns = gr.Slider(1, 100, 50, step=1.0, label="number of patterns") btn_pattern = gr.Button(value="Find patterns among tasks") btn_pattern.click(find_patterns, [df, num_patterns, model, baseline], table) category.change(fn=hide_fpm_and_dist_components, inputs=[domain, partition, category], outputs=[num_patterns, btn_pattern, table, col_name, btn_dist, dist_chart]) category.change(fn=sync_vis_category, inputs=[domain, partition, category], outputs=[col_name, col_name2]) category.change(fn=update_k, inputs=[domain, partition, category], outputs=k) with gr.Tab("😮 Surprisingness"): gr.Markdown(r"

😮 Find out the tasks a model is surprisingly good or bad at compared to similar tasks

") with gr.Row(): domain3 = gr.Radio(domains, label="scenario", scale=2) partition3 = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1) with gr.Row(): model3 = gr.Dropdown(MODELS, value=MODELS[0], label="model", interactive=True, visible=True) k3 = gr.Slider(1, 100, 50, step=1.0, label="number of surprising tasks", interactive=True) num_neighbors = gr.Slider(1, 100, 50, step=1.0, label="number of neighbors", interactive=True) rank3 = gr.Radio(['top', 'bottom'], value='top', label=" ", interactive=True, visible=True) domain3.change(fn=update_partition_and_models, inputs=domain3, outputs=[partition3, model3]) # partition3.change(fn=update_k, inputs=[domain3, partition3], outputs=k3) with gr.Row(): output3 = Plot() with gr.Row(): btn = gr.Button(value="Plot") btn.click(plot_surprisingness, [domain3, partition3, model3, rank3, k3, num_neighbors], output3) # if __name__ == "__main__": demo.launch(share=True)