Spaces:
Running
Running
import pandas as pd | |
import numpy as np | |
import os | |
from copy import deepcopy | |
import gradio as gr | |
import altair as alt | |
alt.data_transformers.enable("vegafusion") | |
from dynabench.task_evaluator import * | |
BASE_DIR = "../db" | |
MODELS = ['qwenvl-chat', 'qwenvl', 'llava15-7b', 'llava15-13b', 'instructblip-vicuna13b', 'instructblip-vicuna7b'] | |
VIDEO_MODELS = ['video-chat2-7b','video-llama2-7b','video-llama2-13b','chat-univi-7b','chat-univi-13b','video-llava-7b','video-chatgpt-7b'] | |
domains = ["imageqa-2d-sticker", "imageqa-3d-tabletop", "imageqa-scene-graph", "videoqa-3d-tabletop", "videoqa-scene-graph"] | |
domain2folder = {"imageqa-2d-sticker": "2d", | |
"imageqa-3d-tabletop": "3d", | |
"imageqa-scene-graph": "sg", | |
"videoqa-3d-tabletop": "video-3d", | |
"videoqa-scene-graph": "video-sg", | |
None: '2d'} | |
def update_partition_and_models(domain): | |
domain = domain2folder[domain] | |
path = f"{BASE_DIR}/{domain}" | |
if os.path.exists(path): | |
partitions = list_directories(path) | |
if domain.find("video") > -1: | |
model = gr.Dropdown(VIDEO_MODELS, value=VIDEO_MODELS[0], label="model") | |
else: | |
model = gr.Dropdown(MODELS, value=MODELS[0], label="model") | |
partition = gr.Dropdown(partitions, value=partitions[0], label="task space of the following task generator") | |
return [partition, model] | |
else: | |
partition = gr.Dropdown([], value=None, label="task space of the following task generator") | |
model = gr.Dropdown([], value=None, label="model") | |
return [partition, model] | |
def update_partition_and_models_and_baselines(domain): | |
domain = domain2folder[domain] | |
path = f"{BASE_DIR}/{domain}" | |
if os.path.exists(path): | |
partitions = list_directories(path) | |
if domain.find("video") > -1: | |
model = gr.Dropdown(VIDEO_MODELS, value=VIDEO_MODELS[0], label="model") | |
baseline = gr.Dropdown(VIDEO_MODELS, value=VIDEO_MODELS[0], label="baseline") | |
else: | |
model = gr.Dropdown(MODELS, value=MODELS[0], label="model") | |
baseline = gr.Dropdown(MODELS, value=MODELS[0], label="baseline") | |
partition = gr.Dropdown(partitions, value=partitions[0], label="task space of the following task generator") | |
else: | |
partition = gr.Dropdown([], value=None, label="task space of the following task generator") | |
model = gr.Dropdown([], value=None, label="model") | |
baseline = gr.Dropdown([], value=None, label="baseline") | |
return [partition, model, baseline] | |
def get_filtered_task_ids(domain, partition, models, rank, k, threshold, baseline): | |
domain = domain2folder[domain] | |
data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
if not os.path.exists(data_path): | |
return [] | |
else: | |
merged_df = pd.read_csv(data_path) | |
merged_df.rename(columns={'llavav1.5-7b': 'llava15-7b', 'llavav1.5-13b': 'llava15-13b'}, inplace=True) | |
df = merged_df | |
select_top = rank == "top" | |
# Model X is good / bad at | |
for model in models: | |
if baseline: | |
df = df[df[model] >= df[baseline]] | |
else: | |
if select_top: | |
df = df[df[model] >= threshold] | |
else: | |
df = df[df[model] <= threshold] | |
if not baseline: | |
df['mean score'] = df[models].mean(axis=1) | |
df = df.sort_values(by='mean score', ascending=False) | |
df = df.iloc[:k, :] if select_top else df.iloc[-k:, :] | |
task_ids = list(df.index) | |
return task_ids | |
def plot_patterns(domain, partition, models, rank, k, threshold, baseline, pattern, order): | |
domain = domain2folder[domain] | |
data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" | |
if not os.path.exists(data_path): | |
return None | |
task_ids = get_filtered_task_ids(domain, partition, models, rank, k, threshold, baseline) | |
expand_df = pd.read_csv(data_path) | |
chart_df = expand_df[expand_df['model'].isin((models + [baseline]) if baseline else models)] | |
chart_df = chart_df[chart_df['task id'].isin(task_ids)] | |
print(pattern) | |
freq, cols = eval(pattern) | |
pattern_str = "" | |
df = chart_df | |
for col in cols: | |
col_name, col_val = col | |
try: | |
col_val = int(col_val) | |
except: | |
col_val = col_val | |
df = df[df[col_name] == col_val] | |
pattern_str += f"{col_name} = {col_val}, " | |
print(len(df)) | |
if baseline: | |
model_str = (', '.join(models) if len(models) > 1 else models[0]) | |
phrase = f'{model_str} perform' if len(models) > 1 else f'{model_str} performs' | |
title = f"{phrase} better than {baseline} on {freq} tasks where {pattern_str[:-2]}" | |
else: | |
title = f"Models are {'best' if rank == 'top' else 'worst'} at {freq} tasks where {pattern_str[:-2]}" | |
chart = alt.Chart(df).mark_bar().encode( | |
alt.X('model:N', | |
sort=alt.EncodingSortField(field=f'score', order=order, op="mean"), | |
axis=alt.Axis(labels=False, tickSize=0)), # no title, no label angle), | |
alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), | |
alt.Color('model:N').legend(), | |
).properties( | |
width=400, | |
height=300, | |
title=title | |
) | |
return chart | |
def plot_embedding(domain, partition, category): | |
domain = domain2folder[domain] | |
data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
if os.path.exists(data_path): | |
merged_df = pd.read_csv(data_path) | |
# models = merged_df.columns | |
has_image = 'image' in merged_df | |
chart = alt.Chart(merged_df).mark_point(size=30, filled=True).encode( | |
alt.OpacityValue(0.5), | |
alt.X('x:Q'), | |
alt.Y('y:Q'), | |
alt.Color(f'{category}:N'), | |
tooltip=['question', 'answer'] + (['image'] if has_image else []), | |
).properties( | |
width=800, | |
height=800, | |
title="UMAP Projected Task Embeddings" | |
).configure_axis( | |
labelFontSize=25, | |
titleFontSize=25, | |
).configure_title( | |
fontSize=40 | |
).configure_legend( | |
labelFontSize=25, | |
titleFontSize=25, | |
).interactive() | |
return chart | |
else: | |
return None | |
def plot_multi_models(domain, partition, category, cat_options, models, order, pattern, aggregate="mean"): | |
domain = domain2folder[domain] | |
data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" | |
if not os.path.exists(data_path): | |
return None | |
expand_df = pd.read_csv(data_path) | |
print(pattern) | |
if pattern is not None: | |
df = expand_df | |
freq, cols = eval(pattern) | |
pattern_str = "" | |
for col in cols: | |
col_name, col_val = col | |
try: | |
col_val = int(col_val) | |
except: | |
col_val = col_val | |
df = df[df[col_name] == col_val] | |
pattern_str += f"{col_name} = {col_val}, " | |
chart = alt.Chart(df).mark_bar().encode( | |
alt.X('model:N', | |
sort=alt.EncodingSortField(field=f'score', order='ascending', op="mean"), | |
axis=alt.Axis(labels=False, tickSize=0)), # no title, no label angle), | |
alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), | |
alt.Color('model:N').legend(), | |
).properties( | |
width=200, | |
height=100, | |
title=f"How do models perform on tasks where {pattern_str[:-2]} (N={freq})?" | |
) | |
return chart | |
else: | |
df = expand_df[(expand_df['model'].isin(models)) & (expand_df[category].isin(cat_options))] | |
if len(models) > 1: | |
chart = alt.Chart(df).mark_bar().encode( | |
alt.X('model:N', | |
sort=alt.EncodingSortField(field=f'score', order=order, op="mean"), | |
axis=alt.Axis(labels=False, tickSize=0, title=None)), | |
alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), | |
alt.Color('model:N').legend(), | |
alt.Column(f'{category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom')) | |
).properties( | |
width=200, | |
height=100, | |
title=f"How do models perform across {category}?" | |
) | |
else: | |
chart = alt.Chart(df).mark_bar().encode( | |
alt.X(f'{category}:N', sort=alt.EncodingSortField(field=f'score', order=order, op="mean")), # no title, no label angle), | |
alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), | |
alt.Color(f'{category}:N').legend(None), | |
).properties( | |
width=200, | |
height=100, | |
title=f"How does {models[0]} perform across {category}?" | |
) | |
chart = chart.configure_title(fontSize=15, offset=5, orient='top', anchor='middle') | |
return chart | |
def plot(domain, partition, models, rank, k, threshold, baseline, order, category, cat_options): | |
domain = domain2folder[domain] | |
data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
expand_data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" | |
# task_plan.reset_index(inplace=True) | |
if not os.path.exists(data_path) or not os.path.exists(expand_data_path): | |
return None | |
else: | |
merged_df = pd.read_csv(data_path) | |
merged_df.rename(columns={'llavav1.5-7b': 'llava15-7b', 'llavav1.5-13b': 'llava15-13b'}, inplace=True) | |
expand_df = pd.read_csv(expand_data_path) | |
df = merged_df | |
select_top = rank == "top" | |
# Model X is good / bad at | |
for model in models: | |
if baseline: | |
df = df[df[model] >= df[baseline]] | |
else: | |
if select_top: | |
df = df[df[model] >= threshold] | |
else: | |
df = df[df[model] <= threshold] | |
if not baseline: | |
df['mean score'] = df[models].mean(axis=1) | |
df = df.sort_values(by='mean score', ascending=False) | |
df = df.iloc[:k, :] if select_top else df.iloc[-k:, :] | |
task_ids = list(df.index) | |
if baseline: | |
models += [baseline] | |
chart_df = expand_df[expand_df['model'].isin(models)] | |
chart_df = chart_df[chart_df['task id'].isin(task_ids)] | |
if cat_options: | |
df = chart_df[chart_df[category].isin(cat_options)] | |
else: | |
df = chart_df | |
if baseline: | |
model_str = (', '.join(models) if len(models) > 1 else models[0]) | |
phrase = f'{model_str} perform' if len(models) > 1 else f'{model_str} performs' | |
title = f"Are there any tasks where {phrase} better than {baseline} (by {category})?" | |
else: | |
title = f"What tasks are models {'best' if select_top else 'worst'} at by {category}?" | |
if len(models) > 1: | |
chart = alt.Chart(df).mark_bar().encode( | |
alt.X('model:N', | |
sort=alt.EncodingSortField(field=f'score', order=order, op="mean"), | |
axis=alt.Axis(labels=False, tickSize=0, title=None)), | |
alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), | |
alt.Color('model:N').legend(), | |
alt.Column(f'{category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom')) | |
).properties( | |
width=200, | |
height=100, | |
title=title | |
) | |
else: | |
chart = alt.Chart(df).mark_bar().encode( | |
alt.X(f'{category}:N', sort=alt.EncodingSortField(field=f'score', order=order, op="mean")), # no title, no label angle), | |
alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), | |
alt.Color(f'{category}:N').legend(None), | |
).properties( | |
width=200, | |
height=100, | |
title=f"What tasks is model {models[0]} {'best' if select_top else 'worst'} at by {category}?" | |
) | |
chart = chart.configure_title(fontSize=15, offset=5, orient='top', anchor='middle') | |
return chart | |
def get_frequent_patterns(task_plan, scores): | |
find_frequent_patterns(k=10, df=task_plan, scores=scores) | |
def list_directories(path): | |
"""List all directories within a given path.""" | |
return [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] | |
def update_category(domain, partition): | |
domain = domain2folder[domain] | |
data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl" | |
if os.path.exists(data_path): | |
data = pickle.load(open(data_path, 'rb')) | |
categories = list(data.columns) | |
category = gr.Dropdown(categories+["task id"], value=None, label="task metadata", interactive=True) | |
return category | |
else: | |
return gr.Dropdown([], value=None, label="task metadata") | |
def update_category2(domain, partition, existing_category): | |
domain = domain2folder[domain] | |
data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl" | |
if os.path.exists(data_path): | |
data = pickle.load(open(data_path, 'rb')) | |
categories = list(data.columns) | |
if existing_category and existing_category in categories: | |
categories.remove(existing_category) | |
category = gr.Dropdown(categories, value=None, label="Optional: second task metadata", interactive=True) | |
return category | |
else: | |
return gr.Dropdown([], value=None, label="task metadata") | |
def update_partition(domain): | |
domain = domain2folder[domain] | |
path = f"{BASE_DIR}/{domain}" | |
if os.path.exists(path): | |
partitions = list_directories(path) | |
return gr.Dropdown(partitions, value=partitions[0], label="task space of the following task generator") | |
else: | |
return gr.Dropdown([], value=None, label="task space of the following task generator") | |
def update_k(domain, partition, category=None): | |
domain = domain2folder[domain] | |
data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
if os.path.exists(data_path): | |
data = pd.read_csv(data_path) | |
max_k = len(data[category].unique()) if category and category != "task id" else len(data) | |
mid = max_k // 2 | |
return gr.Slider(1, max_k, mid, step=1.0, label="k") | |
else: | |
return gr.Slider(1, 1, 1, step=1.0, label="k") | |
# def update_category_values(domain, partition, category): | |
# data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
# if os.path.exists(data_path) and category is not None: | |
# data = pd.read_csv(data_path) | |
# uni_cats = list(data[category].unique()) | |
# return gr.Dropdown(uni_cats, multiselect=True, value=None, interactive=True, label="category values") | |
# else: | |
# return gr.Dropdown([], multiselect=True, value=None, interactive=False, label="category values") | |
# def update_category_values(domain, partition, models, rank, k, threshold, baseline, category): | |
# data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
# if not os.path.exists(data_path): | |
# return gr.Dropdown([], multiselect=True, value=None, interactive=False, label="category values") | |
# else: | |
# merged_df = pd.read_csv(data_path) | |
# merged_df.rename(columns={'llavav1.5-7b': 'llava15-7b', 'llavav1.5-13b': 'llava15-13b'}, inplace=True) | |
# df = merged_df | |
# select_top = rank == "top" | |
# # Model X is good / bad at | |
# for model in models: | |
# if baseline: | |
# df = df[df[model] >= df[baseline]] | |
# else: | |
# if select_top: | |
# df = df[df[model] >= threshold] | |
# else: | |
# df = df[df[model] <= threshold] | |
# if not baseline: | |
# df['mean score'] = df[models].mean(axis=1) | |
# df = df.sort_values(by='mean score', ascending=False) | |
# df = df.iloc[:k, :] if select_top else df.iloc[-k:, :] | |
# uni_cats = list(df[category].unique()) | |
# return gr.Dropdown(uni_cats, multiselect=True, value=None, interactive=True, label="category values") | |
def update_tasks(domain, partition, find_pattern): | |
domain = domain2folder[domain] | |
if find_pattern == "yes": | |
k1 = gr.Slider(1, 10000, 10, step=1.0, label="k", interactive=True) | |
pattern = gr.Dropdown([], value=None, interactive=True, label="pattern") | |
category1 = gr.Dropdown([], value=None, interactive=False, label="task metadata") | |
return [k1, pattern, category1] | |
else: | |
k1 = gr.Slider(1, 10000, 10, step=1.0, label="k", interactive=False) | |
pattern = gr.Dropdown([], value=None, interactive=False, label="pattern") | |
data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
if os.path.exists(data_path): | |
data = pd.read_csv(data_path) | |
non_columns = MODELS + ['question', 'answer'] | |
categories = [cat for cat in list(data.columns) if cat not in non_columns] | |
category1 = gr.Dropdown(categories, value=categories[0], interactive=True, label="task metadata") | |
else: | |
category1 = gr.Dropdown([], value=None, label="task metadata") | |
return [k1, pattern, category1] | |
def update_pattern(domain, partition, k): | |
domain = domain2folder[domain] | |
data_path = f"{BASE_DIR}/{domain}/{partition}/patterns.pkl" | |
if not os.path.exists(data_path): | |
return gr.Dropdown([], value=None, interactive=False, label="pattern") | |
else: | |
results = pickle.load(open(data_path, 'rb')) | |
patterns = results[0] | |
patterns = [str(p) for p in patterns] | |
print(patterns) | |
return gr.Dropdown(patterns[:k], value=None, interactive=True, label="pattern") | |
def update_threshold(domain, partition, baseline): | |
domain = domain2folder[domain] | |
print(baseline) | |
if baseline: | |
rank = gr.Radio(['top', 'bottom'], value='top', label="rank", interactive=False) | |
k = gr.Slider(1, 10000, 10, step=1.0, label="k", interactive=False) | |
threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=False) | |
return [rank, k, threshold] | |
else: | |
data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
if os.path.exists(data_path): | |
data = pd.read_csv(data_path) | |
max_k = len(data) | |
print(max_k) | |
k = gr.Slider(1, max_k, 10, step=1.0, label="k", interactive=True) | |
else: | |
k = gr.Slider(1, 1, 1, step=1.0, label="k") | |
rank = gr.Radio(['top', 'bottom'], value='top', label="rank", interactive=True) | |
threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=True) | |
return [rank, k, threshold] | |
def calc_surprisingness(model, scores, embeddings, k): | |
scores = scores[model].to_numpy() | |
sim = embeddings @ embeddings.T | |
# print("sim values:", sim.shape, sim) | |
indices = np.argsort(-sim)[:, :k] | |
# print("indices:", indices.shape, indices) | |
score_diff = scores[:, None] - scores[indices] | |
# print("score differences:", score_diff.shape, score_diff) | |
sim = sim[np.arange(len(scores))[:, None], indices] | |
# print("top10 sim:", sim.shape, sim) | |
all_surprisingness = score_diff * sim | |
# print("all surprisingness:", all_surprisingness.shape, all_surprisingness) | |
mean_surprisingness = np.mean(score_diff * sim, axis=1) | |
res = {'similarity': sim, | |
'task index': indices, | |
'score difference': score_diff, | |
'all surprisingness': all_surprisingness, | |
'mean surprisingness': mean_surprisingness | |
} | |
return res | |
def plot_surprisingness(domain, partition, model, rank, k, num_neighbors): | |
domain = domain2folder[domain] | |
# model = model[0] | |
model_str = model.replace("-", "_") | |
# sp_path = f"{BASE_DIR}/{domain}/{partition}/surprise_data.csv" | |
sp_pkl = f"{BASE_DIR}/{domain}/{partition}/{model_str}_surprise.pkl" | |
merged_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
if os.path.exists(sp_pkl) and os.path.exists(merged_path): # and not os.path.exists(sp_path) | |
# if os.path.exists(sp_path): | |
# sp_df = pd.read_csv(sp_path) | |
# # res = calc_surprisingness(model, scores, embeds, num_neighbors) | |
# # k = 10 | |
# model = 'qwenvl' | |
# num_neighbors = 10 | |
# if os.path.exists(sp_pkl): | |
res = pickle.load(open(sp_pkl, 'rb')) | |
total_num_task = res['task index'].shape[0] | |
all_records = [] | |
for i in range(total_num_task): | |
mean_surprisingness = np.mean(res['all surprisingness'][i, :num_neighbors]) | |
for j in range(num_neighbors): | |
neighbor_id = res['task index'][i, j] | |
score_diff = res['score difference'][i, j] | |
surprisingness = res['all surprisingness'][i, j] | |
similarity = res['similarity'][i, j] | |
record = {"task id": i, | |
"neighbor rank": j, | |
"neighbor id": neighbor_id, | |
"score difference": score_diff, | |
"surprisingness": surprisingness, | |
"mean surprisingness": mean_surprisingness, | |
"similarity": similarity | |
} | |
# print(record) | |
all_records.append(record) | |
sp_df = pd.DataFrame.from_records(all_records) | |
sp_df = sp_df.sort_values(by="mean surprisingness", ascending=False) | |
num_rows = k * num_neighbors | |
df = sp_df.iloc[:num_rows, :] if rank == "top" else sp_df.iloc[-num_rows:, :] | |
print(len(df)) | |
df['is target'] = df.apply(lambda row: int(row['task id'] == row['neighbor id']), axis=1) | |
merged_df = pd.read_csv(merged_path) | |
for col in merged_df.columns: | |
df[col] = df.apply(lambda row: merged_df.iloc[int(row['neighbor id']), :][col], axis=1) | |
tooltips = ['neighbor id'] + ['image', 'question', 'answer', model] | |
print(df.head()) | |
pts = alt.selection_point(encodings=['x']) | |
embeds = alt.Chart(df).mark_point(size=30, filled=True).encode( | |
alt.OpacityValue(0.5), | |
alt.X('x:Q', scale=alt.Scale(zero=False)), | |
alt.Y('y:Q', scale=alt.Scale(zero=False)), | |
alt.Color(f'{model}:Q'), #scale=alt.Scale(domain=[1, 0.5, 0], range=['blue', 'white', 'red'], interpolate='rgb') | |
alt.Size("is target:N", legend=None, scale=alt.Scale(domain=[0, 1], range=[300, 500])), | |
alt.Shape("is target:N", legend=None, scale=alt.Scale(domain=[0, 1], range=['circle', 'triangle'])), | |
alt.Order("is target:N"), | |
tooltip=tooltips, | |
).properties( | |
width=400, | |
height=400, | |
title=f"What are the tasks {model} is surprisingly {'good' if rank == 'top' else 'bad'} at compared to {num_neighbors} similar tasks?" | |
).transform_filter( | |
pts | |
) | |
bar = alt.Chart(df).mark_bar().encode( | |
alt.Y('mean(mean surprisingness):Q'), | |
alt.X('task id:N', sort=alt.EncodingSortField(field='mean surprisingness', order='descending')), | |
color=alt.condition(pts, alt.ColorValue("steelblue"), alt.ColorValue("grey")), # | |
).add_params(pts).properties( | |
width=400, | |
height=200, | |
) | |
chart = alt.hconcat( | |
bar, | |
embeds | |
).resolve_legend( | |
color="independent", | |
size="independent" | |
).configure_title( | |
fontSize=20 | |
).configure_legend( | |
labelFontSize=10, | |
titleFontSize=10, | |
) | |
return chart | |
else: | |
print(sp_pkl, merged_path) | |
return None | |
def plot_task_distribution(domain, partition, category): | |
domain = domain2folder[domain] | |
task_plan = pickle.load(open(f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl", "rb")) | |
task_plan.reset_index(inplace=True) | |
col_name = category | |
task_plan_cnt = task_plan.groupby(col_name)['index'].count().reset_index() | |
task_plan_cnt.rename(columns={'index': 'count'}, inplace=True) | |
task_plan_cnt['frequency (%)'] = round(task_plan_cnt['count'] / len(task_plan) * 100, 2) | |
task_plan_cnt.head() | |
base = alt.Chart(task_plan_cnt).encode( | |
alt.Theta("count:Q").stack(True), | |
alt.Color(f"{col_name}:N").legend(), | |
tooltip=[col_name, 'count', 'frequency (%)'] | |
) | |
pie = base.mark_arc(outerRadius=120) | |
return pie | |
def plot_all(domain, partition, models, category1, category2, agg): | |
domain = domain2folder[domain] | |
data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" | |
if not os.path.exists(data_path): | |
return None | |
expand_df = pd.read_csv(data_path) | |
chart_df = expand_df[expand_df['model'].isin(models)] | |
if category2: | |
color_val = f'{agg}(score):Q' | |
chart = alt.Chart(chart_df).mark_rect().encode( | |
alt.X(f'{category1}:N', sort=alt.EncodingSortField(field='score', order='ascending', op=agg)), | |
alt.Y(f'{category2}:N', sort=alt.EncodingSortField(field='score', order='descending', op=agg)), # no title, no label angle), | |
alt.Color(color_val), | |
alt.Tooltip('score', aggregate=agg, title=f"{agg} score"), | |
).properties( | |
width=800, | |
height=200, | |
) | |
else: | |
category = "index" if category1 == "task id" else category1 | |
# cat_options = list(chart_df[category].unique()) | |
# cat_options = cat_options[:5] | |
y_val = f'{agg}(score):Q' | |
df = chart_df | |
# df = chart_df[chart_df[category].isin(cat_options)] | |
if len(models) > 1: | |
chart = alt.Chart(df).mark_bar().encode( | |
alt.X('model:N', | |
sort=alt.EncodingSortField(field=f'score', order='ascending', op=agg), | |
axis=alt.Axis(labels=False, tickSize=0, title=None)), | |
alt.Y(y_val, scale=alt.Scale(zero=True)), | |
alt.Color('model:N').legend(), | |
alt.Column(f'{category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom')) | |
).properties( | |
width=200, | |
height=100, | |
title=f"How do models perform across {category}?" | |
) | |
else: | |
chart = alt.Chart(df).mark_bar().encode( | |
alt.X(f'{category}:N', sort=alt.EncodingSortField(field=f'score', order='ascending', op=agg)), # no title, no label angle), | |
alt.Y(y_val, scale=alt.Scale(zero=True)), | |
alt.Color(f'{category}:N').legend(None), | |
).properties( | |
width=200, | |
height=100, | |
title=f"How does {models[0]} perform across {category}?" | |
) | |
chart = chart.configure_title(fontSize=20, offset=5, orient='top', anchor='middle').configure_axis( | |
labelFontSize=20, | |
titleFontSize=20, | |
).configure_legend( | |
labelFontSize=15, | |
titleFontSize=15, | |
) | |
return chart | |
def update_widgets(domain, partition, category, query_type): | |
domain = domain2folder[domain] | |
data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" | |
if not os.path.exists(data_path): | |
print("here?") | |
return [None] * 11 | |
df = pd.read_csv(data_path) | |
max_k = len(df[category].unique()) if category and category != "task id" else len(df) | |
widgets = [] | |
if query_type == "top k": | |
# aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) | |
rank = gr.Radio(['top', 'bottom'], value='top', label=" ", interactive=True, visible=True) | |
k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", interactive=True, visible=True) | |
model = gr.Dropdown(MODELS, value=MODELS, label="of model(s)'", multiselect=True, interactive=True, visible=True) | |
# model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate", interactive=True, visible=True) | |
model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) | |
baseline = gr.Dropdown(MODELS, value=None, label="baseline", visible=False) | |
direction = gr.Radio(['above', 'below'], value='above', label=" ", visible=False) | |
threshold = gr.Slider(0, 1, 0.0, label="threshold", visible=False) | |
baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="baseline aggregate", visible=False) | |
md1 = gr.Markdown(r"<h2>ranked by the </h2>") | |
md2 = gr.Markdown(r"<h2>accuracy</h2>") | |
md3 = gr.Markdown(r"") | |
elif query_type == "threshold": | |
# aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task aggregate", interactive=True, visible=True) | |
# aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) | |
model = gr.Dropdown(MODELS, value=MODELS[0], label="of model(s)'", multiselect=True, interactive=True, visible=True) | |
direction = gr.Radio(['above', 'below'], value='above', label=" ", interactive=True, visible=True) | |
threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=True, visible=True) | |
# model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate", interactive=True, visible=True) | |
model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) | |
rank = gr.Radio(['top', 'bottom'], value='top', label=" ", visible=False) | |
k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", visible=False) | |
baseline = gr.Dropdown(MODELS, value=None, label="baseline", visible=False) | |
baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="baseline aggregate", visible=False) | |
md1 = gr.Markdown(r"<h2>where the</h2>") | |
md2 = gr.Markdown(r"<h2>accuracy is</h2>") | |
md3 = gr.Markdown(r"") | |
elif query_type == "model comparison": | |
model = gr.Dropdown(MODELS, value=MODELS[0], label="of model(s)' accuracy", multiselect=True, interactive=True, visible=True) | |
baseline = gr.Dropdown(MODELS, value=None, label="of baseline(s)' accuracy", multiselect=True, interactive=True, visible=True) | |
direction = gr.Radio(['above', 'below'], value='above', label=" ", interactive=True, visible=True) | |
threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=True, visible=True) | |
model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) | |
# baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate (over baselines)", interactive=True, visible=True) | |
baseline_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) | |
# aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task aggregate", interactive=True, visible=False) | |
rank = gr.Radio(['top', 'bottom'], value='top', label=" ", visible=False) | |
k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", visible=False) | |
md1 = gr.Markdown(r"<h2>where the difference between the </h2>") | |
md2 = gr.Markdown(r"<h2>is </h2>") | |
md3 = gr.Markdown(r"<h2>and the</h2>") | |
elif query_type == "model debugging": | |
model = gr.Dropdown(MODELS, value=MODELS[0], label="model's", multiselect=False, interactive=True, visible=True) | |
# aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", visible=False) | |
baseline = gr.Dropdown(MODELS, value=None, label="baseline", visible=False) | |
direction = gr.Radio(['above', 'below'], value='above', label=" ", visible=False) | |
threshold = gr.Slider(0, 1, 0.0, label="threshold", visible=False) | |
rank = gr.Radio(['top', 'bottom'], value='top', label=" ", visible=False) | |
k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", visible=False) | |
model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate (over models)", visible=False) | |
baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="baseline aggregate", visible=False) | |
md1 = gr.Markdown(r"<h2>where </h2>") | |
md2 = gr.Markdown(r"<h2>mean accuracy is below its overall mean accuracy by one standard deviation</h2>") | |
md3 = gr.Markdown(r"") | |
else: | |
widgets = [None] * 11 | |
widgets = [rank, k, direction, threshold, model, model_aggregate, baseline, baseline_aggregate, md1, md2, md3] | |
return widgets | |
def select_tasks(domain, partition, category, query_type, task_agg, models, model_agg, rank, k, direction, threshold, baselines, baseline_agg): | |
domain = domain2folder[domain] | |
data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" | |
merged_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
if not os.path.exists(data_path) or not os.path.exists(merged_path): | |
return gr.DataFrame(None) | |
df = pd.read_csv(data_path) | |
merged_df = pd.read_csv(merged_path) | |
task_plan = pickle.load(open(f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl", 'rb')) | |
task_plan.reset_index(inplace=True) | |
if not category or category == "task id": | |
category = 'index' | |
if query_type == "top k": | |
df = df[df['model'].isin(models)] | |
df = df.groupby([category, 'model'])['score'].agg(task_agg).reset_index() | |
df = df.groupby([category])['score'].agg(model_agg).reset_index() | |
df = df.sort_values(by='score', ascending=False) | |
if rank == "bottom": | |
df = df.iloc[-k:, :] | |
else: | |
df = df.iloc[:k, :] | |
elif query_type == "threshold": | |
df = df[df['model'].isin(models)] | |
df = df.groupby([category, 'model'])['score'].agg(task_agg).reset_index() | |
df = df.groupby([category])['score'].agg(model_agg).reset_index() | |
if direction == "below": | |
df = df[df['score'] <= threshold] | |
else: | |
df = df[df['score'] >= threshold] | |
elif query_type == "model comparison": | |
# df = merged_df | |
# df.reset_index(inplace=True) | |
# df = df.groupby([category])[[model, baseline]].agg(task_agg).reset_index() | |
# df = df[(df[model] - df[baseline] > threshold)] | |
df_baseline = deepcopy(df) | |
df = df[df['model'].isin(models)] | |
df = df.groupby([category, 'model'])['score'].agg(task_agg).reset_index() | |
df = df.groupby([category])['score'].agg(model_agg).reset_index() | |
model_str = ', '.join(models) | |
exp_score_id = f'{model_agg}({model_str})' if len(models) > 1 else model_str | |
df = df.sort_values(by=category) | |
df_baseline = df_baseline[df_baseline['model'].isin(baselines)] | |
df_baseline = df_baseline.groupby([category, 'model'])['score'].agg(task_agg).reset_index() | |
df_baseline = df_baseline.groupby([category])['score'].agg(baseline_agg).reset_index() | |
model_str = ', '.join(baselines) | |
baseline_score_id = f'{baseline_agg}({model_str})' if len(baselines) > 1 else model_str | |
df_baseline = df_baseline.sort_values(by=category) | |
df.rename(columns={'score': exp_score_id}, inplace=True) | |
df_baseline.rename(columns={'score': baseline_score_id}, inplace=True) | |
df = pd.merge(df, df_baseline, on=category) | |
df = df[(df[exp_score_id] - df[baseline_score_id] > threshold)] | |
elif query_type == "model debugging": | |
model = models | |
print(models) | |
avg_acc = merged_df[model].mean() | |
std = merged_df[model].std() | |
t = avg_acc - std | |
df = df[df['model'] == model] | |
df = df.groupby(['model', category])['score'].agg(task_agg).reset_index() | |
df = df[df['score'] < t] | |
df['mean'] = round(avg_acc, 4) | |
df['std'] = round(std, 4) | |
print(df.head()) | |
if category == 'index': | |
task_attrs = list(df[category]) | |
selected_tasks = task_plan[task_plan[category].isin(task_attrs)] | |
if len(selected_tasks) == 0: | |
return gr.DataFrame(None, label="There is no such task.") | |
if query_type == "model comparison" and (models and baselines): | |
# selected_tasks[model] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][model].values[0], axis=1) | |
# selected_tasks[baseline] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][baseline].values[0], axis=1) | |
selected_tasks[exp_score_id] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][exp_score_id].values[0], axis=1) | |
selected_tasks[baseline_score_id] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][baseline_score_id].values[0], axis=1) | |
else: | |
selected_tasks['score'] = selected_tasks.apply(lambda row: df[df['index'] == row['index']]['score'].values[0], axis=1) | |
print(selected_tasks.head()) | |
return gr.DataFrame(selected_tasks, label=f"There are {len(selected_tasks)} (out of {len(task_plan)}) tasks in total.") | |
else: | |
if len(df) == 0: | |
return gr.DataFrame(None, label=f"There is no such {category}.") | |
else: | |
return gr.DataFrame(df, label=f"The total number of such {category} is {len(df)}.") | |
def find_patterns(selected_tasks, num_patterns, models, baselines, model_agg, baseline_agg): | |
if len(selected_tasks) == 0: | |
return gr.DataFrame(None) | |
print(selected_tasks.head()) | |
if 'score' in selected_tasks: | |
scores = selected_tasks['score'] | |
# elif model in selected_tasks: | |
# scores = selected_tasks[model] | |
else: | |
scores = None | |
print(scores) | |
model_str = ', '.join(models) | |
exp_score_id = f'{model_agg}({model_str})' if len(models) > 1 else model_str | |
if baselines: | |
baseline_str = ', '.join(baselines) | |
baseline_score_id = f'{baseline_agg}({baseline_str})' if len(baselines) > 1 else baseline_str | |
tasks_only = selected_tasks | |
all_score_cols = ['score', exp_score_id] | |
if baselines: | |
all_score_cols += [baseline_score_id] | |
for name in all_score_cols: | |
if name in selected_tasks: | |
tasks_only = tasks_only.drop(name, axis=1) | |
results = find_frequent_patterns(k=num_patterns, df=tasks_only, scores=scores) | |
records = [] | |
if scores is not None: | |
patterns, scores = results[0], results[1] | |
for pattern, score in zip(patterns, scores): | |
pattern_str = "" | |
for t in pattern[1]: | |
col_name, col_val = t | |
pattern_str += f"{col_name} = {col_val}, " | |
record = {'pattern': pattern_str[:-2], 'count': pattern[0], 'score': score} #{model} | |
records.append(record) | |
else: | |
patterns = results | |
for pattern in patterns: | |
pattern_str = "" | |
for t in pattern[1]: | |
col_name, col_val = t | |
pattern_str += f"{col_name} = {col_val}, " | |
record = {'pattern': pattern_str[:-2], 'count': pattern[0]} | |
records.append(record) | |
df = pd.DataFrame.from_records(records) | |
return gr.DataFrame(df) | |
def visualize_task_distribution(selected_tasks, col_name, model1, model2): | |
if not col_name: | |
return None | |
task_plan_cnt = selected_tasks.groupby(col_name)['index'].count().reset_index() | |
task_plan_cnt.rename(columns={'index': 'count'}, inplace=True) | |
task_plan_cnt['frequency (%)'] = round(task_plan_cnt['count'] / len(selected_tasks) * 100, 2) | |
print(task_plan_cnt.head()) | |
tooltips = [col_name, 'count', 'frequency (%)'] | |
base = alt.Chart(task_plan_cnt).encode( | |
alt.Theta("count:Q").stack(True), | |
alt.Color(f"{col_name}:N").legend(), | |
tooltip=tooltips | |
) | |
pie = base.mark_arc(outerRadius=120) | |
return pie | |
def plot_performance_for_selected_tasks(domain, partition, df, query_type, models, baselines, select_category, vis_category, task_agg, model_agg, baseline_agg, rank, direction, threshold): | |
domain = domain2folder[domain] | |
task_agg = "mean" | |
data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" | |
mereged_data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
if not os.path.exists(data_path) or not os.path.exists(mereged_data_path) or len(df) == 0: | |
return None | |
select_tasks = select_category == "task id" and vis_category | |
if select_tasks: # select tasks | |
y_val = f'{task_agg}(score):Q' | |
else: # select task categories | |
y_val = f'score:Q' | |
if select_category == "task id": | |
select_category = "index" | |
print(df.head()) | |
if query_type == "model comparison": | |
# re-format the data for plotting | |
model_str = ', '.join(models) | |
exp_score_id = f'{model_agg}({model_str})' if len(models) > 1 else model_str | |
baseline_str = ', '.join(baselines) | |
baseline_score_id = f'{baseline_agg}({baseline_str})' if len(baselines) > 1 else baseline_str | |
# other_cols = list(df.columns) | |
# other_cols.remove(select_category) | |
print(exp_score_id, baseline_score_id) | |
df = df.melt(id_vars=[select_category], value_vars=[exp_score_id, baseline_score_id]) | |
df.rename(columns={'variable': 'model', 'value': 'score'}, inplace=True) | |
print(df.head()) | |
if select_tasks: | |
merged_df = pd.read_csv(mereged_data_path) | |
df[vis_category] = df.apply(lambda row: merged_df[merged_df.index == row['index']][vis_category].values[0], axis=1) | |
num_columns = len(df['model'].unique()) * len(df[f'{vis_category}'].unique()) | |
chart = alt.Chart(df).mark_bar().encode( | |
alt.X('model:N', | |
sort=alt.EncodingSortField(field=f'score', order='descending', op=task_agg), | |
axis=alt.Axis(labels=False, tickSize=0, title=None)), | |
alt.Y(y_val, scale=alt.Scale(zero=True), title="accuracy"), | |
alt.Color('model:N').legend(), | |
alt.Column(f'{vis_category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom', labelFontSize=20, titleFontSize=20,)) | |
).properties( | |
width=num_columns * 30, | |
height=200, | |
title=f"How do models perform by {vis_category}?" | |
) | |
print(num_columns * 50) | |
else: | |
if query_type == "model debugging": | |
y_title = "accuracy" | |
plot_title = f"{models} performs worse than its (mean - std) on these {vis_category}s" | |
models = [models] | |
else: | |
model_str = ', '.join(models) | |
y_title = f"{model_agg} accuracy" if len(models) > 0 else "accuracy" | |
suffix = f"on these tasks (by {vis_category})" if select_category == "index" else f"on these {vis_category}s" | |
if query_type == "top k": | |
plot_title = f"The {model_agg} accuracy of {model_str} is the {'highest' if rank == 'top' else 'lowest'} " + suffix | |
elif query_type == "threshold": | |
plot_title = f"The {model_agg} accuracy of {model_str} is {direction} {threshold} " + suffix | |
if select_tasks: | |
expand_df = pd.read_csv(data_path) | |
task_ids = list(df['index'].unique()) | |
# all_models = (models + baselines) if baselines else models | |
df = expand_df[(expand_df['model'].isin(models)) & (expand_df['task id'].isin(task_ids))] | |
num_columns = len(df[f'{vis_category}'].unique()) | |
chart = alt.Chart(df).mark_bar().encode( | |
alt.X(f'{vis_category}:N', sort=alt.EncodingSortField(field=f'score', order='ascending', op=task_agg), axis=alt.Axis(labelAngle=-20)), # no title, no label angle), | |
alt.Y(y_val, scale=alt.Scale(zero=True), title=y_title), | |
alt.Color(f'{vis_category}:N').legend(None), | |
).properties( | |
width=num_columns * 30, | |
height=200, | |
title=plot_title | |
) | |
chart = chart.configure_title(fontSize=20, offset=5, orient='top', anchor='middle').configure_axis( | |
labelFontSize=20, | |
titleFontSize=20, | |
).configure_legend( | |
labelFontSize=20, | |
titleFontSize=20, | |
labelLimit=200, | |
) | |
return chart | |
def sync_vis_category(domain, partition, category): | |
domain = domain2folder[domain] | |
if category and category != "task id": | |
return [gr.Dropdown([category], value=category, label="by task metadata", interactive=False), gr.Dropdown([category], value=category, label="by task metadata", interactive=False)] | |
else: | |
data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl" | |
if os.path.exists(data_path): | |
data = pickle.load(open(data_path, 'rb')) | |
categories = list(data.columns) | |
return [gr.Dropdown(categories, value=categories[0], label="by task metadata", interactive=True), gr.Dropdown(categories, value=categories[0], label="by task metadata", interactive=True)] | |
else: | |
return [None, None] | |
def hide_fpm_and_dist_components(domain, partition, category): | |
domain = domain2folder[domain] | |
print(category) | |
if category and category != "task id": | |
num_patterns = gr.Slider(1, 100, 50, step=1.0, label="number of patterns", visible=False) | |
btn_pattern = gr.Button(value="Find patterns among tasks", visible=False) | |
table = gr.DataFrame({}, height=250, visible=False) | |
dist_chart = gr.Plot(visible=False) | |
col_name = gr.Dropdown([], value=None, label="by task metadata", visible=False) | |
btn_dist = gr.Button(value="Visualize task distribution", visible=False) | |
else: | |
data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl" | |
if os.path.exists(data_path): | |
data = pickle.load(open(data_path, 'rb')) | |
categories = list(data.columns) | |
col_name = gr.Dropdown(categories, value=categories[0], label="by task metadata", interactive=True, visible=True) | |
else: | |
col_name = gr.Dropdown([], value=None, label="by task metadata", interactive=True, visible=True) | |
num_patterns = gr.Slider(1, 100, 50, step=1.0, label="number of patterns", interactive=True, visible=True) | |
btn_pattern = gr.Button(value="Find patterns among tasks", interactive=True, visible=True) | |
table = gr.DataFrame({}, height=250, interactive=True, visible=True) | |
dist_chart = gr.Plot(visible=True) | |
btn_dist = gr.Button(value="Visualize task distribution", interactive=True, visible=True) | |
return [num_patterns, btn_pattern, table, col_name, btn_dist, dist_chart] | |
# domains = list_directories(BASE_DIR) | |
theme = gr.Theme.from_hub('sudeepshouche/minimalist') | |
theme.font = [gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"] # gr.themes.GoogleFont("Source Sans Pro") # [gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"] | |
theme.text_size = gr.themes.sizes.text_lg | |
# theme = theme.set(font=) | |
demo = gr.Blocks(theme=theme, title="TaskVerse-UI") # | |
with demo: | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown( | |
r"" | |
) | |
with gr.Column(scale=1): | |
gr.Markdown( | |
r"<h1>Welcome to TaskVerse-UI! </h1>" | |
) | |
with gr.Column(scale=1): | |
gr.Markdown( | |
r"" | |
) | |
with gr.Tab("📊 Overview"): | |
gr.Markdown( | |
r"<h2>📊 Visualize the overall task distribution and model performance </h2>" | |
) | |
with gr.Row(): | |
domain = gr.Radio(domains, label="scenario", scale=2) | |
partition = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1) | |
# domain.change(fn=update_partition, inputs=domain, outputs=partition) | |
gr.Markdown( | |
r"<h2>Overall task metadata distribution</h2>" | |
) | |
with gr.Row(): | |
category = gr.Dropdown([], value=None, label="task metadata") | |
partition.change(fn=update_category, inputs=[domain, partition], outputs=category) | |
with gr.Row(): | |
output = gr.Plot() | |
with gr.Row(): | |
btn = gr.Button(value="Plot") | |
btn.click(plot_task_distribution, [domain, partition, category], output) | |
gr.Markdown( | |
r"<h2>Models' overall performance by task metadata</h2>" | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
models = gr.CheckboxGroup(MODELS, label="model(s)", value=MODELS) | |
with gr.Column(scale=1): | |
aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="aggregate models' accuracy by") | |
with gr.Row(): | |
# with gr.Column(scale=1): | |
category1 = gr.Dropdown([], value=None, label="task metadata", interactive=True) | |
category2 = gr.Dropdown([], value=None, label="Optional: second task metadata", interactive=True) | |
partition.change(fn=update_category, inputs=[domain, partition], outputs=category1) | |
category1.change(fn=update_category2, inputs=[domain, partition, category1], outputs=category2) | |
domain.change(fn=update_partition_and_models, inputs=domain, outputs=[partition, models]) | |
with gr.Row(): | |
output = gr.Plot() | |
with gr.Row(): | |
btn = gr.Button(value="Plot") | |
btn.click(plot_all, [domain, partition, models, category1, category2, aggregate], output) | |
# gr.Examples(["hello", "bonjour", "merhaba"], input_textbox) | |
with gr.Tab("✨ Embedding"): | |
gr.Markdown( | |
r"<h2>✨ Visualize the tasks' embeddings in the 2D space </h2>" | |
) | |
with gr.Row(): | |
domain2 = gr.Radio(domains, label="scenario", scale=2) | |
# domain = gr.Dropdown(domains, value=domains[0], label="scenario") | |
partition2 = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1) | |
category2 = gr.Dropdown([], value=None, label="colored by task metadata", scale=1) | |
domain2.change(fn=update_partition, inputs=domain2, outputs=partition2) | |
partition2.change(fn=update_category, inputs=[domain2, partition2], outputs=category2) | |
with gr.Row(): | |
output2 = gr.Plot() | |
with gr.Row(): | |
btn = gr.Button(value="Run") | |
btn.click(plot_embedding, [domain2, partition2, category2], output2) | |
with gr.Tab("❓ Query"): | |
gr.Markdown( | |
r"<h2>❓ Find out the answers to your queries by finding and visualizing the relevant tasks and models' performance </h2>" | |
) | |
with gr.Row(equal_height=True): | |
domain = gr.Radio(domains, label="scenario", scale=2) | |
partition = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1) | |
with gr.Row(): | |
query1 = "top k" | |
query2 = "threshold" | |
query3 = "model debugging" | |
query4 = "model comparison" | |
query_type = gr.Radio([query1, query2, query3, query4], value="top k", label=r"query type") | |
with gr.Row(): | |
with gr.Accordion("See more details about the query type"): | |
gr.Markdown( | |
r"<ul><li>Top k: Find the k tasks or task metadata that the model(s) perform the best or worst on</li><li>Threshold: Find the tasks or task metadata where the model(s)' performance is greater or lower than a given threshold t</li><li>Model debugging: Find the tasks or task metadata where a model performs significantly worse than its average performance (by one standard deviation)</li><li>Model comparison: Find the tasks or task metadata where some model(s) perform better or worse than the baseline(s) by a given threshold t</li></ul>" | |
) | |
with gr.Row(): | |
gr.Markdown(r"<h2>Help me find the</h2>") | |
with gr.Row(equal_height=True): | |
# with gr.Column(scale=1): | |
rank = gr.Radio(['top', 'bottom'], value='top', label=" ", interactive=True, visible=True) | |
# with gr.Column(scale=2): | |
k = gr.Slider(1, 10, 5 // 2, step=1.0, label="k", interactive=True, visible=True) | |
# with gr.Column(scale=2): | |
category = gr.Dropdown([], value=None, label="tasks / task metadata", interactive=True) | |
with gr.Row(): | |
md1 = gr.Markdown(r"<h2>ranked by the </h2>") | |
with gr.Row(equal_height=True): | |
# with gr.Column(scale=1, min_width=100): | |
# model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) | |
model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True, scale=1) | |
# with gr.Column(scale=8): | |
model = gr.Dropdown(MODELS, value=MODELS, label="of model(s)", multiselect=True, interactive=True, visible=True, scale=2) | |
# with gr.Column(scale=1, min_width=100): | |
# aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True, scale=1) | |
with gr.Row(): | |
md3 = gr.Markdown(r"") | |
with gr.Row(equal_height=True): | |
baseline_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=False, scale=1) | |
baseline = gr.Dropdown(MODELS, value=None, label="of baseline(s)'", visible=False, scale=2) | |
# aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) | |
# with gr.Column(scale=1, min_width=50): | |
with gr.Row(): | |
md2 = gr.Markdown(r"<h2>accuracy</h2>") | |
with gr.Row(): | |
# baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate (over baselines)", visible=False) | |
direction = gr.Radio(['above', 'below'], value='above', label=" ", visible=False) | |
threshold = gr.Slider(0, 1, 0.0, label="threshold", visible=False) | |
widgets = [rank, k, direction, threshold, model, model_aggregate, baseline, baseline_aggregate, md1, md2, md3] | |
partition.change(fn=update_category, inputs=[domain, partition], outputs=category) | |
query_type.change(update_widgets, [domain, partition, category, query_type], widgets) | |
domain.change(fn=update_partition_and_models_and_baselines, inputs=domain, outputs=[partition, model, baseline]) | |
with gr.Row(): | |
df = gr.DataFrame({}, height=200) | |
btn = gr.Button(value="Find tasks / task metadata") | |
btn.click(select_tasks, [domain, partition, category, query_type, aggregate, model, model_aggregate, rank, k, direction, threshold, baseline, baseline_aggregate], df) | |
with gr.Row(): | |
plot = gr.Plot() | |
with gr.Row(): | |
col_name2 = gr.Dropdown([], value=None, label="by task metadata", interactive=True) | |
partition.change(fn=update_category, inputs=[domain, partition], outputs=col_name2) | |
btn_plot = gr.Button(value="Plot model performance", interactive=True) | |
btn_plot.click(plot_performance_for_selected_tasks, [domain, partition, df, query_type, model, baseline, category, col_name2, aggregate, model_aggregate, baseline_aggregate, rank, direction, threshold], plot) | |
with gr.Row(): | |
dist_chart = gr.Plot() | |
with gr.Row(): | |
col_name = gr.Dropdown([], value=None, label="by task metadata", interactive=True) | |
partition.change(fn=update_category, inputs=[domain, partition], outputs=col_name) | |
btn_dist = gr.Button(value="Visualize task distribution", interactive=True) | |
btn_dist.click(visualize_task_distribution, [df, col_name, model, baseline], dist_chart) | |
with gr.Row(): | |
table = gr.DataFrame({}, height=250) | |
with gr.Row(): | |
num_patterns = gr.Slider(1, 100, 50, step=1.0, label="number of patterns") | |
btn_pattern = gr.Button(value="Find patterns among tasks") | |
btn_pattern.click(find_patterns, [df, num_patterns, model, baseline], table) | |
category.change(fn=hide_fpm_and_dist_components, inputs=[domain, partition, category], outputs=[num_patterns, btn_pattern, table, col_name, btn_dist, dist_chart]) | |
category.change(fn=sync_vis_category, inputs=[domain, partition, category], outputs=[col_name, col_name2]) | |
category.change(fn=update_k, inputs=[domain, partition, category], outputs=k) | |
with gr.Tab("😮 Surprisingness"): | |
gr.Markdown(r"<h2>😮 Find out the tasks a model is surprisingly good or bad at compared to similar tasks</h2>") | |
with gr.Row(): | |
domain3 = gr.Radio(domains, label="scenario", scale=2) | |
partition3 = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1) | |
with gr.Row(): | |
model3 = gr.Dropdown(MODELS, value=MODELS[0], label="model", interactive=True, visible=True) | |
k3 = gr.Slider(1, 100, 50, step=1.0, label="number of surprising tasks", interactive=True) | |
num_neighbors = gr.Slider(1, 100, 50, step=1.0, label="number of neighbors", interactive=True) | |
rank3 = gr.Radio(['top', 'bottom'], value='top', label=" ", interactive=True, visible=True) | |
domain3.change(fn=update_partition_and_models, inputs=domain3, outputs=[partition3, model3]) | |
# partition3.change(fn=update_k, inputs=[domain3, partition3], outputs=k3) | |
with gr.Row(): | |
output3 = gr.Plot() | |
with gr.Row(): | |
btn = gr.Button(value="Plot") | |
btn.click(plot_surprisingness, [domain3, partition3, model3, rank3, k3, num_neighbors], output3) | |
# if __name__ == "__main__": | |
demo.launch(share=True) | |