import pandas as pd
import numpy as np
import os
from prefixspan import PrefixSpan
import gradio as gr
import altair as alt
alt.data_transformers.enable("vegafusion")
# from dynabench.task_evaluator import *
BASE_DIR = "db"
MODELS = ['qwenvl-chat', 'qwenvl', 'llava15-7b', 'llava15-13b', 'instructblip-vicuna13b', 'instructblip-vicuna7b']
VIDEO_MODELS = ['video-chat2-7b','video-llama2-7b','video-llama2-13b','chat-univi-7b','chat-univi-13b','video-llava-7b','video-chatgpt-7b']
domains = ["imageqa-2d-sticker", "imageqa-3d-tabletop", "imageqa-scene-graph", "videoqa-3d-tabletop", "videoqa-scene-graph"]
domain2folder = {"imageqa-2d-sticker": "2d",
"imageqa-3d-tabletop": "3d",
"imageqa-scene-graph": "sg",
"videoqa-3d-tabletop": "video-3d",
"videoqa-scene-graph": "video-sg",
None: '2d'}
def find_frequent_patterns(k, df, scores=None):
if len(df) == 0:
return []
df = df.reset_index(drop=True)
cols = df.columns.to_list()
df = df.fillna('').astype('str')
db = [[(c, v) for c, v in zip(cols, d) if v] for d in df.values.tolist()]
ps = PrefixSpan(db)
patterns = ps.topk(k, closed=True)
if scores is None:
return patterns
else:
aggregated_scores = []
scores = np.asarray(scores)
for count, pattern in patterns:
q = ' and '.join([f"`{k}` == {repr(v)}" for k, v in pattern])
indices = df.query(q).index.to_numpy()
aggregated_scores.append(np.mean(scores[indices]))
return patterns, aggregated_scores
def update_partition_and_models(domain):
domain = domain2folder[domain]
path = f"{BASE_DIR}/{domain}"
if os.path.exists(path):
partitions = list_directories(path)
if domain.find("video") > -1:
model = gr.Dropdown(VIDEO_MODELS, value=VIDEO_MODELS[0], label="model")
else:
model = gr.Dropdown(MODELS, value=MODELS[0], label="model")
partition = gr.Dropdown(partitions, value=partitions[0], label="task space of the following task generator")
return [partition, model]
else:
partition = gr.Dropdown([], value=None, label="task space of the following task generator")
model = gr.Dropdown([], value=None, label="model")
return [partition, model]
def update_partition_and_models_and_baselines(domain):
domain = domain2folder[domain]
path = f"{BASE_DIR}/{domain}"
if os.path.exists(path):
partitions = list_directories(path)
if domain.find("video") > -1:
model = gr.Dropdown(VIDEO_MODELS, value=VIDEO_MODELS[0], label="model")
baseline = gr.Dropdown(VIDEO_MODELS, value=VIDEO_MODELS[0], label="baseline")
else:
model = gr.Dropdown(MODELS, value=MODELS[0], label="model")
baseline = gr.Dropdown(MODELS, value=MODELS[0], label="baseline")
partition = gr.Dropdown(partitions, value=partitions[0], label="task space of the following task generator")
else:
partition = gr.Dropdown([], value=None, label="task space of the following task generator")
model = gr.Dropdown([], value=None, label="model")
baseline = gr.Dropdown([], value=None, label="baseline")
return [partition, model, baseline]
def get_filtered_task_ids(domain, partition, models, rank, k, threshold, baseline):
domain = domain2folder[domain]
data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
if not os.path.exists(data_path):
return []
else:
merged_df = pd.read_csv(data_path)
merged_df.rename(columns={'llavav1.5-7b': 'llava15-7b', 'llavav1.5-13b': 'llava15-13b'}, inplace=True)
df = merged_df
select_top = rank == "top"
# Model X is good / bad at
for model in models:
if baseline:
df = df[df[model] >= df[baseline]]
else:
if select_top:
df = df[df[model] >= threshold]
else:
df = df[df[model] <= threshold]
if not baseline:
df['mean score'] = df[models].mean(axis=1)
df = df.sort_values(by='mean score', ascending=False)
df = df.iloc[:k, :] if select_top else df.iloc[-k:, :]
task_ids = list(df.index)
return task_ids
def plot_patterns(domain, partition, models, rank, k, threshold, baseline, pattern, order):
domain = domain2folder[domain]
data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv"
if not os.path.exists(data_path):
return None
task_ids = get_filtered_task_ids(domain, partition, models, rank, k, threshold, baseline)
expand_df = pd.read_csv(data_path)
chart_df = expand_df[expand_df['model'].isin((models + [baseline]) if baseline else models)]
chart_df = chart_df[chart_df['task id'].isin(task_ids)]
print(pattern)
freq, cols = eval(pattern)
pattern_str = ""
df = chart_df
for col in cols:
col_name, col_val = col
try:
col_val = int(col_val)
except:
col_val = col_val
df = df[df[col_name] == col_val]
pattern_str += f"{col_name} = {col_val}, "
print(len(df))
if baseline:
model_str = (', '.join(models) if len(models) > 1 else models[0])
phrase = f'{model_str} perform' if len(models) > 1 else f'{model_str} performs'
title = f"{phrase} better than {baseline} on {freq} tasks where {pattern_str[:-2]}"
else:
title = f"Models are {'best' if rank == 'top' else 'worst'} at {freq} tasks where {pattern_str[:-2]}"
chart = alt.Chart(df).mark_bar().encode(
alt.X('model:N',
sort=alt.EncodingSortField(field=f'score', order=order, op="mean"),
axis=alt.Axis(labels=False, tickSize=0)), # no title, no label angle),
alt.Y('mean(score):Q', scale=alt.Scale(zero=True)),
alt.Color('model:N').legend(),
).properties(
width=400,
height=300,
title=title
)
return chart
def plot_embedding(domain, partition, category):
domain = domain2folder[domain]
data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
if os.path.exists(data_path):
merged_df = pd.read_csv(data_path)
# models = merged_df.columns
has_image = 'image' in merged_df
chart = alt.Chart(merged_df).mark_point(size=30, filled=True).encode(
alt.OpacityValue(0.5),
alt.X('x:Q'),
alt.Y('y:Q'),
alt.Color(f'{category}:N'),
tooltip=['question', 'answer'] + (['image'] if has_image else []),
).properties(
width=800,
height=800,
title="UMAP Projected Task Embeddings"
).configure_axis(
labelFontSize=25,
titleFontSize=25,
).configure_title(
fontSize=40
).configure_legend(
labelFontSize=25,
titleFontSize=25,
).interactive()
return chart
else:
return None
def plot_multi_models(domain, partition, category, cat_options, models, order, pattern, aggregate="mean"):
domain = domain2folder[domain]
data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv"
if not os.path.exists(data_path):
return None
expand_df = pd.read_csv(data_path)
print(pattern)
if pattern is not None:
df = expand_df
freq, cols = eval(pattern)
pattern_str = ""
for col in cols:
col_name, col_val = col
try:
col_val = int(col_val)
except:
col_val = col_val
df = df[df[col_name] == col_val]
pattern_str += f"{col_name} = {col_val}, "
chart = alt.Chart(df).mark_bar().encode(
alt.X('model:N',
sort=alt.EncodingSortField(field=f'score', order='ascending', op="mean"),
axis=alt.Axis(labels=False, tickSize=0)), # no title, no label angle),
alt.Y('mean(score):Q', scale=alt.Scale(zero=True)),
alt.Color('model:N').legend(),
).properties(
width=200,
height=100,
title=f"How do models perform on tasks where {pattern_str[:-2]} (N={freq})?"
)
return chart
else:
df = expand_df[(expand_df['model'].isin(models)) & (expand_df[category].isin(cat_options))]
if len(models) > 1:
chart = alt.Chart(df).mark_bar().encode(
alt.X('model:N',
sort=alt.EncodingSortField(field=f'score', order=order, op="mean"),
axis=alt.Axis(labels=False, tickSize=0, title=None)),
alt.Y('mean(score):Q', scale=alt.Scale(zero=True)),
alt.Color('model:N').legend(),
alt.Column(f'{category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom'))
).properties(
width=200,
height=100,
title=f"How do models perform across {category}?"
)
else:
chart = alt.Chart(df).mark_bar().encode(
alt.X(f'{category}:N', sort=alt.EncodingSortField(field=f'score', order=order, op="mean")), # no title, no label angle),
alt.Y('mean(score):Q', scale=alt.Scale(zero=True)),
alt.Color(f'{category}:N').legend(None),
).properties(
width=200,
height=100,
title=f"How does {models[0]} perform across {category}?"
)
chart = chart.configure_title(fontSize=15, offset=5, orient='top', anchor='middle')
return chart
def plot(domain, partition, models, rank, k, threshold, baseline, order, category, cat_options):
domain = domain2folder[domain]
data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
expand_data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv"
# task_plan.reset_index(inplace=True)
if not os.path.exists(data_path) or not os.path.exists(expand_data_path):
return None
else:
merged_df = pd.read_csv(data_path)
merged_df.rename(columns={'llavav1.5-7b': 'llava15-7b', 'llavav1.5-13b': 'llava15-13b'}, inplace=True)
expand_df = pd.read_csv(expand_data_path)
df = merged_df
select_top = rank == "top"
# Model X is good / bad at
for model in models:
if baseline:
df = df[df[model] >= df[baseline]]
else:
if select_top:
df = df[df[model] >= threshold]
else:
df = df[df[model] <= threshold]
if not baseline:
df['mean score'] = df[models].mean(axis=1)
df = df.sort_values(by='mean score', ascending=False)
df = df.iloc[:k, :] if select_top else df.iloc[-k:, :]
task_ids = list(df.index)
if baseline:
models += [baseline]
chart_df = expand_df[expand_df['model'].isin(models)]
chart_df = chart_df[chart_df['task id'].isin(task_ids)]
if cat_options:
df = chart_df[chart_df[category].isin(cat_options)]
else:
df = chart_df
if baseline:
model_str = (', '.join(models) if len(models) > 1 else models[0])
phrase = f'{model_str} perform' if len(models) > 1 else f'{model_str} performs'
title = f"Are there any tasks where {phrase} better than {baseline} (by {category})?"
else:
title = f"What tasks are models {'best' if select_top else 'worst'} at by {category}?"
if len(models) > 1:
chart = alt.Chart(df).mark_bar().encode(
alt.X('model:N',
sort=alt.EncodingSortField(field=f'score', order=order, op="mean"),
axis=alt.Axis(labels=False, tickSize=0, title=None)),
alt.Y('mean(score):Q', scale=alt.Scale(zero=True)),
alt.Color('model:N').legend(),
alt.Column(f'{category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom'))
).properties(
width=200,
height=100,
title=title
)
else:
chart = alt.Chart(df).mark_bar().encode(
alt.X(f'{category}:N', sort=alt.EncodingSortField(field=f'score', order=order, op="mean")), # no title, no label angle),
alt.Y('mean(score):Q', scale=alt.Scale(zero=True)),
alt.Color(f'{category}:N').legend(None),
).properties(
width=200,
height=100,
title=f"What tasks is model {models[0]} {'best' if select_top else 'worst'} at by {category}?"
)
chart = chart.configure_title(fontSize=15, offset=5, orient='top', anchor='middle')
return chart
def get_frequent_patterns(task_plan, scores):
find_frequent_patterns(k=10, df=task_plan, scores=scores)
def list_directories(path):
"""List all directories within a given path."""
return [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
def update_category(domain, partition):
domain = domain2folder[domain]
data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl"
if os.path.exists(data_path):
data = pickle.load(open(data_path, 'rb'))
categories = list(data.columns)
category = gr.Dropdown(categories+["task id"], value=None, label="task metadata", interactive=True)
return category
else:
return gr.Dropdown([], value=None, label="task metadata")
def update_category2(domain, partition, existing_category):
domain = domain2folder[domain]
data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl"
if os.path.exists(data_path):
data = pickle.load(open(data_path, 'rb'))
categories = list(data.columns)
if existing_category and existing_category in categories:
categories.remove(existing_category)
category = gr.Dropdown(categories, value=None, label="Optional: second task metadata", interactive=True)
return category
else:
return gr.Dropdown([], value=None, label="task metadata")
def update_partition(domain):
domain = domain2folder[domain]
path = f"{BASE_DIR}/{domain}"
if os.path.exists(path):
partitions = list_directories(path)
return gr.Dropdown(partitions, value=partitions[0], label="task space of the following task generator")
else:
return gr.Dropdown([], value=None, label="task space of the following task generator")
def update_k(domain, partition, category=None):
domain = domain2folder[domain]
data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
if os.path.exists(data_path):
data = pd.read_csv(data_path)
max_k = len(data[category].unique()) if category and category != "task id" else len(data)
mid = max_k // 2
return gr.Slider(1, max_k, mid, step=1.0, label="k")
else:
return gr.Slider(1, 1, 1, step=1.0, label="k")
# def update_category_values(domain, partition, category):
# data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
# if os.path.exists(data_path) and category is not None:
# data = pd.read_csv(data_path)
# uni_cats = list(data[category].unique())
# return gr.Dropdown(uni_cats, multiselect=True, value=None, interactive=True, label="category values")
# else:
# return gr.Dropdown([], multiselect=True, value=None, interactive=False, label="category values")
# def update_category_values(domain, partition, models, rank, k, threshold, baseline, category):
# data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
# if not os.path.exists(data_path):
# return gr.Dropdown([], multiselect=True, value=None, interactive=False, label="category values")
# else:
# merged_df = pd.read_csv(data_path)
# merged_df.rename(columns={'llavav1.5-7b': 'llava15-7b', 'llavav1.5-13b': 'llava15-13b'}, inplace=True)
# df = merged_df
# select_top = rank == "top"
# # Model X is good / bad at
# for model in models:
# if baseline:
# df = df[df[model] >= df[baseline]]
# else:
# if select_top:
# df = df[df[model] >= threshold]
# else:
# df = df[df[model] <= threshold]
# if not baseline:
# df['mean score'] = df[models].mean(axis=1)
# df = df.sort_values(by='mean score', ascending=False)
# df = df.iloc[:k, :] if select_top else df.iloc[-k:, :]
# uni_cats = list(df[category].unique())
# return gr.Dropdown(uni_cats, multiselect=True, value=None, interactive=True, label="category values")
def update_tasks(domain, partition, find_pattern):
domain = domain2folder[domain]
if find_pattern == "yes":
k1 = gr.Slider(1, 10000, 10, step=1.0, label="k", interactive=True)
pattern = gr.Dropdown([], value=None, interactive=True, label="pattern")
category1 = gr.Dropdown([], value=None, interactive=False, label="task metadata")
return [k1, pattern, category1]
else:
k1 = gr.Slider(1, 10000, 10, step=1.0, label="k", interactive=False)
pattern = gr.Dropdown([], value=None, interactive=False, label="pattern")
data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
if os.path.exists(data_path):
data = pd.read_csv(data_path)
non_columns = MODELS + ['question', 'answer']
categories = [cat for cat in list(data.columns) if cat not in non_columns]
category1 = gr.Dropdown(categories, value=categories[0], interactive=True, label="task metadata")
else:
category1 = gr.Dropdown([], value=None, label="task metadata")
return [k1, pattern, category1]
def update_pattern(domain, partition, k):
domain = domain2folder[domain]
data_path = f"{BASE_DIR}/{domain}/{partition}/patterns.pkl"
if not os.path.exists(data_path):
return gr.Dropdown([], value=None, interactive=False, label="pattern")
else:
results = pickle.load(open(data_path, 'rb'))
patterns = results[0]
patterns = [str(p) for p in patterns]
print(patterns)
return gr.Dropdown(patterns[:k], value=None, interactive=True, label="pattern")
def update_threshold(domain, partition, baseline):
domain = domain2folder[domain]
print(baseline)
if baseline:
rank = gr.Radio(['top', 'bottom'], value='top', label="rank", interactive=False)
k = gr.Slider(1, 10000, 10, step=1.0, label="k", interactive=False)
threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=False)
return [rank, k, threshold]
else:
data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
if os.path.exists(data_path):
data = pd.read_csv(data_path)
max_k = len(data)
print(max_k)
k = gr.Slider(1, max_k, 10, step=1.0, label="k", interactive=True)
else:
k = gr.Slider(1, 1, 1, step=1.0, label="k")
rank = gr.Radio(['top', 'bottom'], value='top', label="rank", interactive=True)
threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=True)
return [rank, k, threshold]
def calc_surprisingness(model, scores, embeddings, k):
scores = scores[model].to_numpy()
sim = embeddings @ embeddings.T
# print("sim values:", sim.shape, sim)
indices = np.argsort(-sim)[:, :k]
# print("indices:", indices.shape, indices)
score_diff = scores[:, None] - scores[indices]
# print("score differences:", score_diff.shape, score_diff)
sim = sim[np.arange(len(scores))[:, None], indices]
# print("top10 sim:", sim.shape, sim)
all_surprisingness = score_diff * sim
# print("all surprisingness:", all_surprisingness.shape, all_surprisingness)
mean_surprisingness = np.mean(score_diff * sim, axis=1)
res = {'similarity': sim,
'task index': indices,
'score difference': score_diff,
'all surprisingness': all_surprisingness,
'mean surprisingness': mean_surprisingness
}
return res
def plot_surprisingness(domain, partition, model, rank, k, num_neighbors):
domain = domain2folder[domain]
# model = model[0]
model_str = model.replace("-", "_")
# sp_path = f"{BASE_DIR}/{domain}/{partition}/surprise_data.csv"
sp_pkl = f"{BASE_DIR}/{domain}/{partition}/{model_str}_surprise.pkl"
merged_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
if os.path.exists(sp_pkl) and os.path.exists(merged_path): # and not os.path.exists(sp_path)
# if os.path.exists(sp_path):
# sp_df = pd.read_csv(sp_path)
# # res = calc_surprisingness(model, scores, embeds, num_neighbors)
# # k = 10
# model = 'qwenvl'
# num_neighbors = 10
# if os.path.exists(sp_pkl):
res = pickle.load(open(sp_pkl, 'rb'))
total_num_task = res['task index'].shape[0]
all_records = []
for i in range(total_num_task):
mean_surprisingness = np.mean(res['all surprisingness'][i, :num_neighbors])
for j in range(num_neighbors):
neighbor_id = res['task index'][i, j]
score_diff = res['score difference'][i, j]
surprisingness = res['all surprisingness'][i, j]
similarity = res['similarity'][i, j]
record = {"task id": i,
"neighbor rank": j,
"neighbor id": neighbor_id,
"score difference": score_diff,
"surprisingness": surprisingness,
"mean surprisingness": mean_surprisingness,
"similarity": similarity
}
# print(record)
all_records.append(record)
sp_df = pd.DataFrame.from_records(all_records)
sp_df = sp_df.sort_values(by="mean surprisingness", ascending=False)
num_rows = k * num_neighbors
df = sp_df.iloc[:num_rows, :] if rank == "top" else sp_df.iloc[-num_rows:, :]
print(len(df))
df['is target'] = df.apply(lambda row: int(row['task id'] == row['neighbor id']), axis=1)
merged_df = pd.read_csv(merged_path)
for col in merged_df.columns:
df[col] = df.apply(lambda row: merged_df.iloc[int(row['neighbor id']), :][col], axis=1)
tooltips = ['neighbor id'] + ['image', 'question', 'answer', model]
print(df.head())
pts = alt.selection_point(encodings=['x'])
embeds = alt.Chart(df).mark_point(size=30, filled=True).encode(
alt.OpacityValue(0.5),
alt.X('x:Q', scale=alt.Scale(zero=False)),
alt.Y('y:Q', scale=alt.Scale(zero=False)),
alt.Color(f'{model}:Q'), #scale=alt.Scale(domain=[1, 0.5, 0], range=['blue', 'white', 'red'], interpolate='rgb')
alt.Size("is target:N", legend=None, scale=alt.Scale(domain=[0, 1], range=[300, 500])),
alt.Shape("is target:N", legend=None, scale=alt.Scale(domain=[0, 1], range=['circle', 'triangle'])),
alt.Order("is target:N"),
tooltip=tooltips,
).properties(
width=400,
height=400,
title=f"What are the tasks {model} is surprisingly {'good' if rank == 'top' else 'bad'} at compared to {num_neighbors} similar tasks?"
).transform_filter(
pts
)
bar = alt.Chart(df).mark_bar().encode(
alt.Y('mean(mean surprisingness):Q'),
alt.X('task id:N', sort=alt.EncodingSortField(field='mean surprisingness', order='descending')),
color=alt.condition(pts, alt.ColorValue("steelblue"), alt.ColorValue("grey")), #
).add_params(pts).properties(
width=400,
height=200,
)
chart = alt.hconcat(
bar,
embeds
).resolve_legend(
color="independent",
size="independent"
).configure_title(
fontSize=20
).configure_legend(
labelFontSize=10,
titleFontSize=10,
)
return chart
else:
print(sp_pkl, merged_path)
return None
def plot_task_distribution(domain, partition, category):
domain = domain2folder[domain]
task_plan = pickle.load(open(f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl", "rb"))
task_plan.reset_index(inplace=True)
col_name = category
task_plan_cnt = task_plan.groupby(col_name)['index'].count().reset_index()
task_plan_cnt.rename(columns={'index': 'count'}, inplace=True)
task_plan_cnt['frequency (%)'] = round(task_plan_cnt['count'] / len(task_plan) * 100, 2)
task_plan_cnt.head()
base = alt.Chart(task_plan_cnt).encode(
alt.Theta("count:Q").stack(True),
alt.Color(f"{col_name}:N").legend(),
tooltip=[col_name, 'count', 'frequency (%)']
)
pie = base.mark_arc(outerRadius=120)
return pie
def plot_all(domain, partition, models, category1, category2, agg):
domain = domain2folder[domain]
data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv"
if not os.path.exists(data_path):
return None
expand_df = pd.read_csv(data_path)
chart_df = expand_df[expand_df['model'].isin(models)]
if category2:
color_val = f'{agg}(score):Q'
chart = alt.Chart(chart_df).mark_rect().encode(
alt.X(f'{category1}:N', sort=alt.EncodingSortField(field='score', order='ascending', op=agg)),
alt.Y(f'{category2}:N', sort=alt.EncodingSortField(field='score', order='descending', op=agg)), # no title, no label angle),
alt.Color(color_val),
alt.Tooltip('score', aggregate=agg, title=f"{agg} score"),
).properties(
width=800,
height=200,
)
else:
category = "index" if category1 == "task id" else category1
# cat_options = list(chart_df[category].unique())
# cat_options = cat_options[:5]
y_val = f'{agg}(score):Q'
df = chart_df
# df = chart_df[chart_df[category].isin(cat_options)]
if len(models) > 1:
chart = alt.Chart(df).mark_bar().encode(
alt.X('model:N',
sort=alt.EncodingSortField(field=f'score', order='ascending', op=agg),
axis=alt.Axis(labels=False, tickSize=0, title=None)),
alt.Y(y_val, scale=alt.Scale(zero=True)),
alt.Color('model:N').legend(),
alt.Column(f'{category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom'))
).properties(
width=200,
height=100,
title=f"How do models perform across {category}?"
)
else:
chart = alt.Chart(df).mark_bar().encode(
alt.X(f'{category}:N', sort=alt.EncodingSortField(field=f'score', order='ascending', op=agg)), # no title, no label angle),
alt.Y(y_val, scale=alt.Scale(zero=True)),
alt.Color(f'{category}:N').legend(None),
).properties(
width=200,
height=100,
title=f"How does {models[0]} perform across {category}?"
)
chart = chart.configure_title(fontSize=20, offset=5, orient='top', anchor='middle').configure_axis(
labelFontSize=20,
titleFontSize=20,
).configure_legend(
labelFontSize=15,
titleFontSize=15,
)
return chart
def update_widgets(domain, partition, category, query_type):
domain = domain2folder[domain]
data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv"
if not os.path.exists(data_path):
print("here?")
return [None] * 11
df = pd.read_csv(data_path)
max_k = len(df[category].unique()) if category and category != "task id" else len(df)
widgets = []
if query_type == "top k":
# aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True)
rank = gr.Radio(['top', 'bottom'], value='top', label=" ", interactive=True, visible=True)
k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", interactive=True, visible=True)
model = gr.Dropdown(MODELS, value=MODELS, label="of model(s)'", multiselect=True, interactive=True, visible=True)
# model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate", interactive=True, visible=True)
model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True)
baseline = gr.Dropdown(MODELS, value=None, label="baseline", visible=False)
direction = gr.Radio(['above', 'below'], value='above', label=" ", visible=False)
threshold = gr.Slider(0, 1, 0.0, label="threshold", visible=False)
baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="baseline aggregate", visible=False)
md1 = gr.Markdown(r"
ranked by the
")
md2 = gr.Markdown(r"accuracy
")
md3 = gr.Markdown(r"")
elif query_type == "threshold":
# aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task aggregate", interactive=True, visible=True)
# aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True)
model = gr.Dropdown(MODELS, value=MODELS[0], label="of model(s)'", multiselect=True, interactive=True, visible=True)
direction = gr.Radio(['above', 'below'], value='above', label=" ", interactive=True, visible=True)
threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=True, visible=True)
# model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate", interactive=True, visible=True)
model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True)
rank = gr.Radio(['top', 'bottom'], value='top', label=" ", visible=False)
k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", visible=False)
baseline = gr.Dropdown(MODELS, value=None, label="baseline", visible=False)
baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="baseline aggregate", visible=False)
md1 = gr.Markdown(r"where the
")
md2 = gr.Markdown(r"accuracy is
")
md3 = gr.Markdown(r"")
elif query_type == "model comparison":
model = gr.Dropdown(MODELS, value=MODELS[0], label="of model(s)' accuracy", multiselect=True, interactive=True, visible=True)
baseline = gr.Dropdown(MODELS, value=None, label="of baseline(s)' accuracy", multiselect=True, interactive=True, visible=True)
direction = gr.Radio(['above', 'below'], value='above', label=" ", interactive=True, visible=True)
threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=True, visible=True)
model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True)
# baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate (over baselines)", interactive=True, visible=True)
baseline_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True)
# aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task aggregate", interactive=True, visible=False)
rank = gr.Radio(['top', 'bottom'], value='top', label=" ", visible=False)
k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", visible=False)
md1 = gr.Markdown(r"where the difference between the
")
md2 = gr.Markdown(r"is
")
md3 = gr.Markdown(r"and the
")
elif query_type == "model debugging":
model = gr.Dropdown(MODELS, value=MODELS[0], label="model's", multiselect=False, interactive=True, visible=True)
# aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", visible=False)
baseline = gr.Dropdown(MODELS, value=None, label="baseline", visible=False)
direction = gr.Radio(['above', 'below'], value='above', label=" ", visible=False)
threshold = gr.Slider(0, 1, 0.0, label="threshold", visible=False)
rank = gr.Radio(['top', 'bottom'], value='top', label=" ", visible=False)
k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", visible=False)
model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate (over models)", visible=False)
baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="baseline aggregate", visible=False)
md1 = gr.Markdown(r"where
")
md2 = gr.Markdown(r"mean accuracy is below its overall mean accuracy by one standard deviation
")
md3 = gr.Markdown(r"")
else:
widgets = [None] * 11
widgets = [rank, k, direction, threshold, model, model_aggregate, baseline, baseline_aggregate, md1, md2, md3]
return widgets
def select_tasks(domain, partition, category, query_type, task_agg, models, model_agg, rank, k, direction, threshold, baselines, baseline_agg):
domain = domain2folder[domain]
data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv"
merged_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
if not os.path.exists(data_path) or not os.path.exists(merged_path):
return gr.DataFrame(None)
df = pd.read_csv(data_path)
merged_df = pd.read_csv(merged_path)
task_plan = pickle.load(open(f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl", 'rb'))
task_plan.reset_index(inplace=True)
if not category or category == "task id":
category = 'index'
if query_type == "top k":
df = df[df['model'].isin(models)]
df = df.groupby([category, 'model'])['score'].agg(task_agg).reset_index()
df = df.groupby([category])['score'].agg(model_agg).reset_index()
df = df.sort_values(by='score', ascending=False)
if rank == "bottom":
df = df.iloc[-k:, :]
else:
df = df.iloc[:k, :]
elif query_type == "threshold":
df = df[df['model'].isin(models)]
df = df.groupby([category, 'model'])['score'].agg(task_agg).reset_index()
df = df.groupby([category])['score'].agg(model_agg).reset_index()
if direction == "below":
df = df[df['score'] <= threshold]
else:
df = df[df['score'] >= threshold]
elif query_type == "model comparison":
# df = merged_df
# df.reset_index(inplace=True)
# df = df.groupby([category])[[model, baseline]].agg(task_agg).reset_index()
# df = df[(df[model] - df[baseline] > threshold)]
df_baseline = deepcopy(df)
df = df[df['model'].isin(models)]
df = df.groupby([category, 'model'])['score'].agg(task_agg).reset_index()
df = df.groupby([category])['score'].agg(model_agg).reset_index()
model_str = ', '.join(models)
exp_score_id = f'{model_agg}({model_str})' if len(models) > 1 else model_str
df = df.sort_values(by=category)
df_baseline = df_baseline[df_baseline['model'].isin(baselines)]
df_baseline = df_baseline.groupby([category, 'model'])['score'].agg(task_agg).reset_index()
df_baseline = df_baseline.groupby([category])['score'].agg(baseline_agg).reset_index()
model_str = ', '.join(baselines)
baseline_score_id = f'{baseline_agg}({model_str})' if len(baselines) > 1 else model_str
df_baseline = df_baseline.sort_values(by=category)
df.rename(columns={'score': exp_score_id}, inplace=True)
df_baseline.rename(columns={'score': baseline_score_id}, inplace=True)
df = pd.merge(df, df_baseline, on=category)
df = df[(df[exp_score_id] - df[baseline_score_id] > threshold)]
elif query_type == "model debugging":
model = models
print(models)
avg_acc = merged_df[model].mean()
std = merged_df[model].std()
t = avg_acc - std
df = df[df['model'] == model]
df = df.groupby(['model', category])['score'].agg(task_agg).reset_index()
df = df[df['score'] < t]
df['mean'] = round(avg_acc, 4)
df['std'] = round(std, 4)
print(df.head())
if category == 'index':
task_attrs = list(df[category])
selected_tasks = task_plan[task_plan[category].isin(task_attrs)]
if len(selected_tasks) == 0:
return gr.DataFrame(None, label="There is no such task.")
if query_type == "model comparison" and (models and baselines):
# selected_tasks[model] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][model].values[0], axis=1)
# selected_tasks[baseline] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][baseline].values[0], axis=1)
selected_tasks[exp_score_id] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][exp_score_id].values[0], axis=1)
selected_tasks[baseline_score_id] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][baseline_score_id].values[0], axis=1)
else:
selected_tasks['score'] = selected_tasks.apply(lambda row: df[df['index'] == row['index']]['score'].values[0], axis=1)
print(selected_tasks.head())
return gr.DataFrame(selected_tasks, label=f"There are {len(selected_tasks)} (out of {len(task_plan)}) tasks in total.")
else:
if len(df) == 0:
return gr.DataFrame(None, label=f"There is no such {category}.")
else:
return gr.DataFrame(df, label=f"The total number of such {category} is {len(df)}.")
def find_patterns(selected_tasks, num_patterns, models, baselines, model_agg, baseline_agg):
if len(selected_tasks) == 0:
return gr.DataFrame(None)
print(selected_tasks.head())
if 'score' in selected_tasks:
scores = selected_tasks['score']
# elif model in selected_tasks:
# scores = selected_tasks[model]
else:
scores = None
print(scores)
model_str = ', '.join(models)
exp_score_id = f'{model_agg}({model_str})' if len(models) > 1 else model_str
if baselines:
baseline_str = ', '.join(baselines)
baseline_score_id = f'{baseline_agg}({baseline_str})' if len(baselines) > 1 else baseline_str
tasks_only = selected_tasks
all_score_cols = ['score', exp_score_id]
if baselines:
all_score_cols += [baseline_score_id]
for name in all_score_cols:
if name in selected_tasks:
tasks_only = tasks_only.drop(name, axis=1)
results = find_frequent_patterns(k=num_patterns, df=tasks_only, scores=scores)
records = []
if scores is not None:
patterns, scores = results[0], results[1]
for pattern, score in zip(patterns, scores):
pattern_str = ""
for t in pattern[1]:
col_name, col_val = t
pattern_str += f"{col_name} = {col_val}, "
record = {'pattern': pattern_str[:-2], 'count': pattern[0], 'score': score} #{model}
records.append(record)
else:
patterns = results
for pattern in patterns:
pattern_str = ""
for t in pattern[1]:
col_name, col_val = t
pattern_str += f"{col_name} = {col_val}, "
record = {'pattern': pattern_str[:-2], 'count': pattern[0]}
records.append(record)
df = pd.DataFrame.from_records(records)
return gr.DataFrame(df)
def visualize_task_distribution(selected_tasks, col_name, model1, model2):
if not col_name:
return None
task_plan_cnt = selected_tasks.groupby(col_name)['index'].count().reset_index()
task_plan_cnt.rename(columns={'index': 'count'}, inplace=True)
task_plan_cnt['frequency (%)'] = round(task_plan_cnt['count'] / len(selected_tasks) * 100, 2)
print(task_plan_cnt.head())
tooltips = [col_name, 'count', 'frequency (%)']
base = alt.Chart(task_plan_cnt).encode(
alt.Theta("count:Q").stack(True),
alt.Color(f"{col_name}:N").legend(),
tooltip=tooltips
)
pie = base.mark_arc(outerRadius=120)
return pie
def plot_performance_for_selected_tasks(domain, partition, df, query_type, models, baselines, select_category, vis_category, task_agg, model_agg, baseline_agg, rank, direction, threshold):
domain = domain2folder[domain]
task_agg = "mean"
data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv"
mereged_data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
if not os.path.exists(data_path) or not os.path.exists(mereged_data_path) or len(df) == 0:
return None
select_tasks = select_category == "task id" and vis_category
if select_tasks: # select tasks
y_val = f'{task_agg}(score):Q'
else: # select task categories
y_val = f'score:Q'
if select_category == "task id":
select_category = "index"
print(df.head())
if query_type == "model comparison":
# re-format the data for plotting
model_str = ', '.join(models)
exp_score_id = f'{model_agg}({model_str})' if len(models) > 1 else model_str
baseline_str = ', '.join(baselines)
baseline_score_id = f'{baseline_agg}({baseline_str})' if len(baselines) > 1 else baseline_str
# other_cols = list(df.columns)
# other_cols.remove(select_category)
print(exp_score_id, baseline_score_id)
df = df.melt(id_vars=[select_category], value_vars=[exp_score_id, baseline_score_id])
df.rename(columns={'variable': 'model', 'value': 'score'}, inplace=True)
print(df.head())
if select_tasks:
merged_df = pd.read_csv(mereged_data_path)
df[vis_category] = df.apply(lambda row: merged_df[merged_df.index == row['index']][vis_category].values[0], axis=1)
num_columns = len(df['model'].unique()) * len(df[f'{vis_category}'].unique())
chart = alt.Chart(df).mark_bar().encode(
alt.X('model:N',
sort=alt.EncodingSortField(field=f'score', order='descending', op=task_agg),
axis=alt.Axis(labels=False, tickSize=0, title=None)),
alt.Y(y_val, scale=alt.Scale(zero=True), title="accuracy"),
alt.Color('model:N').legend(),
alt.Column(f'{vis_category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom', labelFontSize=20, titleFontSize=20,))
).properties(
width=num_columns * 30,
height=200,
title=f"How do models perform by {vis_category}?"
)
print(num_columns * 50)
else:
if query_type == "model debugging":
y_title = "accuracy"
plot_title = f"{models} performs worse than its (mean - std) on these {vis_category}s"
models = [models]
else:
model_str = ', '.join(models)
y_title = f"{model_agg} accuracy" if len(models) > 0 else "accuracy"
suffix = f"on these tasks (by {vis_category})" if select_category == "index" else f"on these {vis_category}s"
if query_type == "top k":
plot_title = f"The {model_agg} accuracy of {model_str} is the {'highest' if rank == 'top' else 'lowest'} " + suffix
elif query_type == "threshold":
plot_title = f"The {model_agg} accuracy of {model_str} is {direction} {threshold} " + suffix
if select_tasks:
expand_df = pd.read_csv(data_path)
task_ids = list(df['index'].unique())
# all_models = (models + baselines) if baselines else models
df = expand_df[(expand_df['model'].isin(models)) & (expand_df['task id'].isin(task_ids))]
num_columns = len(df[f'{vis_category}'].unique())
chart = alt.Chart(df).mark_bar().encode(
alt.X(f'{vis_category}:N', sort=alt.EncodingSortField(field=f'score', order='ascending', op=task_agg), axis=alt.Axis(labelAngle=-20)), # no title, no label angle),
alt.Y(y_val, scale=alt.Scale(zero=True), title=y_title),
alt.Color(f'{vis_category}:N').legend(None),
).properties(
width=num_columns * 30,
height=200,
title=plot_title
)
chart = chart.configure_title(fontSize=20, offset=5, orient='top', anchor='middle').configure_axis(
labelFontSize=20,
titleFontSize=20,
).configure_legend(
labelFontSize=20,
titleFontSize=20,
labelLimit=200,
)
return chart
def sync_vis_category(domain, partition, category):
domain = domain2folder[domain]
if category and category != "task id":
return [gr.Dropdown([category], value=category, label="by task metadata", interactive=False), gr.Dropdown([category], value=category, label="by task metadata", interactive=False)]
else:
data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl"
if os.path.exists(data_path):
data = pickle.load(open(data_path, 'rb'))
categories = list(data.columns)
return [gr.Dropdown(categories, value=categories[0], label="by task metadata", interactive=True), gr.Dropdown(categories, value=categories[0], label="by task metadata", interactive=True)]
else:
return [None, None]
def hide_fpm_and_dist_components(domain, partition, category):
domain = domain2folder[domain]
print(category)
if category and category != "task id":
num_patterns = gr.Slider(1, 100, 50, step=1.0, label="number of patterns", visible=False)
btn_pattern = gr.Button(value="Find patterns among tasks", visible=False)
table = gr.DataFrame({}, height=250, visible=False)
dist_chart = gr.Plot(visible=False)
col_name = gr.Dropdown([], value=None, label="by task metadata", visible=False)
btn_dist = gr.Button(value="Visualize task distribution", visible=False)
else:
data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl"
if os.path.exists(data_path):
data = pickle.load(open(data_path, 'rb'))
categories = list(data.columns)
col_name = gr.Dropdown(categories, value=categories[0], label="by task metadata", interactive=True, visible=True)
else:
col_name = gr.Dropdown([], value=None, label="by task metadata", interactive=True, visible=True)
num_patterns = gr.Slider(1, 100, 50, step=1.0, label="number of patterns", interactive=True, visible=True)
btn_pattern = gr.Button(value="Find patterns among tasks", interactive=True, visible=True)
table = gr.DataFrame({}, height=250, interactive=True, visible=True)
dist_chart = gr.Plot(visible=True)
btn_dist = gr.Button(value="Visualize task distribution", interactive=True, visible=True)
return [num_patterns, btn_pattern, table, col_name, btn_dist, dist_chart]
# domains = list_directories(BASE_DIR)
theme = gr.Theme.from_hub('sudeepshouche/minimalist')
theme.font = [gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"] # gr.themes.GoogleFont("Source Sans Pro") # [gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"]
theme.text_size = gr.themes.sizes.text_lg
# theme = theme.set(font=)
demo = gr.Blocks(theme=theme, title="TaskVerse-UI") #
with demo:
with gr.Row():
with gr.Column(scale=1):
gr.Markdown(
r""
)
with gr.Column(scale=1):
gr.Markdown(
r"Welcome to TaskVerse-UI!
"
)
with gr.Column(scale=1):
gr.Markdown(
r""
)
with gr.Tab("📊 Overview"):
gr.Markdown(
r"📊 Visualize the overall task distribution and model performance
"
)
with gr.Row():
domain = gr.Radio(domains, label="scenario", scale=2)
partition = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1)
# domain.change(fn=update_partition, inputs=domain, outputs=partition)
gr.Markdown(
r"Overall task metadata distribution
"
)
with gr.Row():
category = gr.Dropdown([], value=None, label="task metadata")
partition.change(fn=update_category, inputs=[domain, partition], outputs=category)
with gr.Row():
output = gr.Plot()
with gr.Row():
btn = gr.Button(value="Plot")
btn.click(plot_task_distribution, [domain, partition, category], output)
gr.Markdown(
r"Models' overall performance by task metadata
"
)
with gr.Row():
with gr.Column(scale=2):
models = gr.CheckboxGroup(MODELS, label="model(s)", value=MODELS)
with gr.Column(scale=1):
aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="aggregate models' accuracy by")
with gr.Row():
# with gr.Column(scale=1):
category1 = gr.Dropdown([], value=None, label="task metadata", interactive=True)
category2 = gr.Dropdown([], value=None, label="Optional: second task metadata", interactive=True)
partition.change(fn=update_category, inputs=[domain, partition], outputs=category1)
category1.change(fn=update_category2, inputs=[domain, partition, category1], outputs=category2)
domain.change(fn=update_partition_and_models, inputs=domain, outputs=[partition, models])
with gr.Row():
output = gr.Plot()
with gr.Row():
btn = gr.Button(value="Plot")
btn.click(plot_all, [domain, partition, models, category1, category2, aggregate], output)
# gr.Examples(["hello", "bonjour", "merhaba"], input_textbox)
with gr.Tab("✨ Embedding"):
gr.Markdown(
r"✨ Visualize the tasks' embeddings in the 2D space
"
)
with gr.Row():
domain2 = gr.Radio(domains, label="scenario", scale=2)
# domain = gr.Dropdown(domains, value=domains[0], label="scenario")
partition2 = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1)
category2 = gr.Dropdown([], value=None, label="colored by task metadata", scale=1)
domain2.change(fn=update_partition, inputs=domain2, outputs=partition2)
partition2.change(fn=update_category, inputs=[domain2, partition2], outputs=category2)
with gr.Row():
output2 = gr.Plot()
with gr.Row():
btn = gr.Button(value="Run")
btn.click(plot_embedding, [domain2, partition2, category2], output2)
with gr.Tab("❓ Query"):
gr.Markdown(
r"❓ Find out the answers to your queries by finding and visualizing the relevant tasks and models' performance
"
)
with gr.Row(equal_height=True):
domain = gr.Radio(domains, label="scenario", scale=2)
partition = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1)
with gr.Row():
query1 = "top k"
query2 = "threshold"
query3 = "model debugging"
query4 = "model comparison"
query_type = gr.Radio([query1, query2, query3, query4], value="top k", label=r"query type")
with gr.Row():
with gr.Accordion("See more details about the query type"):
gr.Markdown(
r"- Top k: Find the k tasks or task metadata that the model(s) perform the best or worst on
- Threshold: Find the tasks or task metadata where the model(s)' performance is greater or lower than a given threshold t
- Model debugging: Find the tasks or task metadata where a model performs significantly worse than its average performance (by one standard deviation)
- Model comparison: Find the tasks or task metadata where some model(s) perform better or worse than the baseline(s) by a given threshold t
"
)
with gr.Row():
gr.Markdown(r"Help me find the
")
with gr.Row(equal_height=True):
# with gr.Column(scale=1):
rank = gr.Radio(['top', 'bottom'], value='top', label=" ", interactive=True, visible=True)
# with gr.Column(scale=2):
k = gr.Slider(1, 10, 5 // 2, step=1.0, label="k", interactive=True, visible=True)
# with gr.Column(scale=2):
category = gr.Dropdown([], value=None, label="tasks / task metadata", interactive=True)
with gr.Row():
md1 = gr.Markdown(r"ranked by the
")
with gr.Row(equal_height=True):
# with gr.Column(scale=1, min_width=100):
# model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True)
model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True, scale=1)
# with gr.Column(scale=8):
model = gr.Dropdown(MODELS, value=MODELS, label="of model(s)", multiselect=True, interactive=True, visible=True, scale=2)
# with gr.Column(scale=1, min_width=100):
# aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True, scale=1)
with gr.Row():
md3 = gr.Markdown(r"")
with gr.Row(equal_height=True):
baseline_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=False, scale=1)
baseline = gr.Dropdown(MODELS, value=None, label="of baseline(s)'", visible=False, scale=2)
# aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True)
# with gr.Column(scale=1, min_width=50):
with gr.Row():
md2 = gr.Markdown(r"accuracy
")
with gr.Row():
# baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate (over baselines)", visible=False)
direction = gr.Radio(['above', 'below'], value='above', label=" ", visible=False)
threshold = gr.Slider(0, 1, 0.0, label="threshold", visible=False)
widgets = [rank, k, direction, threshold, model, model_aggregate, baseline, baseline_aggregate, md1, md2, md3]
partition.change(fn=update_category, inputs=[domain, partition], outputs=category)
query_type.change(update_widgets, [domain, partition, category, query_type], widgets)
domain.change(fn=update_partition_and_models_and_baselines, inputs=domain, outputs=[partition, model, baseline])
with gr.Row():
df = gr.DataFrame({}, height=200)
btn = gr.Button(value="Find tasks / task metadata")
btn.click(select_tasks, [domain, partition, category, query_type, aggregate, model, model_aggregate, rank, k, direction, threshold, baseline, baseline_aggregate], df)
with gr.Row():
plot = gr.Plot()
with gr.Row():
col_name2 = gr.Dropdown([], value=None, label="by task metadata", interactive=True)
partition.change(fn=update_category, inputs=[domain, partition], outputs=col_name2)
btn_plot = gr.Button(value="Plot model performance", interactive=True)
btn_plot.click(plot_performance_for_selected_tasks, [domain, partition, df, query_type, model, baseline, category, col_name2, aggregate, model_aggregate, baseline_aggregate, rank, direction, threshold], plot)
with gr.Row():
dist_chart = gr.Plot()
with gr.Row():
col_name = gr.Dropdown([], value=None, label="by task metadata", interactive=True)
partition.change(fn=update_category, inputs=[domain, partition], outputs=col_name)
btn_dist = gr.Button(value="Visualize task distribution", interactive=True)
btn_dist.click(visualize_task_distribution, [df, col_name, model, baseline], dist_chart)
with gr.Row():
table = gr.DataFrame({}, height=250)
with gr.Row():
num_patterns = gr.Slider(1, 100, 50, step=1.0, label="number of patterns")
btn_pattern = gr.Button(value="Find patterns among tasks")
btn_pattern.click(find_patterns, [df, num_patterns, model, baseline], table)
category.change(fn=hide_fpm_and_dist_components, inputs=[domain, partition, category], outputs=[num_patterns, btn_pattern, table, col_name, btn_dist, dist_chart])
category.change(fn=sync_vis_category, inputs=[domain, partition, category], outputs=[col_name, col_name2])
category.change(fn=update_k, inputs=[domain, partition, category], outputs=k)
with gr.Tab("😮 Surprisingness"):
gr.Markdown(r"😮 Find out the tasks a model is surprisingly good or bad at compared to similar tasks
")
with gr.Row():
domain3 = gr.Radio(domains, label="scenario", scale=2)
partition3 = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1)
with gr.Row():
model3 = gr.Dropdown(MODELS, value=MODELS[0], label="model", interactive=True, visible=True)
k3 = gr.Slider(1, 100, 50, step=1.0, label="number of surprising tasks", interactive=True)
num_neighbors = gr.Slider(1, 100, 50, step=1.0, label="number of neighbors", interactive=True)
rank3 = gr.Radio(['top', 'bottom'], value='top', label=" ", interactive=True, visible=True)
domain3.change(fn=update_partition_and_models, inputs=domain3, outputs=[partition3, model3])
# partition3.change(fn=update_k, inputs=[domain3, partition3], outputs=k3)
with gr.Row():
output3 = gr.Plot()
with gr.Row():
btn = gr.Button(value="Plot")
btn.click(plot_surprisingness, [domain3, partition3, model3, rank3, k3, num_neighbors], output3)
# if __name__ == "__main__":
demo.launch(share=True)