Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
from PIL import Image | |
import base64 | |
from io import BytesIO | |
import random | |
import plotly.graph_objects as go | |
# Define constants | |
MAJOR_A_WIN = "A>>B" | |
MINOR_A_WIN = "A>B" | |
MINOR_B_WIN = "B>A" | |
MAJOR_B_WIN = "B>>A" | |
TIE = "A=B" | |
GA_TRACKING_CODE = """ | |
<script async src="https://www.googletagmanager.com/gtag/js?id=G-EVZ0R7014L"></script> | |
<script> | |
window.dataLayer = window.dataLayer || []; | |
function gtag(){dataLayer.push(arguments);} | |
gtag('js', new Date()); | |
gtag('config', 'G-EVZ0R7014L'); | |
</script> | |
""" | |
def is_consistent(rating, reverse_rating): | |
if rating in {MAJOR_A_WIN, MINOR_A_WIN} and reverse_rating in { | |
MAJOR_B_WIN, | |
MINOR_B_WIN, | |
}: | |
return True | |
if rating in {MAJOR_B_WIN, MINOR_B_WIN} and reverse_rating in { | |
MAJOR_A_WIN, | |
MINOR_A_WIN, | |
}: | |
return True | |
if reverse_rating in {MAJOR_A_WIN, MINOR_A_WIN} and rating in { | |
MAJOR_B_WIN, | |
MINOR_B_WIN, | |
}: | |
return True | |
if reverse_rating in {MAJOR_B_WIN, MINOR_B_WIN} and rating in { | |
MAJOR_A_WIN, | |
MINOR_A_WIN, | |
}: | |
return True | |
if reverse_rating in {TIE} and rating in {TIE}: | |
return True | |
if reverse_rating in {TIE} and rating not in {TIE}: | |
return False | |
if rating in {TIE} and reverse_rating not in {TIE}: | |
return False | |
return False | |
# Function to convert PIL image to base64 | |
def pil_to_base64(img): | |
buffered = BytesIO() | |
img.save(buffered, format="PNG") | |
img_str = base64.b64encode(buffered.getvalue()).decode() | |
return img_str | |
def main(): | |
# Load your dataframes | |
df_test_set = pd.read_json("data/test_set.jsonl", lines=True) | |
df_responses = pd.read_json("data/responses.jsonl", lines=True) | |
df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True) | |
df_leaderboard = ( | |
pd.read_csv("data/leaderboard_6_11.csv") | |
.sort_values("Rank") | |
.reset_index(drop=True) | |
) | |
df_leaderboard = df_leaderboard.rename( | |
columns={"EI Score": "Council Arena EI Score (95% CI)"} | |
) | |
# Prepare the scenario selector options | |
df_test_set["scenario_option"] = ( | |
df_test_set["emobench_id"].astype(str) + ": " + df_test_set["scenario"] | |
) | |
scenario_options = df_test_set["scenario_option"].tolist() | |
# Prepare the model selector options | |
model_options = df_responses["llm_responder"].unique().tolist() | |
# Prepare the judge selector options | |
judge_options = df_response_judging["llm_judge"].unique().tolist() | |
st.set_page_config( | |
page_title="Language Model Council", page_icon="🏛️", layout="wide" | |
) | |
# Custom CSS to center title and header | |
center_css = """ | |
<style> | |
h1, h2, h3, h6{ | |
text-align: center; | |
} | |
</style> | |
""" | |
# Add the Google Analytics tracking code to the Streamlit app | |
st.markdown(GA_TRACKING_CODE, unsafe_allow_html=True) | |
st.markdown(center_css, unsafe_allow_html=True) | |
# Title and subtitle. | |
st.title("Language Model Council") | |
st.markdown( | |
"### Benchmarking Foundation Models on Highly Subjective Tasks by Consensus :classical_building:" | |
) | |
st.markdown( | |
"###### [Justin Zhao](https://www.justinxzhao.com/)¹, [Flor Miriam Plaza-del-Arco](https://fmplaza.github.io/)², [Amanda Cercas Curry](https://amandacurry.github.io/)²" | |
) | |
st.markdown("###### ¹ Predibase, ² Bocconi University") | |
# Create three columns | |
_, col1, col2, col3, col4, _ = st.columns([0.3, 0.1, 0.1, 0.1, 0.1, 0.3]) | |
with col1: | |
st.link_button( | |
"Data", | |
"https://huggingface.co/datasets/llm-council/emotional_application", | |
use_container_width=True, | |
type="primary", | |
) | |
with col2: | |
st.link_button( | |
"Paper", | |
"https://arxiv.org/abs/2406.08598", | |
use_container_width=True, | |
type="primary", | |
) | |
with col3: | |
st.link_button( | |
"Github", | |
"https://github.com/llm-council/llm-council", | |
use_container_width=True, | |
type="primary", | |
) | |
with col4: | |
st.link_button( | |
"Website", | |
"https://llm-council.com/", | |
use_container_width=True, | |
type="primary", | |
) | |
# Render hero image. | |
with open("img/hero.svg", "r") as file: | |
svg_content = file.read() | |
left_co, cent_co, last_co = st.columns([0.2, 0.6, 0.2]) | |
with cent_co: | |
st.image(svg_content, use_column_width=True) | |
with cent_co.expander("Abstract"): | |
st.markdown( | |
"""The rapid advancement of Large Language Models (LLMs) necessitates robust | |
and challenging benchmarks. Leaderboards like Chatbot Arena rank LLMs based | |
on how well their responses align with human preferences. However, many tasks | |
such as those related to emotional intelligence, creative writing, or persuasiveness, | |
are highly subjective and often lack majoritarian human agreement. Judges may | |
have irreconcilable disagreements about what constitutes a better response. To | |
address the challenge of ranking LLMs on highly subjective tasks, we propose | |
a novel benchmarking framework, the Language Model Council (LMC). The | |
LMC operates through a democratic process to: 1) formulate a test set through | |
equal participation, 2) administer the test among council members, and 3) evaluate | |
responses as a collective jury. We deploy a council of 20 newest LLMs on an | |
open-ended emotional intelligence task: responding to interpersonal dilemmas. | |
Our results show that the LMC produces rankings that are more separable, robust, | |
and less biased than those from any individual LLM judge, and is more consistent | |
with a human-established leaderboard compared to other benchmarks.""" | |
) | |
st.markdown( | |
"This leaderboard comes from deploying a Council of 20 LLMs on an **open-ended emotional intelligence task: responding to interpersonal dilemmas**." | |
) | |
# Create horizontal tabs | |
tabs = st.tabs( | |
[ | |
"Leaderboard Results", | |
"Browse Data", | |
"Analysis", | |
"About Us", | |
] | |
) | |
# Define content for each tab | |
with tabs[0]: | |
_, mid_column, _ = st.columns([0.2, 0.6, 0.2]) | |
mid_column.markdown("#### Leaderboard Graph") | |
df = df_leaderboard.copy() | |
df["Score"] = df["Council Arena EI Score (95% CI)"].apply( | |
lambda x: float(x.split(" ")[0]) | |
) | |
df["Lower"] = df["Council Arena EI Score (95% CI)"].apply( | |
lambda x: float(x.split(" ")[1][1:-1]) | |
) | |
df["Upper"] = df["Council Arena EI Score (95% CI)"].apply( | |
lambda x: float(x.split(" ")[2][:-1]) | |
) | |
# Sort the DataFrame by Score in descending order | |
df = df.sort_values(by="Score", ascending=False) | |
# Create the bar chart | |
fig = go.Figure() | |
# Generate rainbow colors | |
num_bars = len(df) | |
colors = [f"hsl({int(360 / num_bars * i)}, 100%, 50%)" for i in range(num_bars)] | |
fig.add_trace( | |
go.Bar( | |
x=df["Score"], | |
y=df["LLM"], | |
orientation="h", | |
error_x=dict( | |
type="data", | |
array=df["Upper"], | |
arrayminus=-1 * df["Lower"], | |
thickness=0.5, | |
width=3, | |
color="black", | |
), | |
marker=dict(color=colors, opacity=0.8), | |
) | |
) | |
fig.update_layout( | |
xaxis=dict(title="Council Emotional Intelligence Score", showgrid=True), | |
yaxis_title="LLM", | |
yaxis=dict(autorange="reversed"), | |
template="presentation", | |
width=1000, | |
height=700, | |
) | |
# Display the plot in Streamlit | |
mid_column.plotly_chart(fig) | |
mid_column.divider() | |
mid_column.markdown("#### Leaderboard Table") | |
# Display the table. | |
mid_column.dataframe(df_leaderboard, hide_index=True) | |
# HTML and CSS to create a text box with specified color | |
def colored_text_box(text, background_color, text_color="black"): | |
html_code = f""" | |
<div style=" | |
background-color: {background_color}; | |
color: {text_color}; | |
padding: 10px; | |
border-radius: 10px; | |
"> | |
{text} | |
</div> | |
""" | |
return html_code | |
# Ensure to initialize session state variables if they do not exist | |
if "selected_scenario" not in st.session_state: | |
st.session_state.selected_scenario = None | |
if "selected_model" not in st.session_state: | |
st.session_state.selected_model = None | |
if "selected_judge" not in st.session_state: | |
st.session_state.selected_judge = None | |
# Define callback functions to update session state | |
def update_scenario(): | |
st.session_state.selected_scenario = st.session_state.scenario_selector | |
def update_model(): | |
st.session_state.selected_model = st.session_state.model_selector | |
def update_judge(): | |
st.session_state.selected_judge = st.session_state.judge_selector | |
def randomize_selection(): | |
st.session_state.selected_scenario = random.choice(scenario_options) | |
st.session_state.selected_model = random.choice(model_options) | |
st.session_state.selected_judge = random.choice(judge_options) | |
with tabs[1]: | |
# Add randomize button at the top of the app | |
_, mid_column, _ = st.columns([0.4, 0.2, 0.4]) | |
mid_column.button( | |
":game_die: Randomize!", | |
on_click=randomize_selection, | |
type="primary", | |
use_container_width=True, | |
) | |
st.markdown("#### 1. Select a scenario.") | |
# Create the selectors | |
st.session_state.selected_scenario = st.selectbox( | |
"Select Scenario", | |
scenario_options, | |
label_visibility="hidden", | |
key="scenario_selector", | |
on_change=update_scenario, | |
index=( | |
scenario_options.index(st.session_state.selected_scenario) | |
if st.session_state.selected_scenario | |
else 0 | |
), | |
) | |
# Get the selected scenario details | |
if st.session_state.selected_scenario: | |
selected_emobench_id = int( | |
st.session_state.selected_scenario.split(": ")[0] | |
) | |
scenario_details = df_test_set[ | |
df_test_set["emobench_id"] == selected_emobench_id | |
].iloc[0] | |
# Display the detailed dilemma and additional information | |
st.markdown( | |
colored_text_box( | |
scenario_details["detailed_dilemma"], | |
"#01204E", | |
"white", | |
), | |
unsafe_allow_html=True, | |
) | |
with st.expander("Additional Information"): | |
st.write( | |
{ | |
"LLM Author": scenario_details["llm_author"], | |
"Problem": scenario_details["problem"], | |
"Relationship": scenario_details["relationship"], | |
"Scenario": scenario_details["scenario"], | |
} | |
) | |
st.divider() | |
st.markdown("#### 2. View responses.") | |
# Create two columns for model selectors | |
col1, col2 = st.columns(2) | |
with col1: | |
fixed_model = "qwen1.5-32B-Chat" | |
st.selectbox( | |
"Select Model", | |
[fixed_model], | |
key="fixed_model", | |
label_visibility="hidden", | |
) | |
# Get the response string for the fixed model | |
if st.session_state.selected_scenario: | |
response_details_fixed = df_responses[ | |
(df_responses["emobench_id"] == selected_emobench_id) | |
& (df_responses["llm_responder"] == fixed_model) | |
].iloc[0] | |
# Display the response string | |
st.markdown( | |
colored_text_box( | |
response_details_fixed["response_string"], | |
"#028391", | |
"white", | |
), | |
unsafe_allow_html=True, | |
) | |
with col2: | |
st.session_state.selected_model = st.selectbox( | |
"Select Model", | |
model_options, | |
key="model_selector", | |
on_change=update_model, | |
index=( | |
model_options.index(st.session_state.selected_model) | |
if st.session_state.selected_model | |
else 0 | |
), | |
) | |
# Get the response string for the selected model | |
if st.session_state.selected_model and st.session_state.selected_scenario: | |
response_details_dynamic = df_responses[ | |
(df_responses["emobench_id"] == selected_emobench_id) | |
& (df_responses["llm_responder"] == st.session_state.selected_model) | |
].iloc[0] | |
# Display the response string | |
st.markdown( | |
colored_text_box( | |
response_details_dynamic["response_string"], | |
"#028391", | |
"white", | |
), | |
unsafe_allow_html=True, | |
) | |
st.divider() | |
st.markdown("#### 3. Response judging.") | |
st.markdown("##### All council members") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.write(f"**{fixed_model}** vs **{st.session_state.selected_model}**") | |
pairwise_counts_left = df_response_judging[ | |
(df_response_judging["first_completion_by"] == fixed_model) | |
& ( | |
df_response_judging["second_completion_by"] | |
== st.session_state.selected_model | |
) | |
]["pairwise_choice"].value_counts() | |
st.bar_chart(pairwise_counts_left) | |
with col2: | |
st.write(f"**{st.session_state.selected_model}** vs **{fixed_model}**") | |
pairwise_counts_right = df_response_judging[ | |
( | |
df_response_judging["first_completion_by"] | |
== st.session_state.selected_model | |
) | |
& (df_response_judging["second_completion_by"] == fixed_model) | |
]["pairwise_choice"].value_counts() | |
st.bar_chart(pairwise_counts_right) | |
# Create the llm_judge selector | |
st.markdown("##### Individual LLM judges") | |
st.session_state.selected_judge = st.selectbox( | |
"Select Judge", | |
judge_options, | |
label_visibility="hidden", | |
key="judge_selector", | |
on_change=update_judge, | |
index=( | |
judge_options.index(st.session_state.selected_judge) | |
if st.session_state.selected_judge | |
else 0 | |
), | |
) | |
# Get the judging details for the selected judge and models | |
if st.session_state.selected_judge and st.session_state.selected_scenario: | |
col1, col2 = st.columns(2) | |
judging_details_left = df_response_judging[ | |
(df_response_judging["llm_judge"] == st.session_state.selected_judge) | |
& (df_response_judging["first_completion_by"] == fixed_model) | |
& ( | |
df_response_judging["second_completion_by"] | |
== st.session_state.selected_model | |
) | |
].iloc[0] | |
judging_details_right = df_response_judging[ | |
(df_response_judging["llm_judge"] == st.session_state.selected_judge) | |
& ( | |
df_response_judging["first_completion_by"] | |
== st.session_state.selected_model | |
) | |
& (df_response_judging["second_completion_by"] == fixed_model) | |
].iloc[0] | |
# Render consistency. | |
if is_consistent( | |
judging_details_left["pairwise_choice"], | |
judging_details_right["pairwise_choice"], | |
): | |
st.success( | |
f"{st.session_state.selected_judge} as a judge was consistent on this example with positions flipped.", | |
icon="✅", | |
) | |
else: | |
st.warning( | |
f"{st.session_state.selected_judge} as a judge was inconsistent on this example with positions flipped.", | |
icon="⚠️", | |
) | |
# Display the judging details | |
with col1: | |
if not judging_details_left.empty: | |
st.write( | |
f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}" | |
) | |
st.markdown( | |
colored_text_box( | |
judging_details_left["judging_response_string"], | |
"#FEAE6F", | |
"black", | |
), | |
unsafe_allow_html=True, | |
) | |
else: | |
st.write("No judging details found for the selected combination.") | |
with col2: | |
if not judging_details_right.empty: | |
st.write( | |
f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}" | |
) | |
st.markdown( | |
colored_text_box( | |
judging_details_right["judging_response_string"], | |
"#FEAE6F", | |
"black", | |
), | |
unsafe_allow_html=True, | |
) | |
else: | |
st.write("No judging details found for the selected combination.") | |
with tabs[2]: | |
st.markdown("### Battles (Respondent vs. Respondent)") | |
st.markdown("###### Expected win rates based on Terry-Bradley coefficients") | |
image = Image.open("img/llm_vs_llm_win_rates.png") | |
img_base64 = pil_to_base64(image) | |
centered_image_html = f""" | |
<div style="text-align: center;"> | |
<img src="data:image/png;base64,{img_base64}" width="1000"/> | |
</div> | |
""" | |
st.markdown(centered_image_html, unsafe_allow_html=True) | |
st.divider() | |
st.markdown("### Affinities (Judge vs. Respondent)") | |
st.markdown("###### Raw affinities") | |
image = Image.open("img/raw.png") | |
img_base64 = pil_to_base64(image) | |
centered_image_html = f""" | |
<div style="text-align: center;"> | |
<img src="data:image/png;base64,{img_base64}" width="1000"/> | |
</div> | |
""" | |
st.markdown(centered_image_html, unsafe_allow_html=True) | |
# Some extra space. | |
st.text("") | |
st.text("") | |
st.text("") | |
st.markdown("###### Council-Normalized") | |
image = Image.open("img/council_normalized.png") | |
img_base64 = pil_to_base64(image) | |
centered_image_html = f""" | |
<div style="text-align: center;"> | |
<img src="data:image/png;base64,{img_base64}" width="1000"/> | |
</div> | |
""" | |
st.markdown(centered_image_html, unsafe_allow_html=True) | |
st.divider() | |
st.markdown("### Agreement (Judge vs. Judge)") | |
st.markdown("###### Sidewise Cohen's Kappa:") | |
image = Image.open("img/judge_agreement.sidewise_cohen_kappa.png") | |
img_base64 = pil_to_base64(image) | |
centered_image_html = f""" | |
<div style="text-align: center;"> | |
<img src="data:image/png;base64,{img_base64}" width="1000"/> | |
</div> | |
""" | |
st.markdown(centered_image_html, unsafe_allow_html=True) | |
st.write("Check out the paper for more detailed analysis!") | |
with tabs[-1]: | |
st.markdown( | |
"""**Motivation**: | |
Good LLM evaluations are [really hard](https://www.jasonwei.net/blog/evals), and newly released models often make their own claims about being the best at something, often citing its position on a benchmark or a leaderboard. But what if we let the models themselves decide who's the best? | |
**Main collaborators**: | |
- [Justin Zhao](https://x.com/justinxzhao) | |
- [Flor Plaza](https://x.com/florplaza22) | |
- [Sam Paech](https://x.com/sam_paech) | |
- [Federico Bianchi](https://x.com/federicobianchy) | |
- [Sahand Sabour](https://x.com/SahandSabour) | |
- [Amanda Cercas Curry](https://x.com/CurriedAmanda) | |
""" | |
) | |
# st.markdown("#### Citation") | |
with st.expander("Citation"): | |
st.write( | |
"Please cite the following paper if you find our leaderboard, dataset, or framework helpful." | |
) | |
st.code( | |
"""@misc{zhao2024council, | |
Title = {Language Model Council: Benchmarking Foundation Models on Highly Subjective Tasks by Consensus}, | |
Author = {Justin Zhao and Flor Miriam Plaza-del-Arco and Amanda Cercas Curry}, | |
Year = {2024} | |
Eprint = {arXiv:2406.08598}, | |
}""" | |
) | |
if __name__ == "__main__": | |
main() | |