import streamlit as st import pandas as pd from PIL import Image import base64 from io import BytesIO import random import plotly.graph_objects as go # Define constants MAJOR_A_WIN = "A>>B" MINOR_A_WIN = "A>B" MINOR_B_WIN = "B>A" MAJOR_B_WIN = "B>>A" TIE = "A=B" GA_TRACKING_CODE = """ """ def is_consistent(rating, reverse_rating): if rating in {MAJOR_A_WIN, MINOR_A_WIN} and reverse_rating in { MAJOR_B_WIN, MINOR_B_WIN, }: return True if rating in {MAJOR_B_WIN, MINOR_B_WIN} and reverse_rating in { MAJOR_A_WIN, MINOR_A_WIN, }: return True if reverse_rating in {MAJOR_A_WIN, MINOR_A_WIN} and rating in { MAJOR_B_WIN, MINOR_B_WIN, }: return True if reverse_rating in {MAJOR_B_WIN, MINOR_B_WIN} and rating in { MAJOR_A_WIN, MINOR_A_WIN, }: return True if reverse_rating in {TIE} and rating in {TIE}: return True if reverse_rating in {TIE} and rating not in {TIE}: return False if rating in {TIE} and reverse_rating not in {TIE}: return False return False # Function to convert PIL image to base64 def pil_to_base64(img): buffered = BytesIO() img.save(buffered, format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode() return img_str def main(): # Load your dataframes df_test_set = pd.read_json("data/test_set.jsonl", lines=True) df_responses = pd.read_json("data/responses.jsonl", lines=True) df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True) df_leaderboard = ( pd.read_csv("data/leaderboard_6_11.csv") .sort_values("Rank") .reset_index(drop=True) ) df_leaderboard = df_leaderboard.rename( columns={"EI Score": "Council Arena EI Score (95% CI)"} ) # Prepare the scenario selector options df_test_set["scenario_option"] = ( df_test_set["emobench_id"].astype(str) + ": " + df_test_set["scenario"] ) scenario_options = df_test_set["scenario_option"].tolist() # Prepare the model selector options model_options = df_responses["llm_responder"].unique().tolist() # Prepare the judge selector options judge_options = df_response_judging["llm_judge"].unique().tolist() st.set_page_config( page_title="Language Model Council", page_icon="🏛️", layout="wide" ) # Custom CSS to center title and header center_css = """ """ # Add the Google Analytics tracking code to the Streamlit app st.markdown(GA_TRACKING_CODE, unsafe_allow_html=True) st.markdown(center_css, unsafe_allow_html=True) # Title and subtitle. st.title("Language Model Council") st.markdown( "### Benchmarking Foundation Models on Highly Subjective Tasks by Consensus :classical_building:" ) st.markdown( "###### [Justin Zhao](https://www.justinxzhao.com/)¹, [Flor Miriam Plaza-del-Arco](https://fmplaza.github.io/)², [Amanda Cercas Curry](https://amandacurry.github.io/)²" ) st.markdown("###### ¹ Predibase, ² Bocconi University") # Create three columns _, col1, col2, col3, col4, _ = st.columns([0.3, 0.1, 0.1, 0.1, 0.1, 0.3]) with col1: st.link_button( "Data", "https://huggingface.co/datasets/llm-council/emotional_application", use_container_width=True, type="primary", ) with col2: st.link_button( "Paper", "https://arxiv.org/abs/2406.08598", use_container_width=True, type="primary", ) with col3: st.link_button( "Github", "https://github.com/llm-council/llm-council", use_container_width=True, type="primary", ) with col4: st.link_button( "Website", "https://llm-council.com/", use_container_width=True, type="primary", ) # Render hero image. with open("img/hero.svg", "r") as file: svg_content = file.read() left_co, cent_co, last_co = st.columns([0.2, 0.6, 0.2]) with cent_co: st.image(svg_content, use_column_width=True) with cent_co.expander("Abstract"): st.markdown( """The rapid advancement of Large Language Models (LLMs) necessitates robust and challenging benchmarks. Leaderboards like Chatbot Arena rank LLMs based on how well their responses align with human preferences. However, many tasks such as those related to emotional intelligence, creative writing, or persuasiveness, are highly subjective and often lack majoritarian human agreement. Judges may have irreconcilable disagreements about what constitutes a better response. To address the challenge of ranking LLMs on highly subjective tasks, we propose a novel benchmarking framework, the Language Model Council (LMC). The LMC operates through a democratic process to: 1) formulate a test set through equal participation, 2) administer the test among council members, and 3) evaluate responses as a collective jury. We deploy a council of 20 newest LLMs on an open-ended emotional intelligence task: responding to interpersonal dilemmas. Our results show that the LMC produces rankings that are more separable, robust, and less biased than those from any individual LLM judge, and is more consistent with a human-established leaderboard compared to other benchmarks.""" ) st.markdown( "This leaderboard comes from deploying a Council of 20 LLMs on an **open-ended emotional intelligence task: responding to interpersonal dilemmas**." ) # Create horizontal tabs tabs = st.tabs( [ "Leaderboard Results", "Browse Data", "Analysis", "About Us", ] ) # Define content for each tab with tabs[0]: _, mid_column, _ = st.columns([0.2, 0.6, 0.2]) mid_column.markdown("#### Leaderboard Graph") df = df_leaderboard.copy() df["Score"] = df["Council Arena EI Score (95% CI)"].apply( lambda x: float(x.split(" ")[0]) ) df["Lower"] = df["Council Arena EI Score (95% CI)"].apply( lambda x: float(x.split(" ")[1][1:-1]) ) df["Upper"] = df["Council Arena EI Score (95% CI)"].apply( lambda x: float(x.split(" ")[2][:-1]) ) # Sort the DataFrame by Score in descending order df = df.sort_values(by="Score", ascending=False) # Create the bar chart fig = go.Figure() # Generate rainbow colors num_bars = len(df) colors = [f"hsl({int(360 / num_bars * i)}, 100%, 50%)" for i in range(num_bars)] fig.add_trace( go.Bar( x=df["Score"], y=df["LLM"], orientation="h", error_x=dict( type="data", array=df["Upper"], arrayminus=-1 * df["Lower"], thickness=0.5, width=3, color="black", ), marker=dict(color=colors, opacity=0.8), ) ) fig.update_layout( xaxis=dict(title="Council Emotional Intelligence Score", showgrid=True), yaxis_title="LLM", yaxis=dict(autorange="reversed"), template="presentation", width=1000, height=700, ) # Display the plot in Streamlit mid_column.plotly_chart(fig) mid_column.divider() mid_column.markdown("#### Leaderboard Table") # Display the table. mid_column.dataframe(df_leaderboard, hide_index=True) # HTML and CSS to create a text box with specified color def colored_text_box(text, background_color, text_color="black"): html_code = f"""
{text}
""" return html_code # Ensure to initialize session state variables if they do not exist if "selected_scenario" not in st.session_state: st.session_state.selected_scenario = None if "selected_model" not in st.session_state: st.session_state.selected_model = None if "selected_judge" not in st.session_state: st.session_state.selected_judge = None # Define callback functions to update session state def update_scenario(): st.session_state.selected_scenario = st.session_state.scenario_selector def update_model(): st.session_state.selected_model = st.session_state.model_selector def update_judge(): st.session_state.selected_judge = st.session_state.judge_selector def randomize_selection(): st.session_state.selected_scenario = random.choice(scenario_options) st.session_state.selected_model = random.choice(model_options) st.session_state.selected_judge = random.choice(judge_options) with tabs[1]: # Add randomize button at the top of the app _, mid_column, _ = st.columns([0.4, 0.2, 0.4]) mid_column.button( ":game_die: Randomize!", on_click=randomize_selection, type="primary", use_container_width=True, ) st.markdown("#### 1. Select a scenario.") # Create the selectors st.session_state.selected_scenario = st.selectbox( "Select Scenario", scenario_options, label_visibility="hidden", key="scenario_selector", on_change=update_scenario, index=( scenario_options.index(st.session_state.selected_scenario) if st.session_state.selected_scenario else 0 ), ) # Get the selected scenario details if st.session_state.selected_scenario: selected_emobench_id = int( st.session_state.selected_scenario.split(": ")[0] ) scenario_details = df_test_set[ df_test_set["emobench_id"] == selected_emobench_id ].iloc[0] # Display the detailed dilemma and additional information st.markdown( colored_text_box( scenario_details["detailed_dilemma"], "#01204E", "white", ), unsafe_allow_html=True, ) with st.expander("Additional Information"): st.write( { "LLM Author": scenario_details["llm_author"], "Problem": scenario_details["problem"], "Relationship": scenario_details["relationship"], "Scenario": scenario_details["scenario"], } ) st.divider() st.markdown("#### 2. View responses.") # Create two columns for model selectors col1, col2 = st.columns(2) with col1: fixed_model = "qwen1.5-32B-Chat" st.selectbox( "Select Model", [fixed_model], key="fixed_model", label_visibility="hidden", ) # Get the response string for the fixed model if st.session_state.selected_scenario: response_details_fixed = df_responses[ (df_responses["emobench_id"] == selected_emobench_id) & (df_responses["llm_responder"] == fixed_model) ].iloc[0] # Display the response string st.markdown( colored_text_box( response_details_fixed["response_string"], "#028391", "white", ), unsafe_allow_html=True, ) with col2: st.session_state.selected_model = st.selectbox( "Select Model", model_options, key="model_selector", on_change=update_model, index=( model_options.index(st.session_state.selected_model) if st.session_state.selected_model else 0 ), ) # Get the response string for the selected model if st.session_state.selected_model and st.session_state.selected_scenario: response_details_dynamic = df_responses[ (df_responses["emobench_id"] == selected_emobench_id) & (df_responses["llm_responder"] == st.session_state.selected_model) ].iloc[0] # Display the response string st.markdown( colored_text_box( response_details_dynamic["response_string"], "#028391", "white", ), unsafe_allow_html=True, ) st.divider() st.markdown("#### 3. Response judging.") st.markdown("##### All council members") col1, col2 = st.columns(2) with col1: st.write(f"**{fixed_model}** vs **{st.session_state.selected_model}**") pairwise_counts_left = df_response_judging[ (df_response_judging["first_completion_by"] == fixed_model) & ( df_response_judging["second_completion_by"] == st.session_state.selected_model ) ]["pairwise_choice"].value_counts() st.bar_chart(pairwise_counts_left) with col2: st.write(f"**{st.session_state.selected_model}** vs **{fixed_model}**") pairwise_counts_right = df_response_judging[ ( df_response_judging["first_completion_by"] == st.session_state.selected_model ) & (df_response_judging["second_completion_by"] == fixed_model) ]["pairwise_choice"].value_counts() st.bar_chart(pairwise_counts_right) # Create the llm_judge selector st.markdown("##### Individual LLM judges") st.session_state.selected_judge = st.selectbox( "Select Judge", judge_options, label_visibility="hidden", key="judge_selector", on_change=update_judge, index=( judge_options.index(st.session_state.selected_judge) if st.session_state.selected_judge else 0 ), ) # Get the judging details for the selected judge and models if st.session_state.selected_judge and st.session_state.selected_scenario: col1, col2 = st.columns(2) judging_details_left = df_response_judging[ (df_response_judging["llm_judge"] == st.session_state.selected_judge) & (df_response_judging["first_completion_by"] == fixed_model) & ( df_response_judging["second_completion_by"] == st.session_state.selected_model ) ].iloc[0] judging_details_right = df_response_judging[ (df_response_judging["llm_judge"] == st.session_state.selected_judge) & ( df_response_judging["first_completion_by"] == st.session_state.selected_model ) & (df_response_judging["second_completion_by"] == fixed_model) ].iloc[0] # Render consistency. if is_consistent( judging_details_left["pairwise_choice"], judging_details_right["pairwise_choice"], ): st.success( f"{st.session_state.selected_judge} as a judge was consistent on this example with positions flipped.", icon="✅", ) else: st.warning( f"{st.session_state.selected_judge} as a judge was inconsistent on this example with positions flipped.", icon="⚠️", ) # Display the judging details with col1: if not judging_details_left.empty: st.write( f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}" ) st.markdown( colored_text_box( judging_details_left["judging_response_string"], "#FEAE6F", "black", ), unsafe_allow_html=True, ) else: st.write("No judging details found for the selected combination.") with col2: if not judging_details_right.empty: st.write( f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}" ) st.markdown( colored_text_box( judging_details_right["judging_response_string"], "#FEAE6F", "black", ), unsafe_allow_html=True, ) else: st.write("No judging details found for the selected combination.") with tabs[2]: st.markdown("### Battles (Respondent vs. Respondent)") st.markdown("###### Expected win rates based on Terry-Bradley coefficients") image = Image.open("img/llm_vs_llm_win_rates.png") img_base64 = pil_to_base64(image) centered_image_html = f"""
""" st.markdown(centered_image_html, unsafe_allow_html=True) st.divider() st.markdown("### Affinities (Judge vs. Respondent)") st.markdown("###### Raw affinities") image = Image.open("img/raw.png") img_base64 = pil_to_base64(image) centered_image_html = f"""
""" st.markdown(centered_image_html, unsafe_allow_html=True) # Some extra space. st.text("") st.text("") st.text("") st.markdown("###### Council-Normalized") image = Image.open("img/council_normalized.png") img_base64 = pil_to_base64(image) centered_image_html = f"""
""" st.markdown(centered_image_html, unsafe_allow_html=True) st.divider() st.markdown("### Agreement (Judge vs. Judge)") st.markdown("###### Sidewise Cohen's Kappa:") image = Image.open("img/judge_agreement.sidewise_cohen_kappa.png") img_base64 = pil_to_base64(image) centered_image_html = f"""
""" st.markdown(centered_image_html, unsafe_allow_html=True) st.write("Check out the paper for more detailed analysis!") with tabs[-1]: st.markdown( """**Motivation**: Good LLM evaluations are [really hard](https://www.jasonwei.net/blog/evals), and newly released models often make their own claims about being the best at something, often citing its position on a benchmark or a leaderboard. But what if we let the models themselves decide who's the best? **Main collaborators**: - [Justin Zhao](https://x.com/justinxzhao) - [Flor Plaza](https://x.com/florplaza22) - [Sam Paech](https://x.com/sam_paech) - [Federico Bianchi](https://x.com/federicobianchy) - [Sahand Sabour](https://x.com/SahandSabour) - [Amanda Cercas Curry](https://x.com/CurriedAmanda) """ ) # st.markdown("#### Citation") with st.expander("Citation"): st.write( "Please cite the following paper if you find our leaderboard, dataset, or framework helpful." ) st.code( """@misc{zhao2024council, Title = {Language Model Council: Benchmarking Foundation Models on Highly Subjective Tasks by Consensus}, Author = {Justin Zhao and Flor Miriam Plaza-del-Arco and Amanda Cercas Curry}, Year = {2024} Eprint = {arXiv:2406.08598}, }""" ) if __name__ == "__main__": main()