import pandas as pd import json from typing import Dict, Any, Tuple # Keep all the constant mappings outside the class MODEL_NAME_MAP = { "Claude_3.5_new": "Claude-3.5-Sonnet (1022)", "GPT_4o": "GPT-4o (0513)", "Claude_3.5": "Claude-3.5-Sonnet (0622)", "Gemini_1.5_pro_002": "Gemini-1.5-Pro-002", "InternVL2_76B": "InternVL2-Llama3-76B", "Qwen2_VL_72B": "Qwen2-VL-72B", "llava_onevision_72B": "Llava-OneVision-72B", "NVLM": "NVLM-D-72B", "GPT_4o_mini": "GPT-4o mini", "Gemini_1.5_flash_002": "Gemini-1.5-Flash-002", "Pixtral_12B": "Pixtral 12B", "Aria": "Aria-MoE-25B", "Qwen2_VL_7B": "Qwen2-VL-7B", "InternVL2_8B": "InternVL2-8B", "llava_onevision_7B": "Llava-OneVision-7B", "Llama_3_2_11B": "Llama-3.2-11B", "Phi-3.5-vision": "Phi-3.5-Vision", "MiniCPM_v2.6": "MiniCPM-V2.6", "Idefics3": "Idefics3-8B-Llama3", "Aquila_VL_2B": "Aquila-VL-2B-llava-qwen", "POINTS_7B": "POINTS-Qwen2.5-7B", "Qwen2_VL_2B": "Qwen2-VL-2B", "InternVL2_2B": "InternVL2-2B", "Molmo_7B_D": "Molmo-7B-D-0924", "Molmo_72B": "Molmo-72B-0924", } DIMENSION_NAME_MAP = { "skills": "Skills", "input_format": "Input Format", "output_format": "Output Format", "input_num": "Visual Input Number", "app": "Application" } KEYWORD_NAME_MAP = { # Skills "Object Recognition and Classification": "Object Recognition", "Text Recognition (OCR)": "OCR", "Language Understanding and Generation": "Language", "Scene and Event Understanding": "Scene/Event", "Mathematical and Logical Reasoning": "Math/Logic", "Commonsense and Social Reasoning": "Commonsense", "Ethical and Safety Reasoning": "Ethics/Safety", "Domain-Specific Knowledge and Skills": "Domain-Specific", "Spatial and Temporal Reasoning": "Spatial/Temporal", "Planning and Decision Making": "Planning/Decision", # Input Format 'User Interface Screenshots': "UI related", 'Text-Based Images and Documents': "Documents", 'Diagrams and Data Visualizations': "Infographics", 'Videos': "Videos", 'Artistic and Creative Content': "Arts/Creative", 'Photographs': "Photographs", '3D Models and Aerial Imagery': "3D related", # Application 'Information_Extraction': "Info Extraction", 'Planning' : "Planning", 'Coding': "Coding", 'Perception': "Perception", 'Metrics': "Metrics", 'Science': "Science", 'Knowledge': "Knowledge", 'Mathematics': "Math", # Output format 'contextual_formatted_text': "Contexual", 'structured_output': "Structured", 'exact_text': "Exact", 'numerical_data': "Numerical", 'open_ended_output': "Open-ended", 'multiple_choice': "MC", "6-8 images": "6-8 imgs", "1-image": "1 img", "2-3 images": "2-3 imgs", "4-5 images": "4-5 imgs", "9-image or more": "9+ imgs", "video": "Video", } MODEL_URLS = { "Claude_3.5_new": "https://www.anthropic.com/news/3-5-models-and-computer-use", "GPT_4o": "https://platform.openai.com/docs/models/gpt-4o", "Claude_3.5": "https://www.anthropic.com/news/claude-3-5-sonnet", "Gemini_1.5_pro_002": "https://ai.google.dev/gemini-api/docs/models/gemini", "Gemini_1.5_flash_002": "https://ai.google.dev/gemini-api/docs/models/gemini", "GPT_4o_mini": "https://platform.openai.com/docs/models#gpt-4o-mini", "Qwen2_VL_72B": "https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct", "InternVL2_76B": "https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B", "llava_onevision_72B": "https://huggingface.co/lmms-lab/llava-onevision-qwen2-72b-ov-chat", "NVLM": "https://huggingface.co/nvidia/NVLM-D-72B", "Molmo_72B": "https://huggingface.co/allenai/Molmo-72B-0924", "Qwen2_VL_7B": "https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct", "Pixtral_12B": "https://huggingface.co/mistralai/Pixtral-12B-2409", "Aria": "https://huggingface.co/rhymes-ai/Aria", "InternVL2_8B": "https://huggingface.co/OpenGVLab/InternVL2-8B", "Phi-3.5-vision": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct", "MiniCPM_v2.6": "https://huggingface.co/openbmb/MiniCPM-V-2_6", "llava_onevision_7B": "https://huggingface.co/lmms-lab/llava-onevision-qwen2-7b-ov", "Llama_3_2_11B": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision", "Idefics3": "https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3", "Molmo_7B_D": "https://huggingface.co/allenai/Molmo-7B-D-0924", "Aquila_VL_2B": "https://huggingface.co/BAAI/Aquila-VL-2B-llava-qwen", "POINTS_7B": "https://huggingface.co/WePOINTS/POINTS-Qwen-2-5-7B-Chat", "Qwen2_VL_2B": "https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct", "InternVL2_2B": "https://huggingface.co/OpenGVLab/InternVL2-2B" } class BaseDataLoader: # Define the base MODEL_GROUPS structure BASE_MODEL_GROUPS = { "All": list(MODEL_NAME_MAP.keys()), "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B'], "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B"], "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002'], "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini'], "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B"], "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B",] } def __init__(self): self.MODEL_DATA = self._load_model_data() self.SUMMARY_DATA = self._load_summary_data() self.SUPER_GROUPS = self._initialize_super_groups() self.MODEL_GROUPS = self._initialize_model_groups() def _initialize_super_groups(self): # Get a sample model to access the structure sample_model = next(iter(self.MODEL_DATA)) # Create groups with task counts groups = {} self.keyword_display_map = {} # Add this map to store display-to-original mapping for dim in self.MODEL_DATA[sample_model]: dim_name = DIMENSION_NAME_MAP[dim] # Create a list of tuples (display_name, count, keyword) for sorting keyword_info = [] for keyword in self.MODEL_DATA[sample_model][dim]: # Get the task count for this keyword task_count = self.MODEL_DATA[sample_model][dim][keyword]["count"] original_name = KEYWORD_NAME_MAP.get(keyword, keyword) display_name = f"{original_name}({task_count})" keyword_info.append((display_name, task_count, keyword)) # Sort by count (descending) and then by display name (for ties) keyword_info.sort(key=lambda x: (-x[1], x[0])) # Store sorted display names and update mapping groups[dim_name] = [info[0] for info in keyword_info] for display_name, _, keyword in keyword_info: self.keyword_display_map[display_name] = keyword # Sort based on predefined order order = ["Application", "Skills", "Output Format", "Input Format", "Visual Input Number"] return {k: groups[k] for k in order if k in groups} def _initialize_model_groups(self) -> Dict[str, list]: # Get the list of available models from the loaded data available_models = set(self.MODEL_DATA.keys()) # Create filtered groups based on available models filtered_groups = {} for group_name, models in self.BASE_MODEL_GROUPS.items(): if group_name == "All": filtered_groups[group_name] = sorted(list(available_models)) else: filtered_models = [model for model in models if model in available_models] if filtered_models: # Only include group if it has models filtered_groups[group_name] = filtered_models return filtered_groups def _load_model_data(self) -> Dict[str, Any]: raise NotImplementedError("Subclasses must implement _load_model_data") def _load_summary_data(self) -> Dict[str, Any]: raise NotImplementedError("Subclasses must implement _load_summary_data") def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame: raise NotImplementedError("Subclasses must implement get_df") def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]: raise NotImplementedError("Subclasses must implement get_leaderboard_data") class DefaultDataLoader(BaseDataLoader): def __init__(self): super().__init__() def _load_model_data(self) -> Dict[str, Any]: with open("./static/eval_results/Default/all_model_keywords_stats.json", "r") as f: return json.load(f) def _load_summary_data(self) -> Dict[str, Any]: with open("./static/eval_results/Default/all_summary.json", "r") as f: return json.load(f) def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame: original_dimension = get_original_dimension(selected_super_group) data = [] for model in self.MODEL_GROUPS[selected_model_group]: model_data = self.MODEL_DATA[model] summary = self.SUMMARY_DATA[model] core_noncot_score = summary["core_noncot"]["macro_mean_score"] core_cot_score = summary["core_cot"]["macro_mean_score"] row = { "Models": get_display_model_name(model, as_link=True), "Overall": round(summary["overall_score"] * 100, 2), "Core w/o CoT": round(core_noncot_score * 100, 2), "Core w/ CoT": round(core_cot_score * 100, 2), "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2) } for display_name in self.SUPER_GROUPS[selected_super_group]: original_keyword = self.keyword_display_map[display_name] if original_dimension in model_data and original_keyword in model_data[original_dimension]: row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2) else: row[display_name] = None data.append(row) df = pd.DataFrame(data) df = df.sort_values(by="Overall", ascending=False) return df def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]: df = self.get_df(selected_super_group, selected_model_group) # Get total task counts from the first model's data sample_model = next(iter(self.MODEL_DATA)) total_core_tasks = self.SUMMARY_DATA[sample_model]["core_noncot"]["num_eval_tasks"] total_open_tasks = self.SUMMARY_DATA[sample_model]["open"]["num_eval_tasks"] total_tasks = total_core_tasks + total_open_tasks # Define headers with task counts column_headers = { "Models": "Models", "Overall": f"Overall({total_tasks})", "Core w/o CoT": f"Core w/o CoT({total_core_tasks})", "Core w/ CoT": f"Core w/ CoT({total_core_tasks})", "Open-ended": f"Open-ended({total_open_tasks})" } # Rename the columns in DataFrame to match headers df = df.rename(columns=column_headers) headers = [ column_headers["Models"], column_headers["Overall"], column_headers["Core w/o CoT"], column_headers["Core w/ CoT"], column_headers["Open-ended"] ] + self.SUPER_GROUPS[selected_super_group] data = df[[ column_headers["Models"], column_headers["Overall"], column_headers["Core w/o CoT"], column_headers["Core w/ CoT"], column_headers["Open-ended"] ] + self.SUPER_GROUPS[selected_super_group]].values.tolist() return headers, data class SingleImageDataLoader(BaseDataLoader): def __init__(self): super().__init__() def _load_model_data(self) -> Dict[str, Any]: with open("./static/eval_results/SI/all_model_keywords_stats.json", "r") as f: return json.load(f) def _load_summary_data(self) -> Dict[str, Any]: with open("./static/eval_results/SI/all_summary.json", "r") as f: return json.load(f) def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame: original_dimension = get_original_dimension(selected_super_group) data = [] for model in self.MODEL_GROUPS[selected_model_group]: model_data = self.MODEL_DATA[model] summary = self.SUMMARY_DATA[model] row = { "Models": get_display_model_name(model, as_link=True), "Overall": round(summary["overall_score"] * 100, 2), "Core": round(summary["core"]["macro_mean_score"] * 100, 2), "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2) } for display_name in self.SUPER_GROUPS[selected_super_group]: original_keyword = self.keyword_display_map[display_name] if original_dimension in model_data and original_keyword in model_data[original_dimension]: row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2) else: row[display_name] = None data.append(row) df = pd.DataFrame(data) df = df.sort_values(by="Overall", ascending=False) return df def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]: df = self.get_df(selected_super_group, selected_model_group) # Get total task counts from the first model's data sample_model = next(iter(self.MODEL_DATA)) total_core_tasks = self.SUMMARY_DATA[sample_model]["core"]["num_eval_tasks"] total_open_tasks = self.SUMMARY_DATA[sample_model]["open"]["num_eval_tasks"] total_tasks = total_core_tasks + total_open_tasks # Define headers with task counts column_headers = { "Models": "Models", "Overall": f"Overall({total_tasks})", "Core": f"Core({total_core_tasks})", "Open-ended": f"Open-ended({total_open_tasks})" } # Rename the columns in DataFrame to match headers df = df.rename(columns=column_headers) headers = [ column_headers["Models"], column_headers["Overall"], column_headers["Core"], column_headers["Open-ended"] ] + self.SUPER_GROUPS[selected_super_group] data = df[[ column_headers["Models"], column_headers["Overall"], column_headers["Core"], column_headers["Open-ended"] ] + self.SUPER_GROUPS[selected_super_group]].values.tolist() return headers, data # Keep your helper functions def get_original_dimension(mapped_dimension): return next(k for k, v in DIMENSION_NAME_MAP.items() if v == mapped_dimension) def get_original_keyword(mapped_keyword): return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword) def get_display_model_name(model_name: str, as_link: bool = True) -> str: display_name = MODEL_NAME_MAP.get(model_name, model_name) if as_link and model_name in MODEL_URLS: return f'{display_name}' return display_name