open-agent-leaderboard

Running

App Files Files Community

liaojiajia commited on Jun 26

Commit

9ec00c3

1 Parent(s): 5740e03

add mm results

Browse files

Files changed (9) hide show

app.py +107 -0
gen_table.py +73 -5
meta_data.py +18 -1
preprocess.py +45 -2
src/detail_math_score.json +1 -1
src/multi-modal.csv +10 -0
src/multi_modal_results.csv +10 -0
src/multi_modal_results.json +86 -0
src/overall_math_score.json +1 -1

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import abc
 import gradio as gr
 import os
 from gen_table import *
 from meta_data import *
@@ -242,6 +243,112 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
                 outputs=data_component
             )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):

 import abc
 import gradio as gr
 import os
+import pandas as pd
 from gen_table import *
 from meta_data import *
                 outputs=data_component
             )
+        with gr.Tab(label='🏅 Open Agent Multi-Modal Leaderboard'):
+            gr.Markdown(LEADERBOARD_MD['MULTI_MODAL_MAIN'])
+            struct_multi_modal = load_results(MULTIMODAL_SCORE_FILE)
+            timestamp = struct_multi_modal['time']
+            EVAL_TIME_MM = format_timestamp(timestamp)
+            # Use BUILD_L3_DF to process multi-modal results (pass the list directly)
+            table_mm, check_box_mm = BUILD_L3_DF(
+                struct_multi_modal['multi_modal_results'], DEFAULT_MULTI_MODAL_BENCH
+            )
+            # Save the complete table as a CSV file
+            csv_path_multi_modal = os.path.join(os.getcwd(), 'src/multi_modal_results.csv')
+            table_mm.to_csv(csv_path_multi_modal, index=False)
+            print(f"Multi-modal results saved to {csv_path_multi_modal}")
+            type_map_mm = check_box_mm['type_map']
+            checkbox_group_mm = gr.CheckboxGroup(
+                choices=check_box_mm['all'],
+                value=check_box_mm['required'],
+                label='Evaluation Dimension',
+                interactive=True,
+            )
+            agent_name_mm = gr.CheckboxGroup(
+                choices=table_mm['Agent'].unique().tolist(),
+                value=table_mm['Agent'].unique().tolist(),
+                label='Agent',
+                interactive=True
+            )
+            vlm_name_mm = gr.CheckboxGroup(
+                choices=table_mm['VLMs'].unique().tolist(),
+                value=table_mm['VLMs'].unique().tolist(),
+                label='VLMs',
+                interactive=True
+            )
+            initial_headers_mm = ['Rank'] + check_box_mm['essential'] + checkbox_group_mm.value
+            available_headers_mm = [h for h in initial_headers_mm if h in table_mm.columns]
+            data_component_mm = gr.components.DataFrame(
+                value=table_mm[available_headers_mm],
+                type='pandas',
+                datatype=[type_map_mm[x] for x in available_headers_mm],
+                interactive=False,
+                wrap=True,
+                visible=True
+            )
+            def filter_df_mm(fields, agents, vlms, *args):
+                headers = ['Rank'] + check_box_mm['essential'] + fields
+                df = table_mm.copy()
+                # Validate inputs to avoid errors
+                if not agents:
+                    agents = df['Agent'].unique().tolist()
+                if not vlms:
+                    vlms = df['VLMs'].unique().tolist()
+                # Add filtering logic
+                df['flag'] = df.apply(lambda row: (
+                    row['Agent'] in agents and
+                    row['VLMs'] in vlms
+                ), axis=1)
+                df = df[df['flag']].copy()
+                df.pop('flag')
+                # Ensure all requested columns exist
+                available_headers = [h for h in headers if h in df.columns]
+                # If no columns are available, return an empty DataFrame with basic columns
+                if not available_headers:
+                    available_headers = ['Rank'] + check_box_mm['essential']
+                comp = gr.components.DataFrame(
+                    value=df[available_headers],
+                    type='pandas',
+                    datatype=[type_map_mm.get(col, 'str') for col in available_headers],
+                    interactive=False,
+                    wrap=True,
+                    visible=True
+                )
+                return comp
+            # Add change events for multi-modal leaderboard
+            checkbox_group_mm.change(
+                fn=filter_df_mm,
+                inputs=[checkbox_group_mm, agent_name_mm, vlm_name_mm],
+                outputs=data_component_mm
+            )
+            agent_name_mm.change(
+                fn=filter_df_mm,
+                inputs=[checkbox_group_mm, agent_name_mm, vlm_name_mm],
+                outputs=data_component_mm
+            )
+            vlm_name_mm.change(
+                fn=filter_df_mm,
+                inputs=[checkbox_group_mm, agent_name_mm, vlm_name_mm],
+                outputs=data_component_mm
+            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):

gen_table.py CHANGED Viewed

@@ -97,14 +97,14 @@ def BUILD_L2_DF(results, fields):
     # Create DataFrame
     df = pd.DataFrame(res)
-    # 获取所有唯一的 Algorithm 和 LLM
     unique_algorithms = df['Algorithm'].unique().tolist()
     unique_llms = df['LLM'].unique().tolist()
     # Set checkbox configuration
     check_box = {}
-    check_box['Algorithm_options'] = unique_algorithms  # 添加 Algorithm 可选项
-    check_box['LLM_options'] = unique_llms  # 添加 LLM 可选项
     # Sort by Dataset and Score in descending order
     df = df.sort_values(['Dataset', 'Score'], ascending=[True, False])
@@ -183,7 +183,7 @@ def generate_table(results, fields):
     df = pd.concat([valid, missing])
     df = df.sort_values('Rank')
-    # 重新排列列顺序
     columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score']
     for d in fields:
         columns.extend([f"{d}-Score", f"{d}-Cost($)"])
@@ -238,4 +238,72 @@ def generate_table_detail(results, fields):
     remaining_columns = [col for col in df.columns if col not in columns]
     df = df[columns + remaining_columns]
-    return df

     # Create DataFrame
     df = pd.DataFrame(res)
+    # Get all unique Algorithms and LLM
     unique_algorithms = df['Algorithm'].unique().tolist()
     unique_llms = df['LLM'].unique().tolist()
     # Set checkbox configuration
     check_box = {}
+    check_box['Algorithm_options'] = unique_algorithms  # Add Algorithm Options
+    check_box['LLM_options'] = unique_llms  # Add LLM option
     # Sort by Dataset and Score in descending order
     df = df.sort_values(['Dataset', 'Score'], ascending=[True, False])
     df = pd.concat([valid, missing])
     df = df.sort_values('Rank')
+    # Rearrange column order
     columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score']
     for d in fields:
         columns.extend([f"{d}-Score", f"{d}-Cost($)"])
     remaining_columns = [col for col in df.columns if col not in columns]
     df = df[columns + remaining_columns]
+    return df
+def generate_multi_modal_table(results, fields):
+    res = defaultdict(list)
+    for entry in results.values():
+        # Add Agent and VLMs
+        res['Agent'].append(entry.get('Agent', 'Unknown'))
+        res['VLMs'].append(entry.get('VLMs', 'Unknown'))
+        # Add numeric fields
+        for field in fields:
+            res[field].append(entry.get(field, None))
+    # Create DataFrame
+    df = pd.DataFrame(res)
+    # Sort by Score in descending order
+    df = df.sort_values('Score', ascending=False)
+    # Add Rank column
+    df['Rank'] = range(1, len(df) + 1)
+    # Rearrange column order
+    columns = ['Rank', 'Agent', 'VLMs'] + fields
+    df = df[columns]
+    return df
+def BUILD_L3_DF(results, fields):
+    res = defaultdict(list)
+    # Iterate over each entry in the multi-modal results (results is a list)
+    for entry in results:
+        # Add Agent and VLMs
+        res['Agent'].append(entry.get('Agent', 'Unknown'))
+        res['VLMs'].append(entry.get('VLMs', 'Unknown'))
+        # Add numeric fields
+        for field in fields:
+            res[field].append(entry.get(field, None))
+    # Create DataFrame
+    df = pd.DataFrame(res)
+    # Sort by Score in descending order
+    df = df.sort_values('Score', ascending=False)
+    # Add Rank column
+    df['Rank'] = range(1, len(df) + 1)
+    # Rearrange column order
+    columns = ['Rank', 'Agent', 'VLMs'] + fields
+    df = df[columns]
+    # Set checkbox configuration
+    check_box = {}
+    check_box['essential'] = ['Agent', 'VLMs']
+    check_box['required'] = check_box['essential'] + fields
+    check_box['all'] = ['Rank'] + fields
+    type_map = defaultdict(lambda: 'number')
+    type_map['Agent'] = 'str'
+    type_map['VLMs'] = 'str'
+    type_map['Rank'] = 'number'
+    for field in fields:
+        type_map[field] = 'number'
+    check_box['type_map'] = type_map
+    return df, check_box

meta_data.py CHANGED Viewed

@@ -1,12 +1,13 @@
 # CONSTANTS-URL
 OVERALL_MATH_SCORE_FILE = "src/overall_math_score.json"
 DETAIL_MATH_SCORE_FILE = "src/detail_math_score.json"
 # CONSTANTS-TEXT
 LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
 ### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: CoT, SC-CoT, PoT, ReAct, ToT, etc. The agents are implemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent)
-We are excited to announce that the paper "Unifying Language Agent Algorithms with Graph-based Orchestration Engine for Reproducible Agent Research" has been accepted to ACL 2025 Systems Demonstration Track! 🎉
 This leaderboard was last updated: {}.
@@ -18,6 +19,9 @@ DEFAULT_MATH_BENCH = [
     'gsm8k', 'AQuA', 'MATH-500',
 ]
 # The README file for each benchmark
 LEADERBOARD_MD = {}
@@ -69,6 +73,19 @@ LEADERBOARD_MD['MATH_DETAIL'] = f"""
 - ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
 """
 META_FIELDS = [
     'Algorithm', 'LLM', 'Eval Date'
 ]

 # CONSTANTS-URL
 OVERALL_MATH_SCORE_FILE = "src/overall_math_score.json"
 DETAIL_MATH_SCORE_FILE = "src/detail_math_score.json"
+MULTIMODAL_SCORE_FILE = "src/multi_modal_results.json"
 # CONSTANTS-TEXT
 LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
 ### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: CoT, SC-CoT, PoT, ReAct, ToT, etc. The agents are implemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent)
+We are excited to announce that the paper "Unifying Language Agent Algorithms with Graph-based Orchestration Engine for Reproducible Agent Research" has been accepted to ACL 2025 Systems Demonstration Track! [*Paper*](https://arxiv.org/abs/2505.24354) 🎉
 This leaderboard was last updated: {}.
     'gsm8k', 'AQuA', 'MATH-500',
 ]
+DEFAULT_MULTI_MODAL_BENCH = ['Score', 'Pass Rate', 'Total Input Tokens', 'Total Output Tokens', 'All Tokens']
 # The README file for each benchmark
 LEADERBOARD_MD = {}
 - ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
 """
+LEADERBOARD_MD['MULTI_MODAL_MAIN'] = f"""
+## Math task main Evaluation Results
+- Metrics:
+  - Score: The evaluation score on each Benchmarks (the higher the better).
+  - Pass rate: The percentage of response that are valid, where a response is valid if it is neither empty nor null.
+- By default, we present the overall evaluation results based on MME-RealWorld, sorted by the descending order of Score.
+- IO (Input-Output): The baseline method that directly prompts the model with the question and expects an answer without any intermediate reasoning steps.
+"""
 META_FIELDS = [
     'Algorithm', 'LLM', 'Eval Date'
 ]

preprocess.py CHANGED Viewed

@@ -174,7 +174,50 @@ def process_csv_to_overall_json():
     with open('src/overall_math_score.json', 'w', encoding='utf-8') as f:
         json.dump(result, f, indent=4, ensure_ascii=False)
 if __name__ == "__main__":
-    # Generate JSON files in two formats
     process_csv_to_json()
-    process_csv_to_overall_json()

     with open('src/overall_math_score.json', 'w', encoding='utf-8') as f:
         json.dump(result, f, indent=4, ensure_ascii=False)
+def process_multi_modal_csv():
+    # Read the CSV file
+    df = pd.read_csv('src/multi-modal.csv', skipinitialspace=True)
+    # Clean and rename columns
+    df.columns = df.columns.str.strip().str.replace('="', '').str.replace('"', '')
+    df = df.rename(columns={
+        'Agent': 'Agent',
+        'VLMs': 'VLMs',
+        'Score': 'Score',
+        'Pass Rate': 'Pass Rate',
+        'Total Input Tokens': 'Total Input Tokens',
+        'Total Output Tokens': 'Total Output Tokens',
+        'All Tokens': 'All Tokens'
+    })
+    # Strip unwanted characters from all string values
+    df = df.applymap(lambda x: str(x).replace('="', '').replace('"', '').strip() if isinstance(x, str) else x)
+    # Helper function to parse numbers with commas
+    def parse_number(value):
+        if pd.isna(value) or value == '-':
+            return 0
+        return int(float(str(value).replace(',', '')))
+    # Process numeric fields
+    df['Score'] = df['Score'].apply(lambda x: round(float(x), 2) if pd.notnull(x) and x != '-' else 0.0)
+    df['Pass Rate'] = df['Pass Rate'].apply(lambda x: round(float(x) / 100, 4) if pd.notnull(x) and x != '-' else 0.0)
+    df['Total Input Tokens'] = df['Total Input Tokens'].apply(parse_number)
+    df['Total Output Tokens'] = df['Total Output Tokens'].apply(parse_number)
+    df['All Tokens'] = df['All Tokens'].apply(parse_number)
+    # Convert to Hugging Face-compatible format
+    result = {
+        "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+        "multi_modal_results": df.to_dict(orient='records')
+    }
+    # Save as JSON file
+    with open('src/multi_modal_results.json', 'w', encoding='utf-8') as f:
+        json.dump(result, f, indent=4, ensure_ascii=False)
 if __name__ == "__main__":
+    # Generate JSON files in three formats
     process_csv_to_json()
+    process_csv_to_overall_json()
+    process_multi_modal_csv()

src/detail_math_score.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-    "time": "2025-03-05 13:15:02",
     "results": {
         "IO": {
             "gpt-3.5-turbo": {

 {
+    "time": "2025-06-25 18:17:55",
     "results": {
         "IO": {
             "gpt-3.5-turbo": {

src/multi-modal.csv ADDED Viewed

	@@ -0,0 +1,10 @@

+"=""Agent""","=""VLMs""","=""Score""","=""Pass Rate""","=""Total Input Tokens""","=""Total Output Tokens""","=""All Tokens""
+"=""ZoomEye""","=""Qwen2.5-VL-72B-Instruct""","=""51.56""","=""99.81""","=""76,808,965""","=""1,276,460""","=""78,085,425""
+"=""ZoomEye""","=""Qwen2.5-VL-7B-Instruct""","=""48.06""","=""96.50""","=""94,418,593""","=""1,472,836""","=""95,891,429""
+"=""IO""","=""Qwen2.5-VL-72B-Instruct""","=""44.47""","=""100.00""","=""6,174,490""","=""2,114""","=""6,176,604""
+"=""ZoomEye""","=""InternVL2.5-8B""","=""43.42""","=""99.34""","=""153,857,588""","=""2,017,170""","=""155,874,758""
+"=""IO""","=""InternVL2.5-8B""","=""42.95""","=""100.00""","=""2,779,778""","=""2,335""","=""2,782,113""
+"=""IO""","=""Qwen2.5-VL-7B-Instruct""","=""42.86""","=""100.00""","=""6,174,490""","=""2,114""","=""6,176,604""
+"=""ZoomEye""","=""Llava-v1.5-7B""","=""31.60""","=""98.86""","=""113,073,261""","=""1,368,724""","=""114,441,985""
+"=""IO""","=""Llava-v1.5-7B""","=""24.79""","=""100.00""","=""734,868""","=""17,036""","=""751,904""
+"=""V*""","=""seal_vqa & seal_vsm""","=""15.14""","=""72.37""","=""-""","=""-""","=""-"""

src/multi_modal_results.csv ADDED Viewed

	@@ -0,0 +1,10 @@

+Rank,Agent,VLMs,Score,Pass Rate,Total Input Tokens,Total Output Tokens,All Tokens
+1,ZoomEye,Qwen2.5-VL-72B-Instruct,51.56,0.9981,76808965,1276460,78085425
+2,ZoomEye,Qwen2.5-VL-7B-Instruct,48.06,0.965,94418593,1472836,95891429
+3,IO,Qwen2.5-VL-72B-Instruct,44.47,1.0,6174490,2114,6176604
+4,ZoomEye,InternVL2.5-8B,43.42,0.9934,153857588,2017170,155874758
+5,IO,InternVL2.5-8B,42.95,1.0,2779778,2335,2782113
+6,IO,Qwen2.5-VL-7B-Instruct,42.86,1.0,6174490,2114,6176604
+7,ZoomEye,Llava-v1.5-7B,31.6,0.9886,113073261,1368724,114441985
+8,IO,Llava-v1.5-7B,24.79,1.0,734868,17036,751904
+9,V*,seal_vqa & seal_vsm,15.14,0.7237,0,0,0

src/multi_modal_results.json ADDED Viewed

	@@ -0,0 +1,86 @@

+{
+    "time": "2025-06-25 18:17:55",
+    "multi_modal_results": [
+        {
+            "Agent": "ZoomEye",
+            "VLMs": "Qwen2.5-VL-72B-Instruct",
+            "Score": 51.56,
+            "Pass Rate": 0.9981,
+            "Total Input Tokens": 76808965,
+            "Total Output Tokens": 1276460,
+            "All Tokens": 78085425
+        },
+        {
+            "Agent": "ZoomEye",
+            "VLMs": "Qwen2.5-VL-7B-Instruct",
+            "Score": 48.06,
+            "Pass Rate": 0.965,
+            "Total Input Tokens": 94418593,
+            "Total Output Tokens": 1472836,
+            "All Tokens": 95891429
+        },
+        {
+            "Agent": "IO",
+            "VLMs": "Qwen2.5-VL-72B-Instruct",
+            "Score": 44.47,
+            "Pass Rate": 1.0,
+            "Total Input Tokens": 6174490,
+            "Total Output Tokens": 2114,
+            "All Tokens": 6176604
+        },
+        {
+            "Agent": "ZoomEye",
+            "VLMs": "InternVL2.5-8B",
+            "Score": 43.42,
+            "Pass Rate": 0.9934,
+            "Total Input Tokens": 153857588,
+            "Total Output Tokens": 2017170,
+            "All Tokens": 155874758
+        },
+        {
+            "Agent": "IO",
+            "VLMs": "InternVL2.5-8B",
+            "Score": 42.95,
+            "Pass Rate": 1.0,
+            "Total Input Tokens": 2779778,
+            "Total Output Tokens": 2335,
+            "All Tokens": 2782113
+        },
+        {
+            "Agent": "IO",
+            "VLMs": "Qwen2.5-VL-7B-Instruct",
+            "Score": 42.86,
+            "Pass Rate": 1.0,
+            "Total Input Tokens": 6174490,
+            "Total Output Tokens": 2114,
+            "All Tokens": 6176604
+        },
+        {
+            "Agent": "ZoomEye",
+            "VLMs": "Llava-v1.5-7B",
+            "Score": 31.6,
+            "Pass Rate": 0.9886,
+            "Total Input Tokens": 113073261,
+            "Total Output Tokens": 1368724,
+            "All Tokens": 114441985
+        },
+        {
+            "Agent": "IO",
+            "VLMs": "Llava-v1.5-7B",
+            "Score": 24.79,
+            "Pass Rate": 1.0,
+            "Total Input Tokens": 734868,
+            "Total Output Tokens": 17036,
+            "All Tokens": 751904
+        },
+        {
+            "Agent": "V*",
+            "VLMs": "seal_vqa & seal_vsm",
+            "Score": 15.14,
+            "Pass Rate": 0.7237,
+            "Total Input Tokens": 0,
+            "Total Output Tokens": 0,
+            "All Tokens": 0
+        }
+    ]
+}

src/overall_math_score.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-    "time": "2025-03-05 13:15:02",
     "results": {
         "IO": {
             "META": {

 {
+    "time": "2025-06-25 18:17:55",
     "results": {
         "IO": {
             "META": {