Spaces:

Enderchef
/

SuperBench-Eval

Running on Zero

App Files Files Community

Enderchef commited on Jun 25

Commit

3b51590

verified ·

1 Parent(s): eee15fb

Update app.py

Browse files

Files changed (1) hide show

app.py +165 -193

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import json
 import pandas as pd
 import traceback
 import spaces
 # --- Environment and Caching ---
@@ -46,10 +47,10 @@ def get_all_benchmark_options():
         try:
             # Fetching dataset configurations requires authentication if the dataset is private
             subjects = get_dataset_config_names(dataset_id, token=HF_TOKEN)
-            benchmark_subject_cache[key] = ["ALL"] + subjects
         except Exception as e:
             print(f"Warning: Could not load configs for {key} ({dataset_id}). It might be private or unavailable. Error: {e}")
-            benchmark_subject_cache[key] = []
     print("Benchmark configurations cached.")
     return benchmark_subject_cache
@@ -80,7 +81,8 @@ def load_model(model_id):
             model_id,
             token=HF_TOKEN,
             torch_dtype=dtype,
-            trust_remote_code=True
         ).to("cuda" if torch.cuda.is_available() else "cpu")
         # Create the pipeline for text generation
@@ -115,8 +117,17 @@ def extract_predicted_letter(output_text):
     Extracts the predicted letter from the model's output.
     It looks for a letter (A, B, C, D) immediately following 'Answer:'.
     """
-    match = re.search(r"Answer:\s*([ABCD])", output_text, re.IGNORECASE)
-    return match.group(1).upper() if match else None
 def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress):
     """
@@ -140,23 +151,28 @@ def evaluate_single_subject(generator, dataset_id, subject, sample_count, progre
         prompt, correct_answer_idx = format_prompt(item)
         expected_letter = get_choice_letter(correct_answer_idx)
         # Generate a short response, aiming for a single letter answer.
         # do_sample=False (greedy decoding) is crucial for reproducibility.
-        raw_output = generator(prompt, max_new_tokens=5, do_sample=False)[0]["generated_text"]
-        predicted_letter = extract_predicted_letter(raw_output)
         is_correct = (predicted_letter == expected_letter)
         if is_correct:
             correct_predictions += 1
         results_details.append({
-            "question": item['question'],
-            "choices": item['choices'],
-            "raw_output": raw_output.strip(),
-            "expected_letter": expected_letter,
-            "predicted_letter": predicted_letter,
-            "is_correct": is_correct,
         })
     accuracy = (correct_predictions / num_samples) * 100 if num_samples > 0 else 0
@@ -168,7 +184,7 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
     """
     Main function to orchestrate the entire evaluation process.
     Handles single subject or 'ALL' subjects evaluation.
-    Returns updates for Gradio UI components.
     """
     try:
         gr.Info("Starting evaluation...")
@@ -185,15 +201,19 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
         subjects_to_run = []
         if subject_name == "ALL":
-            subjects_to_run = ALL_BENCHMARK_SUBJECTS.get(benchmark_category, [])
-            if "ALL" in subjects_to_run:
-                subjects_to_run.remove("ALL") # Remove 'ALL' from the list of subjects to run
         else:
             subjects_to_run = [subject_name]
         if not subjects_to_run:
             gr.Warning(f"No subjects found for '{benchmark_category}'.")
-            return "", "", "", pd.DataFrame().to_dict('records')
         for i, subject in enumerate(subjects_to_run):
             gr.Info(f"Evaluating {benchmark_category} - {subject} ({i+1}/{len(subjects_to_run)})...")
@@ -201,7 +221,7 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
                 accuracy, subject_details = evaluate_single_subject(generator, dataset_id, subject, sample_count, progress)
                 all_results_details.extend(subject_details)
-                num_correct = sum(d['is_correct'] for d in subject_details)
                 num_evaluated = len(subject_details)
                 total_correct += num_correct
@@ -209,58 +229,59 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
                 summary_lines.append(f"- **{subject}**: {accuracy:.2f}% ({num_correct}/{num_evaluated})")
             except Exception as e:
                 gr.Error(f"Skipping {subject} due to an error: {e}")
-                summary_lines.append(f"- **{subject}**: Evaluation failed.")
                 continue
         overall_accuracy = (total_correct / total_samples) * 100 if total_samples > 0 else 0
         # --- Prepare Outputs ---
         if subject_name == "ALL":
-            result_summary = f"### Overall Average Accuracy for {benchmark_category}: {overall_accuracy:.2f}%\n"
-            result_summary += "across {:,} total samples.\n\n---\n\n**Breakdown by Subject:**\n".format(total_samples)
             result_summary += "\n".join(summary_lines)
         else:
             result_summary = f"### Accuracy for {benchmark_category} - {subject_name}: {overall_accuracy:.2f}%\n"
-            result_summary += "({:,}/{:,} correct)".format(total_correct, total_samples)
-        # Create a detailed DataFrame for inspection
-        df_details = pd.DataFrame(all_results_details)
         # Save results for leaderboard
         record = {
             "model_id": model_id,
             "benchmark": benchmark_category,
             "accuracy": overall_accuracy,
-            "subject": subject_name,
             "sample_count": total_samples,
-            "timestamp": pd.Timestamp.now().isoformat()
         }
         with open(EVAL_FILE, "a") as f:
             f.write(json.dumps(record) + "\n")
         gr.Info("Evaluation completed successfully!")
-        # Return updates for the UI
-        return (
-            gr.update(value=result_summary, visible=True),
-            gr.update(value="", visible=False), # Hide error message
-            gr.update(value="", visible=False), # Hide error details
-            gr.update(value=df_details.to_dict('records'), visible=True) # Show detailed results table
-        )
     except Exception as e:
-        error_message = f"An unexpected error occurred: {e}"
         error_details = traceback.format_exc()
         gr.Error(error_message)
-        # Return error updates for the UI
-        return (
-            gr.update(value="", visible=False), # Hide results summary
-            gr.update(value=error_message, visible=True),
-            gr.update(value=error_details, visible=True),
-            gr.update(value=pd.DataFrame().to_dict('records'), visible=False) # Hide detailed results
-        )
 # --- UI Helper Functions ---
@@ -270,171 +291,141 @@ def update_subject_dropdown(benchmark_category):
     default_value = "ALL" if "ALL" in choices else (choices[0] if choices else None)
     return gr.update(choices=choices, value=default_value)
-def load_leaderboard(benchmark_filter):
     """
     Loads and processes evaluation data to display on the leaderboard.
     It now correctly averages scores for models that were evaluated on 'ALL' subjects.
     """
     try:
         if not os.path.exists(EVAL_FILE):
-            return pd.DataFrame(columns=["Model ID", "Avg. Accuracy (%)", "Total Samples"]).to_dict('records')
         df = pd.read_json(EVAL_FILE, lines=True)
         if df.empty:
-            return pd.DataFrame(columns=["Model ID", "Avg. Accuracy (%)", "Total Samples"]).to_dict('records')
         # Coerce accuracy to numeric and filter valid entries
         df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
         df.dropna(subset=['accuracy'], inplace=True)
         # Filter by the selected benchmark (e.g., MMLU or MMLU-Pro)
-        df_filtered = df[df['benchmark'] == benchmark_filter].copy()
         if df_filtered.empty:
-            return pd.DataFrame(columns=["Model ID", "Avg. Accuracy (%)", "Total Samples"]).to_dict('records')
-        # We are interested in the 'ALL' subject evaluations for the main leaderboard
-        df_all = df_filtered[df_filtered['subject'] == 'ALL'].copy()
-        if df_all.empty:
-            return pd.DataFrame(columns=["Model ID", "Avg. Accuracy (%)", "Total Samples"]).to_dict('records')
         # Find the latest evaluation for each model
-        df_all['timestamp'] = pd.to_datetime(df_all['timestamp'])
-        latest_evals = df_all.loc[df_all.groupby('model_id')['timestamp'].idxmax()]
-        leaderboard_df = latest_evals[['model_id', 'accuracy', 'sample_count']].copy()
-        leaderboard_df.columns = ["Model ID", "Avg. Accuracy (%)", "Total Samples"]
-        # Format accuracy to 2 decimal places
-        leaderboard_df["Avg. Accuracy (%)"] = leaderboard_df["Avg. Accuracy (%)"].map('{:.2f}'.format)
-        # Sort by accuracy
-        leaderboard_df = leaderboard_df.sort_values(by="Avg. Accuracy (%)", ascending=False)
-        return leaderboard_df.to_dict('records')
     except Exception as e:
         gr.Error(f"Error loading leaderboard: {e}")
         traceback.print_exc()
-        return pd.DataFrame(columns=["Model ID", "Avg. Accuracy (%)", "Total Samples"]).to_dict('records')
 # --- Gradio Interface Definition ---
-with gr.Blocks(theme=gr.themes.Soft(), css="""
     /* --- Global & Layout --- */
     body { font-family: 'Inter', sans-serif; background-color: #f8f9fa; }
     .gradio-container { max-width: 1280px !important; margin: auto; }
-    .gr-box { border-radius: 12px !important; box-shadow: 0 4px 12px rgba(0,0,0,0.05) !important; border: 1px solid #e9ecef !important; }
     /* --- Typography --- */
-    h1 {
-        text-align: center;
-        font-size: 2.5rem !important;
-        font-weight: 700;
-        color: #212529;
-        margin-bottom: 0.5rem;
-        letter-spacing: -1px;
-    }
-    .subtitle {
-        text-align: center; color: #6c757d; font-size: 1.1rem; margin-bottom: 2.5rem;
-    }
     /* --- Buttons & Inputs --- */
-    .gr-button {
-        border-radius: 8px !important;
-        font-weight: 600 !important;
-        padding: 10px 20px !important;
-        transition: all 0.2s ease;
-    }
-    .gr-button-primary { box-shadow: 0 4px 10px rgba(0, 123, 255, 0.2); }
-    .gr-button-primary:hover { transform: translateY(-2px); box-shadow: 0 6px 15px rgba(0, 123, 255, 0.3); }
     /* --- Custom Radio Buttons (Segmented Control) --- */
-    #leaderboard-toggle, #eval-benchmark-selection {
-        background-color: #e9ecef;
-        padding: 5px;
-        border-radius: 10px;
-        display: inline-flex;
-        margin: auto;
-    }
-    #leaderboard-toggle div.gr-form, #eval-benchmark-selection div.gr-form {
-        display: flex;
-        gap: 5px;
-    }
-    #leaderboard-toggle input[type='radio'], #eval-benchmark-selection input[type='radio'] { display: none; }
-    #leaderboard-toggle label, #eval-benchmark-selection label {
-        padding: 8px 16px;
-        border-radius: 8px;
-        cursor: pointer;
-        transition: background-color 0.3s, color 0.3s, box-shadow 0.3s;
-        font-weight: 500;
-        color: #495057;
-        background: transparent;
-        border: none;
-        box-shadow: none;
-    }
-    #leaderboard-toggle input[type='radio']:checked + label, #eval-benchmark-selection input[type='radio']:checked + label {
-        background-color: white;
-        color: #007bff;
-        font-weight: 600;
-        box-shadow: 0 2px 5px rgba(0,0,0,0.1);
-    }
     /* --- Dataframe / Table Styling --- */
-    .leaderboard-table .gr-dataframe table {
-        border-collapse: collapse;
-        width: 100%;
-    }
-    .leaderboard-table .gr-dataframe thead th {
-        background-color: #f8f9fa !important;
-        color: #495057 !important;
-        font-weight: 600 !important;
-        text-align: left;
-        padding: 12px 15px;
-        border-bottom: 2px solid #dee2e6;
-    }
-    .leaderboard-table .gr-dataframe tbody tr:nth-of-type(even) {
-        background-color: #f8f9fa;
-    }
-    .leaderboard-table .gr-dataframe tbody tr:hover {
-        background-color: #e9ecef;
-    }
-    .leaderboard-table .gr-dataframe tbody td {
-        padding: 12px 15px;
-        border-bottom: 1px solid #dee2e6;
-    }
     /* --- Error & Result Panes --- */
-    #error-display-box { background-color: #fff3f3; border-color: #ffc9c9; }
-    #error-display-box .gr-label { color: #d9480f !important; font-weight: 600; }
-    #result-summary-box { background-color: #f3f9ff; border-color: #cde4ff; }
 """) as demo:
-    gr.Markdown("<h1>🤖 Open LLM Evaluator</h1>")
-    gr.Markdown("<p class='subtitle'>Benchmark leading models on MMLU and MMLU-Pro. Your results contribute to a live leaderboard.</p>")
-    with gr.Tabs():
         # --- Evaluation Tab ---
-        with gr.TabItem("🚀 Run Evaluation"):
-            with gr.Row():
                 with gr.Column(scale=2):
-                    with gr.Box():
                         gr.Markdown("### 1. Configure Evaluation")
                         model_id_input = gr.Textbox(
                             label="Hugging Face Model ID",
                             placeholder="e.g., meta-llama/Meta-Llama-3-8B-Instruct",
                             interactive=True
                         )
-                        with gr.Row():
-                             benchmark_selection_radio = gr.Radio(
-                                ["MMLU", "MMLU-Pro"],
-                                label="Benchmark",
-                                value="MMLU",
-                                interactive=True,
-                                elem_id="eval-benchmark-selection",
-                                container=False
-                            )
                         with gr.Row():
                             benchmark_subject_dropdown = gr.Dropdown(
                                 label="Subject",
@@ -453,44 +444,25 @@ with gr.Blocks(theme=gr.themes.Soft(), css="""
                     gr.Markdown("### 2. View Results")
                     # Panel for displaying the summary of results
-                    with gr.Box(visible=False, elem_id="result-summary-box") as result_summary_box:
-                        result_summary_output = gr.Markdown()
                     # Panel for displaying errors
-                    with gr.Box(visible=False, elem_id="error-display-box") as error_box:
-                        error_output = gr.Textbox(label="Error Message", interactive=False)
                         error_details_output = gr.Textbox(label="Error Details (Traceback)", interactive=False, lines=8)
                     # Panel for detailed, row-by-row results
-                    with gr.Box(visible=False) as details_box:
                         gr.Markdown("#### Detailed Evaluation Log")
-                        detailed_results_df = gr.Dataframe(
-                            headers=["Question", "Correct", "Expected", "Predicted", "Raw Output"],
-                            datatype=["str", "bool", "str", "str", "str"],
                             interactive=False,
                             row_count=10,
-                            col_count=5
                         )
-        # --- Leaderboard Tab ---
-        with gr.TabItem("📊 Leaderboard"):
-            with gr.Column():
-                gr.Markdown("<div style='display: flex; justify-content: center; width: 100%; margin-bottom: 20px;'></div>", elem_id="leaderboard-toggle-container")
-                leaderboard_type_toggle = gr.Radio(
-                    ["MMLU", "MMLU-Pro"],
-                    label="Select Benchmark",
-                    value="MMLU",
-                    interactive=True,
-                    elem_id="leaderboard-toggle",
-                    container=False
-                )
-                leaderboard_table_output = gr.Dataframe(
-                    headers=["Model ID", "Avg. Accuracy (%)", "Total Samples"],
-                    interactive=False,
-                    datatype=["str", "str", "number"],
-                    row_count=15,
-                    elem_classes="leaderboard-table"
-                )
     # --- Event Handlers & Logic ---
@@ -505,12 +477,12 @@ with gr.Blocks(theme=gr.themes.Soft(), css="""
     run_button.click(
         fn=run_evaluation,
         inputs=[model_id_input, benchmark_selection_radio, benchmark_subject_dropdown, sample_count_slider],
-        outputs=[result_summary_box, error_box, error_details_output, details_box]
     ).then(
-        # This chained function updates the component values *after* the visibility is set
-        lambda r, e, d, df: (r, e, d, df.to_dict('records')),
-        inputs=[result_summary_box, error_box, error_details_output, details_box],
-        outputs=[result_summary_output, error_output, error_details_output, detailed_results_df]
     )
     # Leaderboard loading logic
@@ -522,14 +494,14 @@ with gr.Blocks(theme=gr.themes.Soft(), css="""
     leaderboard_type_toggle.change(
         fn=load_leaderboard,
         inputs=[leaderboard_type_toggle],
-        outputs=[leaderboard_table_output]
     )
-    # When the run button is clicked again, refresh the leaderboard
-    run_button.click(
         fn=load_leaderboard,
         inputs=[leaderboard_type_toggle],
-        outputs=[leaderboard_table_output]
     )
 # Launch the Gradio app

 import pandas as pd
 import traceback
 import spaces
+from datetime import datetime
 # --- Environment and Caching ---
         try:
             # Fetching dataset configurations requires authentication if the dataset is private
             subjects = get_dataset_config_names(dataset_id, token=HF_TOKEN)
+            benchmark_subject_cache[key] = ["ALL"] + sorted([s for s in subjects if s != 'all']) # Sort subjects
         except Exception as e:
             print(f"Warning: Could not load configs for {key} ({dataset_id}). It might be private or unavailable. Error: {e}")
+            benchmark_subject_cache[key] = ["ALL"] # Provide a default
     print("Benchmark configurations cached.")
     return benchmark_subject_cache
             model_id,
             token=HF_TOKEN,
             torch_dtype=dtype,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True, # Optimization for large models
         ).to("cuda" if torch.cuda.is_available() else "cpu")
         # Create the pipeline for text generation
     Extracts the predicted letter from the model's output.
     It looks for a letter (A, B, C, D) immediately following 'Answer:'.
     """
+    # Look for "Answer: X" and capture X
+    match = re.search(r"Answer:\s*([ABCD])", output_text.strip(), re.IGNORECASE)
+    if match:
+        return match.group(1).upper()
+    # Fallback: if the model just outputs a letter
+    match = re.search(r"^\s*([ABCD])\b", output_text.strip())
+    if match:
+        return match.group(1).upper()
+    return None
 def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress):
     """
         prompt, correct_answer_idx = format_prompt(item)
         expected_letter = get_choice_letter(correct_answer_idx)
+        # The generated text is often just after the prompt. We need to slice it.
+        full_prompt_text = generator.tokenizer.decode(generator.tokenizer.encode(prompt), skip_special_tokens=True)
         # Generate a short response, aiming for a single letter answer.
         # do_sample=False (greedy decoding) is crucial for reproducibility.
+        raw_output = generator(prompt, max_new_tokens=5, do_sample=False, pad_token_id=generator.tokenizer.eos_token_id)[0]["generated_text"]
+        # Isolate the newly generated part
+        generated_text_only = raw_output[len(full_prompt_text):].strip()
+        predicted_letter = extract_predicted_letter(generated_text_only)
         is_correct = (predicted_letter == expected_letter)
         if is_correct:
             correct_predictions += 1
         results_details.append({
+            "Question": item['question'],
+            "Correct": "✅" if is_correct else "❌",
+            "Expected": expected_letter,
+            "Predicted": predicted_letter or "N/A",
+            "Model Output": generated_text_only
         })
     accuracy = (correct_predictions / num_samples) * 100 if num_samples > 0 else 0
     """
     Main function to orchestrate the entire evaluation process.
     Handles single subject or 'ALL' subjects evaluation.
+    Returns a dictionary of Gradio updates.
     """
     try:
         gr.Info("Starting evaluation...")
         subjects_to_run = []
         if subject_name == "ALL":
+            # Exclude the "ALL" placeholder from the list of subjects to run
+            subjects_to_run = [s for s in ALL_BENCHMARK_SUBJECTS.get(benchmark_category, []) if s != "ALL"]
         else:
             subjects_to_run = [subject_name]
         if not subjects_to_run:
             gr.Warning(f"No subjects found for '{benchmark_category}'.")
+            # Return an empty but valid structure
+            return {
+                result_summary_output: gr.update(value="No subjects found to evaluate.", visible=True),
+                error_box: gr.update(visible=False),
+                details_box: gr.update(visible=False),
+            }
         for i, subject in enumerate(subjects_to_run):
             gr.Info(f"Evaluating {benchmark_category} - {subject} ({i+1}/{len(subjects_to_run)})...")
                 accuracy, subject_details = evaluate_single_subject(generator, dataset_id, subject, sample_count, progress)
                 all_results_details.extend(subject_details)
+                num_correct = sum(1 for d in subject_details if d['Correct'] == "✅")
                 num_evaluated = len(subject_details)
                 total_correct += num_correct
                 summary_lines.append(f"- **{subject}**: {accuracy:.2f}% ({num_correct}/{num_evaluated})")
             except Exception as e:
+                error_trace = traceback.format_exc()
                 gr.Error(f"Skipping {subject} due to an error: {e}")
+                summary_lines.append(f"- **{subject}**: Evaluation failed. See logs for details:\n```\n{error_trace}\n```")
                 continue
         overall_accuracy = (total_correct / total_samples) * 100 if total_samples > 0 else 0
         # --- Prepare Outputs ---
         if subject_name == "ALL":
+            result_summary = f"### Overall Average Accuracy: {overall_accuracy:.2f}%\n"
+            result_summary += f"across {total_samples:,} total samples from {len(subjects_to_run)} subjects.\n\n---\n\n**Breakdown by Subject:**\n"
             result_summary += "\n".join(summary_lines)
         else:
             result_summary = f"### Accuracy for {benchmark_category} - {subject_name}: {overall_accuracy:.2f}%\n"
+            result_summary += f"({total_correct:,}/{total_samples:,} correct)"
         # Save results for leaderboard
         record = {
             "model_id": model_id,
             "benchmark": benchmark_category,
             "accuracy": overall_accuracy,
+            "subject": subject_name, # Record if it was an 'ALL' run
             "sample_count": total_samples,
+            "timestamp": datetime.now().isoformat()
         }
         with open(EVAL_FILE, "a") as f:
             f.write(json.dumps(record) + "\n")
         gr.Info("Evaluation completed successfully!")
+        df_details = pd.DataFrame(all_results_details)
+        # Return a dictionary of component updates
+        return {
+            result_summary_output: gr.update(value=result_summary, visible=True),
+            error_box: gr.update(visible=False),
+            details_box: gr.update(visible=True),
+            detailed_results_df: gr.update(value=df_details)
+        }
     except Exception as e:
+        error_message = f"An unexpected error occurred during setup: {e}"
         error_details = traceback.format_exc()
         gr.Error(error_message)
+        return {
+            result_summary_output: gr.update(visible=False),
+            error_box: gr.update(visible=True),
+            error_output: gr.update(value=error_message),
+            error_details_output: gr.update(value=error_details),
+            details_box: gr.update(visible=False)
+        }
 # --- UI Helper Functions ---
     default_value = "ALL" if "ALL" in choices else (choices[0] if choices else None)
     return gr.update(choices=choices, value=default_value)
+def load_leaderboard(benchmark_filter, progress=gr.Progress()):
     """
     Loads and processes evaluation data to display on the leaderboard.
     It now correctly averages scores for models that were evaluated on 'ALL' subjects.
     """
+    progress(0, desc="Loading Leaderboard...")
     try:
         if not os.path.exists(EVAL_FILE):
+            return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
         df = pd.read_json(EVAL_FILE, lines=True)
         if df.empty:
+            return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
         # Coerce accuracy to numeric and filter valid entries
         df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
         df.dropna(subset=['accuracy'], inplace=True)
         # Filter by the selected benchmark (e.g., MMLU or MMLU-Pro)
+        df_filtered = df[(df['benchmark'] == benchmark_filter) & (df['subject'] == 'ALL')].copy()
         if df_filtered.empty:
+            return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
         # Find the latest evaluation for each model
+        df_filtered['timestamp'] = pd.to_datetime(df_filtered['timestamp'])
+        latest_evals = df_filtered.loc[df_filtered.groupby('model_id')['timestamp'].idxmax()].copy()
+        leaderboard_df = latest_evals.sort_values(by="accuracy", ascending=False).copy()
+        # Add Rank
+        leaderboard_df.insert(0, 'Rank', range(1, len(leaderboard_df) + 1))
+        # Rename and format columns
+        leaderboard_df.rename(columns={
+            'model_id': 'Model ID',
+            'accuracy': 'Avg. Accuracy (%)',
+            'sample_count': 'Total Samples',
+            'timestamp': 'Date'
+        }, inplace=True)
+        leaderboard_df['Avg. Accuracy (%)'] = leaderboard_df['Avg. Accuracy (%)'].map('{:.2f}'.format)
+        leaderboard_df['Date'] = leaderboard_df['Date'].dt.strftime('%Y-%m-%d')
+        progress(1, desc="Done.")
+        return leaderboard_df[['Rank', 'Model ID', 'Avg. Accuracy (%)', 'Total Samples', 'Date']]
     except Exception as e:
         gr.Error(f"Error loading leaderboard: {e}")
         traceback.print_exc()
+        return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
 # --- Gradio Interface Definition ---
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css="""
     /* --- Global & Layout --- */
     body { font-family: 'Inter', sans-serif; background-color: #f8f9fa; }
     .gradio-container { max-width: 1280px !important; margin: auto; }
+    .gr-group { border-radius: 12px !important; box-shadow: 0 4px 12px rgba(0,0,0,0.05) !important; border: 1px solid #e9ecef !important; background-color: white; }
     /* --- Typography --- */
+    h1 { text-align: center; font-size: 2.5rem !important; font-weight: 800; color: #212529; margin-bottom: 0.5rem; letter-spacing: -1.5px; }
+    .subtitle { text-align: center; color: #6c757d; font-size: 1.1rem; margin-bottom: 2.5rem; max-width: 800px; margin-left: auto; margin-right: auto;}
     /* --- Buttons & Inputs --- */
+    .gr-button { font-weight: 600 !important; transition: all 0.2s ease; }
+    .gr-button-primary { box-shadow: 0 4px 10px rgba(59, 130, 246, 0.2); }
+    .gr-button-primary:hover { transform: translateY(-2px); box-shadow: 0 6px 15px rgba(59, 130, 246, 0.3); }
     /* --- Custom Radio Buttons (Segmented Control) --- */
+    #leaderboard-toggle-group { display: flex; justify-content: center; align-items: center; gap: 1rem; margin-bottom: 1.5rem; }
+    #leaderboard-toggle { background-color: #e9ecef; padding: 5px; border-radius: 10px; display: inline-flex; }
+    #leaderboard-toggle div.gr-form { display: flex; gap: 5px; }
+    #leaderboard-toggle input[type='radio'] { display: none; }
+    #leaderboard-toggle label { padding: 8px 16px; border-radius: 8px; cursor: pointer; transition: all 0.3s ease; font-weight: 500; color: #495057; background: transparent; border: none; box-shadow: none; }
+    #leaderboard-toggle input[type='radio']:checked + label { background-color: white; color: #0d6efd; font-weight: 600; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
     /* --- Dataframe / Table Styling --- */
+    .leaderboard-table .gr-dataframe table { border-collapse: collapse; width: 100%; }
+    .leaderboard-table .gr-dataframe thead th { background-color: #f8f9fa !important; color: #495057 !important; font-weight: 600 !important; text-align: left; padding: 12px 15px; border-bottom: 2px solid #dee2e6; }
+    .leaderboard-table .gr-dataframe tbody tr:nth-of-type(even) { background-color: #fdfdff; }
+    .leaderboard-table .gr-dataframe tbody tr:hover { background-color: #f0f6ff; }
+    .leaderboard-table .gr-dataframe tbody td { padding: 12px 15px; border-bottom: 1px solid #e9ecef; }
+    .leaderboard-table .gr-dataframe tbody td:first-child { font-weight: 700; color: #495057; }
     /* --- Error & Result Panes --- */
+    #error-display-box { background-color: #fff3f3 !important; border-color: #ffc9c9 !important; }
+    #result-summary-box { background-color: #f3f9ff !important; border-color: #cde4ff !important; }
 """) as demo:
+    gr.Markdown("<h1>🏆 Open LLM Evaluator</h1>")
+    gr.Markdown("<p class='subtitle'>Benchmark leading models on MMLU and MMLU-Pro. Your results contribute to a live leaderboard. Select a benchmark and run an evaluation, or view the current standings.</p>")
+    with gr.Tabs() as tabs:
+        # --- Leaderboard Tab ---
+        with gr.TabItem("📊 Leaderboard", id=0):
+            with gr.Column():
+                with gr.Row(elem_id="leaderboard-toggle-group"):
+                    leaderboard_type_toggle = gr.Radio(
+                        ["MMLU", "MMLU-Pro"],
+                        label="Select Benchmark",
+                        value="MMLU",
+                        interactive=True,
+                        elem_id="leaderboard-toggle",
+                        container=False,
+                        show_label=False,
+                    )
+                    refresh_button = gr.Button("🔄 Refresh", size="sm")
+                leaderboard_table_output = gr.DataFrame(
+                    headers=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"],
+                    interactive=False,
+                    datatype=["number", "str", "str", "number", "str"],
+                    row_count=15,
+                    elem_classes="leaderboard-table"
+                )
         # --- Evaluation Tab ---
+        with gr.TabItem("🚀 Run Evaluation", id=1):
+            with gr.Row(variant='panel'):
                 with gr.Column(scale=2):
+                    with gr.Group():
                         gr.Markdown("### 1. Configure Evaluation")
                         model_id_input = gr.Textbox(
                             label="Hugging Face Model ID",
                             placeholder="e.g., meta-llama/Meta-Llama-3-8B-Instruct",
                             interactive=True
                         )
+                        benchmark_selection_radio = gr.Radio(
+                            ["MMLU", "MMLU-Pro"],
+                            label="Benchmark",
+                            value="MMLU",
+                            interactive=True,
+                        )
                         with gr.Row():
                             benchmark_subject_dropdown = gr.Dropdown(
                                 label="Subject",
                     gr.Markdown("### 2. View Results")
                     # Panel for displaying the summary of results
+                    with gr.Group(visible=False) as result_summary_box:
+                        result_summary_output = gr.Markdown(elem_id="result-summary-box")
                     # Panel for displaying errors
+                    with gr.Group(visible=False) as error_box:
+                        error_output = gr.Textbox(label="Error Message", interactive=False, elem_id="error-display-box")
                         error_details_output = gr.Textbox(label="Error Details (Traceback)", interactive=False, lines=8)
                     # Panel for detailed, row-by-row results
+                    with gr.Group(visible=False) as details_box:
                         gr.Markdown("#### Detailed Evaluation Log")
+                        detailed_results_df = gr.DataFrame(
+                            headers=["Question", "Correct", "Expected", "Predicted", "Model Output"],
+                            datatype=["str", "str", "str", "str", "str"],
                             interactive=False,
                             row_count=10,
+                            col_count=5,
+                            wrap=True,
                         )
     # --- Event Handlers & Logic ---
     run_button.click(
         fn=run_evaluation,
         inputs=[model_id_input, benchmark_selection_radio, benchmark_subject_dropdown, sample_count_slider],
+        outputs=[result_summary_output, error_box, error_output, error_details_output, details_box, detailed_results_df]
+    ).then(
+        # After evaluation, switch to the leaderboard tab and refresh it
+        lambda: gr.update(selected=0), outputs=[tabs]
     ).then(
+        load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output]
     )
     # Leaderboard loading logic
     leaderboard_type_toggle.change(
         fn=load_leaderboard,
         inputs=[leaderboard_type_toggle],
+        outputs=[leaderboard_table_output],
+        show_progress='minimal'
     )
+    refresh_button.click(
         fn=load_leaderboard,
         inputs=[leaderboard_type_toggle],
+        outputs=[leaderboard_table_output],
+        show_progress='full'
     )
 # Launch the Gradio app