Spaces:

Enderchef
/

SuperBench-Eval

Running on Zero

App Files Files Community

Enderchef commited on Jun 25

Commit

eee15fb

verified ·

1 Parent(s): 9c0e863

Update app.py

Browse files

Files changed (1) hide show

app.py +417 -629

app.py CHANGED Viewed

@@ -6,744 +6,532 @@ import torch
 import re
 import json
 import pandas as pd
-import matplotlib.pyplot as plt
-import traceback # Import traceback for detailed error logging
-import spaces # Import the spaces library
-# Cache to avoid reloading the model
 model_cache = {}
 HF_TOKEN = os.environ.get("HF_TOKEN")
 # --- Constants for Benchmarks ---
 MMLU_DATASET = "cais/mmlu"
 MMLU_PRO_DATASET = "TIGER-Lab/MMLU-Pro"
-def get_all_benchmark_options():
-    all_options = {}
-    gr_dropdown_options = [] # This is for initial display only, not used for dynamic updates directly
-    # Get subjects for MMLU
-    try:
-        mmlu_subjects = get_dataset_config_names(MMLU_DATASET, token=HF_TOKEN)
-        all_options[MMLU_DATASET] = ["ALL"] + mmlu_subjects
-    except Exception as e:
-        print(f"Warning: Could not load MMLU dataset configs. Error: {e}")
-        all_options[MMLU_DATASET] = []
-    # Get subjects for MMLU-Pro
-    try:
-        mmlu_pro_subjects = get_dataset_config_names(MMLU_PRO_DATASET, token=HF_TOKEN)
-        all_options[MMLU_PRO_DATASET] = ["ALL"] + mmlu_pro_subjects
-    except Exception as e:
-        print(f"Warning: Could not load MMLU-Pro dataset configs. It might not be accessible or available. Error: {e}")
-        all_options[MMLU_PRO_DATASET] = []
-    # Flattened list for the initial state of the subject dropdown (e.g., MMLU subjects)
-    if MMLU_DATASET in all_options:
-        gr_dropdown_options.extend(all_options[MMLU_DATASET])
-    return all_options, gr_dropdown_options
-# Initialize these once globally when the app starts
-ALL_BENCHMARK_SUBJECTS, INITIAL_GRADIO_DROPDOWN_OPTIONS = get_all_benchmark_options()
-@spaces.GPU() # Decorator to ensure this function runs on GPU if available
 def load_model(model_id):
     """
-    Loads a Hugging Face model and its tokenizer, then creates a text-generation pipeline.
-    Uses a cache to avoid re-loading if the model is already in memory.
-    Provides Gradio Info/Error messages for user feedback.
-    Raises an exception if model loading fails.
     """
     gr.Info(f"Attempting to load model: {model_id}...")
     if model_id in model_cache:
-        gr.Info(f"Model '{model_id}' already loaded from cache.")
         return model_cache[model_id]
     try:
-        # Load tokenizer and model, using bfloat16 if CUDA is available for efficiency
         tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             token=HF_TOKEN,
-            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
             trust_remote_code=True
         ).to("cuda" if torch.cuda.is_available() else "cpu")
-        # Create a text-generation pipeline
-        generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
-        # Cache the loaded generator
         model_cache[model_id] = generator
         gr.Info(f"Model '{model_id}' loaded successfully.")
         return generator
     except Exception as e:
-        # Re-raise the exception to be caught by the outer run_evaluation try-except
-        raise ValueError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token. Error: {e}")
 def format_prompt(item):
-    prompt = f"""{item['question']}
-A. {item['choices'][0]}
-B. {item['choices'][1]}
-C. {item['choices'][2]}
-D. {item['choices'][3]}
-Answer:"""
-    return prompt, item['answer'] # Returns the prompt string and the correct choice index (0-3)
-def extract_choice_letter(output):
     """
-    Extracts the most likely choice letter (A, B, C, D) from the model's generated output.
-    It prioritizes an exact match after "Answer:", then looks for any single capital letter.
     """
-    # Look for "Answer: X" pattern first (e.g., "Answer: A" or "Answer: B")
-    match = re.search(r"Answer:\s*([ABCD])", output, re.IGNORECASE)
-    if match:
-        return match.group(1).upper() # Ensure it's uppercase
-    # Fallback: look for a single capital letter A-D anywhere in the output
-    match = re.search(r"\b([ABCD])\b", output.strip())
-    if match:
-        return match.group(1)
-    return None # Return None if no valid choice letter is found
-def get_choice_letter(index):
-    """Converts a numerical choice index (0-3) to a capital letter (A-D)."""
-    if 0 <= index <= 3:
-        return chr(ord('A') + index)
-    return None # Return None for invalid indices
 def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress):
     """
-    Evaluates a given model generator on a specific subject from a specified dataset.
-    Args:
-        generator: The Hugging Face pipeline for text generation.
-        dataset_id (str): The ID of the dataset (e.g., "cais/mmlu", "cais/mmlu_pro").
-        subject (str): The specific subject/config name within the dataset.
-        sample_count (int): The maximum number of samples to evaluate.
-        progress (gr.Progress): Gradio progress tracker.
-    Returns:
-        tuple: (accuracy, list_of_detailed_results)
-    Raises:
-        Exception: If dataset loading fails.
     """
-    gr.Info(f"Loading dataset: {dataset_id} - {subject}...")
     try:
-        # Load the "test" split of the dataset
-        dataset = load_dataset(dataset_id, subject, token=HF_TOKEN)["test"]
     except Exception as e:
-        # Re-raise the exception to be caught by the outer run_evaluation try-except
         raise RuntimeError(f"Failed to load dataset '{dataset_id}' for subject '{subject}'. Error: {e}")
-    # Limit the number of samples and shuffle for consistent evaluation across runs
-    num_samples_to_evaluate = min(sample_count, len(dataset))
-    dataset = dataset.shuffle(seed=42).select(range(num_samples_to_evaluate))
-    correct_count = 0
-    subject_results = []
-    # Iterate through the selected samples with a progress bar
-    for i, item in enumerate(progress.tqdm(dataset, desc=f"Processing {subject} samples")):
-        prompt, answer_idx = format_prompt(item)
-        expected_letter = get_choice_letter(answer_idx)
-        # Generate only 1 new token for the answer (A, B, C, D)
-        # do_sample=False ensures deterministic output for a given prompt (greedy decoding)
-        output_raw = generator(prompt, max_new_tokens=1, do_sample=False)[0]["generated_text"]
-        # Check for potential reasoning model output
-        is_reasoning_model_output = '<' in output_raw or re.search(r"\b(because|therefore|thus|reasoning)\b", output_raw, re.IGNORECASE) is not None
-        # Extract the predicted letter from the model's raw output
-        predicted_letter = extract_choice_letter(output_raw)
         is_correct = (predicted_letter == expected_letter)
-        correct_count += is_correct
-        # Store detailed results for logging and display
-        subject_results.append({
             "question": item['question'],
             "choices": item['choices'],
-            "model_raw_output": output_raw.strip(),
-            "expected_answer_letter": expected_letter,
-            "predicted_answer_letter": predicted_letter,
             "is_correct": is_correct,
-            "is_reasoning_model_output": is_reasoning_model_output # Store the flag
         })
-    # Calculate accuracy for the current subject
-    accuracy = (correct_count / len(dataset)) * 100 if len(dataset) > 0 else 0
-    return accuracy, subject_results
-@spaces.GPU() # Decorator to ensure this function runs on GPU if available
-def run_evaluation(model_id, benchmark_category, subject_name, sample_count, progress=gr.Progress()):
-    """
-    Main function to orchestrate the evaluation process.
-    Handles single subject or 'ALL' subjects evaluation for MMLU/MMLU-Pro.
-    Returns Gradio.update objects to control UI component visibility and content.
-    """
-    gr.Info("Starting evaluation...")
-    if not model_id:
-        gr.Warning("Please enter a Hugging Face Model ID before running the evaluation.")
-        # Return updates to hide logs/debug and show empty results
-        return "", gr.update(value="", visible=False), gr.update(visible=False), \
-               gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
-    dataset_id_map = {
-        "MMLU": MMLU_DATASET,
-        "MMLU-Pro": MMLU_PRO_DATASET
-    }
-    current_dataset_id = dataset_id_map.get(benchmark_category)
-    if not current_dataset_id:
-        gr.Error(f"Unknown benchmark category selected: {benchmark_category}. This should not happen.")
-        return "", gr.update(value="", visible=False), gr.update(visible=False), \
-               gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
     try:
-        generator = load_model(model_id) # This function will raise an exception on failure
-        all_evaluation_results = []
-        total_correct_overall = 0
-        total_samples_overall = 0
-        eval_summary_lines = []
         if subject_name == "ALL":
-            subjects_to_evaluate = ALL_BENCHMARK_SUBJECTS.get(current_dataset_id, [])
-            if "ALL" in subjects_to_evaluate:
-                subjects_to_evaluate.remove("ALL")
-            if not subjects_to_evaluate:
-                gr.Warning(f"No subjects found to evaluate for '{benchmark_category}'.")
-                return "", gr.update(value="", visible=False), gr.update(visible=False), \
-                       gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
-            for i, sub in enumerate(progress.tqdm(subjects_to_evaluate, desc=f"Evaluating ALL {benchmark_category} subjects")):
-                gr.Info(f"Evaluating {benchmark_category} - {sub} ({i+1}/{len(subjects_to_evaluate)})...")
-                try:
-                    accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, sub, sample_count, progress)
-                    all_evaluation_results.extend(subject_details)
-                    num_evaluated_samples = len(subject_details)
-                    num_correct_in_subject = sum(d['is_correct'] for d in subject_details)
-                    total_correct_overall += num_correct_in_subject
-                    total_samples_overall += num_evaluated_samples
-                    eval_summary_lines.append(f"- {benchmark_category} - {sub}: {accuracy:.2f}% ({num_correct_in_subject}/{num_evaluated_samples} samples)")
-                except Exception as e:
-                    gr.Error(f"Skipping {benchmark_category} - {sub} due to an error: {e}")
-                    eval_summary_lines.append(f"- {benchmark_category} - {sub}: Error during evaluation.")
-                    continue
-            overall_accuracy = (total_correct_overall / total_samples_overall) * 100 if total_samples_overall > 0 else 0
-            score_string = f"Overall Average Accuracy for {benchmark_category}: {overall_accuracy:.2f}% across {total_samples_overall} total samples.\n\n"
-            score_string += "Detailed breakdown:\n" + "\n".join(eval_summary_lines)
         else:
-            accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, subject_name, sample_count, progress)
-            all_evaluation_results.extend(subject_details)
-            overall_accuracy = accuracy
-            num_evaluated_samples = len(subject_details)
-            score_string = f"Accuracy for {benchmark_category} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
-        # Format detailed results for display in the text box
-        formatted_details = "\n\n".join([
-            (
-                f"### Question:\n{item['question']}\n\n"
-                + f"**Choices:**\n" + "\n".join([f"{get_choice_letter(i)}. {c}" for i, c in enumerate(item['choices'])]) + "\n\n"
-                + (f"**Note:** Reasoning models are currently not fully supported for single-letter extraction. The original model output followed:\n" if item.get('is_reasoning_model_output') else "")
-                + f"**Model Raw Output:** {item['model_raw_output']}\n"
-                + f"**Expected Answer:** {item['expected_answer_letter']}\n"
-                + f"**Predicted Answer:** {item['predicted_answer_letter']}\n"
-                + f"**Correct:** {'Yes' if item['is_correct'] else 'No'}"
-            )
-            for item in all_evaluation_results
-        ])
-        # Record the evaluation result to a JSONL file for the leaderboard
         record = {
             "model_id": model_id,
             "benchmark": benchmark_category,
-            "subject": subject_name,
             "accuracy": overall_accuracy,
-            "sample_count": total_samples_overall if subject_name == "ALL" else len(all_evaluation_results),
             "timestamp": pd.Timestamp.now().isoformat()
         }
-        with open("eval.jsonl", "a") as f:
             f.write(json.dumps(record) + "\n")
         gr.Info("Evaluation completed successfully!")
-        return score_string, \
-               gr.update(value="", visible=False), gr.update(visible=False), \
-               gr.update(visible=True), gr.update(visible=True), gr.update(value=formatted_details, visible=False)
     except Exception as e:
-        error_message = str(e)
-        detailed_error_traceback = traceback.format_exc()
-        gr.Error("An error occurred during evaluation.")
-        # Return updates for error state
-        return "Error occurred during evaluation. We'll evaluate for you if this persists - please open a community support tab for assistance.", \
-               gr.update(value=detailed_error_traceback, visible=True), gr.update(visible=True), \
-               gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
-def save_text(text_content):
-    """Saves the provided text content to a file and returns the file path for download."""
-    if not text_content:
-        gr.Warning("No evaluation results to download.")
-        return None
-    file_path = "evaluation_results.txt"
-    try:
-        with open(file_path, "w") as f:
-            f.write(text_content)
-        return file_path
-    except Exception as e:
-        gr.Error(f"Error saving file: {e}")
-        return None
 def load_leaderboard(benchmark_filter):
     """
-    Loads evaluation data from 'eval.jsonl', computes average accuracy per model for the selected benchmark,
-    and prepares data for the leaderboard table.
     """
     try:
-        df = pd.read_json("eval.jsonl", lines=True)
-        # Ensure 'accuracy' is numeric, coerce errors to NaN and drop them
-        df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
-        df = df.dropna(subset=['accuracy'])
         if df.empty:
-            gr.Warning("No valid evaluation data found to populate the leaderboard.")
-            return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
-        # Filter data based on the selected benchmark
-        df_filtered = df[df['benchmark'] == benchmark_filter]
         if df_filtered.empty:
-            gr.Warning(f"No evaluation data for {benchmark_filter} found yet.")
-            return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
-        # For the leaderboard, we typically want the average across all subjects within that benchmark.
-        # So we group by model_id and take the mean of accuracy.
-        df_grouped = df_filtered.groupby("model_id")["accuracy"].mean().reset_index()
-        df_grouped.columns = ["Model ID", "Average Accuracy (%)"]
-        df_sorted = df_grouped.sort_values(by="Average Accuracy (%)", ascending=False)
-        return df_sorted.to_dict('records')
-    except FileNotFoundError:
-        gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
-        return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
     except Exception as e:
         gr.Error(f"Error loading leaderboard: {e}")
-        traceback.print_exc() # Print full traceback for debugging
-        return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
-def update_subject_dropdown_choices(benchmark_category):
-    """
-    Updates the choices for the subject dropdown based on the selected benchmark category.
-    """
-    dataset_id_map = {
-        "MMLU": MMLU_DATASET,
-        "MMLU-Pro": MMLU_PRO_DATASET
-    }
-    selected_dataset_id = dataset_id_map.get(benchmark_category)
-    if selected_dataset_id and selected_dataset_id in ALL_BENCHMARK_SUBJECTS:
-        new_choices = ALL_BENCHMARK_SUBJECTS[selected_dataset_id]
-        # Set default value to "ALL" if available, otherwise the first subject
-        default_value = "ALL" if "ALL" in new_choices else (new_choices[0] if new_choices else None)
-        return gr.update(choices=new_choices, value=default_value)
-    else:
-        return gr.update(choices=[], value=None)
 # --- Gradio Interface Definition ---
-with gr.Blocks(css="""
-    /* Import Google Font - Inter */
-    @import url('https://fonts.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
-    /* General body and container styling */
-    body {
-        font-family: 'Inter', sans-serif;
-        background-color: #eef2f6; /* Lighter background */
-        margin: 0;
-        padding: 20px;
-    }
-    .gradio-container {
-        max-width: 1200px;
-        margin: 20px auto;
-        padding: 40px; /* Increased padding */
-        box-shadow: 0 10px 25px rgba(0,0,0,0.1); /* Softer, larger shadow */
-        border-radius: 15px; /* More rounded corners */
-        background-color: #ffffff;
-        border: 1px solid #e0e6ed; /* Subtle border */
-    }
-    /* Headings */
-    h1 {
-        color: #1a202c; /* Darker, more professional heading color */
-        text-align: center;
-        margin-bottom: 30px;
-        font-size: 2.8em; /* Slightly larger H1 */
-        font-weight: 700;
-        letter-spacing: -0.03em;
-        text-shadow: 1px 1px 2px rgba(0,0,0,0.05); /* Subtle text shadow */
-    }
-    h3 {
-        color: #2d3748;
-        font-size: 1.3em; /* Slightly larger H3 */
-        margin-bottom: 15px;
-        font-weight: 600;
-    }
-    /* Markdown text */
-    .markdown-text {
-        text-align: center;
-        color: #4a5568;
-        line-height: 1.7;
-        font-size: 1.05em;
-        margin-bottom: 30px;
-    }
-    .markdown-text div {
-        font-size: 1.1em;
-        max-width: 800px; /* Constrain width for readability */
-        margin: 0 auto;
-    }
-    /* Buttons */
-    .gr-button {
-        background-color: #2f80ed; /* A vibrant, professional blue */
-        color: white;
-        border: none;
-        padding: 14px 30px; /* More padding */
-        border-radius: 10px; /* More rounded */
-        cursor: pointer;
-        transition: background-color 0.3s ease, transform 0.2s ease, box-shadow 0.2s ease;
-        font-size: 1.15em; /* Slightly larger font */
-        font-weight: 600;
-        box-shadow: 0 5px 15px rgba(0, 123, 255, 0.2); /* Enhanced shadow for primary button */
-        margin: 5px; /* Add some margin for spacing between buttons */
-    }
-    .gr-button:hover {
-        background-color: #1a6dcd; /* Darker blue on hover */
-        transform: translateY(-3px); /* More pronounced lift effect */
-        box-shadow: 0 8px 20px rgba(0, 123, 255, 0.3);
-    }
-    .gr-button:active {
-        transform: translateY(0);
-        box-shadow: 0 2px 5px rgba(0,0,0,0.1);
-    }
-    /* Specific button styling for debug/show details */
-    #debug-button, #show-details-button {
-        background-color: #718096; /* Professional grey */
-        box-shadow: 0 3px 10px rgba(113, 128, 150, 0.2);
-    }
-    #debug-button:hover, #show-details-button:hover {
-        background-color: #5d6d81;
-        box-shadow: 0 5px 12px rgba(113, 128, 150, 0.3);
-    }
-    #download-button {
-        background-color: #38a169; /* Muted green for download */
-        box-shadow: 0 3px 10px rgba(56, 161, 105, 0.2);
-    }
-    #download-button:hover {
-        background-color: #277e50;
-        box-shadow: 0 5px 12px rgba(56, 161, 105, 0.3);
-    }
-    /* Input/Output Boxes (Containers) */
-    .gr-box {
-        border: 1px solid #cbd5e0; /* Lighter, subtle border */
-        border-radius: 12px;
-        padding: 25px; /* Increased padding */
-        margin-bottom: 25px;
-        background-color: #f8fafc; /* Very light background */
-        box-shadow: inset 0 2px 5px rgba(0,0,0,0.03); /* Subtle inner shadow */
-    }
-    /* Specific text output boxes (the content inside the containers) */
-    .gr-output-text {
-        white-space: pre-wrap;
-        word-wrap: break-word;
-        background-color: #ffffff; /* White background for readability */
-        border: 1px solid #e2e8f0;
-        border-radius: 8px;
-        padding: 18px; /* More padding */
-        min-height: 120px; /* Ensure a minimum height */
-        box-shadow: 0 2px 8px rgba(0,0,0,0.05); /* Small shadow for depth */
-        color: #2d3748; /* Darker text for readability */
-        font-size: 0.95em;
-        line-height: 1.6;
-    }
-    /* Specific error output style */
-    #error-message-output {
-        background-color: #ffe0e6; /* Light red */
-        border-color: #ff99aa; /* Slightly darker red border */
-        color: #c53030; /* Stronger red text */
-        font-weight: 500;
-        padding: 20px;
-    }
-    /* Labels for inputs */
-    .gr-textbox label, .gr-dropdown label, .gr-slider label {
-        font-weight: 600;
-        color: #2d3748; /* Darker label text */
-        margin-bottom: 10px;
-        display: block;
-        font-size: 1.05em; /* Slightly larger label font */
-    }
-    /* Tabs styling */
-    .gr-tabs-nav button {
-        font-weight: 600;
-        font-size: 1.1em;
-        padding: 12px 25px; /* More padding for tabs */
-        border-top-left-radius: 10px;
-        border-top-right-radius: 10px;
-        background-color: #ebf4f8; /* Light blueish tab background */
-        color: #4a5568;
-        border: 1px solid #cce0eb; /* Subtle border for tabs */
-        border-bottom: none;
-        transition: background-color 0.3s ease, color 0.3s ease;
-    }
-    .gr-tabs-nav button.selected {
-        background-color: #ffffff; /* White for selected tab */
-        color: #2f80ed; /* Blue for selected text */
-        border-color: #2f80ed;
-        border-bottom: 1px solid #ffffff; /* Hide bottom border to merge with content */
-    }
-    /* Leaderboard specific table styling (general for all leaderboard tables) */
-    .leaderboard-table {
-        border-radius: 12px;
-        box-shadow: 0 4px 15px rgba(0,0,0,0.08);
-        overflow: hidden;
-        margin-bottom: 25px; /* Space between tables */
-    }
-    .leaderboard-table table {
-        border-collapse: separate;
-        border-spacing: 0;
-        width: 100%;
-        background-color: #ffffff;
-    }
-    .leaderboard-table thead th {
-        background-color: #edf2f7; /* Light grey header */
-        color: #2d3748;
         font-weight: 700;
-        padding: 15px 20px;
-        text-align: left;
-        border-bottom: 2px solid #e2e8f0;
-    }
-    .leaderboard-table tbody tr {
-        transition: background-color 0.2s ease;
-    }
-    .leaderboard-table tbody tr:nth-child(odd) {
-        background-color: #f7fafc; /* Zebra striping */
-    }
-    .leaderboard-table tbody tr:hover {
-        background-color: #e6fffa; /* Light teal on hover for rows */
-    }
-    .leaderboard-table tbody td {
-        padding: 12px 20px;
-        border-bottom: 1px solid #ebf4f8;
-        color: #4a5568;
     }
-    .leaderboard-table tbody tr:last-child td {
-        border-bottom: none;
     }
-    .leaderboard-table tbody tr:first-child td {
-        border-top-left-radius: 12px;
-        border-top-right-radius: 12px;
-    }
-    .leaderboard-table tbody tr:last-child td {
-        border-bottom-left-radius: 12px;
-        border-bottom-right-radius: 12px;
-    }
-    /* Radio button group for leaderboard */
-    #leaderboard-toggle.gr-form {
         display: flex;
-        justify-content: center;
-        padding: 0px 0px 20px 0px; /* Reduced padding for more compact look */
     }
-    #leaderboard-toggle label.gr-radio-label {
-        font-size: 1.1em;
-        font-weight: 600;
-        color: #2d3748;
-        padding: 10px 20px;
         border-radius: 8px;
-        background-color: #edf2f7; /* Light background for unselected */
-        border: 1px solid #e2e8f0;
         cursor: pointer;
-        transition: all 0.3s ease;
-        margin: 0 5px; /* Spacing between radio buttons */
-    }
-    #leaderboard-toggle input[type="radio"]:checked + label.gr-radio-label {
-        background-color: #2f80ed; /* Blue for selected */
-        color: white;
-        border-color: #2f80ed;
-        box-shadow: 0 3px 10px rgba(47, 128, 237, 0.3);
-    }
-    #leaderboard-toggle input[type="radio"]:checked + label.gr-radio-label:hover {
-        background-color: #1a6dcd; /* Darker blue on hover */
-    }
-    #leaderboard-toggle label.gr-radio-label:hover {
-        background-color: #e2e8f0; /* Lighter grey on hover */
     }
-    /* Radio button group for evaluation benchmark selection */
-    #eval-benchmark-selection {
-        display: flex;
-        justify-content: center;
-        margin-bottom: 20px; /* Space above dropdown */
     }
-    #eval-benchmark-selection label.gr-radio-label {
-        font-size: 1.05em;
-        font-weight: 500;
-        color: #4a5568;
-        padding: 8px 15px;
-        border-radius: 6px;
-        background-color: #f0f4f7;
-        border: 1px solid #d9e3ed;
-        cursor: pointer;
-        transition: all 0.3s ease;
-        margin: 0 5px;
     }
-    #eval-benchmark-selection input[type="radio"]:checked + label.gr-radio-label {
-        background-color: #48bb78; /* A pleasant green for evaluation selection */
-        color: white;
-        border-color: #48bb78;
-        box-shadow: 0 2px 8px rgba(72, 187, 120, 0.2);
     }
-    #eval-benchmark-selection input[type="radio"]:checked + label.gr-radio-label:hover {
-        background-color: #38a169;
     }
-    #eval-benchmark-selection label.gr-radio-label:hover {
-        background-color: #e5edf2;
     }
 """) as demo:
-    gr.Markdown("""
-    # 🤖 LLM Benchmark Evaluator
-    """)
     with gr.Tabs():
         with gr.TabItem("🚀 Run Evaluation"):
-            gr.Markdown("""
-            <div class="markdown-text">
-                Enter your Hugging Face Model ID, choose a benchmark (MMLU only for now),
-                select a subject (or 'ALL' for a comprehensive evaluation),
-                and specify the number of samples per subject.
-                Ensure your Hugging Face token is set as an environment variable for private models.
-            </div>
-            """)
-            with gr.Column(elem_classes="gr-box"):
-                model_id_input = gr.Textbox(
-                    label="Your Hugging Face Model ID",
-                    placeholder="e.g., ICONNAI/ICONN-1-Mini-Beta",
-                    interactive=True
-                )
-                # New Radio button for benchmark selection for evaluation
-                benchmark_selection_radio = gr.Radio(
-                    ["MMLU"],
-                    label="Select Benchmark Type",
-                    value="MMLU", # Default selection
                     interactive=True,
-                    container=False, # Important for custom styling placement
-                    elem_id="eval-benchmark-selection"
                 )
-                with gr.Row():
-                    benchmark_subject_dropdown = gr.Dropdown(
-                        label="Choose Subject", # Label changed to be more concise
-                        choices=INITIAL_GRADIO_DROPDOWN_OPTIONS, # Initial choices (MMLU subjects)
-                        value="all", # Default to ALL for MMLU initially
-                        interactive=True,
-                        min_width=400,
-                        visible=False
-                    )
-                    sample_count_slider = gr.Slider(
-                        label="Number of Samples per Subject (1-100)",
-                        minimum=1,
-                        maximum=100,
-                        value=100,
-                        step=1,
-                        interactive=True,
-                        min_width=200,
-                        visible=False
-                    )
-                run_button = gr.Button("Run Evaluation", elem_classes="gr-button")
-            gr.Markdown("<hr>") # Visual separator
-            with gr.Column(elem_classes="gr-box"):
-                acc_output = gr.Textbox(
-                    label="Benchmark Accuracy Results",
-                    interactive=False,
-                    elem_classes="gr-output-text",
-                    lines=5,
-                    placeholder="Evaluation results will appear here."
                 )
-            # Define button click actions
-            run_button.click(
-                run_evaluation,
-                inputs=[model_id_input, benchmark_selection_radio, benchmark_subject_dropdown, sample_count_slider], # Updated inputs
-                outputs=[
-                    acc_output
-                ]
-            )
-            # Link benchmark selection radio to subject dropdown
-            benchmark_selection_radio.change(
-                update_subject_dropdown_choices,
-                inputs=[benchmark_selection_radio],
-                outputs=[benchmark_subject_dropdown]
-            )
-        with gr.TabItem("📊 Leaderboard"):
-            gr.Markdown("""
-            <div class="markdown-text">
-                Explore the performance of various LLMs on a chunk of MMLU called MMLU Small.
-                This leaderboard is updated automatically with each new evaluation.
-            </div>
-            """)
-            # Leaderboard Type Toggle
-            leaderboard_type_toggle = gr.Radio(
-                ["MMLU Small"],
-                label="Select Benchmark for Leaderboard",
-                value="MMLU", # Default to MMLU
-                interactive=True,
-                container=False, # Make it inline with content
-                elem_id="leaderboard-toggle"
-            )
-            # Leaderboard Table
-            leaderboard_table_output = gr.Dataframe(
-                headers=["Model ID", "Average Accuracy (%)"],
-                interactive=False,
-                datatype=["str", "number"],
-                row_count=10,
-                col_count=2,
-                label="Benchmark Leaderboard Data",
-                elem_classes="leaderboard-table" # Apply custom class for styling
-            )
-            # Initial load and dynamic update for the leaderboard
-            demo.load(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output])
-            leaderboard_type_toggle.change(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output])
 # Launch the Gradio app
-demo.launch()

 import re
 import json
 import pandas as pd
+import traceback
+import spaces
+# --- Environment and Caching ---
+# It's good practice to ensure the cache directory exists.
+CACHE_DIR = "evaluation_cache"
+os.makedirs(CACHE_DIR, exist_ok=True)
+EVAL_FILE = os.path.join(CACHE_DIR, "eval.jsonl")
+# Cache to avoid reloading models and dataset configs
 model_cache = {}
+benchmark_subject_cache = {}
+# Use environment variable for the Hugging Face token
 HF_TOKEN = os.environ.get("HF_TOKEN")
 # --- Constants for Benchmarks ---
 MMLU_DATASET = "cais/mmlu"
 MMLU_PRO_DATASET = "TIGER-Lab/MMLU-Pro"
+BENCHMARK_MAP = {
+    "MMLU": MMLU_DATASET,
+    "MMLU-Pro": MMLU_PRO_DATASET
+}
+# --- Data Loading and Preparation ---
+def get_all_benchmark_options():
+    """
+    Fetches and caches the available subjects (configs) for each benchmark dataset.
+    This function now populates a global cache to avoid repeated API calls.
+    """
+    if benchmark_subject_cache:
+        return benchmark_subject_cache
+    print("Fetching benchmark configurations for the first time...")
+    for key, dataset_id in BENCHMARK_MAP.items():
+        try:
+            # Fetching dataset configurations requires authentication if the dataset is private
+            subjects = get_dataset_config_names(dataset_id, token=HF_TOKEN)
+            benchmark_subject_cache[key] = ["ALL"] + subjects
+        except Exception as e:
+            print(f"Warning: Could not load configs for {key} ({dataset_id}). It might be private or unavailable. Error: {e}")
+            benchmark_subject_cache[key] = []
+    print("Benchmark configurations cached.")
+    return benchmark_subject_cache
+# Initialize the cache on startup
+ALL_BENCHMARK_SUBJECTS = get_all_benchmark_options()
+@spaces.GPU()
 def load_model(model_id):
     """
+    Loads a Hugging Face model and tokenizer, creating a text-generation pipeline.
+    Uses a cache to avoid reloading models.
     """
+    if not model_id:
+        raise ValueError("Model ID cannot be empty.")
     gr.Info(f"Attempting to load model: {model_id}...")
     if model_id in model_cache:
+        gr.Info(f"Model '{model_id}' found in cache.")
         return model_cache[model_id]
     try:
+        # Use bfloat16 for better performance on modern GPUs
+        dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
         tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             token=HF_TOKEN,
+            torch_dtype=dtype,
             trust_remote_code=True
         ).to("cuda" if torch.cuda.is_available() else "cpu")
+        # Create the pipeline for text generation
+        generator = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            device=0 if torch.cuda.is_available() else -1
+        )
         model_cache[model_id] = generator
         gr.Info(f"Model '{model_id}' loaded successfully.")
         return generator
     except Exception as e:
+        # Raise a more specific error to be caught by the main evaluation function
+        raise RuntimeError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token (if required). Error: {e}")
+# --- Evaluation Logic ---
 def format_prompt(item):
+    """Formats the MMLU question and choices into a standardized prompt."""
+    prompt = f"Question: {item['question']}\n\nChoices:\nA. {item['choices'][0]}\nB. {item['choices'][1]}\nC. {item['choices'][2]}\nD. {item['choices'][3]}\n\nAnswer:"
+    return prompt, item['answer']
+def get_choice_letter(index):
+    """Converts a numerical choice index (0-3) to a letter (A-D)."""
+    return chr(ord('A') + index) if 0 <= index <= 3 else None
+def extract_predicted_letter(output_text):
     """
+    Extracts the predicted letter from the model's output.
+    It looks for a letter (A, B, C, D) immediately following 'Answer:'.
     """
+    match = re.search(r"Answer:\s*([ABCD])", output_text, re.IGNORECASE)
+    return match.group(1).upper() if match else None
 def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress):
     """
+    Evaluates a model on a specific subject from a dataset.
     """
+    gr.Info(f"Loading dataset: {dataset_id} ({subject})...")
     try:
+        # Load the 'test' split as it's standard for MMLU evaluation
+        dataset = load_dataset(dataset_id, subject, token=HF_TOKEN, split="test")
     except Exception as e:
         raise RuntimeError(f"Failed to load dataset '{dataset_id}' for subject '{subject}'. Error: {e}")
+    # Shuffle and select a subset of samples for evaluation
+    num_samples = min(sample_count, len(dataset))
+    dataset = dataset.shuffle(seed=42).select(range(num_samples))
+    correct_predictions = 0
+    results_details = []
+    for item in progress.tqdm(dataset, desc=f"Evaluating {subject}"):
+        prompt, correct_answer_idx = format_prompt(item)
+        expected_letter = get_choice_letter(correct_answer_idx)
+        # Generate a short response, aiming for a single letter answer.
+        # do_sample=False (greedy decoding) is crucial for reproducibility.
+        raw_output = generator(prompt, max_new_tokens=5, do_sample=False)[0]["generated_text"]
+        predicted_letter = extract_predicted_letter(raw_output)
         is_correct = (predicted_letter == expected_letter)
+        if is_correct:
+            correct_predictions += 1
+        results_details.append({
             "question": item['question'],
             "choices": item['choices'],
+            "raw_output": raw_output.strip(),
+            "expected_letter": expected_letter,
+            "predicted_letter": predicted_letter,
             "is_correct": is_correct,
         })
+    accuracy = (correct_predictions / num_samples) * 100 if num_samples > 0 else 0
+    return accuracy, results_details
+@spaces.GPU()
+def run_evaluation(model_id, benchmark_category, subject_name, sample_count, progress=gr.Progress(track_tqdm=True)):
+    """
+    Main function to orchestrate the entire evaluation process.
+    Handles single subject or 'ALL' subjects evaluation.
+    Returns updates for Gradio UI components.
+    """
     try:
+        gr.Info("Starting evaluation...")
+        generator = load_model(model_id)
+        dataset_id = BENCHMARK_MAP.get(benchmark_category)
+        if not dataset_id:
+            raise ValueError(f"Invalid benchmark category: {benchmark_category}")
+        all_results_details = []
+        summary_lines = []
+        total_correct = 0
+        total_samples = 0
+        subjects_to_run = []
         if subject_name == "ALL":
+            subjects_to_run = ALL_BENCHMARK_SUBJECTS.get(benchmark_category, [])
+            if "ALL" in subjects_to_run:
+                subjects_to_run.remove("ALL") # Remove 'ALL' from the list of subjects to run
+        else:
+            subjects_to_run = [subject_name]
+        if not subjects_to_run:
+            gr.Warning(f"No subjects found for '{benchmark_category}'.")
+            return "", "", "", pd.DataFrame().to_dict('records')
+        for i, subject in enumerate(subjects_to_run):
+            gr.Info(f"Evaluating {benchmark_category} - {subject} ({i+1}/{len(subjects_to_run)})...")
+            try:
+                accuracy, subject_details = evaluate_single_subject(generator, dataset_id, subject, sample_count, progress)
+                all_results_details.extend(subject_details)
+                num_correct = sum(d['is_correct'] for d in subject_details)
+                num_evaluated = len(subject_details)
+                total_correct += num_correct
+                total_samples += num_evaluated
+                summary_lines.append(f"- **{subject}**: {accuracy:.2f}% ({num_correct}/{num_evaluated})")
+            except Exception as e:
+                gr.Error(f"Skipping {subject} due to an error: {e}")
+                summary_lines.append(f"- **{subject}**: Evaluation failed.")
+                continue
+        overall_accuracy = (total_correct / total_samples) * 100 if total_samples > 0 else 0
+        # --- Prepare Outputs ---
+        if subject_name == "ALL":
+            result_summary = f"### Overall Average Accuracy for {benchmark_category}: {overall_accuracy:.2f}%\n"
+            result_summary += "across {:,} total samples.\n\n---\n\n**Breakdown by Subject:**\n".format(total_samples)
+            result_summary += "\n".join(summary_lines)
         else:
+            result_summary = f"### Accuracy for {benchmark_category} - {subject_name}: {overall_accuracy:.2f}%\n"
+            result_summary += "({:,}/{:,} correct)".format(total_correct, total_samples)
+        # Create a detailed DataFrame for inspection
+        df_details = pd.DataFrame(all_results_details)
+        # Save results for leaderboard
         record = {
             "model_id": model_id,
             "benchmark": benchmark_category,
             "accuracy": overall_accuracy,
+            "subject": subject_name,
+            "sample_count": total_samples,
             "timestamp": pd.Timestamp.now().isoformat()
         }
+        with open(EVAL_FILE, "a") as f:
             f.write(json.dumps(record) + "\n")
         gr.Info("Evaluation completed successfully!")
+        # Return updates for the UI
+        return (
+            gr.update(value=result_summary, visible=True),
+            gr.update(value="", visible=False), # Hide error message
+            gr.update(value="", visible=False), # Hide error details
+            gr.update(value=df_details.to_dict('records'), visible=True) # Show detailed results table
+        )
     except Exception as e:
+        error_message = f"An unexpected error occurred: {e}"
+        error_details = traceback.format_exc()
+        gr.Error(error_message)
+        # Return error updates for the UI
+        return (
+            gr.update(value="", visible=False), # Hide results summary
+            gr.update(value=error_message, visible=True),
+            gr.update(value=error_details, visible=True),
+            gr.update(value=pd.DataFrame().to_dict('records'), visible=False) # Hide detailed results
+        )
+# --- UI Helper Functions ---
+def update_subject_dropdown(benchmark_category):
+    """Updates the subject dropdown choices based on the selected benchmark."""
+    choices = ALL_BENCHMARK_SUBJECTS.get(benchmark_category, [])
+    default_value = "ALL" if "ALL" in choices else (choices[0] if choices else None)
+    return gr.update(choices=choices, value=default_value)
 def load_leaderboard(benchmark_filter):
     """
+    Loads and processes evaluation data to display on the leaderboard.
+    It now correctly averages scores for models that were evaluated on 'ALL' subjects.
     """
     try:
+        if not os.path.exists(EVAL_FILE):
+            return pd.DataFrame(columns=["Model ID", "Avg. Accuracy (%)", "Total Samples"]).to_dict('records')
+        df = pd.read_json(EVAL_FILE, lines=True)
         if df.empty:
+            return pd.DataFrame(columns=["Model ID", "Avg. Accuracy (%)", "Total Samples"]).to_dict('records')
+        # Coerce accuracy to numeric and filter valid entries
+        df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
+        df.dropna(subset=['accuracy'], inplace=True)
+        # Filter by the selected benchmark (e.g., MMLU or MMLU-Pro)
+        df_filtered = df[df['benchmark'] == benchmark_filter].copy()
         if df_filtered.empty:
+            return pd.DataFrame(columns=["Model ID", "Avg. Accuracy (%)", "Total Samples"]).to_dict('records')
+        # We are interested in the 'ALL' subject evaluations for the main leaderboard
+        df_all = df_filtered[df_filtered['subject'] == 'ALL'].copy()
+        if df_all.empty:
+            return pd.DataFrame(columns=["Model ID", "Avg. Accuracy (%)", "Total Samples"]).to_dict('records')
+        # Find the latest evaluation for each model
+        df_all['timestamp'] = pd.to_datetime(df_all['timestamp'])
+        latest_evals = df_all.loc[df_all.groupby('model_id')['timestamp'].idxmax()]
+        leaderboard_df = latest_evals[['model_id', 'accuracy', 'sample_count']].copy()
+        leaderboard_df.columns = ["Model ID", "Avg. Accuracy (%)", "Total Samples"]
+        # Format accuracy to 2 decimal places
+        leaderboard_df["Avg. Accuracy (%)"] = leaderboard_df["Avg. Accuracy (%)"].map('{:.2f}'.format)
+        # Sort by accuracy
+        leaderboard_df = leaderboard_df.sort_values(by="Avg. Accuracy (%)", ascending=False)
+        return leaderboard_df.to_dict('records')
     except Exception as e:
         gr.Error(f"Error loading leaderboard: {e}")
+        traceback.print_exc()
+        return pd.DataFrame(columns=["Model ID", "Avg. Accuracy (%)", "Total Samples"]).to_dict('records')
 # --- Gradio Interface Definition ---
+with gr.Blocks(theme=gr.themes.Soft(), css="""
+    /* --- Global & Layout --- */
+    body { font-family: 'Inter', sans-serif; background-color: #f8f9fa; }
+    .gradio-container { max-width: 1280px !important; margin: auto; }
+    .gr-box { border-radius: 12px !important; box-shadow: 0 4px 12px rgba(0,0,0,0.05) !important; border: 1px solid #e9ecef !important; }
+    /* --- Typography --- */
+    h1 {
+        text-align: center;
+        font-size: 2.5rem !important;
         font-weight: 700;
+        color: #212529;
+        margin-bottom: 0.5rem;
+        letter-spacing: -1px;
     }
+    .subtitle {
+        text-align: center; color: #6c757d; font-size: 1.1rem; margin-bottom: 2.5rem;
     }
+    /* --- Buttons & Inputs --- */
+    .gr-button {
+        border-radius: 8px !important;
+        font-weight: 600 !important;
+        padding: 10px 20px !important;
+        transition: all 0.2s ease;
+    }
+    .gr-button-primary { box-shadow: 0 4px 10px rgba(0, 123, 255, 0.2); }
+    .gr-button-primary:hover { transform: translateY(-2px); box-shadow: 0 6px 15px rgba(0, 123, 255, 0.3); }
+    /* --- Custom Radio Buttons (Segmented Control) --- */
+    #leaderboard-toggle, #eval-benchmark-selection {
+        background-color: #e9ecef;
+        padding: 5px;
+        border-radius: 10px;
+        display: inline-flex;
+        margin: auto;
+    }
+    #leaderboard-toggle div.gr-form, #eval-benchmark-selection div.gr-form {
         display: flex;
+        gap: 5px;
     }
+    #leaderboard-toggle input[type='radio'], #eval-benchmark-selection input[type='radio'] { display: none; }
+    #leaderboard-toggle label, #eval-benchmark-selection label {
+        padding: 8px 16px;
         border-radius: 8px;
         cursor: pointer;
+        transition: background-color 0.3s, color 0.3s, box-shadow 0.3s;
+        font-weight: 500;
+        color: #495057;
+        background: transparent;
+        border: none;
+        box-shadow: none;
+    }
+    #leaderboard-toggle input[type='radio']:checked + label, #eval-benchmark-selection input[type='radio']:checked + label {
+        background-color: white;
+        color: #007bff;
+        font-weight: 600;
+        box-shadow: 0 2px 5px rgba(0,0,0,0.1);
     }
+    /* --- Dataframe / Table Styling --- */
+    .leaderboard-table .gr-dataframe table {
+        border-collapse: collapse;
+        width: 100%;
     }
+    .leaderboard-table .gr-dataframe thead th {
+        background-color: #f8f9fa !important;
+        color: #495057 !important;
+        font-weight: 600 !important;
+        text-align: left;
+        padding: 12px 15px;
+        border-bottom: 2px solid #dee2e6;
     }
+    .leaderboard-table .gr-dataframe tbody tr:nth-of-type(even) {
+        background-color: #f8f9fa;
     }
+    .leaderboard-table .gr-dataframe tbody tr:hover {
+        background-color: #e9ecef;
     }
+    .leaderboard-table .gr-dataframe tbody td {
+        padding: 12px 15px;
+        border-bottom: 1px solid #dee2e6;
     }
+    /* --- Error & Result Panes --- */
+    #error-display-box { background-color: #fff3f3; border-color: #ffc9c9; }
+    #error-display-box .gr-label { color: #d9480f !important; font-weight: 600; }
+    #result-summary-box { background-color: #f3f9ff; border-color: #cde4ff; }
 """) as demo:
+    gr.Markdown("<h1>🤖 Open LLM Evaluator</h1>")
+    gr.Markdown("<p class='subtitle'>Benchmark leading models on MMLU and MMLU-Pro. Your results contribute to a live leaderboard.</p>")
     with gr.Tabs():
+        # --- Evaluation Tab ---
         with gr.TabItem("🚀 Run Evaluation"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    with gr.Box():
+                        gr.Markdown("### 1. Configure Evaluation")
+                        model_id_input = gr.Textbox(
+                            label="Hugging Face Model ID",
+                            placeholder="e.g., meta-llama/Meta-Llama-3-8B-Instruct",
+                            interactive=True
+                        )
+                        with gr.Row():
+                             benchmark_selection_radio = gr.Radio(
+                                ["MMLU", "MMLU-Pro"],
+                                label="Benchmark",
+                                value="MMLU",
+                                interactive=True,
+                                elem_id="eval-benchmark-selection",
+                                container=False
+                            )
+                        with gr.Row():
+                            benchmark_subject_dropdown = gr.Dropdown(
+                                label="Subject",
+                                choices=ALL_BENCHMARK_SUBJECTS.get("MMLU", []),
+                                value="ALL",
+                                interactive=True
+                            )
+                            sample_count_slider = gr.Slider(
+                                label="Samples per Subject",
+                                minimum=5, maximum=100, value=25, step=5, interactive=True
+                            )
+                    run_button = gr.Button("Start Evaluation", variant="primary", scale=1)
+                with gr.Column(scale=3):
+                    gr.Markdown("### 2. View Results")
+                    # Panel for displaying the summary of results
+                    with gr.Box(visible=False, elem_id="result-summary-box") as result_summary_box:
+                        result_summary_output = gr.Markdown()
+                    # Panel for displaying errors
+                    with gr.Box(visible=False, elem_id="error-display-box") as error_box:
+                        error_output = gr.Textbox(label="Error Message", interactive=False)
+                        error_details_output = gr.Textbox(label="Error Details (Traceback)", interactive=False, lines=8)
+                    # Panel for detailed, row-by-row results
+                    with gr.Box(visible=False) as details_box:
+                        gr.Markdown("#### Detailed Evaluation Log")
+                        detailed_results_df = gr.Dataframe(
+                            headers=["Question", "Correct", "Expected", "Predicted", "Raw Output"],
+                            datatype=["str", "bool", "str", "str", "str"],
+                            interactive=False,
+                            row_count=10,
+                            col_count=5
+                        )
+        # --- Leaderboard Tab ---
+        with gr.TabItem("📊 Leaderboard"):
+            with gr.Column():
+                gr.Markdown("<div style='display: flex; justify-content: center; width: 100%; margin-bottom: 20px;'></div>", elem_id="leaderboard-toggle-container")
+                leaderboard_type_toggle = gr.Radio(
+                    ["MMLU", "MMLU-Pro"],
+                    label="Select Benchmark",
+                    value="MMLU",
                     interactive=True,
+                    elem_id="leaderboard-toggle",
+                    container=False
                 )
+                leaderboard_table_output = gr.Dataframe(
+                    headers=["Model ID", "Avg. Accuracy (%)", "Total Samples"],
+                    interactive=False,
+                    datatype=["str", "str", "number"],
+                    row_count=15,
+                    elem_classes="leaderboard-table"
                 )
+    # --- Event Handlers & Logic ---
+    # Update subject dropdown when benchmark type changes
+    benchmark_selection_radio.change(
+        fn=update_subject_dropdown,
+        inputs=[benchmark_selection_radio],
+        outputs=[benchmark_subject_dropdown]
+    )
+    # Main evaluation trigger
+    run_button.click(
+        fn=run_evaluation,
+        inputs=[model_id_input, benchmark_selection_radio, benchmark_subject_dropdown, sample_count_slider],
+        outputs=[result_summary_box, error_box, error_details_output, details_box]
+    ).then(
+        # This chained function updates the component values *after* the visibility is set
+        lambda r, e, d, df: (r, e, d, df.to_dict('records')),
+        inputs=[result_summary_box, error_box, error_details_output, details_box],
+        outputs=[result_summary_output, error_output, error_details_output, detailed_results_df]
+    )
+    # Leaderboard loading logic
+    demo.load(
+        fn=load_leaderboard,
+        inputs=[leaderboard_type_toggle],
+        outputs=[leaderboard_table_output]
+    )
+    leaderboard_type_toggle.change(
+        fn=load_leaderboard,
+        inputs=[leaderboard_type_toggle],
+        outputs=[leaderboard_table_output]
+    )
+    # When the run button is clicked again, refresh the leaderboard
+    run_button.click(
+        fn=load_leaderboard,
+        inputs=[leaderboard_type_toggle],
+        outputs=[leaderboard_table_output]
+    )
 # Launch the Gradio app
+if __name__ == "__main__":
+    demo.launch(debug=True)