Spaces:

HeshamHaroon
/

Arabic-Function-Calling-Leaderboard

Running

App Files Files Community

HeshamHaroon commited on 13 days ago

Commit

a5f8ac7

verified ·

1 Parent(s): 6add5d0

Update: Auto-evaluation on Space startup

Browse files

Files changed (4) hide show

README.md +10 -9
afcl/app.py +274 -290
app.py +3 -3
requirements.txt +1 -1

README.md CHANGED Viewed

@@ -29,22 +29,23 @@ The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Mode
 4. Handle parallel and complex function calls
 5. Detect when no function should be called
 ## Dataset
-The benchmark includes **1,470+ samples** across 10 categories:
 - Simple, Multiple, Parallel, Parallel Multiple
 - Irrelevance Detection
 - Dialect Handling (Egyptian, Gulf, Levantine)
-- Programming APIs (Java, JavaScript, REST, SQL)
-📊 **Dataset**: [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
-## Submit Your Model
-To submit your model for evaluation:
-1. Go to the "Submit" tab
-2. Fill in your model details
-3. Your model will be added to the evaluation queue
 ## Citation

 4. Handle parallel and complex function calls
 5. Detect when no function should be called
+## Models Evaluated
+- **Arabic-Native**: Jais, ALLaM, SILMA, AceGPT
+- **Multilingual**: Qwen, Llama, Gemma, Mistral, Phi, BLOOMZ, Aya
 ## Dataset
+📊 **Dataset**: [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
+- **147 test samples** across 10 categories
 - Simple, Multiple, Parallel, Parallel Multiple
 - Irrelevance Detection
 - Dialect Handling (Egyptian, Gulf, Levantine)
+## Evaluation
+The leaderboard automatically evaluates models using the HuggingFace Inference API when the Space starts.
 ## Citation

afcl/app.py CHANGED Viewed

@@ -2,21 +2,22 @@
 Arabic Function Calling Leaderboard (AFCL)
 ==========================================
-A Gradio-based leaderboard for evaluating LLMs on Arabic function calling.
 """
 import gradio as gr
 import pandas as pd
 import json
 import os
 from pathlib import Path
 from typing import Dict, List, Optional
-# Local imports
-from .data.loader import (
-    load_leaderboard, save_leaderboard, load_benchmark,
-    calculate_overall_score, CATEGORY_WEIGHTS
-)
 # Constants
 TITLE = "🏆 Arabic Function Calling Leaderboard"
@@ -28,330 +29,313 @@ The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Mode
 **لوحة تقييم استدعاء الدوال بالعربية** تقيّم نماذج اللغة الكبيرة على قدرتها على فهم الاستعلامات العربية وإنشاء استدعاءات الدوال المناسبة.
 """
-# Column definitions
-LEADERBOARD_COLUMNS = {
-    "rank": {"label": "#", "label_en": "#", "type": "number"},
-    "model": {"label": "النموذج", "label_en": "Model", "type": "str"},
-    "organization": {"label": "المنظمة", "label_en": "Organization", "type": "str"},
-    "overall": {"label": "الدقة الكلية", "label_en": "Overall", "type": "number"},
-    "simple": {"label": "بسيط", "label_en": "Simple", "type": "number"},
-    "multiple": {"label": "متعدد", "label_en": "Multiple", "type": "number"},
-    "parallel": {"label": "متوازي", "label_en": "Parallel", "type": "number"},
-    "parallel_multiple": {"label": "متوازي متعدد", "label_en": "P. Multiple", "type": "number"},
-    "irrelevance": {"label": "اللا صلة", "label_en": "Irrelevance", "type": "number"},
-    "dialect_handling": {"label": "اللهجات", "label_en": "Dialects", "type": "number"},
-    "status": {"label": "الحالة", "label_en": "Status", "type": "str"},
-}
-# Empty sample - will load from file
-SAMPLE_LEADERBOARD = []
-def get_leaderboard_data() -> List[Dict]:
-    """Load leaderboard data from file or return sample data."""
     try:
-        data = load_leaderboard("data/leaderboard.json")
-        if data:
-            return data
-    except Exception:
-        pass
-    return SAMPLE_LEADERBOARD
-def format_leaderboard_dataframe(data: List[Dict], use_arabic: bool = True) -> pd.DataFrame:
-    """Convert leaderboard data to pandas DataFrame."""
-    if not data:
-        return pd.DataFrame()
-    df = pd.DataFrame(data)
-    # Select columns to display (fewer columns for cleaner view)
-    display_cols = ["rank", "model", "organization", "overall", "status"]
-    df = df[[c for c in display_cols if c in df.columns]]
-    # Rename columns based on language preference
-    column_mapping = {}
-    for col, info in LEADERBOARD_COLUMNS.items():
-        if col in df.columns:
-            label = info["label"] if use_arabic else info["label_en"]
-            column_mapping[col] = label
-    df = df.rename(columns=column_mapping)
-    # Format numeric columns (show as percentage, but mark 0.0 as "-")
     for col in df.columns:
-        if df[col].dtype in ['float64', 'float32']:
-            df[col] = df[col].apply(lambda x: "-" if x == 0.0 else f"{x:.1f}%")
-    # Format status column
-    status_col = "الحالة" if use_arabic else "Status"
-    if status_col in df.columns:
-        df[status_col] = df[status_col].apply(
-            lambda x: "⏳ قيد الانتظار" if x == "pending" else "✅ مكتمل"
-            if use_arabic else "⏳ Pending" if x == "pending" else "✅ Done"
-        )
     return df
-def create_models_list_tab():
-    """Create the models list tab showing all models to be evaluated."""
-    data = get_leaderboard_data()
-    # Group by organization
-    orgs = {}
-    for entry in data:
-        org = entry.get("organization", "Other")
-        if org not in orgs:
-            orgs[org] = []
-        orgs[org].append(entry)
-    # Create markdown content
-    md_content = """
-## 📋 Models Queue | قائمة النماذج للتقييم
-The following **{total}** models are queued for evaluation on the Arabic Function Calling benchmark:
-النماذج التالية (**{total}** نموذج) في قائمة الانتظار للتقييم:
----
-""".format(total=len(data))
-    for org, models in sorted(orgs.items()):
-        md_content += f"### {org}\n"
-        for m in models:
-            model_url = m.get("model_url", "#")
-            md_content += f"- [{m['model']}]({model_url}) - ⏳ Pending\n"
-        md_content += "\n"
-    return gr.Markdown(md_content)
-def create_submit_tab():
-    """Create the model submission tab."""
-    with gr.Column():
-        gr.Markdown("""
-        ## 📤 Submit Your Model | أرسل نموذجك
-        To submit a model for evaluation, provide the following information:
-        لإرسال نموذج للتقييم، قدم المعلومات التالية:
-        """)
-        with gr.Row():
-            model_name = gr.Textbox(
-                label="Model Name | اسم النموذج",
-                placeholder="e.g., my-arabic-llm-7b"
-            )
-            model_type = gr.Dropdown(
-                label="Model Type | نوع النموذج",
-                choices=["HuggingFace Hub", "API Endpoint", "Local Model"],
-                value="HuggingFace Hub"
-            )
-        model_path = gr.Textbox(
-            label="Model Path/Endpoint | مسار النموذج",
-            placeholder="e.g., organization/model-name or https://api.example.com/v1"
-        )
-        precision = gr.Dropdown(
-            label="Precision | الدقة",
-            choices=["float16", "bfloat16", "float32", "int8", "int4"],
-            value="float16"
-        )
-        with gr.Row():
-            base_model = gr.Textbox(
-                label="Base Model (if fine-tuned) | النموذج الأساسي",
-                placeholder="e.g., meta-llama/Llama-2-7b"
-            )
-            license_type = gr.Dropdown(
-                label="License | الرخصة",
-                choices=["Apache-2.0", "MIT", "CC-BY-4.0", "Llama 2", "Other"],
-                value="Apache-2.0"
-            )
-        submit_btn = gr.Button("Submit for Evaluation | أرسل للتقييم", variant="primary")
-        result_text = gr.Markdown("")
-        def handle_submission(name, mtype, path, prec, base, lic):
-            if not name or not path:
-                return "❌ Please fill in the required fields | يرجى ملء الحقول المطلوبة"
-            return f"""
-            ✅ **Submission Received | تم استلام الطلب**
-            - Model: {name}
-            - Type: {mtype}
-            - Path: {path}
-            Your model will be evaluated and added to the leaderboard soon.
-            سيتم تقييم نموذجك وإضافته إلى لوحة التقييم قريباً.
-            """
-        submit_btn.click(
-            fn=handle_submission,
-            inputs=[model_name, model_type, model_path, precision, base_model, license_type],
-            outputs=result_text
-        )
-def create_about_tab():
-    """Create the about/methodology tab."""
-    return gr.Markdown("""
-    # About AFCL | عن لوحة التقييم
-    ## Evaluation Categories | فئات التقييم
-    | Category | الفئة | Samples | Description |
-    |----------|-------|---------|-------------|
-    | Simple | بسيط | 200 | Single function, single call |
-    | Multiple | متعدد | 200 | Select correct function from options |
-    | Parallel | متوازي | 200 | Multiple calls of same function |
-    | Parallel Multiple | متوازي متعدد | 200 | Multiple functions, multiple calls |
-    | Irrelevance | اللا صلة | 200 | No function should be called |
-    | Dialect Handling | اللهجات | 150 | Egyptian/Gulf/Levantine queries |
-    | Java | جافا | 100 | Java API function calls |
-    | JavaScript | جافاسكريبت | 50 | JS function calls |
-    | REST | REST | 70 | REST API calls |
-    | SQL | SQL | 100 | SQL query generation |
-    **Total: 1,470 samples**
-    ## Scoring Formula | معادلة التقييم
-    ```
-    Overall Score = Σ (category_score × weight)
-    ```
-    **Weights | الأوزان:**
-    - Simple: 15%
-    - Multiple: 10%
-    - Parallel: 10%
-    - Parallel Multiple: 10%
-    - Irrelevance: 15%
-    - Dialect Handling: 15%
-    - Multi-Turn: 15%
-    - Native Arabic: 10%
-    ## Dataset | مجموعة البيانات
-    📊 **[HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)**
-    - **Total Samples**: 1,470
-    - **Languages**: Arabic (MSA + Dialects) & English
-    - **Categories**: 10 evaluation categories
-    - **Source**: Translated from BFCL with dialect variants
-    ## Citation | الاقتباس
-    ```bibtex
-    @misc{afcl2024,
-        title={Arabic Function Calling Leaderboard},
-        author={Hesham Haroon},
-        year={2024},
-        url={https://huggingface.co/spaces/HeshamHaroon/Arabic-Function-Calling-Leaderboard}
-    }
-    ```
-    """)
 def create_app():
-    """Create the main Gradio application."""
-    # Load CSS
-    css_path = Path(__file__).parent / "static" / "styles.css"
-    custom_css = ""
-    if css_path.exists():
-        with open(css_path, "r") as f:
-            custom_css = f.read()
-    with gr.Blocks(
-        title="Arabic Function Calling Leaderboard",
-        css=custom_css,
-        theme=gr.themes.Soft()
-    ) as app:
-        # Header
         gr.Markdown(f"""
         <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4a 100%); border-radius: 12px; color: white; margin-bottom: 20px;">
-            <h1 style="font-size: 2rem; margin-bottom: 10px;">{TITLE_AR}</h1>
-            <h2 style="font-size: 1.5rem; margin-bottom: 10px;">{TITLE}</h2>
-            <p style="opacity: 0.9;">Evaluating LLMs on Arabic Function Calling | تقييم نماذج اللغة على استدعاء الدوال بالعربية</p>
         </div>
         """)
         gr.Markdown(DESCRIPTION)
-        # Stats row
-        data = get_leaderboard_data()
-        evaluated = len([d for d in data if d.get("status") != "pending"])
-        pending = len([d for d in data if d.get("status") == "pending"])
         with gr.Row():
             gr.Markdown(f"""
             <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
-                <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(data)}</div>
-                <div style="color: #666;">Total Models | إجمالي النماذج</div>
             </div>
             """)
-            gr.Markdown(f"""
-            <div style="text-align: center; padding: 15px; background: #fff3cd; border-radius: 8px;">
-                <div style="font-size: 2rem; font-weight: bold; color: #856404;">{pending}</div>
-                <div style="color: #856404;">⏳ Pending | قيد الانتظار</div>
             </div>
             """)
             gr.Markdown("""
             <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
-                <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">1,470</div>
-                <div style="color: #666;">Test Samples | عينات الاختبار</div>
             </div>
             """)
-        # Notice about pending evaluation
-        if pending > 0:
-            gr.Markdown(f"""
-            <div style="padding: 15px; background: #fff3cd; border: 1px solid #ffc107; border-radius: 8px; margin: 15px 0;">
-                ⏳ <strong>Evaluation in Progress | التقييم قيد التنفيذ</strong><br>
-                {pending} models are waiting to be evaluated. Results will be updated as evaluations complete.<br>
-                {pending} نموذج في انتظار التقييم. سيتم تحديث النتائج فور اكتمال التقييم.
-            </div>
-            """)
-        # Tabs
         with gr.Tabs():
-            with gr.TabItem("🏆 Leaderboard | لوحة التقييم"):
-                df = format_leaderboard_dataframe(data, use_arabic=True)
-                gr.DataFrame(
-                    value=df,
-                    interactive=False,
-                    wrap=True,
                 )
-            with gr.TabItem("📋 Models | النماذج"):
-                create_models_list_tab()
-            with gr.TabItem("📤 Submit | إرسال"):
-                create_submit_tab()
-            with gr.TabItem("ℹ️ About | عن المشروع"):
-                create_about_tab()
-        # Footer
         gr.Markdown("""
         ---
-        <div style="text-align: center; color: #666; padding: 20px;">
-            Built with ❤️ for the Arabic NLP community | بُني بحب لمجتمع معالجة اللغة العربية
-            <br>
-            <a href="https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling">Dataset</a> |
-            <a href="https://github.com/HeshamHaroon">GitHub</a>
         </div>
         """)
     return app
-# Main entry point
 if __name__ == "__main__":
-    app = create_app()
     app.launch()

 Arabic Function Calling Leaderboard (AFCL)
 ==========================================
+A Gradio-based leaderboard that evaluates LLMs on Arabic function calling.
+Evaluation runs on HuggingFace Space infrastructure.
 """
 import gradio as gr
 import pandas as pd
 import json
 import os
+import re
+import time
+import requests
 from pathlib import Path
 from typing import Dict, List, Optional
+from threading import Thread
+from datasets import load_dataset
+import huggingface_hub
 # Constants
 TITLE = "🏆 Arabic Function Calling Leaderboard"
 **لوحة تقييم استدعاء الدوال بالعربية** تقيّم نماذج اللغة الكبيرة على قدرتها على فهم الاستعلامات العربية وإنشاء استدعاءات الدوال المناسبة.
 """
+# Models to evaluate
+MODELS_TO_EVALUATE = [
+    {"model": "Jais-30B-Chat", "model_id": "inceptionai/jais-30b-chat-v3", "organization": "Inception AI"},
+    {"model": "ALLaM-7B-Instruct", "model_id": "sdaia/allam-1-7b-instruct", "organization": "SDAIA"},
+    {"model": "SILMA-9B-Instruct", "model_id": "silma-ai/SILMA-9B-Instruct-v1.0", "organization": "Silma AI"},
+    {"model": "AceGPT-13B-Chat", "model_id": "FreedomIntelligence/AceGPT-13B-chat", "organization": "FreedomIntelligence"},
+    {"model": "BLOOMZ-7B1", "model_id": "bigscience/bloomz-7b1", "organization": "BigScience"},
+    {"model": "Aya-Expanse-8B", "model_id": "CohereForAI/aya-expanse-8b", "organization": "Cohere For AI"},
+    {"model": "Qwen2.5-7B-Instruct", "model_id": "Qwen/Qwen2.5-7B-Instruct", "organization": "Alibaba Qwen"},
+    {"model": "Llama-3.1-8B-Instruct", "model_id": "meta-llama/Llama-3.1-8B-Instruct", "organization": "Meta"},
+    {"model": "Gemma-2-9B-IT", "model_id": "google/gemma-2-9b-it", "organization": "Google"},
+    {"model": "Mistral-7B-Instruct", "model_id": "mistralai/Mistral-7B-Instruct-v0.3", "organization": "Mistral AI"},
+    {"model": "Phi-3-Mini-Instruct", "model_id": "microsoft/Phi-3-mini-4k-instruct", "organization": "Microsoft"},
+]
+# Global state
+LEADERBOARD_DATA = []
+EVALUATION_STATUS = "Not started"
+def load_evaluation_dataset():
+    """Load the Arabic FC dataset from HuggingFace."""
     try:
+        dataset = load_dataset("HeshamHaroon/Arabic_Function_Calling", split="test")
+        samples = []
+        for item in dataset:
+            sample = {
+                'id': item['id'],
+                'query_ar': item['query_ar'],
+                'functions': json.loads(item['functions']) if item['functions'] else [],
+                'ground_truth': json.loads(item['ground_truth']) if item['ground_truth'] else None,
+                'category': item['category'],
+            }
+            samples.append(sample)
+        return samples
+    except Exception as e:
+        print(f"Error loading dataset: {e}")
+        return []
+def create_prompt(query: str, functions: List[Dict]) -> str:
+    """Create evaluation prompt."""
+    func_desc = "You are a function calling AI. Given the user query and available functions, respond with a JSON function call.\n\nAvailable functions:\n"
+    for f in functions:
+        func_desc += f"- {f.get('name')}: {f.get('description', '')}\n"
+    return f"""{func_desc}
+User Query (Arabic): {query}
+Respond ONLY with a JSON object:
+{{"name": "function_name", "arguments": {{"param1": "value1"}}}}
+If no function should be called:
+{{"name": null, "arguments": {{}}}}
+JSON Response:"""
+def call_model(model_id: str, prompt: str) -> str:
+    """Call model via HuggingFace Inference API."""
+    token = os.getenv("HF_TOKEN", "")
+    headers = {"Authorization": f"Bearer {token}"}
+    url = f"https://api-inference.huggingface.co/models/{model_id}"
+    payload = {
+        "inputs": prompt,
+        "parameters": {"max_new_tokens": 200, "temperature": 0.1}
+    }
+    try:
+        response = requests.post(url, headers=headers, json=payload, timeout=60)
+        if response.status_code == 503:
+            time.sleep(20)
+            response = requests.post(url, headers=headers, json=payload, timeout=60)
+        result = response.json()
+        if isinstance(result, list) and result:
+            return result[0].get("generated_text", "")
+        return str(result)
+    except:
+        return ""
+def parse_response(response: str) -> Optional[Dict]:
+    """Parse function call from response."""
+    if not response:
+        return None
+    try:
+        return json.loads(response.strip())
+    except:
+        pass
+    match = re.search(r'\{[^{}]*"name"[^{}]*\}', response)
+    if match:
+        try:
+            return json.loads(match.group())
+        except:
+            pass
+    if any(x in response.lower() for x in ['null', 'none', 'لا يمكن']):
+        return {"name": None}
+    return None
+def evaluate_sample(model_id: str, sample: Dict) -> float:
+    """Evaluate single sample."""
+    query = sample.get('query_ar', '')
+    functions = sample.get('functions', [])
+    category = sample.get('category', '')
+    ground_truth = sample.get('ground_truth')
+    prompt = create_prompt(query, functions)
+    response = call_model(model_id, prompt)
+    parsed = parse_response(response)
+    if category == 'irrelevance':
+        return 1.0 if (parsed is None or parsed.get('name') is None) else 0.0
+    if not ground_truth or not parsed:
+        return 0.0
+    expected = ground_truth.get('calls', [ground_truth])[0] if isinstance(ground_truth, dict) else ground_truth
+    if str(parsed.get('name', '')).lower() != str(expected.get('name', '')).lower():
+        return 0.0
+    pred_args = parsed.get('arguments', {})
+    exp_args = expected.get('arguments', {})
+    if not exp_args:
+        return 1.0
+    matched = sum(1 for k, v in exp_args.items() if str(pred_args.get(k, '')).lower() == str(v).lower())
+    return matched / len(exp_args)
+def run_evaluation():
+    """Run full evaluation on all models."""
+    global LEADERBOARD_DATA, EVALUATION_STATUS
+    EVALUATION_STATUS = "Loading dataset..."
+    samples = load_evaluation_dataset()
+    if not samples:
+        EVALUATION_STATUS = "Failed to load dataset"
+        return
+    results = []
+    total_models = len(MODELS_TO_EVALUATE)
+    for idx, model_config in enumerate(MODELS_TO_EVALUATE):
+        model_name = model_config['model']
+        model_id = model_config['model_id']
+        EVALUATION_STATUS = f"Evaluating {model_name} ({idx+1}/{total_models})..."
+        category_scores = {}
+        category_counts = {}
+        for sample in samples:
+            cat = sample.get('category', 'simple')
+            if cat not in category_scores:
+                category_scores[cat] = 0.0
+                category_counts[cat] = 0
+            try:
+                score = evaluate_sample(model_id, sample)
+                category_scores[cat] += score
+            except:
+                pass
+            category_counts[cat] += 1
+            time.sleep(0.5)  # Rate limiting
+        # Calculate scores
+        scores = {cat: round((category_scores[cat] / category_counts[cat]) * 100, 1)
+                  for cat in category_scores if category_counts[cat] > 0}
+        # Weighted overall
+        weights = {"simple": 0.15, "multiple": 0.10, "parallel": 0.10,
+                   "parallel_multiple": 0.10, "irrelevance": 0.15, "dialect_handling": 0.15}
+        overall = sum(scores.get(c, 0) * w for c, w in weights.items()) / sum(weights.values())
+        results.append({
+            "model": model_name,
+            "model_id": model_id,
+            "organization": model_config['organization'],
+            "overall": round(overall, 1),
+            "simple": scores.get('simple', 0),
+            "multiple": scores.get('multiple', 0),
+            "parallel": scores.get('parallel', 0),
+            "parallel_multiple": scores.get('parallel_multiple', 0),
+            "irrelevance": scores.get('irrelevance', 0),
+            "dialect_handling": scores.get('dialect_handling', 0),
+            "status": "completed"
+        })
+    # Sort and rank
+    results = sorted(results, key=lambda x: x['overall'], reverse=True)
+    for i, r in enumerate(results, 1):
+        r['rank'] = i
+    LEADERBOARD_DATA = results
+    EVALUATION_STATUS = f"Completed - {len(results)} models evaluated"
+def get_leaderboard_df():
+    """Get leaderboard as DataFrame."""
+    if not LEADERBOARD_DATA:
+        # Return empty with pending status
+        data = [{"rank": i+1, "model": m["model"], "organization": m["organization"],
+                 "overall": "-", "status": "⏳ Pending"}
+                for i, m in enumerate(MODELS_TO_EVALUATE)]
+        return pd.DataFrame(data)
+    df = pd.DataFrame(LEADERBOARD_DATA)
+    cols = ["rank", "model", "organization", "overall", "simple", "multiple",
+            "parallel", "parallel_multiple", "irrelevance", "dialect_handling"]
+    df = df[[c for c in cols if c in df.columns]]
+    # Format percentages
     for col in df.columns:
+        if df[col].dtype in ['float64', 'float32', 'int64']:
+            if col != 'rank':
+                df[col] = df[col].apply(lambda x: f"{x:.1f}%")
     return df
 def create_app():
+    """Create the Gradio app."""
+    with gr.Blocks(title="Arabic FC Leaderboard", theme=gr.themes.Soft()) as app:
         gr.Markdown(f"""
         <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4a 100%); border-radius: 12px; color: white; margin-bottom: 20px;">
+            <h1>{TITLE_AR}</h1>
+            <h2>{TITLE}</h2>
+            <p>Evaluating LLMs on Arabic Function Calling | تقييم نماذج اللغة على استدعاء الدوال بالعربية</p>
         </div>
         """)
         gr.Markdown(DESCRIPTION)
         with gr.Row():
             gr.Markdown(f"""
             <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
+                <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(MODELS_TO_EVALUATE)}</div>
+                <div>Models | النماذج</div>
             </div>
             """)
+            gr.Markdown("""
+            <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
+                <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">147</div>
+                <div>Test Samples | عينات</div>
             </div>
             """)
             gr.Markdown("""
             <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
+                <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">10</div>
+                <div>Categories | الفئات</div>
             </div>
             """)
+        status_text = gr.Markdown(f"**Status:** {EVALUATION_STATUS}")
         with gr.Tabs():
+            with gr.TabItem("🏆 Leaderboard"):
+                leaderboard_df = gr.DataFrame(
+                    value=get_leaderboard_df(),
+                    interactive=False
                 )
+                def refresh_leaderboard():
+                    return get_leaderboard_df(), f"**Status:** {EVALUATION_STATUS}"
+                refresh_btn = gr.Button("🔄 Refresh | تحديث")
+                refresh_btn.click(refresh_leaderboard, outputs=[leaderboard_df, status_text])
+            with gr.TabItem("📊 About"):
+                gr.Markdown("""
+                ## Evaluation Categories
+                | Category | Samples | Description |
+                |----------|---------|-------------|
+                | Simple | ~20 | Single function call |
+                | Multiple | ~20 | Select from multiple functions |
+                | Parallel | ~20 | Multiple calls |
+                | Parallel Multiple | ~20 | Complex multi-call |
+                | Irrelevance | ~20 | Should not call |
+                | Dialect | ~15 | Egyptian/Gulf/Levantine |
+                ## Dataset
+                📊 [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
+                """)
         gr.Markdown("""
         ---
+        <div style="text-align: center; color: #666;">
+            Built for the Arabic NLP community | بُني لمجتمع معالجة اللغة العربية
         </div>
         """)
+        # Start evaluation in background
+        if not LEADERBOARD_DATA:
+            Thread(target=run_evaluation, daemon=True).start()
     return app
+app = create_app()
 if __name__ == "__main__":
     app.launch()

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ Arabic Function Calling Leaderboard - HuggingFace Space Entry Point
 import sys
 sys.path.insert(0, ".")
-from afcl.app import create_app
-app = create_app()
-app.launch()

 import sys
 sys.path.insert(0, ".")
+from afcl.app import app
+if __name__ == "__main__":
+    app.launch()

requirements.txt CHANGED Viewed

@@ -2,4 +2,4 @@ gradio==4.44.0
 huggingface_hub==0.25.0
 datasets>=2.14.0
 pandas>=2.0.0
-plotly>=5.18.0

 huggingface_hub==0.25.0
 datasets>=2.14.0
 pandas>=2.0.0
+requests>=2.28.0