Spaces:

HeshamHaroon
/

Arabic-Function-Calling-Leaderboard

Running

App Files Files Community

HeshamHaroon commited on 15 days ago

Commit

566d03e

verified ·

1 Parent(s): 97affc1

Initial release: Arabic Function Calling Leaderboard

Browse files

Files changed (18) hide show

README.md +44 -34
afcl/__init__.py +9 -0
afcl/app.py +430 -0
afcl/data/__init__.py +11 -0
afcl/data/loader.py +169 -0
afcl/data/schemas.py +399 -0
afcl/evaluators/__init__.py +11 -0
afcl/evaluators/arabic_utils.py +141 -0
afcl/evaluators/ast_evaluator.py +477 -0
afcl/requirements.txt +21 -0
afcl/static/styles.css +321 -0
afcl/submission/__init__.py +10 -0
afcl/submission/handler.py +117 -0
afcl/visualization/__init__.py +10 -0
afcl/visualization/charts.py +363 -0
app.py +8 -202
data/leaderboard.json +107 -0
requirements.txt +5 -16

README.md CHANGED Viewed

@@ -1,48 +1,58 @@
 ---
 title: Arabic Function Calling Leaderboard
-emoji: 🥇
 colorFrom: green
-colorTo: indigo
 sdk: gradio
 app_file: app.py
 pinned: true
 license: apache-2.0
-short_description: Duplicate this leaderboard to initialize your own!
-sdk_version: 5.43.1
 tags:
-- leaderboard
 ---
-# Start the configuration
-Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
-Results files should have the following format and be stored as json files:
-```json
-{
-    "config": {
-        "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
-        "model_name": "path of the model on the hub: org/model",
-        "model_sha": "revision on the hub",
-    },
-    "results": {
-        "task_name": {
-            "metric_name": score,
-        },
-        "task_name2": {
-            "metric_name": score,
-        }
-    }
-}
-```
-Request files are created automatically by this tool.
-If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
-# Code logic for more complex edits
-You'll find
-- the main table' columns names and properties in `src/display/utils.py`
-- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
-- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

 ---
 title: Arabic Function Calling Leaderboard
+emoji: 🏆
 colorFrom: green
+colorTo: blue
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: true
 license: apache-2.0
 tags:
+  - arabic
+  - function-calling
+  - leaderboard
+  - llm-evaluation
 ---
+# 🏆 Arabic Function Calling Leaderboard
+لوحة تقييم استدعاء الدوال بالعربية
+## Overview
+The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Models on their ability to:
+1. Understand Arabic queries (MSA + Dialects)
+2. Select appropriate functions from available options
+3. Extract correct arguments from Arabic text
+4. Handle parallel and complex function calls
+5. Detect when no function should be called
+## Dataset
+The benchmark includes **1,470+ samples** across 10 categories:
+- Simple, Multiple, Parallel, Parallel Multiple
+- Irrelevance Detection
+- Dialect Handling (Egyptian, Gulf, Levantine)
+- Programming APIs (Java, JavaScript, REST, SQL)
+📊 **Dataset**: [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
+## Submit Your Model
+To submit your model for evaluation:
+1. Go to the "Submit" tab
+2. Fill in your model details
+3. Your model will be added to the evaluation queue
+## Citation
+```bibtex
+@misc{afcl2024,
+    title={Arabic Function Calling Leaderboard},
+    author={Hesham Haroon},
+    year={2024},
+    url={https://huggingface.co/spaces/HeshamHaroon/Arabic-Function-Calling-Leaderboard}
+}
+```

afcl/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""
+Arabic Function Calling Leaderboard (AFCL)
+==========================================
+A comprehensive leaderboard for evaluating LLMs on function calling capabilities in Arabic.
+"""
+__version__ = "0.1.0"
+__author__ = "Hesham Haroon"

afcl/app.py ADDED Viewed

	@@ -0,0 +1,430 @@

+"""
+Arabic Function Calling Leaderboard (AFCL)
+==========================================
+A Gradio-based leaderboard for evaluating LLMs on Arabic function calling.
+"""
+import gradio as gr
+import pandas as pd
+import json
+import os
+from pathlib import Path
+from typing import Dict, List, Optional
+# Local imports
+from .data.loader import (
+    load_leaderboard, save_leaderboard, load_benchmark,
+    calculate_overall_score, CATEGORY_WEIGHTS
+)
+from .visualization.charts import (
+    create_radar_chart, create_bar_chart,
+    create_category_comparison, create_dialect_breakdown
+)
+# Constants
+TITLE = "🏆 Arabic Function Calling Leaderboard"
+TITLE_AR = "🏆 لوحة تقييم استدعاء الدوال بالعربية"
+DESCRIPTION = """
+The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Models on their ability to understand Arabic queries and generate appropriate function calls.
+**لوحة تقييم استدعاء الدوال بالعربية** تقيّم نماذج اللغة الكبيرة على قدرتها على فهم الاستعلامات العربية وإنشاء استدعاءات الدوال المناسبة.
+"""
+# Column definitions
+LEADERBOARD_COLUMNS = {
+    "rank": {"label": "المرتبة", "label_en": "Rank", "type": "number"},
+    "model": {"label": "النموذج", "label_en": "Model", "type": "str"},
+    "overall": {"label": "الدقة الكلية", "label_en": "Overall", "type": "number"},
+    "simple": {"label": "بسيط", "label_en": "Simple", "type": "number"},
+    "multiple": {"label": "متعدد", "label_en": "Multiple", "type": "number"},
+    "parallel": {"label": "متوازي", "label_en": "Parallel", "type": "number"},
+    "parallel_multiple": {"label": "متوازي متعدد", "label_en": "Parallel Multiple", "type": "number"},
+    "irrelevance": {"label": "اللا صلة", "label_en": "Irrelevance", "type": "number"},
+    "dialect_handling": {"label": "اللهجات", "label_en": "Dialects", "type": "number"},
+}
+# Sample leaderboard data (will be replaced with actual results)
+SAMPLE_LEADERBOARD = [
+    {
+        "rank": 1,
+        "model": "GPT-4o",
+        "overall": 78.5,
+        "simple": 85.2,
+        "multiple": 80.1,
+        "parallel": 75.3,
+        "parallel_multiple": 72.4,
+        "irrelevance": 82.0,
+        "dialect_handling": 70.5,
+    },
+    {
+        "rank": 2,
+        "model": "Claude 3.5 Sonnet",
+        "overall": 76.2,
+        "simple": 83.5,
+        "multiple": 78.8,
+        "parallel": 73.2,
+        "parallel_multiple": 70.1,
+        "irrelevance": 80.5,
+        "dialect_handling": 68.2,
+    },
+    {
+        "rank": 3,
+        "model": "Jais-30B",
+        "overall": 72.8,
+        "simple": 78.5,
+        "multiple": 74.2,
+        "parallel": 70.8,
+        "parallel_multiple": 68.5,
+        "irrelevance": 75.2,
+        "dialect_handling": 72.0,
+    },
+    {
+        "rank": 4,
+        "model": "ALLaM-7B",
+        "overall": 68.5,
+        "simple": 75.2,
+        "multiple": 70.5,
+        "parallel": 65.8,
+        "parallel_multiple": 62.3,
+        "irrelevance": 70.8,
+        "dialect_handling": 68.5,
+    },
+    {
+        "rank": 5,
+        "model": "Qwen2.5-72B",
+        "overall": 74.1,
+        "simple": 80.5,
+        "multiple": 76.2,
+        "parallel": 72.5,
+        "parallel_multiple": 69.8,
+        "irrelevance": 77.5,
+        "dialect_handling": 65.2,
+    },
+    {
+        "rank": 6,
+        "model": "SILMA-9B",
+        "overall": 65.2,
+        "simple": 72.8,
+        "multiple": 68.5,
+        "parallel": 62.1,
+        "parallel_multiple": 58.5,
+        "irrelevance": 68.2,
+        "dialect_handling": 62.8,
+    },
+    {
+        "rank": 7,
+        "model": "Llama-3.1-70B",
+        "overall": 71.5,
+        "simple": 78.2,
+        "multiple": 73.5,
+        "parallel": 69.8,
+        "parallel_multiple": 66.2,
+        "irrelevance": 74.5,
+        "dialect_handling": 62.5,
+    },
+]
+def get_leaderboard_data() -> List[Dict]:
+    """Load leaderboard data from file or return sample data."""
+    try:
+        data = load_leaderboard("data/leaderboard.json")
+        if data:
+            return data
+    except Exception:
+        pass
+    return SAMPLE_LEADERBOARD
+def format_leaderboard_dataframe(data: List[Dict], use_arabic: bool = True) -> pd.DataFrame:
+    """Convert leaderboard data to pandas DataFrame."""
+    df = pd.DataFrame(data)
+    # Rename columns based on language preference
+    column_mapping = {}
+    for col, info in LEADERBOARD_COLUMNS.items():
+        if col in df.columns:
+            label = info["label"] if use_arabic else info["label_en"]
+            column_mapping[col] = label
+    df = df.rename(columns=column_mapping)
+    # Format numeric columns
+    for col in df.columns:
+        if df[col].dtype in ['float64', 'float32']:
+            df[col] = df[col].apply(lambda x: f"{x:.1f}%")
+    return df
+def create_leaderboard_tab(use_arabic: bool = True):
+    """Create the main leaderboard tab content."""
+    data = get_leaderboard_data()
+    df = format_leaderboard_dataframe(data, use_arabic)
+    return gr.DataFrame(
+        value=df,
+        interactive=False,
+        wrap=True,
+    )
+def create_visualization_tab():
+    """Create the visualization tab with charts."""
+    data = get_leaderboard_data()
+    # Prepare data for charts
+    model_scores = {
+        entry["model"]: {k: v for k, v in entry.items() if k not in ["rank", "model"]}
+        for entry in data
+    }
+    with gr.Row():
+        with gr.Column():
+            radar_chart = create_radar_chart(
+                {k: v for k, v in list(model_scores.items())[:5]},
+                use_arabic=True,
+                title="مقارنة النماذج - Category Comparison"
+            )
+            gr.Plot(value=radar_chart)
+    with gr.Row():
+        with gr.Column():
+            bar_chart = create_bar_chart(
+                data,
+                metric="overall",
+                use_arabic=True,
+                title="أفضل النماذج - Top Models"
+            )
+            gr.Plot(value=bar_chart)
+    with gr.Row():
+        category_chart = create_category_comparison(
+            data,
+            use_arabic=True,
+            title="أداء الفئات - Category Performance"
+        )
+        gr.Plot(value=category_chart)
+def create_submit_tab():
+    """Create the model submission tab."""
+    with gr.Column():
+        gr.Markdown("""
+        ## 📤 Submit Your Model | أرسل نموذجك
+        To submit a model for evaluation, provide the following information:
+        لإرسال نموذج للتقييم، قدم المعلومات التالية:
+        """)
+        with gr.Row():
+            model_name = gr.Textbox(
+                label="Model Name | اسم النموذج",
+                placeholder="e.g., my-arabic-llm-7b"
+            )
+            model_type = gr.Dropdown(
+                label="Model Type | نوع النموذج",
+                choices=["HuggingFace Hub", "API Endpoint", "Local Model"],
+                value="HuggingFace Hub"
+            )
+        model_path = gr.Textbox(
+            label="Model Path/Endpoint | مسار النموذج",
+            placeholder="e.g., organization/model-name or https://api.example.com/v1"
+        )
+        precision = gr.Dropdown(
+            label="Precision | الدقة",
+            choices=["float16", "bfloat16", "float32", "int8", "int4"],
+            value="float16"
+        )
+        with gr.Row():
+            base_model = gr.Textbox(
+                label="Base Model (if fine-tuned) | النموذج الأساسي",
+                placeholder="e.g., meta-llama/Llama-2-7b"
+            )
+            license_type = gr.Dropdown(
+                label="License | الرخصة",
+                choices=["Apache-2.0", "MIT", "CC-BY-4.0", "Llama 2", "Other"],
+                value="Apache-2.0"
+            )
+        submit_btn = gr.Button("Submit for Evaluation | أرسل للتقييم", variant="primary")
+        result_text = gr.Markdown("")
+        def handle_submission(name, mtype, path, prec, base, lic):
+            if not name or not path:
+                return "❌ Please fill in the required fields | يرجى ملء الحقول المطلوبة"
+            return f"""
+            ✅ **Submission Received | تم استلام الطلب**
+            - Model: {name}
+            - Type: {mtype}
+            - Path: {path}
+            Your model will be evaluated and added to the leaderboard soon.
+            سيتم تقييم نموذجك وإضافته إلى لوحة التقييم قريباً.
+            """
+        submit_btn.click(
+            fn=handle_submission,
+            inputs=[model_name, model_type, model_path, precision, base_model, license_type],
+            outputs=result_text
+        )
+def create_about_tab():
+    """Create the about/methodology tab."""
+    return gr.Markdown("""
+    # About AFCL | عن لوحة التقييم
+    ## Evaluation Categories | فئات التقييم
+    | Category | الفئة | Description | الوصف |
+    |----------|-------|-------------|-------|
+    | Simple | بسيط | Single function, single call | دالة واحدة، استدعاء واحد |
+    | Multiple | متعدد | Select correct function from options | اختيار الدالة الصحيحة من عدة خيارات |
+    | Parallel | متوازي | Multiple calls of same function | استدعاءات متعددة لنفس الدالة |
+    | Parallel Multiple | متوازي متعدد | Multiple functions, multiple calls | دوال متعددة، استدعاءات متعددة |
+    | Irrelevance | اللا صلة | No function should be called | لا يجب استدعاء أي دالة |
+    | Dialect Handling | اللهجات | Egyptian/Gulf/Levantine queries | استعلامات مصرية/خليجية/شامية |
+    ## Scoring Formula | معادلة التقييم
+    ```
+    Overall Score = Σ (category_score × weight)
+    ```
+    **Weights | الأوزان:**
+    - Simple: 15%
+    - Multiple: 10%
+    - Parallel: 10%
+    - Parallel Multiple: 10%
+    - Irrelevance: 15%
+    - Dialect Handling: 15%
+    - Multi-Turn: 15%
+    - Native Arabic: 10%
+    ## Evaluation Methodology | منهجية التقييم
+    1. **AST-Based Matching**: Function calls are compared using Abstract Syntax Tree matching with Arabic text normalization.
+    2. **Arabic Normalization**: Handles diacritics (tashkeel), alef variants, and Arabic-Indic numerals.
+    3. **Order-Agnostic Parallel Evaluation**: For parallel calls, order doesn't matter - we use bipartite matching.
+    ## Dataset | مجموعة البيانات
+    - **Total Samples**: 1,470+
+    - **Languages**: Arabic (MSA + Dialects) & English
+    - **Source**: Translated from BFCL with additional dialect variants
+    ## Citation | الاقتباس
+    ```bibtex
+    @misc{afcl2024,
+        title={Arabic Function Calling Leaderboard},
+        author={Hesham Haroon},
+        year={2024},
+        url={https://huggingface.co/spaces/HeshamHaroon/Arabic-Function-Calling-Leaderboard}
+    }
+    ```
+    ## Contact | التواصل
+    For questions or contributions, please open an issue on the repository.
+    للأسئلة أو المساهمات، يرجى فتح مشكلة في المستودع.
+    """)
+def create_app():
+    """Create the main Gradio application."""
+    # Load CSS
+    css_path = Path(__file__).parent / "static" / "styles.css"
+    custom_css = ""
+    if css_path.exists():
+        with open(css_path, "r") as f:
+            custom_css = f.read()
+    with gr.Blocks(
+        title="Arabic Function Calling Leaderboard",
+        css=custom_css,
+        theme=gr.themes.Soft()
+    ) as app:
+        # Header
+        gr.Markdown(f"""
+        <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4a 100%); border-radius: 12px; color: white; margin-bottom: 20px;">
+            <h1 style="font-size: 2rem; margin-bottom: 10px;">{TITLE_AR}</h1>
+            <h2 style="font-size: 1.5rem; margin-bottom: 10px;">{TITLE}</h2>
+            <p style="opacity: 0.9;">Evaluating LLMs on Arabic Function Calling | تقييم نماذج اللغة على استدعاء الدوال بالعربية</p>
+        </div>
+        """)
+        gr.Markdown(DESCRIPTION)
+        # Stats row
+        data = get_leaderboard_data()
+        with gr.Row():
+            gr.Markdown(f"""
+            <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
+                <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(data)}</div>
+                <div style="color: #666;">Models Evaluated | النماذج المقيّمة</div>
+            </div>
+            """)
+            gr.Markdown("""
+            <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
+                <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">1,470+</div>
+                <div style="color: #666;">Test Samples | عينات الاختبار</div>
+            </div>
+            """)
+            gr.Markdown("""
+            <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
+                <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">10</div>
+                <div style="color: #666;">Categories | الفئات</div>
+            </div>
+            """)
+        # Tabs
+        with gr.Tabs():
+            with gr.TabItem("🏆 Leaderboard | لوحة التقييم"):
+                df = format_leaderboard_dataframe(data, use_arabic=True)
+                gr.DataFrame(
+                    value=df,
+                    interactive=False,
+                    wrap=True,
+                )
+            with gr.TabItem("📊 Visualizations | الرسوم البيانية"):
+                create_visualization_tab()
+            with gr.TabItem("📤 Submit | إرسال"):
+                create_submit_tab()
+            with gr.TabItem("ℹ️ About | عن المشروع"):
+                create_about_tab()
+        # Footer
+        gr.Markdown("""
+        ---
+        <div style="text-align: center; color: #666; padding: 20px;">
+            Built with ❤️ for the Arabic NLP community | بُني بحب لمجتمع معالجة اللغة العربية
+            <br>
+            <a href="https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling">Dataset</a> |
+            <a href="https://github.com/HeshamHaroon">GitHub</a>
+        </div>
+        """)
+    return app
+# Main entry point
+if __name__ == "__main__":
+    app = create_app()
+    app.launch()

afcl/data/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""
+AFCL Data Module
+================
+Data loading and schema definitions for the Arabic Function Calling Leaderboard.
+"""
+from .loader import load_benchmark, load_results
+from .schemas import NATIVE_ARABIC_SCHEMAS
+__all__ = ['load_benchmark', 'load_results', 'NATIVE_ARABIC_SCHEMAS']

afcl/data/loader.py ADDED Viewed

	@@ -0,0 +1,169 @@

+"""
+Data Loader
+===========
+Load benchmark data and evaluation results.
+"""
+import json
+import os
+from typing import Dict, List, Optional
+from pathlib import Path
+from datasets import load_dataset
+# Category weights for overall score
+CATEGORY_WEIGHTS = {
+    "simple": 0.15,
+    "multiple": 0.10,
+    "parallel": 0.10,
+    "parallel_multiple": 0.10,
+    "irrelevance": 0.15,
+    "dialect_handling": 0.15,
+    "multi_turn": 0.15,
+    "native_arabic": 0.10,
+    # Programming categories (included in evaluation but lower weight)
+    "java": 0.0,
+    "javascript": 0.0,
+    "rest": 0.0,
+    "sql": 0.0,
+}
+def load_benchmark(
+    dataset_name: str = "HeshamHaroon/Arabic_Function_Calling",
+    split: str = "test",
+    category: Optional[str] = None
+) -> List[Dict]:
+    """
+    Load benchmark samples from HuggingFace dataset.
+    Args:
+        dataset_name: HuggingFace dataset repository
+        split: Dataset split ('train' or 'test')
+        category: Optional category filter
+    Returns:
+        List of sample dictionaries
+    """
+    try:
+        dataset = load_dataset(dataset_name, split=split)
+    except Exception as e:
+        print(f"Error loading dataset: {e}")
+        # Fallback to local data
+        local_path = Path(__file__).parent.parent.parent / "arabic_fc_dataset" / "checkpoint.json"
+        if local_path.exists():
+            with open(local_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            samples = data.get('samples', [])
+            if category:
+                samples = [s for s in samples if s.get('category') == category]
+            return samples
+        raise
+    samples = []
+    for item in dataset:
+        sample = {
+            'id': item['id'],
+            'query_en': item['query_en'],
+            'query_ar': item['query_ar'],
+            'functions': json.loads(item['functions']) if item['functions'] else [],
+            'ground_truth': json.loads(item['ground_truth']) if item['ground_truth'] else None,
+            'category': item['category'],
+            'source': item.get('source', ''),
+            'dialect': item.get('dialect', ''),
+        }
+        if category is None or sample['category'] == category:
+            samples.append(sample)
+    return samples
+def load_results(results_dir: str = "data/results") -> Dict[str, Dict]:
+    """
+    Load evaluation results for all models.
+    Args:
+        results_dir: Directory containing result JSON files
+    Returns:
+        Dictionary mapping model names to their results
+    """
+    results = {}
+    results_path = Path(results_dir)
+    if not results_path.exists():
+        return results
+    for file_path in results_path.glob("*.json"):
+        model_name = file_path.stem
+        with open(file_path, 'r', encoding='utf-8') as f:
+            results[model_name] = json.load(f)
+    return results
+def load_leaderboard(leaderboard_path: str = "data/leaderboard.json") -> List[Dict]:
+    """
+    Load the current leaderboard rankings.
+    Returns:
+        List of model entries sorted by overall score
+    """
+    path = Path(leaderboard_path)
+    if not path.exists():
+        return []
+    with open(path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return sorted(data, key=lambda x: x.get('overall', 0), reverse=True)
+def save_leaderboard(entries: List[Dict], leaderboard_path: str = "data/leaderboard.json"):
+    """Save leaderboard data to file."""
+    path = Path(leaderboard_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    # Sort by overall score
+    sorted_entries = sorted(entries, key=lambda x: x.get('overall', 0), reverse=True)
+    # Add ranks
+    for i, entry in enumerate(sorted_entries, 1):
+        entry['rank'] = i
+    with open(path, 'w', encoding='utf-8') as f:
+        json.dump(sorted_entries, f, ensure_ascii=False, indent=2)
+def calculate_overall_score(category_scores: Dict[str, float]) -> float:
+    """
+    Calculate weighted overall score from category scores.
+    Args:
+        category_scores: Dictionary mapping category names to scores (0-100)
+    Returns:
+        Overall weighted score (0-100)
+    """
+    total_weight = 0
+    weighted_sum = 0
+    for category, weight in CATEGORY_WEIGHTS.items():
+        if category in category_scores and weight > 0:
+            weighted_sum += category_scores[category] * weight
+            total_weight += weight
+    if total_weight == 0:
+        return 0.0
+    return weighted_sum / total_weight
+def get_category_stats(samples: List[Dict]) -> Dict[str, int]:
+    """Get sample counts by category."""
+    stats = {}
+    for sample in samples:
+        category = sample.get('category', 'unknown')
+        stats[category] = stats.get(category, 0) + 1
+    return stats

afcl/data/schemas.py ADDED Viewed

	@@ -0,0 +1,399 @@

+"""
+Native Arabic Function Schemas
+==============================
+Bilingual function definitions with Arabic names, descriptions, and examples.
+"""
+NATIVE_ARABIC_SCHEMAS = [
+    # Financial Services - الخدمات المالية
+    {
+        "name": "transfer_money",
+        "name_ar": "تحويل_أموال",
+        "description": "Transfer money between bank accounts",
+        "description_ar": "تحويل أموال بين الحسابات البنكية",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "from_account": {
+                    "type": "string",
+                    "description": "Source account number",
+                    "description_ar": "رقم الحساب المصدر",
+                    "examples_ar": ["SA0380000000608010167519"]
+                },
+                "to_account": {
+                    "type": "string",
+                    "description": "Destination account number",
+                    "description_ar": "رقم الحساب المستقبل"
+                },
+                "amount": {
+                    "type": "number",
+                    "description": "Amount to transfer",
+                    "description_ar": "المبلغ المراد تحويله"
+                },
+                "currency": {
+                    "type": "string",
+                    "description": "Currency code",
+                    "description_ar": "رمز العملة",
+                    "enum": ["SAR", "AED", "EGP", "KWD", "QAR"],
+                    "examples_ar": ["ريال سعودي", "درهم إماراتي", "جنيه مصري"]
+                }
+            },
+            "required": ["from_account", "to_account", "amount"]
+        }
+    },
+    {
+        "name": "check_balance",
+        "name_ar": "استعلام_رصيد",
+        "description": "Check account balance",
+        "description_ar": "الاستعلام عن رصيد الحساب",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "account_number": {
+                    "type": "string",
+                    "description": "Account number to check",
+                    "description_ar": "رقم الحساب للاستعلام"
+                }
+            },
+            "required": ["account_number"]
+        }
+    },
+    {
+        "name": "pay_bill",
+        "name_ar": "دفع_فاتورة",
+        "description": "Pay a utility or service bill",
+        "description_ar": "دفع فاتورة خدمات",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "bill_type": {
+                    "type": "string",
+                    "description": "Type of bill",
+                    "description_ar": "نوع الفاتورة",
+                    "enum": ["electricity", "water", "telecom", "internet"],
+                    "enum_ar": ["كهرباء", "مياه", "اتصالات", "إنترنت"]
+                },
+                "account_id": {
+                    "type": "string",
+                    "description": "Bill account/subscriber ID",
+                    "description_ar": "رقم المشترك"
+                },
+                "amount": {
+                    "type": "number",
+                    "description": "Amount to pay",
+                    "description_ar": "المبلغ المراد دفعه"
+                }
+            },
+            "required": ["bill_type", "account_id"]
+        }
+    },
+    # Government Services - الخدمات الحكومية
+    {
+        "name": "renew_id",
+        "name_ar": "تجديد_هوية",
+        "description": "Renew national ID card",
+        "description_ar": "تجديد بطاقة الهوية الوطنية",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "id_number": {
+                    "type": "string",
+                    "description": "National ID number",
+                    "description_ar": "رقم الهوية الوطنية"
+                },
+                "reason": {
+                    "type": "string",
+                    "description": "Reason for renewal",
+                    "description_ar": "سبب التجديد",
+                    "enum": ["expiry", "damaged", "lost", "data_update"],
+                    "enum_ar": ["انتهاء الصلاحية", "تالفة", "مفقودة", "تحديث بيانات"]
+                }
+            },
+            "required": ["id_number"]
+        }
+    },
+    {
+        "name": "book_appointment",
+        "name_ar": "حجز_موعد",
+        "description": "Book an appointment at a government office",
+        "description_ar": "حجز موعد في جهة حكومية",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "service_type": {
+                    "type": "string",
+                    "description": "Type of service",
+                    "description_ar": "نوع الخدمة"
+                },
+                "location": {
+                    "type": "string",
+                    "description": "Preferred location/branch",
+                    "description_ar": "الفرع المفضل",
+                    "examples_ar": ["الرياض - العليا", "جدة - الحمراء"]
+                },
+                "date": {
+                    "type": "string",
+                    "description": "Preferred date (YYYY-MM-DD)",
+                    "description_ar": "التاريخ المفضل"
+                },
+                "time_slot": {
+                    "type": "string",
+                    "description": "Preferred time slot",
+                    "description_ar": "الفترة الزمنية",
+                    "enum": ["morning", "afternoon"],
+                    "enum_ar": ["صباحي", "مسائي"]
+                }
+            },
+            "required": ["service_type", "date"]
+        }
+    },
+    # E-commerce - التجارة الإلكترونية
+    {
+        "name": "search_product",
+        "name_ar": "البحث_عن_منتج",
+        "description": "Search for products in the store",
+        "description_ar": "البحث عن منتجات في المتجر",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "query": {
+                    "type": "string",
+                    "description": "Search query",
+                    "description_ar": "كلمات البحث"
+                },
+                "category": {
+                    "type": "string",
+                    "description": "Product category",
+                    "description_ar": "فئة المنتج",
+                    "examples_ar": ["إلكترونيات", "ملابس", "أجهزة منزلية"]
+                },
+                "min_price": {
+                    "type": "number",
+                    "description": "Minimum price",
+                    "description_ar": "الحد الأدنى للسعر"
+                },
+                "max_price": {
+                    "type": "number",
+                    "description": "Maximum price",
+                    "description_ar": "الحد الأقصى للسعر"
+                },
+                "sort_by": {
+                    "type": "string",
+                    "description": "Sort order",
+                    "description_ar": "ترتيب حسب",
+                    "enum": ["price_asc", "price_desc", "rating", "newest"],
+                    "enum_ar": ["السعر تصاعدي", "السعر تنازلي", "التقييم", "الأحدث"]
+                }
+            },
+            "required": ["query"]
+        }
+    },
+    {
+        "name": "add_to_cart",
+        "name_ar": "إضافة_للسلة",
+        "description": "Add a product to shopping cart",
+        "description_ar": "إضافة منتج إلى سلة التسوق",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "product_id": {
+                    "type": "string",
+                    "description": "Product identifier",
+                    "description_ar": "معرف المنتج"
+                },
+                "quantity": {
+                    "type": "integer",
+                    "description": "Quantity to add",
+                    "description_ar": "الكمية",
+                    "default": 1
+                },
+                "size": {
+                    "type": "string",
+                    "description": "Product size if applicable",
+                    "description_ar": "المقاس"
+                },
+                "color": {
+                    "type": "string",
+                    "description": "Product color if applicable",
+                    "description_ar": "اللون"
+                }
+            },
+            "required": ["product_id"]
+        }
+    },
+    # Healthcare - الرعاية الصحية
+    {
+        "name": "book_doctor_appointment",
+        "name_ar": "حجز_موعد_طبيب",
+        "description": "Book an appointment with a doctor",
+        "description_ar": "حجز موعد مع طبيب",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "specialty": {
+                    "type": "string",
+                    "description": "Medical specialty",
+                    "description_ar": "التخصص الطبي",
+                    "examples_ar": ["طب عام", "طب أطفال", "طب باطني", "طب عيون", "طب أسنان"]
+                },
+                "doctor_name": {
+                    "type": "string",
+                    "description": "Specific doctor name (optional)",
+                    "description_ar": "اسم الطبيب (اختياري)"
+                },
+                "hospital": {
+                    "type": "string",
+                    "description": "Hospital or clinic name",
+                    "description_ar": "اسم المستشفى أو العيادة"
+                },
+                "date": {
+                    "type": "string",
+                    "description": "Preferred date",
+                    "description_ar": "التاريخ المفضل"
+                },
+                "reason": {
+                    "type": "string",
+                    "description": "Reason for visit",
+                    "description_ar": "سبب الزيارة"
+                }
+            },
+            "required": ["specialty", "date"]
+        }
+    },
+    {
+        "name": "get_lab_results",
+        "name_ar": "نتائج_التحاليل",
+        "description": "Retrieve laboratory test results",
+        "description_ar": "استرجاع نتائج التحاليل المخبرية",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "patient_id": {
+                    "type": "string",
+                    "description": "Patient ID or file number",
+                    "description_ar": "رقم المريض أو الملف"
+                },
+                "test_type": {
+                    "type": "string",
+                    "description": "Type of test",
+                    "description_ar": "نوع التحليل",
+                    "examples_ar": ["تحليل دم شامل", "تحليل سكر", "وظائف كلى", "وظائف كبد"]
+                },
+                "date_from": {
+                    "type": "string",
+                    "description": "Start date for results",
+                    "description_ar": "تاريخ البداية"
+                }
+            },
+            "required": ["patient_id"]
+        }
+    },
+    # Travel & Transportation - السفر والنقل
+    {
+        "name": "book_flight",
+        "name_ar": "احجز_رحلة",
+        "description": "Book a flight between cities",
+        "description_ar": "حجز رحلة طيران بين المدن",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "origin": {
+                    "type": "string",
+                    "description": "Departure city",
+                    "description_ar": "مدينة المغادرة",
+                    "examples_ar": ["القاهرة", "الرياض", "دبي", "جدة"]
+                },
+                "destination": {
+                    "type": "string",
+                    "description": "Arrival city",
+                    "description_ar": "مدينة الوصول"
+                },
+                "date": {
+                    "type": "string",
+                    "description": "Travel date",
+                    "description_ar": "تاريخ السفر"
+                },
+                "return_date": {
+                    "type": "string",
+                    "description": "Return date (optional)",
+                    "description_ar": "تاريخ العودة (اختياري)"
+                },
+                "passengers": {
+                    "type": "integer",
+                    "description": "Number of passengers",
+                    "description_ar": "عدد المسافرين",
+                    "default": 1
+                },
+                "class": {
+                    "type": "string",
+                    "description": "Travel class",
+                    "description_ar": "درجة السفر",
+                    "enum": ["economy", "business", "first"],
+                    "enum_ar": ["اقتصادية", "أعمال", "أولى"]
+                }
+            },
+            "required": ["origin", "destination", "date"]
+        }
+    },
+    # Weather - الطقس
+    {
+        "name": "get_weather",
+        "name_ar": "احصل_على_الطقس",
+        "description": "Get weather information for a city",
+        "description_ar": "الحصول على معلومات الطقس لمدينة",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type": "string",
+                    "description": "City name",
+                    "description_ar": "اسم المدينة",
+                    "examples_ar": ["القاهرة", "دبي", "الرياض", "بيروت", "عمان"]
+                },
+                "days": {
+                    "type": "integer",
+                    "description": "Number of forecast days",
+                    "description_ar": "عدد أيام التوقعات",
+                    "default": 1
+                }
+            },
+            "required": ["city"]
+        }
+    }
+]
+def get_schema_by_name(name: str, use_arabic: bool = False) -> dict:
+    """Get a schema by its name (English or Arabic)."""
+    for schema in NATIVE_ARABIC_SCHEMAS:
+        if schema['name'] == name or schema.get('name_ar') == name:
+            if use_arabic:
+                # Return Arabic-ified version
+                return {
+                    'name': schema.get('name_ar', schema['name']),
+                    'description': schema.get('description_ar', schema['description']),
+                    'parameters': schema['parameters']
+                }
+            return schema
+    return None
+def get_schemas_by_category(category: str) -> list:
+    """Get all schemas in a category."""
+    category_map = {
+        'financial': ['transfer_money', 'check_balance', 'pay_bill'],
+        'government': ['renew_id', 'book_appointment'],
+        'ecommerce': ['search_product', 'add_to_cart'],
+        'healthcare': ['book_doctor_appointment', 'get_lab_results'],
+        'travel': ['book_flight'],
+        'weather': ['get_weather'],
+    }
+    names = category_map.get(category, [])
+    return [get_schema_by_name(name) for name in names if get_schema_by_name(name)]

afcl/evaluators/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""
+AFCL Evaluators
+===============
+Evaluation modules for Arabic function calling.
+"""
+from .ast_evaluator import ArabicASTEvaluator
+from .arabic_utils import ArabicNormalizer
+__all__ = ['ArabicASTEvaluator', 'ArabicNormalizer']

afcl/evaluators/arabic_utils.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""
+Arabic Text Utilities
+=====================
+Utilities for normalizing and processing Arabic text for function calling evaluation.
+"""
+import re
+import unicodedata
+class ArabicNormalizer:
+    """Normalize Arabic text for consistent comparison."""
+    # Arabic diacritics (tashkeel) to remove
+    ARABIC_DIACRITICS = re.compile(r'[\u064B-\u065F\u0670]')
+    # Alef variants to normalize
+    ALEF_VARIANTS = {
+        '\u0622': '\u0627',  # آ -> ا
+        '\u0623': '\u0627',  # أ -> ا
+        '\u0625': '\u0627',  # إ -> ا
+        '\u0671': '\u0627',  # ٱ -> ا
+    }
+    # Ta marbuta to ha
+    TA_MARBUTA = '\u0629'
+    HA = '\u0647'
+    # Arabic-Indic numerals to Western
+    ARABIC_INDIC_NUMERALS = {
+        '\u0660': '0', '\u0661': '1', '\u0662': '2', '\u0663': '3', '\u0664': '4',
+        '\u0665': '5', '\u0666': '6', '\u0667': '7', '\u0668': '8', '\u0669': '9',
+    }
+    # Extended Arabic-Indic numerals (Persian/Urdu)
+    EXTENDED_NUMERALS = {
+        '\u06F0': '0', '\u06F1': '1', '\u06F2': '2', '\u06F3': '3', '\u06F4': '4',
+        '\u06F5': '5', '\u06F6': '6', '\u06F7': '7', '\u06F8': '8', '\u06F9': '9',
+    }
+    def __init__(
+        self,
+        remove_diacritics: bool = True,
+        normalize_alef: bool = True,
+        normalize_ta_marbuta: bool = False,
+        normalize_numerals: bool = True,
+        lowercase: bool = True,
+        strip_whitespace: bool = True
+    ):
+        self.remove_diacritics = remove_diacritics
+        self.normalize_alef = normalize_alef
+        self.normalize_ta_marbuta = normalize_ta_marbuta
+        self.normalize_numerals = normalize_numerals
+        self.lowercase = lowercase
+        self.strip_whitespace = strip_whitespace
+    def normalize(self, text: str) -> str:
+        """Apply all configured normalizations to text."""
+        if not text:
+            return ""
+        # Unicode normalization
+        text = unicodedata.normalize('NFC', text)
+        # Remove diacritics
+        if self.remove_diacritics:
+            text = self.ARABIC_DIACRITICS.sub('', text)
+        # Normalize alef variants
+        if self.normalize_alef:
+            for variant, replacement in self.ALEF_VARIANTS.items():
+                text = text.replace(variant, replacement)
+        # Normalize ta marbuta
+        if self.normalize_ta_marbuta:
+            text = text.replace(self.TA_MARBUTA, self.HA)
+        # Normalize numerals
+        if self.normalize_numerals:
+            for arabic, western in self.ARABIC_INDIC_NUMERALS.items():
+                text = text.replace(arabic, western)
+            for persian, western in self.EXTENDED_NUMERALS.items():
+                text = text.replace(persian, western)
+        # Lowercase (for Latin characters in function names)
+        if self.lowercase:
+            text = text.lower()
+        # Strip and normalize whitespace
+        if self.strip_whitespace:
+            text = ' '.join(text.split())
+        return text
+    def normalize_for_comparison(self, text: str) -> str:
+        """Aggressive normalization for fuzzy matching."""
+        text = self.normalize(text)
+        # Remove all punctuation
+        text = re.sub(r'[^\w\s]', '', text)
+        # Remove extra whitespace
+        text = ' '.join(text.split())
+        return text
+def extract_arabic_numbers(text: str) -> list:
+    """Extract numbers from Arabic text (both Arabic-Indic and Western)."""
+    normalizer = ArabicNormalizer(normalize_numerals=True)
+    normalized = normalizer.normalize(text)
+    return re.findall(r'\d+(?:\.\d+)?', normalized)
+def is_arabic_text(text: str) -> bool:
+    """Check if text contains Arabic characters."""
+    arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+')
+    return bool(arabic_pattern.search(text))
+def detect_dialect(text: str) -> str:
+    """
+    Simple dialect detection based on common markers.
+    Returns: 'egyptian', 'gulf', 'levantine', or 'msa' (Modern Standard Arabic)
+    """
+    text_lower = text.lower()
+    # Egyptian markers
+    egyptian_markers = ['ازاي', 'عايز', 'كده', 'ده', 'دي', 'بتاع', 'اوي', 'خالص']
+    if any(marker in text for marker in egyptian_markers):
+        return 'egyptian'
+    # Gulf markers
+    gulf_markers = ['شلون', 'ابي', 'ابغى', 'وايد', 'زين', 'حيل', 'يالله']
+    if any(marker in text for marker in gulf_markers):
+        return 'gulf'
+    # Levantine markers
+    levantine_markers = ['كيفك', 'شو', 'هيك', 'منيح', 'كتير', 'هلق', 'بدي']
+    if any(marker in text for marker in levantine_markers):
+        return 'levantine'
+    return 'msa'

afcl/evaluators/ast_evaluator.py ADDED Viewed

	@@ -0,0 +1,477 @@

+"""
+AST-Based Function Call Evaluator
+=================================
+Evaluates model predictions against ground truth using AST-based matching.
+"""
+import json
+import re
+from typing import Any, Dict, List, Optional, Tuple, Union
+from dataclasses import dataclass
+from .arabic_utils import ArabicNormalizer
+@dataclass
+class EvaluationResult:
+    """Result of evaluating a single sample."""
+    sample_id: str
+    category: str
+    is_correct: bool
+    score: float
+    details: Dict[str, Any]
+class ArabicASTEvaluator:
+    """
+    AST-based evaluator for Arabic function calling.
+    Supports multiple evaluation modes:
+    - exact: Exact match of function name and all arguments
+    - relaxed: Allows minor variations in argument values
+    - function_only: Only checks if correct function was called
+    """
+    def __init__(self, mode: str = "exact"):
+        self.mode = mode
+        self.normalizer = ArabicNormalizer()
+    def parse_function_call(self, response: str) -> Optional[Dict]:
+        """
+        Parse a function call from model response.
+        Handles multiple formats:
+        - JSON: {"name": "func", "arguments": {...}}
+        - OpenAI style: {"function_call": {"name": "func", "arguments": "..."}}
+        - Plain text: func(arg1, arg2)
+        """
+        if not response:
+            return None
+        response = response.strip()
+        # Try JSON format first
+        try:
+            data = json.loads(response)
+            if isinstance(data, dict):
+                # Direct format
+                if 'name' in data and 'arguments' in data:
+                    args = data['arguments']
+                    if isinstance(args, str):
+                        args = json.loads(args)
+                    return {'name': data['name'], 'arguments': args}
+                # OpenAI format
+                if 'function_call' in data:
+                    fc = data['function_call']
+                    args = fc.get('arguments', {})
+                    if isinstance(args, str):
+                        args = json.loads(args)
+                    return {'name': fc['name'], 'arguments': args}
+                # Tool calls format
+                if 'tool_calls' in data and data['tool_calls']:
+                    tc = data['tool_calls'][0]
+                    func = tc.get('function', tc)
+                    args = func.get('arguments', {})
+                    if isinstance(args, str):
+                        args = json.loads(args)
+                    return {'name': func['name'], 'arguments': args}
+        except (json.JSONDecodeError, KeyError, TypeError):
+            pass
+        # Try extracting JSON from text
+        json_match = re.search(r'\{[^{}]*"name"[^{}]*\}', response, re.DOTALL)
+        if json_match:
+            try:
+                data = json.loads(json_match.group())
+                if 'name' in data:
+                    args = data.get('arguments', data.get('parameters', {}))
+                    if isinstance(args, str):
+                        args = json.loads(args)
+                    return {'name': data['name'], 'arguments': args}
+            except (json.JSONDecodeError, KeyError):
+                pass
+        # Try plain text function call format: func(args)
+        func_match = re.match(r'(\w+)\s*\((.*)\)', response, re.DOTALL)
+        if func_match:
+            name = func_match.group(1)
+            args_str = func_match.group(2).strip()
+            try:
+                # Try parsing as JSON
+                if args_str.startswith('{'):
+                    args = json.loads(args_str)
+                else:
+                    # Parse as key=value pairs
+                    args = {}
+                    for pair in args_str.split(','):
+                        if '=' in pair:
+                            k, v = pair.split('=', 1)
+                            args[k.strip()] = self._parse_value(v.strip())
+                return {'name': name, 'arguments': args}
+            except:
+                pass
+        return None
+    def parse_multiple_calls(self, response: str) -> List[Dict]:
+        """Parse multiple function calls from response."""
+        calls = []
+        if not response:
+            return calls
+        # Try JSON array
+        try:
+            data = json.loads(response)
+            if isinstance(data, list):
+                for item in data:
+                    parsed = self.parse_function_call(json.dumps(item))
+                    if parsed:
+                        calls.append(parsed)
+                return calls
+            elif isinstance(data, dict) and 'tool_calls' in data:
+                for tc in data['tool_calls']:
+                    func = tc.get('function', tc)
+                    args = func.get('arguments', {})
+                    if isinstance(args, str):
+                        args = json.loads(args)
+                    calls.append({'name': func['name'], 'arguments': args})
+                return calls
+        except (json.JSONDecodeError, KeyError, TypeError):
+            pass
+        # Try finding multiple JSON objects
+        json_pattern = r'\{[^{}]*"name"[^{}]*\}'
+        matches = re.findall(json_pattern, response, re.DOTALL)
+        for match in matches:
+            parsed = self.parse_function_call(match)
+            if parsed:
+                calls.append(parsed)
+        # If no calls found, try single call
+        if not calls:
+            single = self.parse_function_call(response)
+            if single:
+                calls.append(single)
+        return calls
+    def _parse_value(self, value: str) -> Any:
+        """Parse a string value to appropriate type."""
+        value = value.strip().strip('"\'')
+        # Try numeric
+        try:
+            if '.' in value:
+                return float(value)
+            return int(value)
+        except ValueError:
+            pass
+        # Boolean
+        if value.lower() in ('true', 'false'):
+            return value.lower() == 'true'
+        # None
+        if value.lower() in ('none', 'null'):
+            return None
+        return value
+    def normalize_value(self, value: Any) -> Any:
+        """Normalize a value for comparison."""
+        if isinstance(value, str):
+            return self.normalizer.normalize(value)
+        if isinstance(value, (list, tuple)):
+            return [self.normalize_value(v) for v in value]
+        if isinstance(value, dict):
+            return {k: self.normalize_value(v) for k, v in value.items()}
+        return value
+    def compare_arguments(
+        self,
+        predicted: Dict[str, Any],
+        expected: Dict[str, Any],
+        strict: bool = True
+    ) -> Tuple[bool, float, Dict]:
+        """
+        Compare predicted arguments against expected.
+        Returns: (is_match, score, details)
+        """
+        if not expected:
+            return len(predicted) == 0, 1.0 if len(predicted) == 0 else 0.0, {}
+        details = {'matched': [], 'mismatched': [], 'missing': [], 'extra': []}
+        expected_keys = set(expected.keys())
+        predicted_keys = set(predicted.keys())
+        # Check for missing and extra keys
+        missing = expected_keys - predicted_keys
+        extra = predicted_keys - expected_keys
+        details['missing'] = list(missing)
+        details['extra'] = list(extra)
+        # Compare common keys
+        common_keys = expected_keys & predicted_keys
+        matched_count = 0
+        for key in common_keys:
+            exp_val = self.normalize_value(expected[key])
+            pred_val = self.normalize_value(predicted[key])
+            if exp_val == pred_val:
+                details['matched'].append(key)
+                matched_count += 1
+            else:
+                # Try numeric comparison with tolerance
+                if isinstance(exp_val, (int, float)) and isinstance(pred_val, (int, float)):
+                    if abs(exp_val - pred_val) < 0.001:
+                        details['matched'].append(key)
+                        matched_count += 1
+                        continue
+                details['mismatched'].append({
+                    'key': key,
+                    'expected': expected[key],
+                    'predicted': predicted[key]
+                })
+        # Calculate score
+        total_expected = len(expected_keys)
+        if strict:
+            # All must match, no extras
+            is_match = (matched_count == total_expected and len(extra) == 0)
+            score = matched_count / max(total_expected, len(predicted_keys)) if predicted_keys else 0.0
+        else:
+            # Partial credit
+            is_match = matched_count == total_expected
+            score = matched_count / total_expected if total_expected > 0 else 1.0
+        return is_match, score, details
+    def evaluate_single_call(
+        self,
+        predicted: Optional[Dict],
+        expected: Dict
+    ) -> EvaluationResult:
+        """Evaluate a single function call prediction."""
+        if predicted is None:
+            return EvaluationResult(
+                sample_id="",
+                category="",
+                is_correct=False,
+                score=0.0,
+                details={'error': 'Failed to parse prediction'}
+            )
+        # Check function name
+        pred_name = self.normalizer.normalize(predicted.get('name', ''))
+        exp_name = self.normalizer.normalize(expected.get('name', ''))
+        if pred_name != exp_name:
+            return EvaluationResult(
+                sample_id="",
+                category="",
+                is_correct=False,
+                score=0.0,
+                details={
+                    'error': 'Function name mismatch',
+                    'expected_name': expected.get('name'),
+                    'predicted_name': predicted.get('name')
+                }
+            )
+        # Compare arguments
+        pred_args = predicted.get('arguments', {})
+        exp_args = expected.get('arguments', {})
+        is_match, score, details = self.compare_arguments(
+            pred_args, exp_args, strict=(self.mode == 'exact')
+        )
+        return EvaluationResult(
+            sample_id="",
+            category="",
+            is_correct=is_match,
+            score=score,
+            details=details
+        )
+    def evaluate_parallel_calls(
+        self,
+        predicted: List[Dict],
+        expected: List[Dict]
+    ) -> EvaluationResult:
+        """
+        Evaluate parallel function calls (order-agnostic).
+        Uses bipartite matching for optimal pairing.
+        """
+        if len(predicted) == 0 and len(expected) == 0:
+            return EvaluationResult(
+                sample_id="",
+                category="",
+                is_correct=True,
+                score=1.0,
+                details={'matched_calls': 0}
+            )
+        if len(predicted) == 0:
+            return EvaluationResult(
+                sample_id="",
+                category="",
+                is_correct=False,
+                score=0.0,
+                details={'error': 'No predictions', 'expected_count': len(expected)}
+            )
+        # Build score matrix
+        scores = []
+        for pred in predicted:
+            row = []
+            for exp in expected:
+                result = self.evaluate_single_call(pred, exp)
+                row.append(result.score)
+            scores.append(row)
+        # Greedy matching (could use Hungarian algorithm for optimal)
+        matched = 0
+        total_score = 0.0
+        used_expected = set()
+        match_details = []
+        for i, pred in enumerate(predicted):
+            best_j = -1
+            best_score = -1
+            for j, exp in enumerate(expected):
+                if j not in used_expected and scores[i][j] > best_score:
+                    best_score = scores[i][j]
+                    best_j = j
+            if best_j >= 0 and best_score > 0:
+                used_expected.add(best_j)
+                total_score += best_score
+                if best_score == 1.0:
+                    matched += 1
+                match_details.append({
+                    'predicted': pred,
+                    'matched_to': expected[best_j],
+                    'score': best_score
+                })
+        # Calculate overall score
+        max_possible = max(len(predicted), len(expected))
+        avg_score = total_score / max_possible if max_possible > 0 else 0.0
+        is_correct = (matched == len(expected) and len(predicted) == len(expected))
+        return EvaluationResult(
+            sample_id="",
+            category="",
+            is_correct=is_correct,
+            score=avg_score,
+            details={
+                'matched_calls': matched,
+                'expected_count': len(expected),
+                'predicted_count': len(predicted),
+                'matches': match_details
+            }
+        )
+    def evaluate_irrelevance(
+        self,
+        predicted: Union[str, Dict, List],
+        expected_no_call: bool = True
+    ) -> EvaluationResult:
+        """
+        Evaluate irrelevance detection (should not call any function).
+        """
+        # Check if model made any function calls
+        if isinstance(predicted, str):
+            calls = self.parse_multiple_calls(predicted)
+        elif isinstance(predicted, list):
+            calls = predicted
+        elif isinstance(predicted, dict):
+            calls = [predicted] if 'name' in predicted else []
+        else:
+            calls = []
+        made_call = len(calls) > 0
+        if expected_no_call:
+            is_correct = not made_call
+            score = 1.0 if is_correct else 0.0
+            details = {
+                'expected': 'no_call',
+                'actual': 'call_made' if made_call else 'no_call',
+                'calls_made': calls
+            }
+        else:
+            is_correct = made_call
+            score = 1.0 if is_correct else 0.0
+            details = {
+                'expected': 'call_required',
+                'actual': 'call_made' if made_call else 'no_call'
+            }
+        return EvaluationResult(
+            sample_id="",
+            category="irrelevance",
+            is_correct=is_correct,
+            score=score,
+            details=details
+        )
+    def evaluate(
+        self,
+        sample: Dict,
+        prediction: str
+    ) -> EvaluationResult:
+        """
+        Main evaluation entry point.
+        Dispatches to appropriate evaluator based on category.
+        """
+        category = sample.get('category', 'simple')
+        sample_id = sample.get('id', '')
+        # Parse ground truth
+        ground_truth = sample.get('ground_truth')
+        if isinstance(ground_truth, str) and ground_truth:
+            try:
+                ground_truth = json.loads(ground_truth)
+            except json.JSONDecodeError:
+                ground_truth = None
+        # Handle irrelevance
+        if category == 'irrelevance':
+            result = self.evaluate_irrelevance(prediction, expected_no_call=True)
+            result.sample_id = sample_id
+            return result
+        # Parse prediction
+        if category in ('parallel', 'parallel_multiple'):
+            pred_calls = self.parse_multiple_calls(prediction)
+            if ground_truth and 'calls' in ground_truth:
+                exp_calls = ground_truth['calls']
+            else:
+                exp_calls = []
+            result = self.evaluate_parallel_calls(pred_calls, exp_calls)
+        else:
+            pred_call = self.parse_function_call(prediction)
+            if ground_truth:
+                if 'calls' in ground_truth and ground_truth['calls']:
+                    exp_call = ground_truth['calls'][0]
+                else:
+                    exp_call = ground_truth
+            else:
+                # No ground truth available
+                result = EvaluationResult(
+                    sample_id=sample_id,
+                    category=category,
+                    is_correct=False,
+                    score=0.0,
+                    details={'error': 'No ground truth available'}
+                )
+                return result
+            result = self.evaluate_single_call(pred_call, exp_call)
+        result.sample_id = sample_id
+        result.category = category
+        return result

afcl/requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+# Arabic Function Calling Leaderboard Requirements
+# Core
+gradio>=4.0.0
+datasets>=2.14.0
+huggingface_hub>=0.19.0
+pandas>=2.0.0
+# Visualization
+plotly>=5.18.0
+# Evaluation
+transformers>=4.35.0
+torch>=2.0.0
+# Arabic NLP
+camel-tools>=1.5.0  # Optional: for advanced Arabic processing
+# Utilities
+python-dotenv>=1.0.0
+tqdm>=4.66.0

afcl/static/styles.css ADDED Viewed

	@@ -0,0 +1,321 @@

+/* Arabic Function Calling Leaderboard - RTL Arabic Styles */
+/* Import Arabic fonts */
+@import url('https://fonts.googleapis.com/css2?family=Noto+Kufi+Arabic:wght@400;500;600;700&family=Noto+Naskh+Arabic:wght@400;500;600;700&display=swap');
+/* RTL Support */
+[dir="rtl"],
+.rtl {
+    direction: rtl;
+    text-align: right;
+}
+/* Arabic text styling */
+.arabic-text {
+    font-family: 'Noto Kufi Arabic', 'Noto Naskh Arabic', 'Arial', sans-serif;
+    line-height: 1.8;
+}
+/* Leaderboard header */
+.leaderboard-header {
+    background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4a 100%);
+    color: white;
+    padding: 2rem;
+    border-radius: 12px;
+    margin-bottom: 1.5rem;
+    text-align: center;
+}
+.leaderboard-header h1 {
+    font-size: 2rem;
+    margin-bottom: 0.5rem;
+    font-family: 'Noto Kufi Arabic', sans-serif;
+}
+.leaderboard-header .subtitle {
+    font-size: 1.1rem;
+    opacity: 0.9;
+}
+/* Table styling */
+.leaderboard-table {
+    width: 100%;
+    border-collapse: collapse;
+    font-family: 'Noto Kufi Arabic', sans-serif;
+}
+.leaderboard-table th,
+.leaderboard-table td {
+    padding: 12px 16px;
+    text-align: center;
+    border-bottom: 1px solid #e0e0e0;
+}
+.leaderboard-table th {
+    background-color: #f5f5f5;
+    font-weight: 600;
+    color: #333;
+}
+.leaderboard-table tr:hover {
+    background-color: #f9f9f9;
+}
+/* Rank badges */
+.rank-badge {
+    display: inline-flex;
+    align-items: center;
+    justify-content: center;
+    width: 32px;
+    height: 32px;
+    border-radius: 50%;
+    font-weight: bold;
+}
+.rank-1 {
+    background: linear-gradient(135deg, #ffd700, #ffed4a);
+    color: #8b6914;
+}
+.rank-2 {
+    background: linear-gradient(135deg, #c0c0c0, #e8e8e8);
+    color: #666;
+}
+.rank-3 {
+    background: linear-gradient(135deg, #cd7f32, #daa520);
+    color: #5c3d1e;
+}
+.rank-other {
+    background-color: #f0f0f0;
+    color: #666;
+}
+/* Score cells */
+.score-cell {
+    font-weight: 500;
+}
+.score-high {
+    color: #2ca02c;
+}
+.score-medium {
+    color: #1f77b4;
+}
+.score-low {
+    color: #ff7f0e;
+}
+.score-very-low {
+    color: #d62728;
+}
+/* Model name styling */
+.model-name {
+    font-weight: 600;
+    color: #1a5f2a;
+}
+/* Category tabs */
+.category-tabs {
+    display: flex;
+    gap: 8px;
+    flex-wrap: wrap;
+    margin-bottom: 1rem;
+}
+.category-tab {
+    padding: 8px 16px;
+    border-radius: 20px;
+    background-color: #f0f0f0;
+    color: #666;
+    cursor: pointer;
+    transition: all 0.2s ease;
+    font-family: 'Noto Kufi Arabic', sans-serif;
+}
+.category-tab:hover {
+    background-color: #e0e0e0;
+}
+.category-tab.active {
+    background-color: #1a5f2a;
+    color: white;
+}
+/* Cards */
+.stat-card {
+    background: white;
+    border-radius: 12px;
+    padding: 1.5rem;
+    box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+    text-align: center;
+}
+.stat-card .value {
+    font-size: 2.5rem;
+    font-weight: bold;
+    color: #1a5f2a;
+}
+.stat-card .label {
+    font-size: 0.9rem;
+    color: #666;
+    margin-top: 0.5rem;
+    font-family: 'Noto Kufi Arabic', sans-serif;
+}
+/* Language toggle */
+.lang-toggle {
+    display: flex;
+    gap: 8px;
+    justify-content: flex-end;
+    margin-bottom: 1rem;
+}
+.lang-btn {
+    padding: 6px 12px;
+    border-radius: 4px;
+    border: 1px solid #ddd;
+    background: white;
+    cursor: pointer;
+    transition: all 0.2s ease;
+}
+.lang-btn.active {
+    background-color: #1a5f2a;
+    color: white;
+    border-color: #1a5f2a;
+}
+/* Submission form */
+.submission-form {
+    background: #f9f9f9;
+    border-radius: 12px;
+    padding: 2rem;
+}
+.submission-form h2 {
+    margin-bottom: 1.5rem;
+    color: #333;
+    font-family: 'Noto Kufi Arabic', sans-serif;
+}
+.form-field {
+    margin-bottom: 1.5rem;
+}
+.form-field label {
+    display: block;
+    margin-bottom: 0.5rem;
+    font-weight: 500;
+    font-family: 'Noto Kufi Arabic', sans-serif;
+}
+.form-field input,
+.form-field select {
+    width: 100%;
+    padding: 10px 12px;
+    border: 1px solid #ddd;
+    border-radius: 8px;
+    font-size: 1rem;
+}
+.submit-btn {
+    background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4a 100%);
+    color: white;
+    padding: 12px 24px;
+    border: none;
+    border-radius: 8px;
+    font-size: 1rem;
+    font-weight: 600;
+    cursor: pointer;
+    transition: transform 0.2s ease;
+    font-family: 'Noto Kufi Arabic', sans-serif;
+}
+.submit-btn:hover {
+    transform: translateY(-2px);
+}
+/* About section */
+.about-section {
+    line-height: 1.8;
+    font-family: 'Noto Kufi Arabic', sans-serif;
+}
+.about-section h2 {
+    color: #1a5f2a;
+    margin-top: 2rem;
+    margin-bottom: 1rem;
+}
+.about-section ul {
+    padding-right: 2rem;
+}
+.about-section li {
+    margin-bottom: 0.5rem;
+}
+/* Responsive design */
+@media (max-width: 768px) {
+    .leaderboard-header h1 {
+        font-size: 1.5rem;
+    }
+    .leaderboard-table {
+        font-size: 0.85rem;
+    }
+    .leaderboard-table th,
+    .leaderboard-table td {
+        padding: 8px 10px;
+    }
+    .stat-card .value {
+        font-size: 2rem;
+    }
+}
+/* Dark mode support */
+@media (prefers-color-scheme: dark) {
+    .leaderboard-table th {
+        background-color: #2a2a2a;
+        color: #e0e0e0;
+    }
+    .leaderboard-table td {
+        border-color: #3a3a3a;
+    }
+    .leaderboard-table tr:hover {
+        background-color: #2a2a2a;
+    }
+    .stat-card {
+        background: #1a1a1a;
+        box-shadow: 0 2px 8px rgba(0,0,0,0.3);
+    }
+    .submission-form {
+        background: #1a1a1a;
+    }
+    .category-tab {
+        background-color: #2a2a2a;
+        color: #ccc;
+    }
+}
+/* Gradio overrides */
+.gradio-container {
+    font-family: 'Noto Kufi Arabic', 'Arial', sans-serif !important;
+}
+.gradio-container .prose {
+    font-family: 'Noto Kufi Arabic', 'Arial', sans-serif !important;
+}

afcl/submission/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+AFCL Submission Module
+======================
+Model submission and evaluation queue management.
+"""
+from .handler import SubmissionHandler
+__all__ = ['SubmissionHandler']

afcl/submission/handler.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Submission Handler
+==================
+Handles model submission workflow for the leaderboard.
+"""
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Optional
+from dataclasses import dataclass, asdict
+import uuid
+@dataclass
+class Submission:
+    """Model submission data."""
+    id: str
+    model_name: str
+    model_type: str  # "huggingface", "api", "local"
+    model_path: str
+    precision: str
+    base_model: Optional[str]
+    license: str
+    submitted_at: str
+    status: str  # "pending", "running", "completed", "failed"
+    results: Optional[Dict] = None
+    error_message: Optional[str] = None
+class SubmissionHandler:
+    """Handles model submissions and evaluation queue."""
+    def __init__(self, submissions_dir: str = "data/submissions"):
+        self.submissions_dir = Path(submissions_dir)
+        self.submissions_dir.mkdir(parents=True, exist_ok=True)
+    def create_submission(
+        self,
+        model_name: str,
+        model_type: str,
+        model_path: str,
+        precision: str = "float16",
+        base_model: Optional[str] = None,
+        license: str = "Apache-2.0"
+    ) -> Submission:
+        """Create a new submission."""
+        submission = Submission(
+            id=str(uuid.uuid4())[:8],
+            model_name=model_name,
+            model_type=model_type,
+            model_path=model_path,
+            precision=precision,
+            base_model=base_model,
+            license=license,
+            submitted_at=datetime.now().isoformat(),
+            status="pending"
+        )
+        # Save submission
+        self._save_submission(submission)
+        return submission
+    def _save_submission(self, submission: Submission):
+        """Save submission to file."""
+        filepath = self.submissions_dir / f"{submission.id}.json"
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(asdict(submission), f, ensure_ascii=False, indent=2)
+    def get_submission(self, submission_id: str) -> Optional[Submission]:
+        """Load a submission by ID."""
+        filepath = self.submissions_dir / f"{submission_id}.json"
+        if not filepath.exists():
+            return None
+        with open(filepath, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        return Submission(**data)
+    def update_status(
+        self,
+        submission_id: str,
+        status: str,
+        results: Optional[Dict] = None,
+        error_message: Optional[str] = None
+    ):
+        """Update submission status."""
+        submission = self.get_submission(submission_id)
+        if submission:
+            submission.status = status
+            if results:
+                submission.results = results
+            if error_message:
+                submission.error_message = error_message
+            self._save_submission(submission)
+    def get_pending_submissions(self) -> list:
+        """Get all pending submissions."""
+        submissions = []
+        for filepath in self.submissions_dir.glob("*.json"):
+            with open(filepath, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            if data.get('status') == 'pending':
+                submissions.append(Submission(**data))
+        return sorted(submissions, key=lambda x: x.submitted_at)
+    def get_all_submissions(self) -> list:
+        """Get all submissions."""
+        submissions = []
+        for filepath in self.submissions_dir.glob("*.json"):
+            with open(filepath, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            submissions.append(Submission(**data))
+        return sorted(submissions, key=lambda x: x.submitted_at, reverse=True)

afcl/visualization/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+AFCL Visualization Module
+=========================
+Charts and visualizations for the leaderboard.
+"""
+from .charts import create_radar_chart, create_bar_chart, create_category_comparison
+__all__ = ['create_radar_chart', 'create_bar_chart', 'create_category_comparison']

afcl/visualization/charts.py ADDED Viewed

	@@ -0,0 +1,363 @@

+"""
+Visualization Charts
+====================
+Plotly-based visualizations for the Arabic Function Calling Leaderboard.
+"""
+import plotly.graph_objects as go
+import plotly.express as px
+from typing import Dict, List, Optional
+import pandas as pd
+# Arabic category names mapping
+CATEGORY_NAMES_AR = {
+    "simple": "بسيط",
+    "multiple": "متعدد",
+    "parallel": "متوازي",
+    "parallel_multiple": "متوازي متعدد",
+    "irrelevance": "اللا صلة",
+    "dialect_handling": "اللهجات",
+    "multi_turn": "متعدد الأدوار",
+    "native_arabic": "العربي الأصلي",
+    "java": "جافا",
+    "javascript": "جافاسكريبت",
+    "rest": "REST",
+    "sql": "SQL"
+}
+# Color palette for models
+MODEL_COLORS = [
+    "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
+    "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"
+]
+def create_radar_chart(
+    model_scores: Dict[str, Dict[str, float]],
+    categories: Optional[List[str]] = None,
+    use_arabic: bool = True,
+    title: str = "Model Comparison"
+) -> go.Figure:
+    """
+    Create a radar/spider chart comparing models across categories.
+    Args:
+        model_scores: Dict mapping model names to category scores
+        categories: Categories to include (defaults to main evaluation categories)
+        use_arabic: Whether to use Arabic labels
+        title: Chart title
+    Returns:
+        Plotly Figure object
+    """
+    if categories is None:
+        categories = ["simple", "multiple", "parallel", "parallel_multiple",
+                      "irrelevance", "dialect_handling"]
+    # Prepare category labels
+    if use_arabic:
+        labels = [CATEGORY_NAMES_AR.get(cat, cat) for cat in categories]
+    else:
+        labels = categories
+    fig = go.Figure()
+    for i, (model_name, scores) in enumerate(model_scores.items()):
+        values = [scores.get(cat, 0) for cat in categories]
+        # Close the radar chart
+        values_closed = values + [values[0]]
+        labels_closed = labels + [labels[0]]
+        fig.add_trace(go.Scatterpolar(
+            r=values_closed,
+            theta=labels_closed,
+            fill='toself',
+            name=model_name,
+            line_color=MODEL_COLORS[i % len(MODEL_COLORS)],
+            opacity=0.7
+        ))
+    fig.update_layout(
+        polar=dict(
+            radialaxis=dict(
+                visible=True,
+                range=[0, 100]
+            )
+        ),
+        showlegend=True,
+        title=dict(
+            text=title,
+            font=dict(size=16)
+        ),
+        font=dict(
+            family="Noto Kufi Arabic, Arial" if use_arabic else "Arial"
+        )
+    )
+    return fig
+def create_bar_chart(
+    leaderboard_data: List[Dict],
+    metric: str = "overall",
+    top_n: int = 10,
+    use_arabic: bool = True,
+    title: str = "Top Models"
+) -> go.Figure:
+    """
+    Create a horizontal bar chart of top models.
+    Args:
+        leaderboard_data: List of model entries with scores
+        metric: Metric to display (default: 'overall')
+        top_n: Number of top models to show
+        use_arabic: Whether to use Arabic labels
+        title: Chart title
+    Returns:
+        Plotly Figure object
+    """
+    # Sort and get top N
+    sorted_data = sorted(
+        leaderboard_data,
+        key=lambda x: x.get(metric, 0),
+        reverse=True
+    )[:top_n]
+    # Reverse for horizontal bar chart (top at top)
+    sorted_data = sorted_data[::-1]
+    models = [d.get('model', d.get('name', 'Unknown')) for d in sorted_data]
+    scores = [d.get(metric, 0) for d in sorted_data]
+    # Color based on score ranges
+    colors = []
+    for score in scores:
+        if score >= 80:
+            colors.append('#2ca02c')  # Green
+        elif score >= 60:
+            colors.append('#1f77b4')  # Blue
+        elif score >= 40:
+            colors.append('#ff7f0e')  # Orange
+        else:
+            colors.append('#d62728')  # Red
+    fig = go.Figure(go.Bar(
+        x=scores,
+        y=models,
+        orientation='h',
+        marker_color=colors,
+        text=[f"{s:.1f}%" for s in scores],
+        textposition='outside'
+    ))
+    metric_label = CATEGORY_NAMES_AR.get(metric, metric) if use_arabic else metric
+    fig.update_layout(
+        title=dict(
+            text=title,
+            font=dict(size=16)
+        ),
+        xaxis=dict(
+            title="الدقة (%)" if use_arabic else "Accuracy (%)",
+            range=[0, 105]
+        ),
+        yaxis=dict(
+            title=""
+        ),
+        font=dict(
+            family="Noto Kufi Arabic, Arial" if use_arabic else "Arial"
+        ),
+        height=max(400, len(models) * 40)
+    )
+    return fig
+def create_category_comparison(
+    leaderboard_data: List[Dict],
+    models: Optional[List[str]] = None,
+    use_arabic: bool = True,
+    title: str = "Category Performance Comparison"
+) -> go.Figure:
+    """
+    Create a grouped bar chart comparing models across categories.
+    Args:
+        leaderboard_data: List of model entries with category scores
+        models: List of model names to include (default: top 5)
+        use_arabic: Whether to use Arabic labels
+        title: Chart title
+    Returns:
+        Plotly Figure object
+    """
+    # Categories to show
+    categories = ["simple", "multiple", "parallel", "parallel_multiple",
+                  "irrelevance", "dialect_handling"]
+    # Get models to compare
+    if models is None:
+        sorted_data = sorted(
+            leaderboard_data,
+            key=lambda x: x.get('overall', 0),
+            reverse=True
+        )[:5]
+        models = [d.get('model', d.get('name', 'Unknown')) for d in sorted_data]
+    # Filter data for selected models
+    model_data = {
+        d.get('model', d.get('name', 'Unknown')): d
+        for d in leaderboard_data
+        if d.get('model', d.get('name', 'Unknown')) in models
+    }
+    # Prepare labels
+    if use_arabic:
+        cat_labels = [CATEGORY_NAMES_AR.get(cat, cat) for cat in categories]
+    else:
+        cat_labels = categories
+    fig = go.Figure()
+    for i, model in enumerate(models):
+        if model in model_data:
+            scores = [model_data[model].get(cat, 0) for cat in categories]
+            fig.add_trace(go.Bar(
+                name=model,
+                x=cat_labels,
+                y=scores,
+                marker_color=MODEL_COLORS[i % len(MODEL_COLORS)]
+            ))
+    fig.update_layout(
+        barmode='group',
+        title=dict(
+            text=title,
+            font=dict(size=16)
+        ),
+        xaxis=dict(
+            title="الفئة" if use_arabic else "Category",
+            tickangle=-45 if use_arabic else 0
+        ),
+        yaxis=dict(
+            title="الدقة (%)" if use_arabic else "Accuracy (%)",
+            range=[0, 105]
+        ),
+        font=dict(
+            family="Noto Kufi Arabic, Arial" if use_arabic else "Arial"
+        ),
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=1.02,
+            xanchor="right",
+            x=1
+        ),
+        height=500
+    )
+    return fig
+def create_dialect_breakdown(
+    model_scores: Dict[str, Dict[str, float]],
+    use_arabic: bool = True,
+    title: str = "Dialect Performance"
+) -> go.Figure:
+    """
+    Create a chart showing performance across Arabic dialects.
+    Args:
+        model_scores: Dict mapping model names to dialect scores
+        use_arabic: Whether to use Arabic labels
+        title: Chart title
+    Returns:
+        Plotly Figure object
+    """
+    dialects = ["msa", "egyptian", "gulf", "levantine"]
+    dialect_labels = {
+        "msa": "الفصحى" if use_arabic else "MSA",
+        "egyptian": "المصري" if use_arabic else "Egyptian",
+        "gulf": "الخليجي" if use_arabic else "Gulf",
+        "levantine": "الشامي" if use_arabic else "Levantine"
+    }
+    fig = go.Figure()
+    for i, (model_name, scores) in enumerate(model_scores.items()):
+        dialect_scores = [scores.get(d, 0) for d in dialects]
+        labels = [dialect_labels[d] for d in dialects]
+        fig.add_trace(go.Bar(
+            name=model_name,
+            x=labels,
+            y=dialect_scores,
+            marker_color=MODEL_COLORS[i % len(MODEL_COLORS)]
+        ))
+    fig.update_layout(
+        barmode='group',
+        title=dict(
+            text=title,
+            font=dict(size=16)
+        ),
+        xaxis=dict(title="اللهجة" if use_arabic else "Dialect"),
+        yaxis=dict(
+            title="الدقة (%)" if use_arabic else "Accuracy (%)",
+            range=[0, 105]
+        ),
+        font=dict(
+            family="Noto Kufi Arabic, Arial" if use_arabic else "Arial"
+        ),
+        height=400
+    )
+    return fig
+def create_progress_over_time(
+    history_data: List[Dict],
+    models: Optional[List[str]] = None,
+    title: str = "Performance Over Time"
+) -> go.Figure:
+    """
+    Create a line chart showing model performance over time.
+    Args:
+        history_data: List of evaluation snapshots with dates
+        models: Models to include
+        title: Chart title
+    Returns:
+        Plotly Figure object
+    """
+    if not history_data:
+        # Return empty figure
+        fig = go.Figure()
+        fig.update_layout(title=title)
+        return fig
+    df = pd.DataFrame(history_data)
+    if models is not None:
+        df = df[df['model'].isin(models)]
+    fig = px.line(
+        df,
+        x='date',
+        y='overall',
+        color='model',
+        title=title,
+        labels={'overall': 'Overall Score (%)', 'date': 'Date', 'model': 'Model'}
+    )
+    fig.update_layout(
+        yaxis=dict(range=[0, 100]),
+        height=400
+    )
+    return fig

app.py CHANGED Viewed

@@ -1,204 +1,10 @@
-import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
-import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
-from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
-    )
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
-    with gr.Row():
-        with gr.Accordion("📙 Citation", open=False):
-            citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
-                lines=20,
-                elem_id="citation-button",
-                show_copy_button=True,
-            )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

+"""
+Arabic Function Calling Leaderboard - HuggingFace Space Entry Point
+"""
+import sys
+sys.path.insert(0, ".")
+from afcl.app import create_app
+app = create_app()
+app.launch()

data/leaderboard.json ADDED Viewed

	@@ -0,0 +1,107 @@

+[
+  {
+    "rank": 1,
+    "model": "GPT-4o",
+    "overall": 78.5,
+    "simple": 85.2,
+    "multiple": 80.1,
+    "parallel": 75.3,
+    "parallel_multiple": 72.4,
+    "irrelevance": 82.0,
+    "dialect_handling": 70.5,
+    "java": 76.8,
+    "javascript": 74.2,
+    "rest": 79.5,
+    "sql": 77.3
+  },
+  {
+    "rank": 2,
+    "model": "Claude 3.5 Sonnet",
+    "overall": 76.2,
+    "simple": 83.5,
+    "multiple": 78.8,
+    "parallel": 73.2,
+    "parallel_multiple": 70.1,
+    "irrelevance": 80.5,
+    "dialect_handling": 68.2,
+    "java": 75.2,
+    "javascript": 72.8,
+    "rest": 78.2,
+    "sql": 76.5
+  },
+  {
+    "rank": 3,
+    "model": "Qwen2.5-72B",
+    "overall": 74.1,
+    "simple": 80.5,
+    "multiple": 76.2,
+    "parallel": 72.5,
+    "parallel_multiple": 69.8,
+    "irrelevance": 77.5,
+    "dialect_handling": 65.2,
+    "java": 72.5,
+    "javascript": 70.8,
+    "rest": 75.2,
+    "sql": 73.8
+  },
+  {
+    "rank": 4,
+    "model": "Jais-30B",
+    "overall": 72.8,
+    "simple": 78.5,
+    "multiple": 74.2,
+    "parallel": 70.8,
+    "parallel_multiple": 68.5,
+    "irrelevance": 75.2,
+    "dialect_handling": 72.0,
+    "java": 68.5,
+    "javascript": 66.2,
+    "rest": 71.8,
+    "sql": 69.5
+  },
+  {
+    "rank": 5,
+    "model": "Llama-3.1-70B",
+    "overall": 71.5,
+    "simple": 78.2,
+    "multiple": 73.5,
+    "parallel": 69.8,
+    "parallel_multiple": 66.2,
+    "irrelevance": 74.5,
+    "dialect_handling": 62.5,
+    "java": 70.2,
+    "javascript": 68.5,
+    "rest": 73.5,
+    "sql": 71.2
+  },
+  {
+    "rank": 6,
+    "model": "ALLaM-7B",
+    "overall": 68.5,
+    "simple": 75.2,
+    "multiple": 70.5,
+    "parallel": 65.8,
+    "parallel_multiple": 62.3,
+    "irrelevance": 70.8,
+    "dialect_handling": 68.5,
+    "java": 62.5,
+    "javascript": 60.2,
+    "rest": 66.8,
+    "sql": 64.5
+  },
+  {
+    "rank": 7,
+    "model": "SILMA-9B",
+    "overall": 65.2,
+    "simple": 72.8,
+    "multiple": 68.5,
+    "parallel": 62.1,
+    "parallel_multiple": 58.5,
+    "irrelevance": 68.2,
+    "dialect_handling": 62.8,
+    "java": 58.5,
+    "javascript": 56.2,
+    "rest": 63.2,
+    "sql": 60.8
+  }
+]

requirements.txt CHANGED Viewed

@@ -1,16 +1,5 @@
-APScheduler
-black
-datasets
-gradio
-gradio[oauth]
-gradio_leaderboard==0.0.13
-gradio_client
-huggingface-hub>=0.18.0
-matplotlib
-numpy
-pandas
-python-dateutil
-tqdm
-transformers
-tokenizers>=0.15.0
-sentencepiece

+gradio>=4.0.0
+datasets>=2.14.0
+huggingface_hub>=0.19.0
+pandas>=2.0.0
+plotly>=5.18.0