Initial release: Arabic Function Calling Leaderboard
Browse files- README.md +44 -34
- afcl/__init__.py +9 -0
- afcl/app.py +430 -0
- afcl/data/__init__.py +11 -0
- afcl/data/loader.py +169 -0
- afcl/data/schemas.py +399 -0
- afcl/evaluators/__init__.py +11 -0
- afcl/evaluators/arabic_utils.py +141 -0
- afcl/evaluators/ast_evaluator.py +477 -0
- afcl/requirements.txt +21 -0
- afcl/static/styles.css +321 -0
- afcl/submission/__init__.py +10 -0
- afcl/submission/handler.py +117 -0
- afcl/visualization/__init__.py +10 -0
- afcl/visualization/charts.py +363 -0
- app.py +8 -202
- data/leaderboard.json +107 -0
- requirements.txt +5 -16
README.md
CHANGED
|
@@ -1,48 +1,58 @@
|
|
| 1 |
---
|
| 2 |
title: Arabic Function Calling Leaderboard
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: green
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
|
|
|
| 7 |
app_file: app.py
|
| 8 |
pinned: true
|
| 9 |
license: apache-2.0
|
| 10 |
-
short_description: Duplicate this leaderboard to initialize your own!
|
| 11 |
-
sdk_version: 5.43.1
|
| 12 |
tags:
|
| 13 |
-
-
|
|
|
|
|
|
|
|
|
|
| 14 |
---
|
| 15 |
|
| 16 |
-
#
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
"task_name": {
|
| 30 |
-
"metric_name": score,
|
| 31 |
-
},
|
| 32 |
-
"task_name2": {
|
| 33 |
-
"metric_name": score,
|
| 34 |
-
}
|
| 35 |
-
}
|
| 36 |
-
}
|
| 37 |
-
```
|
| 38 |
|
| 39 |
-
|
| 40 |
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: Arabic Function Calling Leaderboard
|
| 3 |
+
emoji: 🏆
|
| 4 |
colorFrom: green
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: apache-2.0
|
|
|
|
|
|
|
| 11 |
tags:
|
| 12 |
+
- arabic
|
| 13 |
+
- function-calling
|
| 14 |
+
- leaderboard
|
| 15 |
+
- llm-evaluation
|
| 16 |
---
|
| 17 |
|
| 18 |
+
# 🏆 Arabic Function Calling Leaderboard
|
| 19 |
+
|
| 20 |
+
لوحة تقييم استدعاء الدوال بالعربية
|
| 21 |
+
|
| 22 |
+
## Overview
|
| 23 |
+
|
| 24 |
+
The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Models on their ability to:
|
| 25 |
+
|
| 26 |
+
1. Understand Arabic queries (MSA + Dialects)
|
| 27 |
+
2. Select appropriate functions from available options
|
| 28 |
+
3. Extract correct arguments from Arabic text
|
| 29 |
+
4. Handle parallel and complex function calls
|
| 30 |
+
5. Detect when no function should be called
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
## Dataset
|
| 33 |
|
| 34 |
+
The benchmark includes **1,470+ samples** across 10 categories:
|
| 35 |
+
- Simple, Multiple, Parallel, Parallel Multiple
|
| 36 |
+
- Irrelevance Detection
|
| 37 |
+
- Dialect Handling (Egyptian, Gulf, Levantine)
|
| 38 |
+
- Programming APIs (Java, JavaScript, REST, SQL)
|
| 39 |
|
| 40 |
+
📊 **Dataset**: [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
|
| 41 |
|
| 42 |
+
## Submit Your Model
|
| 43 |
+
|
| 44 |
+
To submit your model for evaluation:
|
| 45 |
+
1. Go to the "Submit" tab
|
| 46 |
+
2. Fill in your model details
|
| 47 |
+
3. Your model will be added to the evaluation queue
|
| 48 |
+
|
| 49 |
+
## Citation
|
| 50 |
+
|
| 51 |
+
```bibtex
|
| 52 |
+
@misc{afcl2024,
|
| 53 |
+
title={Arabic Function Calling Leaderboard},
|
| 54 |
+
author={Hesham Haroon},
|
| 55 |
+
year={2024},
|
| 56 |
+
url={https://huggingface.co/spaces/HeshamHaroon/Arabic-Function-Calling-Leaderboard}
|
| 57 |
+
}
|
| 58 |
+
```
|
afcl/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Arabic Function Calling Leaderboard (AFCL)
|
| 3 |
+
==========================================
|
| 4 |
+
|
| 5 |
+
A comprehensive leaderboard for evaluating LLMs on function calling capabilities in Arabic.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
__version__ = "0.1.0"
|
| 9 |
+
__author__ = "Hesham Haroon"
|
afcl/app.py
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Arabic Function Calling Leaderboard (AFCL)
|
| 3 |
+
==========================================
|
| 4 |
+
|
| 5 |
+
A Gradio-based leaderboard for evaluating LLMs on Arabic function calling.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import gradio as gr
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import json
|
| 11 |
+
import os
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Dict, List, Optional
|
| 14 |
+
|
| 15 |
+
# Local imports
|
| 16 |
+
from .data.loader import (
|
| 17 |
+
load_leaderboard, save_leaderboard, load_benchmark,
|
| 18 |
+
calculate_overall_score, CATEGORY_WEIGHTS
|
| 19 |
+
)
|
| 20 |
+
from .visualization.charts import (
|
| 21 |
+
create_radar_chart, create_bar_chart,
|
| 22 |
+
create_category_comparison, create_dialect_breakdown
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
# Constants
|
| 26 |
+
TITLE = "🏆 Arabic Function Calling Leaderboard"
|
| 27 |
+
TITLE_AR = "🏆 لوحة تقييم استدعاء الدوال بالعربية"
|
| 28 |
+
|
| 29 |
+
DESCRIPTION = """
|
| 30 |
+
The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Models on their ability to understand Arabic queries and generate appropriate function calls.
|
| 31 |
+
|
| 32 |
+
**لوحة تقييم استدعاء الدوال بالعربية** تقيّم نماذج اللغة الكبيرة على قدرتها على فهم الاستعلامات العربية وإنشاء استدعاءات الدوال المناسبة.
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
# Column definitions
|
| 36 |
+
LEADERBOARD_COLUMNS = {
|
| 37 |
+
"rank": {"label": "المرتبة", "label_en": "Rank", "type": "number"},
|
| 38 |
+
"model": {"label": "النموذج", "label_en": "Model", "type": "str"},
|
| 39 |
+
"overall": {"label": "الدقة الكلية", "label_en": "Overall", "type": "number"},
|
| 40 |
+
"simple": {"label": "بسيط", "label_en": "Simple", "type": "number"},
|
| 41 |
+
"multiple": {"label": "متعدد", "label_en": "Multiple", "type": "number"},
|
| 42 |
+
"parallel": {"label": "متوازي", "label_en": "Parallel", "type": "number"},
|
| 43 |
+
"parallel_multiple": {"label": "متوازي متعدد", "label_en": "Parallel Multiple", "type": "number"},
|
| 44 |
+
"irrelevance": {"label": "اللا صلة", "label_en": "Irrelevance", "type": "number"},
|
| 45 |
+
"dialect_handling": {"label": "اللهجات", "label_en": "Dialects", "type": "number"},
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
# Sample leaderboard data (will be replaced with actual results)
|
| 49 |
+
SAMPLE_LEADERBOARD = [
|
| 50 |
+
{
|
| 51 |
+
"rank": 1,
|
| 52 |
+
"model": "GPT-4o",
|
| 53 |
+
"overall": 78.5,
|
| 54 |
+
"simple": 85.2,
|
| 55 |
+
"multiple": 80.1,
|
| 56 |
+
"parallel": 75.3,
|
| 57 |
+
"parallel_multiple": 72.4,
|
| 58 |
+
"irrelevance": 82.0,
|
| 59 |
+
"dialect_handling": 70.5,
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"rank": 2,
|
| 63 |
+
"model": "Claude 3.5 Sonnet",
|
| 64 |
+
"overall": 76.2,
|
| 65 |
+
"simple": 83.5,
|
| 66 |
+
"multiple": 78.8,
|
| 67 |
+
"parallel": 73.2,
|
| 68 |
+
"parallel_multiple": 70.1,
|
| 69 |
+
"irrelevance": 80.5,
|
| 70 |
+
"dialect_handling": 68.2,
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"rank": 3,
|
| 74 |
+
"model": "Jais-30B",
|
| 75 |
+
"overall": 72.8,
|
| 76 |
+
"simple": 78.5,
|
| 77 |
+
"multiple": 74.2,
|
| 78 |
+
"parallel": 70.8,
|
| 79 |
+
"parallel_multiple": 68.5,
|
| 80 |
+
"irrelevance": 75.2,
|
| 81 |
+
"dialect_handling": 72.0,
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"rank": 4,
|
| 85 |
+
"model": "ALLaM-7B",
|
| 86 |
+
"overall": 68.5,
|
| 87 |
+
"simple": 75.2,
|
| 88 |
+
"multiple": 70.5,
|
| 89 |
+
"parallel": 65.8,
|
| 90 |
+
"parallel_multiple": 62.3,
|
| 91 |
+
"irrelevance": 70.8,
|
| 92 |
+
"dialect_handling": 68.5,
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"rank": 5,
|
| 96 |
+
"model": "Qwen2.5-72B",
|
| 97 |
+
"overall": 74.1,
|
| 98 |
+
"simple": 80.5,
|
| 99 |
+
"multiple": 76.2,
|
| 100 |
+
"parallel": 72.5,
|
| 101 |
+
"parallel_multiple": 69.8,
|
| 102 |
+
"irrelevance": 77.5,
|
| 103 |
+
"dialect_handling": 65.2,
|
| 104 |
+
},
|
| 105 |
+
{
|
| 106 |
+
"rank": 6,
|
| 107 |
+
"model": "SILMA-9B",
|
| 108 |
+
"overall": 65.2,
|
| 109 |
+
"simple": 72.8,
|
| 110 |
+
"multiple": 68.5,
|
| 111 |
+
"parallel": 62.1,
|
| 112 |
+
"parallel_multiple": 58.5,
|
| 113 |
+
"irrelevance": 68.2,
|
| 114 |
+
"dialect_handling": 62.8,
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"rank": 7,
|
| 118 |
+
"model": "Llama-3.1-70B",
|
| 119 |
+
"overall": 71.5,
|
| 120 |
+
"simple": 78.2,
|
| 121 |
+
"multiple": 73.5,
|
| 122 |
+
"parallel": 69.8,
|
| 123 |
+
"parallel_multiple": 66.2,
|
| 124 |
+
"irrelevance": 74.5,
|
| 125 |
+
"dialect_handling": 62.5,
|
| 126 |
+
},
|
| 127 |
+
]
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def get_leaderboard_data() -> List[Dict]:
|
| 131 |
+
"""Load leaderboard data from file or return sample data."""
|
| 132 |
+
try:
|
| 133 |
+
data = load_leaderboard("data/leaderboard.json")
|
| 134 |
+
if data:
|
| 135 |
+
return data
|
| 136 |
+
except Exception:
|
| 137 |
+
pass
|
| 138 |
+
return SAMPLE_LEADERBOARD
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def format_leaderboard_dataframe(data: List[Dict], use_arabic: bool = True) -> pd.DataFrame:
|
| 142 |
+
"""Convert leaderboard data to pandas DataFrame."""
|
| 143 |
+
df = pd.DataFrame(data)
|
| 144 |
+
|
| 145 |
+
# Rename columns based on language preference
|
| 146 |
+
column_mapping = {}
|
| 147 |
+
for col, info in LEADERBOARD_COLUMNS.items():
|
| 148 |
+
if col in df.columns:
|
| 149 |
+
label = info["label"] if use_arabic else info["label_en"]
|
| 150 |
+
column_mapping[col] = label
|
| 151 |
+
|
| 152 |
+
df = df.rename(columns=column_mapping)
|
| 153 |
+
|
| 154 |
+
# Format numeric columns
|
| 155 |
+
for col in df.columns:
|
| 156 |
+
if df[col].dtype in ['float64', 'float32']:
|
| 157 |
+
df[col] = df[col].apply(lambda x: f"{x:.1f}%")
|
| 158 |
+
|
| 159 |
+
return df
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def create_leaderboard_tab(use_arabic: bool = True):
|
| 163 |
+
"""Create the main leaderboard tab content."""
|
| 164 |
+
data = get_leaderboard_data()
|
| 165 |
+
df = format_leaderboard_dataframe(data, use_arabic)
|
| 166 |
+
|
| 167 |
+
return gr.DataFrame(
|
| 168 |
+
value=df,
|
| 169 |
+
interactive=False,
|
| 170 |
+
wrap=True,
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def create_visualization_tab():
|
| 175 |
+
"""Create the visualization tab with charts."""
|
| 176 |
+
data = get_leaderboard_data()
|
| 177 |
+
|
| 178 |
+
# Prepare data for charts
|
| 179 |
+
model_scores = {
|
| 180 |
+
entry["model"]: {k: v for k, v in entry.items() if k not in ["rank", "model"]}
|
| 181 |
+
for entry in data
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
with gr.Row():
|
| 185 |
+
with gr.Column():
|
| 186 |
+
radar_chart = create_radar_chart(
|
| 187 |
+
{k: v for k, v in list(model_scores.items())[:5]},
|
| 188 |
+
use_arabic=True,
|
| 189 |
+
title="مقارنة النماذج - Category Comparison"
|
| 190 |
+
)
|
| 191 |
+
gr.Plot(value=radar_chart)
|
| 192 |
+
|
| 193 |
+
with gr.Row():
|
| 194 |
+
with gr.Column():
|
| 195 |
+
bar_chart = create_bar_chart(
|
| 196 |
+
data,
|
| 197 |
+
metric="overall",
|
| 198 |
+
use_arabic=True,
|
| 199 |
+
title="أفضل النماذج - Top Models"
|
| 200 |
+
)
|
| 201 |
+
gr.Plot(value=bar_chart)
|
| 202 |
+
|
| 203 |
+
with gr.Row():
|
| 204 |
+
category_chart = create_category_comparison(
|
| 205 |
+
data,
|
| 206 |
+
use_arabic=True,
|
| 207 |
+
title="أداء الفئات - Category Performance"
|
| 208 |
+
)
|
| 209 |
+
gr.Plot(value=category_chart)
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def create_submit_tab():
|
| 213 |
+
"""Create the model submission tab."""
|
| 214 |
+
with gr.Column():
|
| 215 |
+
gr.Markdown("""
|
| 216 |
+
## 📤 Submit Your Model | أرسل نموذجك
|
| 217 |
+
|
| 218 |
+
To submit a model for evaluation, provide the following information:
|
| 219 |
+
|
| 220 |
+
لإرسال نموذج للتقييم، قدم المعلومات التالية:
|
| 221 |
+
""")
|
| 222 |
+
|
| 223 |
+
with gr.Row():
|
| 224 |
+
model_name = gr.Textbox(
|
| 225 |
+
label="Model Name | اسم النموذج",
|
| 226 |
+
placeholder="e.g., my-arabic-llm-7b"
|
| 227 |
+
)
|
| 228 |
+
model_type = gr.Dropdown(
|
| 229 |
+
label="Model Type | نوع النموذج",
|
| 230 |
+
choices=["HuggingFace Hub", "API Endpoint", "Local Model"],
|
| 231 |
+
value="HuggingFace Hub"
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
model_path = gr.Textbox(
|
| 235 |
+
label="Model Path/Endpoint | مسار النموذج",
|
| 236 |
+
placeholder="e.g., organization/model-name or https://api.example.com/v1"
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
precision = gr.Dropdown(
|
| 240 |
+
label="Precision | الدقة",
|
| 241 |
+
choices=["float16", "bfloat16", "float32", "int8", "int4"],
|
| 242 |
+
value="float16"
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
with gr.Row():
|
| 246 |
+
base_model = gr.Textbox(
|
| 247 |
+
label="Base Model (if fine-tuned) | النموذج الأساسي",
|
| 248 |
+
placeholder="e.g., meta-llama/Llama-2-7b"
|
| 249 |
+
)
|
| 250 |
+
license_type = gr.Dropdown(
|
| 251 |
+
label="License | الرخصة",
|
| 252 |
+
choices=["Apache-2.0", "MIT", "CC-BY-4.0", "Llama 2", "Other"],
|
| 253 |
+
value="Apache-2.0"
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
submit_btn = gr.Button("Submit for Evaluation | أرسل للتقييم", variant="primary")
|
| 257 |
+
|
| 258 |
+
result_text = gr.Markdown("")
|
| 259 |
+
|
| 260 |
+
def handle_submission(name, mtype, path, prec, base, lic):
|
| 261 |
+
if not name or not path:
|
| 262 |
+
return "❌ Please fill in the required fields | يرجى ملء الحقول المطلوبة"
|
| 263 |
+
return f"""
|
| 264 |
+
✅ **Submission Received | تم استلام الطلب**
|
| 265 |
+
|
| 266 |
+
- Model: {name}
|
| 267 |
+
- Type: {mtype}
|
| 268 |
+
- Path: {path}
|
| 269 |
+
|
| 270 |
+
Your model will be evaluated and added to the leaderboard soon.
|
| 271 |
+
|
| 272 |
+
سيتم تقييم نموذجك وإضافته إلى لوحة التقييم قريباً.
|
| 273 |
+
"""
|
| 274 |
+
|
| 275 |
+
submit_btn.click(
|
| 276 |
+
fn=handle_submission,
|
| 277 |
+
inputs=[model_name, model_type, model_path, precision, base_model, license_type],
|
| 278 |
+
outputs=result_text
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
def create_about_tab():
|
| 283 |
+
"""Create the about/methodology tab."""
|
| 284 |
+
return gr.Markdown("""
|
| 285 |
+
# About AFCL | عن لوحة التقييم
|
| 286 |
+
|
| 287 |
+
## Evaluation Categories | فئات التقييم
|
| 288 |
+
|
| 289 |
+
| Category | الفئة | Description | الوصف |
|
| 290 |
+
|----------|-------|-------------|-------|
|
| 291 |
+
| Simple | بسيط | Single function, single call | دالة واحدة، استدعاء واحد |
|
| 292 |
+
| Multiple | متعدد | Select correct function from options | اختيار الدالة الصحيحة من عدة خيارات |
|
| 293 |
+
| Parallel | متوازي | Multiple calls of same function | استدعاءات متعددة لنفس الدالة |
|
| 294 |
+
| Parallel Multiple | متوازي متعدد | Multiple functions, multiple calls | دوال متعددة، استدعاءات متعددة |
|
| 295 |
+
| Irrelevance | اللا صلة | No function should be called | لا يجب استدعاء أي دالة |
|
| 296 |
+
| Dialect Handling | اللهجات | Egyptian/Gulf/Levantine queries | استعلامات مصرية/خليجية/شامية |
|
| 297 |
+
|
| 298 |
+
## Scoring Formula | معادلة التقييم
|
| 299 |
+
|
| 300 |
+
```
|
| 301 |
+
Overall Score = Σ (category_score × weight)
|
| 302 |
+
```
|
| 303 |
+
|
| 304 |
+
**Weights | الأوزان:**
|
| 305 |
+
- Simple: 15%
|
| 306 |
+
- Multiple: 10%
|
| 307 |
+
- Parallel: 10%
|
| 308 |
+
- Parallel Multiple: 10%
|
| 309 |
+
- Irrelevance: 15%
|
| 310 |
+
- Dialect Handling: 15%
|
| 311 |
+
- Multi-Turn: 15%
|
| 312 |
+
- Native Arabic: 10%
|
| 313 |
+
|
| 314 |
+
## Evaluation Methodology | منهجية التقييم
|
| 315 |
+
|
| 316 |
+
1. **AST-Based Matching**: Function calls are compared using Abstract Syntax Tree matching with Arabic text normalization.
|
| 317 |
+
|
| 318 |
+
2. **Arabic Normalization**: Handles diacritics (tashkeel), alef variants, and Arabic-Indic numerals.
|
| 319 |
+
|
| 320 |
+
3. **Order-Agnostic Parallel Evaluation**: For parallel calls, order doesn't matter - we use bipartite matching.
|
| 321 |
+
|
| 322 |
+
## Dataset | مجموعة البيانات
|
| 323 |
+
|
| 324 |
+
- **Total Samples**: 1,470+
|
| 325 |
+
- **Languages**: Arabic (MSA + Dialects) & English
|
| 326 |
+
- **Source**: Translated from BFCL with additional dialect variants
|
| 327 |
+
|
| 328 |
+
## Citation | الاقتباس
|
| 329 |
+
|
| 330 |
+
```bibtex
|
| 331 |
+
@misc{afcl2024,
|
| 332 |
+
title={Arabic Function Calling Leaderboard},
|
| 333 |
+
author={Hesham Haroon},
|
| 334 |
+
year={2024},
|
| 335 |
+
url={https://huggingface.co/spaces/HeshamHaroon/Arabic-Function-Calling-Leaderboard}
|
| 336 |
+
}
|
| 337 |
+
```
|
| 338 |
+
|
| 339 |
+
## Contact | التواصل
|
| 340 |
+
|
| 341 |
+
For questions or contributions, please open an issue on the repository.
|
| 342 |
+
|
| 343 |
+
للأسئلة أو المساهمات، يرجى فتح مشكلة في المستودع.
|
| 344 |
+
""")
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
def create_app():
|
| 348 |
+
"""Create the main Gradio application."""
|
| 349 |
+
# Load CSS
|
| 350 |
+
css_path = Path(__file__).parent / "static" / "styles.css"
|
| 351 |
+
custom_css = ""
|
| 352 |
+
if css_path.exists():
|
| 353 |
+
with open(css_path, "r") as f:
|
| 354 |
+
custom_css = f.read()
|
| 355 |
+
|
| 356 |
+
with gr.Blocks(
|
| 357 |
+
title="Arabic Function Calling Leaderboard",
|
| 358 |
+
css=custom_css,
|
| 359 |
+
theme=gr.themes.Soft()
|
| 360 |
+
) as app:
|
| 361 |
+
# Header
|
| 362 |
+
gr.Markdown(f"""
|
| 363 |
+
<div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4a 100%); border-radius: 12px; color: white; margin-bottom: 20px;">
|
| 364 |
+
<h1 style="font-size: 2rem; margin-bottom: 10px;">{TITLE_AR}</h1>
|
| 365 |
+
<h2 style="font-size: 1.5rem; margin-bottom: 10px;">{TITLE}</h2>
|
| 366 |
+
<p style="opacity: 0.9;">Evaluating LLMs on Arabic Function Calling | تقييم نماذج اللغة على استدعاء الدوال بالعربية</p>
|
| 367 |
+
</div>
|
| 368 |
+
""")
|
| 369 |
+
|
| 370 |
+
gr.Markdown(DESCRIPTION)
|
| 371 |
+
|
| 372 |
+
# Stats row
|
| 373 |
+
data = get_leaderboard_data()
|
| 374 |
+
with gr.Row():
|
| 375 |
+
gr.Markdown(f"""
|
| 376 |
+
<div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
|
| 377 |
+
<div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(data)}</div>
|
| 378 |
+
<div style="color: #666;">Models Evaluated | النماذج المقيّمة</div>
|
| 379 |
+
</div>
|
| 380 |
+
""")
|
| 381 |
+
gr.Markdown("""
|
| 382 |
+
<div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
|
| 383 |
+
<div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">1,470+</div>
|
| 384 |
+
<div style="color: #666;">Test Samples | عينات الاختبار</div>
|
| 385 |
+
</div>
|
| 386 |
+
""")
|
| 387 |
+
gr.Markdown("""
|
| 388 |
+
<div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
|
| 389 |
+
<div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">10</div>
|
| 390 |
+
<div style="color: #666;">Categories | الفئات</div>
|
| 391 |
+
</div>
|
| 392 |
+
""")
|
| 393 |
+
|
| 394 |
+
# Tabs
|
| 395 |
+
with gr.Tabs():
|
| 396 |
+
with gr.TabItem("🏆 Leaderboard | لوحة التقييم"):
|
| 397 |
+
df = format_leaderboard_dataframe(data, use_arabic=True)
|
| 398 |
+
gr.DataFrame(
|
| 399 |
+
value=df,
|
| 400 |
+
interactive=False,
|
| 401 |
+
wrap=True,
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
with gr.TabItem("📊 Visualizations | الرسوم البيانية"):
|
| 405 |
+
create_visualization_tab()
|
| 406 |
+
|
| 407 |
+
with gr.TabItem("📤 Submit | إرسال"):
|
| 408 |
+
create_submit_tab()
|
| 409 |
+
|
| 410 |
+
with gr.TabItem("ℹ️ About | عن المشروع"):
|
| 411 |
+
create_about_tab()
|
| 412 |
+
|
| 413 |
+
# Footer
|
| 414 |
+
gr.Markdown("""
|
| 415 |
+
---
|
| 416 |
+
<div style="text-align: center; color: #666; padding: 20px;">
|
| 417 |
+
Built with ❤️ for the Arabic NLP community | بُني بحب لمجتمع معالجة اللغة العربية
|
| 418 |
+
<br>
|
| 419 |
+
<a href="https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling">Dataset</a> |
|
| 420 |
+
<a href="https://github.com/HeshamHaroon">GitHub</a>
|
| 421 |
+
</div>
|
| 422 |
+
""")
|
| 423 |
+
|
| 424 |
+
return app
|
| 425 |
+
|
| 426 |
+
|
| 427 |
+
# Main entry point
|
| 428 |
+
if __name__ == "__main__":
|
| 429 |
+
app = create_app()
|
| 430 |
+
app.launch()
|
afcl/data/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AFCL Data Module
|
| 3 |
+
================
|
| 4 |
+
|
| 5 |
+
Data loading and schema definitions for the Arabic Function Calling Leaderboard.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from .loader import load_benchmark, load_results
|
| 9 |
+
from .schemas import NATIVE_ARABIC_SCHEMAS
|
| 10 |
+
|
| 11 |
+
__all__ = ['load_benchmark', 'load_results', 'NATIVE_ARABIC_SCHEMAS']
|
afcl/data/loader.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Loader
|
| 3 |
+
===========
|
| 4 |
+
|
| 5 |
+
Load benchmark data and evaluation results.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
from typing import Dict, List, Optional
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from datasets import load_dataset
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# Category weights for overall score
|
| 16 |
+
CATEGORY_WEIGHTS = {
|
| 17 |
+
"simple": 0.15,
|
| 18 |
+
"multiple": 0.10,
|
| 19 |
+
"parallel": 0.10,
|
| 20 |
+
"parallel_multiple": 0.10,
|
| 21 |
+
"irrelevance": 0.15,
|
| 22 |
+
"dialect_handling": 0.15,
|
| 23 |
+
"multi_turn": 0.15,
|
| 24 |
+
"native_arabic": 0.10,
|
| 25 |
+
# Programming categories (included in evaluation but lower weight)
|
| 26 |
+
"java": 0.0,
|
| 27 |
+
"javascript": 0.0,
|
| 28 |
+
"rest": 0.0,
|
| 29 |
+
"sql": 0.0,
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def load_benchmark(
|
| 34 |
+
dataset_name: str = "HeshamHaroon/Arabic_Function_Calling",
|
| 35 |
+
split: str = "test",
|
| 36 |
+
category: Optional[str] = None
|
| 37 |
+
) -> List[Dict]:
|
| 38 |
+
"""
|
| 39 |
+
Load benchmark samples from HuggingFace dataset.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
dataset_name: HuggingFace dataset repository
|
| 43 |
+
split: Dataset split ('train' or 'test')
|
| 44 |
+
category: Optional category filter
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
List of sample dictionaries
|
| 48 |
+
"""
|
| 49 |
+
try:
|
| 50 |
+
dataset = load_dataset(dataset_name, split=split)
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print(f"Error loading dataset: {e}")
|
| 53 |
+
# Fallback to local data
|
| 54 |
+
local_path = Path(__file__).parent.parent.parent / "arabic_fc_dataset" / "checkpoint.json"
|
| 55 |
+
if local_path.exists():
|
| 56 |
+
with open(local_path, 'r', encoding='utf-8') as f:
|
| 57 |
+
data = json.load(f)
|
| 58 |
+
samples = data.get('samples', [])
|
| 59 |
+
if category:
|
| 60 |
+
samples = [s for s in samples if s.get('category') == category]
|
| 61 |
+
return samples
|
| 62 |
+
raise
|
| 63 |
+
|
| 64 |
+
samples = []
|
| 65 |
+
for item in dataset:
|
| 66 |
+
sample = {
|
| 67 |
+
'id': item['id'],
|
| 68 |
+
'query_en': item['query_en'],
|
| 69 |
+
'query_ar': item['query_ar'],
|
| 70 |
+
'functions': json.loads(item['functions']) if item['functions'] else [],
|
| 71 |
+
'ground_truth': json.loads(item['ground_truth']) if item['ground_truth'] else None,
|
| 72 |
+
'category': item['category'],
|
| 73 |
+
'source': item.get('source', ''),
|
| 74 |
+
'dialect': item.get('dialect', ''),
|
| 75 |
+
}
|
| 76 |
+
if category is None or sample['category'] == category:
|
| 77 |
+
samples.append(sample)
|
| 78 |
+
|
| 79 |
+
return samples
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def load_results(results_dir: str = "data/results") -> Dict[str, Dict]:
|
| 83 |
+
"""
|
| 84 |
+
Load evaluation results for all models.
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
results_dir: Directory containing result JSON files
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
Dictionary mapping model names to their results
|
| 91 |
+
"""
|
| 92 |
+
results = {}
|
| 93 |
+
results_path = Path(results_dir)
|
| 94 |
+
|
| 95 |
+
if not results_path.exists():
|
| 96 |
+
return results
|
| 97 |
+
|
| 98 |
+
for file_path in results_path.glob("*.json"):
|
| 99 |
+
model_name = file_path.stem
|
| 100 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 101 |
+
results[model_name] = json.load(f)
|
| 102 |
+
|
| 103 |
+
return results
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def load_leaderboard(leaderboard_path: str = "data/leaderboard.json") -> List[Dict]:
|
| 107 |
+
"""
|
| 108 |
+
Load the current leaderboard rankings.
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
List of model entries sorted by overall score
|
| 112 |
+
"""
|
| 113 |
+
path = Path(leaderboard_path)
|
| 114 |
+
if not path.exists():
|
| 115 |
+
return []
|
| 116 |
+
|
| 117 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 118 |
+
data = json.load(f)
|
| 119 |
+
|
| 120 |
+
return sorted(data, key=lambda x: x.get('overall', 0), reverse=True)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def save_leaderboard(entries: List[Dict], leaderboard_path: str = "data/leaderboard.json"):
|
| 124 |
+
"""Save leaderboard data to file."""
|
| 125 |
+
path = Path(leaderboard_path)
|
| 126 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 127 |
+
|
| 128 |
+
# Sort by overall score
|
| 129 |
+
sorted_entries = sorted(entries, key=lambda x: x.get('overall', 0), reverse=True)
|
| 130 |
+
|
| 131 |
+
# Add ranks
|
| 132 |
+
for i, entry in enumerate(sorted_entries, 1):
|
| 133 |
+
entry['rank'] = i
|
| 134 |
+
|
| 135 |
+
with open(path, 'w', encoding='utf-8') as f:
|
| 136 |
+
json.dump(sorted_entries, f, ensure_ascii=False, indent=2)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def calculate_overall_score(category_scores: Dict[str, float]) -> float:
|
| 140 |
+
"""
|
| 141 |
+
Calculate weighted overall score from category scores.
|
| 142 |
+
|
| 143 |
+
Args:
|
| 144 |
+
category_scores: Dictionary mapping category names to scores (0-100)
|
| 145 |
+
|
| 146 |
+
Returns:
|
| 147 |
+
Overall weighted score (0-100)
|
| 148 |
+
"""
|
| 149 |
+
total_weight = 0
|
| 150 |
+
weighted_sum = 0
|
| 151 |
+
|
| 152 |
+
for category, weight in CATEGORY_WEIGHTS.items():
|
| 153 |
+
if category in category_scores and weight > 0:
|
| 154 |
+
weighted_sum += category_scores[category] * weight
|
| 155 |
+
total_weight += weight
|
| 156 |
+
|
| 157 |
+
if total_weight == 0:
|
| 158 |
+
return 0.0
|
| 159 |
+
|
| 160 |
+
return weighted_sum / total_weight
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def get_category_stats(samples: List[Dict]) -> Dict[str, int]:
|
| 164 |
+
"""Get sample counts by category."""
|
| 165 |
+
stats = {}
|
| 166 |
+
for sample in samples:
|
| 167 |
+
category = sample.get('category', 'unknown')
|
| 168 |
+
stats[category] = stats.get(category, 0) + 1
|
| 169 |
+
return stats
|
afcl/data/schemas.py
ADDED
|
@@ -0,0 +1,399 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Native Arabic Function Schemas
|
| 3 |
+
==============================
|
| 4 |
+
|
| 5 |
+
Bilingual function definitions with Arabic names, descriptions, and examples.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
NATIVE_ARABIC_SCHEMAS = [
|
| 9 |
+
# Financial Services - الخدمات المالية
|
| 10 |
+
{
|
| 11 |
+
"name": "transfer_money",
|
| 12 |
+
"name_ar": "تحويل_أموال",
|
| 13 |
+
"description": "Transfer money between bank accounts",
|
| 14 |
+
"description_ar": "تحويل أموال بين الحسابات البنكية",
|
| 15 |
+
"parameters": {
|
| 16 |
+
"type": "object",
|
| 17 |
+
"properties": {
|
| 18 |
+
"from_account": {
|
| 19 |
+
"type": "string",
|
| 20 |
+
"description": "Source account number",
|
| 21 |
+
"description_ar": "رقم الحساب المصدر",
|
| 22 |
+
"examples_ar": ["SA0380000000608010167519"]
|
| 23 |
+
},
|
| 24 |
+
"to_account": {
|
| 25 |
+
"type": "string",
|
| 26 |
+
"description": "Destination account number",
|
| 27 |
+
"description_ar": "رقم الحساب المستقبل"
|
| 28 |
+
},
|
| 29 |
+
"amount": {
|
| 30 |
+
"type": "number",
|
| 31 |
+
"description": "Amount to transfer",
|
| 32 |
+
"description_ar": "المبلغ المراد تحويله"
|
| 33 |
+
},
|
| 34 |
+
"currency": {
|
| 35 |
+
"type": "string",
|
| 36 |
+
"description": "Currency code",
|
| 37 |
+
"description_ar": "رمز العملة",
|
| 38 |
+
"enum": ["SAR", "AED", "EGP", "KWD", "QAR"],
|
| 39 |
+
"examples_ar": ["ريال سعودي", "درهم إماراتي", "جنيه مصري"]
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"required": ["from_account", "to_account", "amount"]
|
| 43 |
+
}
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"name": "check_balance",
|
| 47 |
+
"name_ar": "استعلام_رصيد",
|
| 48 |
+
"description": "Check account balance",
|
| 49 |
+
"description_ar": "الاستعلام عن رصيد الحساب",
|
| 50 |
+
"parameters": {
|
| 51 |
+
"type": "object",
|
| 52 |
+
"properties": {
|
| 53 |
+
"account_number": {
|
| 54 |
+
"type": "string",
|
| 55 |
+
"description": "Account number to check",
|
| 56 |
+
"description_ar": "رقم الحساب للاستعلام"
|
| 57 |
+
}
|
| 58 |
+
},
|
| 59 |
+
"required": ["account_number"]
|
| 60 |
+
}
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"name": "pay_bill",
|
| 64 |
+
"name_ar": "دفع_فاتورة",
|
| 65 |
+
"description": "Pay a utility or service bill",
|
| 66 |
+
"description_ar": "دفع فاتورة خدمات",
|
| 67 |
+
"parameters": {
|
| 68 |
+
"type": "object",
|
| 69 |
+
"properties": {
|
| 70 |
+
"bill_type": {
|
| 71 |
+
"type": "string",
|
| 72 |
+
"description": "Type of bill",
|
| 73 |
+
"description_ar": "نوع الفاتورة",
|
| 74 |
+
"enum": ["electricity", "water", "telecom", "internet"],
|
| 75 |
+
"enum_ar": ["كهرباء", "مياه", "اتصالات", "إنترنت"]
|
| 76 |
+
},
|
| 77 |
+
"account_id": {
|
| 78 |
+
"type": "string",
|
| 79 |
+
"description": "Bill account/subscriber ID",
|
| 80 |
+
"description_ar": "رقم المشترك"
|
| 81 |
+
},
|
| 82 |
+
"amount": {
|
| 83 |
+
"type": "number",
|
| 84 |
+
"description": "Amount to pay",
|
| 85 |
+
"description_ar": "المبلغ المراد دفعه"
|
| 86 |
+
}
|
| 87 |
+
},
|
| 88 |
+
"required": ["bill_type", "account_id"]
|
| 89 |
+
}
|
| 90 |
+
},
|
| 91 |
+
|
| 92 |
+
# Government Services - الخدمات الحكومية
|
| 93 |
+
{
|
| 94 |
+
"name": "renew_id",
|
| 95 |
+
"name_ar": "تجديد_هوية",
|
| 96 |
+
"description": "Renew national ID card",
|
| 97 |
+
"description_ar": "تجديد بطاقة الهوية الوطنية",
|
| 98 |
+
"parameters": {
|
| 99 |
+
"type": "object",
|
| 100 |
+
"properties": {
|
| 101 |
+
"id_number": {
|
| 102 |
+
"type": "string",
|
| 103 |
+
"description": "National ID number",
|
| 104 |
+
"description_ar": "رقم الهوية الوطنية"
|
| 105 |
+
},
|
| 106 |
+
"reason": {
|
| 107 |
+
"type": "string",
|
| 108 |
+
"description": "Reason for renewal",
|
| 109 |
+
"description_ar": "سبب التجديد",
|
| 110 |
+
"enum": ["expiry", "damaged", "lost", "data_update"],
|
| 111 |
+
"enum_ar": ["انتهاء الصلاحية", "تالفة", "مفقودة", "تحديث بيانات"]
|
| 112 |
+
}
|
| 113 |
+
},
|
| 114 |
+
"required": ["id_number"]
|
| 115 |
+
}
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"name": "book_appointment",
|
| 119 |
+
"name_ar": "حجز_موعد",
|
| 120 |
+
"description": "Book an appointment at a government office",
|
| 121 |
+
"description_ar": "حجز موعد في جهة حكومية",
|
| 122 |
+
"parameters": {
|
| 123 |
+
"type": "object",
|
| 124 |
+
"properties": {
|
| 125 |
+
"service_type": {
|
| 126 |
+
"type": "string",
|
| 127 |
+
"description": "Type of service",
|
| 128 |
+
"description_ar": "نوع الخدمة"
|
| 129 |
+
},
|
| 130 |
+
"location": {
|
| 131 |
+
"type": "string",
|
| 132 |
+
"description": "Preferred location/branch",
|
| 133 |
+
"description_ar": "الفرع المفضل",
|
| 134 |
+
"examples_ar": ["الرياض - العليا", "جدة - الحمراء"]
|
| 135 |
+
},
|
| 136 |
+
"date": {
|
| 137 |
+
"type": "string",
|
| 138 |
+
"description": "Preferred date (YYYY-MM-DD)",
|
| 139 |
+
"description_ar": "التاريخ المفضل"
|
| 140 |
+
},
|
| 141 |
+
"time_slot": {
|
| 142 |
+
"type": "string",
|
| 143 |
+
"description": "Preferred time slot",
|
| 144 |
+
"description_ar": "الفترة الزمنية",
|
| 145 |
+
"enum": ["morning", "afternoon"],
|
| 146 |
+
"enum_ar": ["صباحي", "مسائي"]
|
| 147 |
+
}
|
| 148 |
+
},
|
| 149 |
+
"required": ["service_type", "date"]
|
| 150 |
+
}
|
| 151 |
+
},
|
| 152 |
+
|
| 153 |
+
# E-commerce - التجارة الإلكترونية
|
| 154 |
+
{
|
| 155 |
+
"name": "search_product",
|
| 156 |
+
"name_ar": "البحث_عن_منتج",
|
| 157 |
+
"description": "Search for products in the store",
|
| 158 |
+
"description_ar": "البحث عن منتجات في المتجر",
|
| 159 |
+
"parameters": {
|
| 160 |
+
"type": "object",
|
| 161 |
+
"properties": {
|
| 162 |
+
"query": {
|
| 163 |
+
"type": "string",
|
| 164 |
+
"description": "Search query",
|
| 165 |
+
"description_ar": "كلمات البحث"
|
| 166 |
+
},
|
| 167 |
+
"category": {
|
| 168 |
+
"type": "string",
|
| 169 |
+
"description": "Product category",
|
| 170 |
+
"description_ar": "فئة المنتج",
|
| 171 |
+
"examples_ar": ["إلكترونيات", "ملابس", "أجهزة منزلية"]
|
| 172 |
+
},
|
| 173 |
+
"min_price": {
|
| 174 |
+
"type": "number",
|
| 175 |
+
"description": "Minimum price",
|
| 176 |
+
"description_ar": "الحد الأدنى للسعر"
|
| 177 |
+
},
|
| 178 |
+
"max_price": {
|
| 179 |
+
"type": "number",
|
| 180 |
+
"description": "Maximum price",
|
| 181 |
+
"description_ar": "الحد الأقصى للسعر"
|
| 182 |
+
},
|
| 183 |
+
"sort_by": {
|
| 184 |
+
"type": "string",
|
| 185 |
+
"description": "Sort order",
|
| 186 |
+
"description_ar": "ترتيب حسب",
|
| 187 |
+
"enum": ["price_asc", "price_desc", "rating", "newest"],
|
| 188 |
+
"enum_ar": ["السعر تصاعدي", "السعر تنازلي", "التقييم", "الأحدث"]
|
| 189 |
+
}
|
| 190 |
+
},
|
| 191 |
+
"required": ["query"]
|
| 192 |
+
}
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"name": "add_to_cart",
|
| 196 |
+
"name_ar": "إضافة_للسلة",
|
| 197 |
+
"description": "Add a product to shopping cart",
|
| 198 |
+
"description_ar": "إضافة منتج إلى سلة التسوق",
|
| 199 |
+
"parameters": {
|
| 200 |
+
"type": "object",
|
| 201 |
+
"properties": {
|
| 202 |
+
"product_id": {
|
| 203 |
+
"type": "string",
|
| 204 |
+
"description": "Product identifier",
|
| 205 |
+
"description_ar": "معرف المنتج"
|
| 206 |
+
},
|
| 207 |
+
"quantity": {
|
| 208 |
+
"type": "integer",
|
| 209 |
+
"description": "Quantity to add",
|
| 210 |
+
"description_ar": "الكمية",
|
| 211 |
+
"default": 1
|
| 212 |
+
},
|
| 213 |
+
"size": {
|
| 214 |
+
"type": "string",
|
| 215 |
+
"description": "Product size if applicable",
|
| 216 |
+
"description_ar": "المقاس"
|
| 217 |
+
},
|
| 218 |
+
"color": {
|
| 219 |
+
"type": "string",
|
| 220 |
+
"description": "Product color if applicable",
|
| 221 |
+
"description_ar": "اللون"
|
| 222 |
+
}
|
| 223 |
+
},
|
| 224 |
+
"required": ["product_id"]
|
| 225 |
+
}
|
| 226 |
+
},
|
| 227 |
+
|
| 228 |
+
# Healthcare - الرعاية الصحية
|
| 229 |
+
{
|
| 230 |
+
"name": "book_doctor_appointment",
|
| 231 |
+
"name_ar": "حجز_موعد_طبيب",
|
| 232 |
+
"description": "Book an appointment with a doctor",
|
| 233 |
+
"description_ar": "حجز موعد مع طبيب",
|
| 234 |
+
"parameters": {
|
| 235 |
+
"type": "object",
|
| 236 |
+
"properties": {
|
| 237 |
+
"specialty": {
|
| 238 |
+
"type": "string",
|
| 239 |
+
"description": "Medical specialty",
|
| 240 |
+
"description_ar": "التخصص الطبي",
|
| 241 |
+
"examples_ar": ["طب عام", "طب أطفال", "طب باطني", "طب عيون", "طب أسنان"]
|
| 242 |
+
},
|
| 243 |
+
"doctor_name": {
|
| 244 |
+
"type": "string",
|
| 245 |
+
"description": "Specific doctor name (optional)",
|
| 246 |
+
"description_ar": "اسم الطبيب (اختياري)"
|
| 247 |
+
},
|
| 248 |
+
"hospital": {
|
| 249 |
+
"type": "string",
|
| 250 |
+
"description": "Hospital or clinic name",
|
| 251 |
+
"description_ar": "اسم المستشفى أو العيادة"
|
| 252 |
+
},
|
| 253 |
+
"date": {
|
| 254 |
+
"type": "string",
|
| 255 |
+
"description": "Preferred date",
|
| 256 |
+
"description_ar": "التاريخ المفضل"
|
| 257 |
+
},
|
| 258 |
+
"reason": {
|
| 259 |
+
"type": "string",
|
| 260 |
+
"description": "Reason for visit",
|
| 261 |
+
"description_ar": "سبب الزيارة"
|
| 262 |
+
}
|
| 263 |
+
},
|
| 264 |
+
"required": ["specialty", "date"]
|
| 265 |
+
}
|
| 266 |
+
},
|
| 267 |
+
{
|
| 268 |
+
"name": "get_lab_results",
|
| 269 |
+
"name_ar": "نتائج_التحاليل",
|
| 270 |
+
"description": "Retrieve laboratory test results",
|
| 271 |
+
"description_ar": "استرجاع نتائج التحاليل المخبرية",
|
| 272 |
+
"parameters": {
|
| 273 |
+
"type": "object",
|
| 274 |
+
"properties": {
|
| 275 |
+
"patient_id": {
|
| 276 |
+
"type": "string",
|
| 277 |
+
"description": "Patient ID or file number",
|
| 278 |
+
"description_ar": "رقم المريض أو الملف"
|
| 279 |
+
},
|
| 280 |
+
"test_type": {
|
| 281 |
+
"type": "string",
|
| 282 |
+
"description": "Type of test",
|
| 283 |
+
"description_ar": "نوع التحليل",
|
| 284 |
+
"examples_ar": ["تحليل دم شامل", "تحليل سكر", "وظائف كلى", "وظائف كبد"]
|
| 285 |
+
},
|
| 286 |
+
"date_from": {
|
| 287 |
+
"type": "string",
|
| 288 |
+
"description": "Start date for results",
|
| 289 |
+
"description_ar": "تاريخ البداية"
|
| 290 |
+
}
|
| 291 |
+
},
|
| 292 |
+
"required": ["patient_id"]
|
| 293 |
+
}
|
| 294 |
+
},
|
| 295 |
+
|
| 296 |
+
# Travel & Transportation - السفر والنقل
|
| 297 |
+
{
|
| 298 |
+
"name": "book_flight",
|
| 299 |
+
"name_ar": "احجز_رحلة",
|
| 300 |
+
"description": "Book a flight between cities",
|
| 301 |
+
"description_ar": "حجز رحلة طيران بين المدن",
|
| 302 |
+
"parameters": {
|
| 303 |
+
"type": "object",
|
| 304 |
+
"properties": {
|
| 305 |
+
"origin": {
|
| 306 |
+
"type": "string",
|
| 307 |
+
"description": "Departure city",
|
| 308 |
+
"description_ar": "مدينة المغادرة",
|
| 309 |
+
"examples_ar": ["القاهرة", "الرياض", "دبي", "جدة"]
|
| 310 |
+
},
|
| 311 |
+
"destination": {
|
| 312 |
+
"type": "string",
|
| 313 |
+
"description": "Arrival city",
|
| 314 |
+
"description_ar": "مدينة الوصول"
|
| 315 |
+
},
|
| 316 |
+
"date": {
|
| 317 |
+
"type": "string",
|
| 318 |
+
"description": "Travel date",
|
| 319 |
+
"description_ar": "تاريخ السفر"
|
| 320 |
+
},
|
| 321 |
+
"return_date": {
|
| 322 |
+
"type": "string",
|
| 323 |
+
"description": "Return date (optional)",
|
| 324 |
+
"description_ar": "تاريخ العودة (اختياري)"
|
| 325 |
+
},
|
| 326 |
+
"passengers": {
|
| 327 |
+
"type": "integer",
|
| 328 |
+
"description": "Number of passengers",
|
| 329 |
+
"description_ar": "عدد المسافرين",
|
| 330 |
+
"default": 1
|
| 331 |
+
},
|
| 332 |
+
"class": {
|
| 333 |
+
"type": "string",
|
| 334 |
+
"description": "Travel class",
|
| 335 |
+
"description_ar": "درجة السفر",
|
| 336 |
+
"enum": ["economy", "business", "first"],
|
| 337 |
+
"enum_ar": ["اقتصادية", "أعمال", "أولى"]
|
| 338 |
+
}
|
| 339 |
+
},
|
| 340 |
+
"required": ["origin", "destination", "date"]
|
| 341 |
+
}
|
| 342 |
+
},
|
| 343 |
+
|
| 344 |
+
# Weather - الطقس
|
| 345 |
+
{
|
| 346 |
+
"name": "get_weather",
|
| 347 |
+
"name_ar": "احصل_على_الطقس",
|
| 348 |
+
"description": "Get weather information for a city",
|
| 349 |
+
"description_ar": "الحصول على معلومات الطقس لمدينة",
|
| 350 |
+
"parameters": {
|
| 351 |
+
"type": "object",
|
| 352 |
+
"properties": {
|
| 353 |
+
"city": {
|
| 354 |
+
"type": "string",
|
| 355 |
+
"description": "City name",
|
| 356 |
+
"description_ar": "اسم المدينة",
|
| 357 |
+
"examples_ar": ["القاهرة", "دبي", "الرياض", "بيروت", "عمان"]
|
| 358 |
+
},
|
| 359 |
+
"days": {
|
| 360 |
+
"type": "integer",
|
| 361 |
+
"description": "Number of forecast days",
|
| 362 |
+
"description_ar": "عدد أيام التوقعات",
|
| 363 |
+
"default": 1
|
| 364 |
+
}
|
| 365 |
+
},
|
| 366 |
+
"required": ["city"]
|
| 367 |
+
}
|
| 368 |
+
}
|
| 369 |
+
]
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
def get_schema_by_name(name: str, use_arabic: bool = False) -> dict:
|
| 373 |
+
"""Get a schema by its name (English or Arabic)."""
|
| 374 |
+
for schema in NATIVE_ARABIC_SCHEMAS:
|
| 375 |
+
if schema['name'] == name or schema.get('name_ar') == name:
|
| 376 |
+
if use_arabic:
|
| 377 |
+
# Return Arabic-ified version
|
| 378 |
+
return {
|
| 379 |
+
'name': schema.get('name_ar', schema['name']),
|
| 380 |
+
'description': schema.get('description_ar', schema['description']),
|
| 381 |
+
'parameters': schema['parameters']
|
| 382 |
+
}
|
| 383 |
+
return schema
|
| 384 |
+
return None
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
def get_schemas_by_category(category: str) -> list:
|
| 388 |
+
"""Get all schemas in a category."""
|
| 389 |
+
category_map = {
|
| 390 |
+
'financial': ['transfer_money', 'check_balance', 'pay_bill'],
|
| 391 |
+
'government': ['renew_id', 'book_appointment'],
|
| 392 |
+
'ecommerce': ['search_product', 'add_to_cart'],
|
| 393 |
+
'healthcare': ['book_doctor_appointment', 'get_lab_results'],
|
| 394 |
+
'travel': ['book_flight'],
|
| 395 |
+
'weather': ['get_weather'],
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
names = category_map.get(category, [])
|
| 399 |
+
return [get_schema_by_name(name) for name in names if get_schema_by_name(name)]
|
afcl/evaluators/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AFCL Evaluators
|
| 3 |
+
===============
|
| 4 |
+
|
| 5 |
+
Evaluation modules for Arabic function calling.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from .ast_evaluator import ArabicASTEvaluator
|
| 9 |
+
from .arabic_utils import ArabicNormalizer
|
| 10 |
+
|
| 11 |
+
__all__ = ['ArabicASTEvaluator', 'ArabicNormalizer']
|
afcl/evaluators/arabic_utils.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Arabic Text Utilities
|
| 3 |
+
=====================
|
| 4 |
+
|
| 5 |
+
Utilities for normalizing and processing Arabic text for function calling evaluation.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import re
|
| 9 |
+
import unicodedata
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class ArabicNormalizer:
|
| 13 |
+
"""Normalize Arabic text for consistent comparison."""
|
| 14 |
+
|
| 15 |
+
# Arabic diacritics (tashkeel) to remove
|
| 16 |
+
ARABIC_DIACRITICS = re.compile(r'[\u064B-\u065F\u0670]')
|
| 17 |
+
|
| 18 |
+
# Alef variants to normalize
|
| 19 |
+
ALEF_VARIANTS = {
|
| 20 |
+
'\u0622': '\u0627', # آ -> ا
|
| 21 |
+
'\u0623': '\u0627', # أ -> ا
|
| 22 |
+
'\u0625': '\u0627', # إ -> ا
|
| 23 |
+
'\u0671': '\u0627', # ٱ -> ا
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
# Ta marbuta to ha
|
| 27 |
+
TA_MARBUTA = '\u0629'
|
| 28 |
+
HA = '\u0647'
|
| 29 |
+
|
| 30 |
+
# Arabic-Indic numerals to Western
|
| 31 |
+
ARABIC_INDIC_NUMERALS = {
|
| 32 |
+
'\u0660': '0', '\u0661': '1', '\u0662': '2', '\u0663': '3', '\u0664': '4',
|
| 33 |
+
'\u0665': '5', '\u0666': '6', '\u0667': '7', '\u0668': '8', '\u0669': '9',
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
# Extended Arabic-Indic numerals (Persian/Urdu)
|
| 37 |
+
EXTENDED_NUMERALS = {
|
| 38 |
+
'\u06F0': '0', '\u06F1': '1', '\u06F2': '2', '\u06F3': '3', '\u06F4': '4',
|
| 39 |
+
'\u06F5': '5', '\u06F6': '6', '\u06F7': '7', '\u06F8': '8', '\u06F9': '9',
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
def __init__(
|
| 43 |
+
self,
|
| 44 |
+
remove_diacritics: bool = True,
|
| 45 |
+
normalize_alef: bool = True,
|
| 46 |
+
normalize_ta_marbuta: bool = False,
|
| 47 |
+
normalize_numerals: bool = True,
|
| 48 |
+
lowercase: bool = True,
|
| 49 |
+
strip_whitespace: bool = True
|
| 50 |
+
):
|
| 51 |
+
self.remove_diacritics = remove_diacritics
|
| 52 |
+
self.normalize_alef = normalize_alef
|
| 53 |
+
self.normalize_ta_marbuta = normalize_ta_marbuta
|
| 54 |
+
self.normalize_numerals = normalize_numerals
|
| 55 |
+
self.lowercase = lowercase
|
| 56 |
+
self.strip_whitespace = strip_whitespace
|
| 57 |
+
|
| 58 |
+
def normalize(self, text: str) -> str:
|
| 59 |
+
"""Apply all configured normalizations to text."""
|
| 60 |
+
if not text:
|
| 61 |
+
return ""
|
| 62 |
+
|
| 63 |
+
# Unicode normalization
|
| 64 |
+
text = unicodedata.normalize('NFC', text)
|
| 65 |
+
|
| 66 |
+
# Remove diacritics
|
| 67 |
+
if self.remove_diacritics:
|
| 68 |
+
text = self.ARABIC_DIACRITICS.sub('', text)
|
| 69 |
+
|
| 70 |
+
# Normalize alef variants
|
| 71 |
+
if self.normalize_alef:
|
| 72 |
+
for variant, replacement in self.ALEF_VARIANTS.items():
|
| 73 |
+
text = text.replace(variant, replacement)
|
| 74 |
+
|
| 75 |
+
# Normalize ta marbuta
|
| 76 |
+
if self.normalize_ta_marbuta:
|
| 77 |
+
text = text.replace(self.TA_MARBUTA, self.HA)
|
| 78 |
+
|
| 79 |
+
# Normalize numerals
|
| 80 |
+
if self.normalize_numerals:
|
| 81 |
+
for arabic, western in self.ARABIC_INDIC_NUMERALS.items():
|
| 82 |
+
text = text.replace(arabic, western)
|
| 83 |
+
for persian, western in self.EXTENDED_NUMERALS.items():
|
| 84 |
+
text = text.replace(persian, western)
|
| 85 |
+
|
| 86 |
+
# Lowercase (for Latin characters in function names)
|
| 87 |
+
if self.lowercase:
|
| 88 |
+
text = text.lower()
|
| 89 |
+
|
| 90 |
+
# Strip and normalize whitespace
|
| 91 |
+
if self.strip_whitespace:
|
| 92 |
+
text = ' '.join(text.split())
|
| 93 |
+
|
| 94 |
+
return text
|
| 95 |
+
|
| 96 |
+
def normalize_for_comparison(self, text: str) -> str:
|
| 97 |
+
"""Aggressive normalization for fuzzy matching."""
|
| 98 |
+
text = self.normalize(text)
|
| 99 |
+
# Remove all punctuation
|
| 100 |
+
text = re.sub(r'[^\w\s]', '', text)
|
| 101 |
+
# Remove extra whitespace
|
| 102 |
+
text = ' '.join(text.split())
|
| 103 |
+
return text
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def extract_arabic_numbers(text: str) -> list:
|
| 107 |
+
"""Extract numbers from Arabic text (both Arabic-Indic and Western)."""
|
| 108 |
+
normalizer = ArabicNormalizer(normalize_numerals=True)
|
| 109 |
+
normalized = normalizer.normalize(text)
|
| 110 |
+
return re.findall(r'\d+(?:\.\d+)?', normalized)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def is_arabic_text(text: str) -> bool:
|
| 114 |
+
"""Check if text contains Arabic characters."""
|
| 115 |
+
arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+')
|
| 116 |
+
return bool(arabic_pattern.search(text))
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def detect_dialect(text: str) -> str:
|
| 120 |
+
"""
|
| 121 |
+
Simple dialect detection based on common markers.
|
| 122 |
+
Returns: 'egyptian', 'gulf', 'levantine', or 'msa' (Modern Standard Arabic)
|
| 123 |
+
"""
|
| 124 |
+
text_lower = text.lower()
|
| 125 |
+
|
| 126 |
+
# Egyptian markers
|
| 127 |
+
egyptian_markers = ['ازاي', 'عايز', 'كده', 'ده', 'دي', 'بتاع', 'اوي', 'خالص']
|
| 128 |
+
if any(marker in text for marker in egyptian_markers):
|
| 129 |
+
return 'egyptian'
|
| 130 |
+
|
| 131 |
+
# Gulf markers
|
| 132 |
+
gulf_markers = ['شلون', 'ابي', 'ابغى', 'وايد', 'زين', 'حيل', 'يالله']
|
| 133 |
+
if any(marker in text for marker in gulf_markers):
|
| 134 |
+
return 'gulf'
|
| 135 |
+
|
| 136 |
+
# Levantine markers
|
| 137 |
+
levantine_markers = ['كيفك', 'شو', 'هيك', 'منيح', 'كتير', 'هلق', 'بدي']
|
| 138 |
+
if any(marker in text for marker in levantine_markers):
|
| 139 |
+
return 'levantine'
|
| 140 |
+
|
| 141 |
+
return 'msa'
|
afcl/evaluators/ast_evaluator.py
ADDED
|
@@ -0,0 +1,477 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AST-Based Function Call Evaluator
|
| 3 |
+
=================================
|
| 4 |
+
|
| 5 |
+
Evaluates model predictions against ground truth using AST-based matching.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import re
|
| 10 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
+
from .arabic_utils import ArabicNormalizer
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class EvaluationResult:
|
| 17 |
+
"""Result of evaluating a single sample."""
|
| 18 |
+
sample_id: str
|
| 19 |
+
category: str
|
| 20 |
+
is_correct: bool
|
| 21 |
+
score: float
|
| 22 |
+
details: Dict[str, Any]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class ArabicASTEvaluator:
|
| 26 |
+
"""
|
| 27 |
+
AST-based evaluator for Arabic function calling.
|
| 28 |
+
|
| 29 |
+
Supports multiple evaluation modes:
|
| 30 |
+
- exact: Exact match of function name and all arguments
|
| 31 |
+
- relaxed: Allows minor variations in argument values
|
| 32 |
+
- function_only: Only checks if correct function was called
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
def __init__(self, mode: str = "exact"):
|
| 36 |
+
self.mode = mode
|
| 37 |
+
self.normalizer = ArabicNormalizer()
|
| 38 |
+
|
| 39 |
+
def parse_function_call(self, response: str) -> Optional[Dict]:
|
| 40 |
+
"""
|
| 41 |
+
Parse a function call from model response.
|
| 42 |
+
Handles multiple formats:
|
| 43 |
+
- JSON: {"name": "func", "arguments": {...}}
|
| 44 |
+
- OpenAI style: {"function_call": {"name": "func", "arguments": "..."}}
|
| 45 |
+
- Plain text: func(arg1, arg2)
|
| 46 |
+
"""
|
| 47 |
+
if not response:
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
response = response.strip()
|
| 51 |
+
|
| 52 |
+
# Try JSON format first
|
| 53 |
+
try:
|
| 54 |
+
data = json.loads(response)
|
| 55 |
+
if isinstance(data, dict):
|
| 56 |
+
# Direct format
|
| 57 |
+
if 'name' in data and 'arguments' in data:
|
| 58 |
+
args = data['arguments']
|
| 59 |
+
if isinstance(args, str):
|
| 60 |
+
args = json.loads(args)
|
| 61 |
+
return {'name': data['name'], 'arguments': args}
|
| 62 |
+
# OpenAI format
|
| 63 |
+
if 'function_call' in data:
|
| 64 |
+
fc = data['function_call']
|
| 65 |
+
args = fc.get('arguments', {})
|
| 66 |
+
if isinstance(args, str):
|
| 67 |
+
args = json.loads(args)
|
| 68 |
+
return {'name': fc['name'], 'arguments': args}
|
| 69 |
+
# Tool calls format
|
| 70 |
+
if 'tool_calls' in data and data['tool_calls']:
|
| 71 |
+
tc = data['tool_calls'][0]
|
| 72 |
+
func = tc.get('function', tc)
|
| 73 |
+
args = func.get('arguments', {})
|
| 74 |
+
if isinstance(args, str):
|
| 75 |
+
args = json.loads(args)
|
| 76 |
+
return {'name': func['name'], 'arguments': args}
|
| 77 |
+
except (json.JSONDecodeError, KeyError, TypeError):
|
| 78 |
+
pass
|
| 79 |
+
|
| 80 |
+
# Try extracting JSON from text
|
| 81 |
+
json_match = re.search(r'\{[^{}]*"name"[^{}]*\}', response, re.DOTALL)
|
| 82 |
+
if json_match:
|
| 83 |
+
try:
|
| 84 |
+
data = json.loads(json_match.group())
|
| 85 |
+
if 'name' in data:
|
| 86 |
+
args = data.get('arguments', data.get('parameters', {}))
|
| 87 |
+
if isinstance(args, str):
|
| 88 |
+
args = json.loads(args)
|
| 89 |
+
return {'name': data['name'], 'arguments': args}
|
| 90 |
+
except (json.JSONDecodeError, KeyError):
|
| 91 |
+
pass
|
| 92 |
+
|
| 93 |
+
# Try plain text function call format: func(args)
|
| 94 |
+
func_match = re.match(r'(\w+)\s*\((.*)\)', response, re.DOTALL)
|
| 95 |
+
if func_match:
|
| 96 |
+
name = func_match.group(1)
|
| 97 |
+
args_str = func_match.group(2).strip()
|
| 98 |
+
try:
|
| 99 |
+
# Try parsing as JSON
|
| 100 |
+
if args_str.startswith('{'):
|
| 101 |
+
args = json.loads(args_str)
|
| 102 |
+
else:
|
| 103 |
+
# Parse as key=value pairs
|
| 104 |
+
args = {}
|
| 105 |
+
for pair in args_str.split(','):
|
| 106 |
+
if '=' in pair:
|
| 107 |
+
k, v = pair.split('=', 1)
|
| 108 |
+
args[k.strip()] = self._parse_value(v.strip())
|
| 109 |
+
return {'name': name, 'arguments': args}
|
| 110 |
+
except:
|
| 111 |
+
pass
|
| 112 |
+
|
| 113 |
+
return None
|
| 114 |
+
|
| 115 |
+
def parse_multiple_calls(self, response: str) -> List[Dict]:
|
| 116 |
+
"""Parse multiple function calls from response."""
|
| 117 |
+
calls = []
|
| 118 |
+
|
| 119 |
+
if not response:
|
| 120 |
+
return calls
|
| 121 |
+
|
| 122 |
+
# Try JSON array
|
| 123 |
+
try:
|
| 124 |
+
data = json.loads(response)
|
| 125 |
+
if isinstance(data, list):
|
| 126 |
+
for item in data:
|
| 127 |
+
parsed = self.parse_function_call(json.dumps(item))
|
| 128 |
+
if parsed:
|
| 129 |
+
calls.append(parsed)
|
| 130 |
+
return calls
|
| 131 |
+
elif isinstance(data, dict) and 'tool_calls' in data:
|
| 132 |
+
for tc in data['tool_calls']:
|
| 133 |
+
func = tc.get('function', tc)
|
| 134 |
+
args = func.get('arguments', {})
|
| 135 |
+
if isinstance(args, str):
|
| 136 |
+
args = json.loads(args)
|
| 137 |
+
calls.append({'name': func['name'], 'arguments': args})
|
| 138 |
+
return calls
|
| 139 |
+
except (json.JSONDecodeError, KeyError, TypeError):
|
| 140 |
+
pass
|
| 141 |
+
|
| 142 |
+
# Try finding multiple JSON objects
|
| 143 |
+
json_pattern = r'\{[^{}]*"name"[^{}]*\}'
|
| 144 |
+
matches = re.findall(json_pattern, response, re.DOTALL)
|
| 145 |
+
for match in matches:
|
| 146 |
+
parsed = self.parse_function_call(match)
|
| 147 |
+
if parsed:
|
| 148 |
+
calls.append(parsed)
|
| 149 |
+
|
| 150 |
+
# If no calls found, try single call
|
| 151 |
+
if not calls:
|
| 152 |
+
single = self.parse_function_call(response)
|
| 153 |
+
if single:
|
| 154 |
+
calls.append(single)
|
| 155 |
+
|
| 156 |
+
return calls
|
| 157 |
+
|
| 158 |
+
def _parse_value(self, value: str) -> Any:
|
| 159 |
+
"""Parse a string value to appropriate type."""
|
| 160 |
+
value = value.strip().strip('"\'')
|
| 161 |
+
# Try numeric
|
| 162 |
+
try:
|
| 163 |
+
if '.' in value:
|
| 164 |
+
return float(value)
|
| 165 |
+
return int(value)
|
| 166 |
+
except ValueError:
|
| 167 |
+
pass
|
| 168 |
+
# Boolean
|
| 169 |
+
if value.lower() in ('true', 'false'):
|
| 170 |
+
return value.lower() == 'true'
|
| 171 |
+
# None
|
| 172 |
+
if value.lower() in ('none', 'null'):
|
| 173 |
+
return None
|
| 174 |
+
return value
|
| 175 |
+
|
| 176 |
+
def normalize_value(self, value: Any) -> Any:
|
| 177 |
+
"""Normalize a value for comparison."""
|
| 178 |
+
if isinstance(value, str):
|
| 179 |
+
return self.normalizer.normalize(value)
|
| 180 |
+
if isinstance(value, (list, tuple)):
|
| 181 |
+
return [self.normalize_value(v) for v in value]
|
| 182 |
+
if isinstance(value, dict):
|
| 183 |
+
return {k: self.normalize_value(v) for k, v in value.items()}
|
| 184 |
+
return value
|
| 185 |
+
|
| 186 |
+
def compare_arguments(
|
| 187 |
+
self,
|
| 188 |
+
predicted: Dict[str, Any],
|
| 189 |
+
expected: Dict[str, Any],
|
| 190 |
+
strict: bool = True
|
| 191 |
+
) -> Tuple[bool, float, Dict]:
|
| 192 |
+
"""
|
| 193 |
+
Compare predicted arguments against expected.
|
| 194 |
+
|
| 195 |
+
Returns: (is_match, score, details)
|
| 196 |
+
"""
|
| 197 |
+
if not expected:
|
| 198 |
+
return len(predicted) == 0, 1.0 if len(predicted) == 0 else 0.0, {}
|
| 199 |
+
|
| 200 |
+
details = {'matched': [], 'mismatched': [], 'missing': [], 'extra': []}
|
| 201 |
+
|
| 202 |
+
expected_keys = set(expected.keys())
|
| 203 |
+
predicted_keys = set(predicted.keys())
|
| 204 |
+
|
| 205 |
+
# Check for missing and extra keys
|
| 206 |
+
missing = expected_keys - predicted_keys
|
| 207 |
+
extra = predicted_keys - expected_keys
|
| 208 |
+
|
| 209 |
+
details['missing'] = list(missing)
|
| 210 |
+
details['extra'] = list(extra)
|
| 211 |
+
|
| 212 |
+
# Compare common keys
|
| 213 |
+
common_keys = expected_keys & predicted_keys
|
| 214 |
+
matched_count = 0
|
| 215 |
+
|
| 216 |
+
for key in common_keys:
|
| 217 |
+
exp_val = self.normalize_value(expected[key])
|
| 218 |
+
pred_val = self.normalize_value(predicted[key])
|
| 219 |
+
|
| 220 |
+
if exp_val == pred_val:
|
| 221 |
+
details['matched'].append(key)
|
| 222 |
+
matched_count += 1
|
| 223 |
+
else:
|
| 224 |
+
# Try numeric comparison with tolerance
|
| 225 |
+
if isinstance(exp_val, (int, float)) and isinstance(pred_val, (int, float)):
|
| 226 |
+
if abs(exp_val - pred_val) < 0.001:
|
| 227 |
+
details['matched'].append(key)
|
| 228 |
+
matched_count += 1
|
| 229 |
+
continue
|
| 230 |
+
details['mismatched'].append({
|
| 231 |
+
'key': key,
|
| 232 |
+
'expected': expected[key],
|
| 233 |
+
'predicted': predicted[key]
|
| 234 |
+
})
|
| 235 |
+
|
| 236 |
+
# Calculate score
|
| 237 |
+
total_expected = len(expected_keys)
|
| 238 |
+
if strict:
|
| 239 |
+
# All must match, no extras
|
| 240 |
+
is_match = (matched_count == total_expected and len(extra) == 0)
|
| 241 |
+
score = matched_count / max(total_expected, len(predicted_keys)) if predicted_keys else 0.0
|
| 242 |
+
else:
|
| 243 |
+
# Partial credit
|
| 244 |
+
is_match = matched_count == total_expected
|
| 245 |
+
score = matched_count / total_expected if total_expected > 0 else 1.0
|
| 246 |
+
|
| 247 |
+
return is_match, score, details
|
| 248 |
+
|
| 249 |
+
def evaluate_single_call(
|
| 250 |
+
self,
|
| 251 |
+
predicted: Optional[Dict],
|
| 252 |
+
expected: Dict
|
| 253 |
+
) -> EvaluationResult:
|
| 254 |
+
"""Evaluate a single function call prediction."""
|
| 255 |
+
if predicted is None:
|
| 256 |
+
return EvaluationResult(
|
| 257 |
+
sample_id="",
|
| 258 |
+
category="",
|
| 259 |
+
is_correct=False,
|
| 260 |
+
score=0.0,
|
| 261 |
+
details={'error': 'Failed to parse prediction'}
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
# Check function name
|
| 265 |
+
pred_name = self.normalizer.normalize(predicted.get('name', ''))
|
| 266 |
+
exp_name = self.normalizer.normalize(expected.get('name', ''))
|
| 267 |
+
|
| 268 |
+
if pred_name != exp_name:
|
| 269 |
+
return EvaluationResult(
|
| 270 |
+
sample_id="",
|
| 271 |
+
category="",
|
| 272 |
+
is_correct=False,
|
| 273 |
+
score=0.0,
|
| 274 |
+
details={
|
| 275 |
+
'error': 'Function name mismatch',
|
| 276 |
+
'expected_name': expected.get('name'),
|
| 277 |
+
'predicted_name': predicted.get('name')
|
| 278 |
+
}
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
# Compare arguments
|
| 282 |
+
pred_args = predicted.get('arguments', {})
|
| 283 |
+
exp_args = expected.get('arguments', {})
|
| 284 |
+
|
| 285 |
+
is_match, score, details = self.compare_arguments(
|
| 286 |
+
pred_args, exp_args, strict=(self.mode == 'exact')
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
return EvaluationResult(
|
| 290 |
+
sample_id="",
|
| 291 |
+
category="",
|
| 292 |
+
is_correct=is_match,
|
| 293 |
+
score=score,
|
| 294 |
+
details=details
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
def evaluate_parallel_calls(
|
| 298 |
+
self,
|
| 299 |
+
predicted: List[Dict],
|
| 300 |
+
expected: List[Dict]
|
| 301 |
+
) -> EvaluationResult:
|
| 302 |
+
"""
|
| 303 |
+
Evaluate parallel function calls (order-agnostic).
|
| 304 |
+
Uses bipartite matching for optimal pairing.
|
| 305 |
+
"""
|
| 306 |
+
if len(predicted) == 0 and len(expected) == 0:
|
| 307 |
+
return EvaluationResult(
|
| 308 |
+
sample_id="",
|
| 309 |
+
category="",
|
| 310 |
+
is_correct=True,
|
| 311 |
+
score=1.0,
|
| 312 |
+
details={'matched_calls': 0}
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
if len(predicted) == 0:
|
| 316 |
+
return EvaluationResult(
|
| 317 |
+
sample_id="",
|
| 318 |
+
category="",
|
| 319 |
+
is_correct=False,
|
| 320 |
+
score=0.0,
|
| 321 |
+
details={'error': 'No predictions', 'expected_count': len(expected)}
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
# Build score matrix
|
| 325 |
+
scores = []
|
| 326 |
+
for pred in predicted:
|
| 327 |
+
row = []
|
| 328 |
+
for exp in expected:
|
| 329 |
+
result = self.evaluate_single_call(pred, exp)
|
| 330 |
+
row.append(result.score)
|
| 331 |
+
scores.append(row)
|
| 332 |
+
|
| 333 |
+
# Greedy matching (could use Hungarian algorithm for optimal)
|
| 334 |
+
matched = 0
|
| 335 |
+
total_score = 0.0
|
| 336 |
+
used_expected = set()
|
| 337 |
+
match_details = []
|
| 338 |
+
|
| 339 |
+
for i, pred in enumerate(predicted):
|
| 340 |
+
best_j = -1
|
| 341 |
+
best_score = -1
|
| 342 |
+
|
| 343 |
+
for j, exp in enumerate(expected):
|
| 344 |
+
if j not in used_expected and scores[i][j] > best_score:
|
| 345 |
+
best_score = scores[i][j]
|
| 346 |
+
best_j = j
|
| 347 |
+
|
| 348 |
+
if best_j >= 0 and best_score > 0:
|
| 349 |
+
used_expected.add(best_j)
|
| 350 |
+
total_score += best_score
|
| 351 |
+
if best_score == 1.0:
|
| 352 |
+
matched += 1
|
| 353 |
+
match_details.append({
|
| 354 |
+
'predicted': pred,
|
| 355 |
+
'matched_to': expected[best_j],
|
| 356 |
+
'score': best_score
|
| 357 |
+
})
|
| 358 |
+
|
| 359 |
+
# Calculate overall score
|
| 360 |
+
max_possible = max(len(predicted), len(expected))
|
| 361 |
+
avg_score = total_score / max_possible if max_possible > 0 else 0.0
|
| 362 |
+
|
| 363 |
+
is_correct = (matched == len(expected) and len(predicted) == len(expected))
|
| 364 |
+
|
| 365 |
+
return EvaluationResult(
|
| 366 |
+
sample_id="",
|
| 367 |
+
category="",
|
| 368 |
+
is_correct=is_correct,
|
| 369 |
+
score=avg_score,
|
| 370 |
+
details={
|
| 371 |
+
'matched_calls': matched,
|
| 372 |
+
'expected_count': len(expected),
|
| 373 |
+
'predicted_count': len(predicted),
|
| 374 |
+
'matches': match_details
|
| 375 |
+
}
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
def evaluate_irrelevance(
|
| 379 |
+
self,
|
| 380 |
+
predicted: Union[str, Dict, List],
|
| 381 |
+
expected_no_call: bool = True
|
| 382 |
+
) -> EvaluationResult:
|
| 383 |
+
"""
|
| 384 |
+
Evaluate irrelevance detection (should not call any function).
|
| 385 |
+
"""
|
| 386 |
+
# Check if model made any function calls
|
| 387 |
+
if isinstance(predicted, str):
|
| 388 |
+
calls = self.parse_multiple_calls(predicted)
|
| 389 |
+
elif isinstance(predicted, list):
|
| 390 |
+
calls = predicted
|
| 391 |
+
elif isinstance(predicted, dict):
|
| 392 |
+
calls = [predicted] if 'name' in predicted else []
|
| 393 |
+
else:
|
| 394 |
+
calls = []
|
| 395 |
+
|
| 396 |
+
made_call = len(calls) > 0
|
| 397 |
+
|
| 398 |
+
if expected_no_call:
|
| 399 |
+
is_correct = not made_call
|
| 400 |
+
score = 1.0 if is_correct else 0.0
|
| 401 |
+
details = {
|
| 402 |
+
'expected': 'no_call',
|
| 403 |
+
'actual': 'call_made' if made_call else 'no_call',
|
| 404 |
+
'calls_made': calls
|
| 405 |
+
}
|
| 406 |
+
else:
|
| 407 |
+
is_correct = made_call
|
| 408 |
+
score = 1.0 if is_correct else 0.0
|
| 409 |
+
details = {
|
| 410 |
+
'expected': 'call_required',
|
| 411 |
+
'actual': 'call_made' if made_call else 'no_call'
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
return EvaluationResult(
|
| 415 |
+
sample_id="",
|
| 416 |
+
category="irrelevance",
|
| 417 |
+
is_correct=is_correct,
|
| 418 |
+
score=score,
|
| 419 |
+
details=details
|
| 420 |
+
)
|
| 421 |
+
|
| 422 |
+
def evaluate(
|
| 423 |
+
self,
|
| 424 |
+
sample: Dict,
|
| 425 |
+
prediction: str
|
| 426 |
+
) -> EvaluationResult:
|
| 427 |
+
"""
|
| 428 |
+
Main evaluation entry point.
|
| 429 |
+
Dispatches to appropriate evaluator based on category.
|
| 430 |
+
"""
|
| 431 |
+
category = sample.get('category', 'simple')
|
| 432 |
+
sample_id = sample.get('id', '')
|
| 433 |
+
|
| 434 |
+
# Parse ground truth
|
| 435 |
+
ground_truth = sample.get('ground_truth')
|
| 436 |
+
if isinstance(ground_truth, str) and ground_truth:
|
| 437 |
+
try:
|
| 438 |
+
ground_truth = json.loads(ground_truth)
|
| 439 |
+
except json.JSONDecodeError:
|
| 440 |
+
ground_truth = None
|
| 441 |
+
|
| 442 |
+
# Handle irrelevance
|
| 443 |
+
if category == 'irrelevance':
|
| 444 |
+
result = self.evaluate_irrelevance(prediction, expected_no_call=True)
|
| 445 |
+
result.sample_id = sample_id
|
| 446 |
+
return result
|
| 447 |
+
|
| 448 |
+
# Parse prediction
|
| 449 |
+
if category in ('parallel', 'parallel_multiple'):
|
| 450 |
+
pred_calls = self.parse_multiple_calls(prediction)
|
| 451 |
+
if ground_truth and 'calls' in ground_truth:
|
| 452 |
+
exp_calls = ground_truth['calls']
|
| 453 |
+
else:
|
| 454 |
+
exp_calls = []
|
| 455 |
+
result = self.evaluate_parallel_calls(pred_calls, exp_calls)
|
| 456 |
+
else:
|
| 457 |
+
pred_call = self.parse_function_call(prediction)
|
| 458 |
+
if ground_truth:
|
| 459 |
+
if 'calls' in ground_truth and ground_truth['calls']:
|
| 460 |
+
exp_call = ground_truth['calls'][0]
|
| 461 |
+
else:
|
| 462 |
+
exp_call = ground_truth
|
| 463 |
+
else:
|
| 464 |
+
# No ground truth available
|
| 465 |
+
result = EvaluationResult(
|
| 466 |
+
sample_id=sample_id,
|
| 467 |
+
category=category,
|
| 468 |
+
is_correct=False,
|
| 469 |
+
score=0.0,
|
| 470 |
+
details={'error': 'No ground truth available'}
|
| 471 |
+
)
|
| 472 |
+
return result
|
| 473 |
+
result = self.evaluate_single_call(pred_call, exp_call)
|
| 474 |
+
|
| 475 |
+
result.sample_id = sample_id
|
| 476 |
+
result.category = category
|
| 477 |
+
return result
|
afcl/requirements.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Arabic Function Calling Leaderboard Requirements
|
| 2 |
+
|
| 3 |
+
# Core
|
| 4 |
+
gradio>=4.0.0
|
| 5 |
+
datasets>=2.14.0
|
| 6 |
+
huggingface_hub>=0.19.0
|
| 7 |
+
pandas>=2.0.0
|
| 8 |
+
|
| 9 |
+
# Visualization
|
| 10 |
+
plotly>=5.18.0
|
| 11 |
+
|
| 12 |
+
# Evaluation
|
| 13 |
+
transformers>=4.35.0
|
| 14 |
+
torch>=2.0.0
|
| 15 |
+
|
| 16 |
+
# Arabic NLP
|
| 17 |
+
camel-tools>=1.5.0 # Optional: for advanced Arabic processing
|
| 18 |
+
|
| 19 |
+
# Utilities
|
| 20 |
+
python-dotenv>=1.0.0
|
| 21 |
+
tqdm>=4.66.0
|
afcl/static/styles.css
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Arabic Function Calling Leaderboard - RTL Arabic Styles */
|
| 2 |
+
|
| 3 |
+
/* Import Arabic fonts */
|
| 4 |
+
@import url('https://fonts.googleapis.com/css2?family=Noto+Kufi+Arabic:wght@400;500;600;700&family=Noto+Naskh+Arabic:wght@400;500;600;700&display=swap');
|
| 5 |
+
|
| 6 |
+
/* RTL Support */
|
| 7 |
+
[dir="rtl"],
|
| 8 |
+
.rtl {
|
| 9 |
+
direction: rtl;
|
| 10 |
+
text-align: right;
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
/* Arabic text styling */
|
| 14 |
+
.arabic-text {
|
| 15 |
+
font-family: 'Noto Kufi Arabic', 'Noto Naskh Arabic', 'Arial', sans-serif;
|
| 16 |
+
line-height: 1.8;
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
/* Leaderboard header */
|
| 20 |
+
.leaderboard-header {
|
| 21 |
+
background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4a 100%);
|
| 22 |
+
color: white;
|
| 23 |
+
padding: 2rem;
|
| 24 |
+
border-radius: 12px;
|
| 25 |
+
margin-bottom: 1.5rem;
|
| 26 |
+
text-align: center;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
.leaderboard-header h1 {
|
| 30 |
+
font-size: 2rem;
|
| 31 |
+
margin-bottom: 0.5rem;
|
| 32 |
+
font-family: 'Noto Kufi Arabic', sans-serif;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
.leaderboard-header .subtitle {
|
| 36 |
+
font-size: 1.1rem;
|
| 37 |
+
opacity: 0.9;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
/* Table styling */
|
| 41 |
+
.leaderboard-table {
|
| 42 |
+
width: 100%;
|
| 43 |
+
border-collapse: collapse;
|
| 44 |
+
font-family: 'Noto Kufi Arabic', sans-serif;
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
.leaderboard-table th,
|
| 48 |
+
.leaderboard-table td {
|
| 49 |
+
padding: 12px 16px;
|
| 50 |
+
text-align: center;
|
| 51 |
+
border-bottom: 1px solid #e0e0e0;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
.leaderboard-table th {
|
| 55 |
+
background-color: #f5f5f5;
|
| 56 |
+
font-weight: 600;
|
| 57 |
+
color: #333;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
.leaderboard-table tr:hover {
|
| 61 |
+
background-color: #f9f9f9;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
/* Rank badges */
|
| 65 |
+
.rank-badge {
|
| 66 |
+
display: inline-flex;
|
| 67 |
+
align-items: center;
|
| 68 |
+
justify-content: center;
|
| 69 |
+
width: 32px;
|
| 70 |
+
height: 32px;
|
| 71 |
+
border-radius: 50%;
|
| 72 |
+
font-weight: bold;
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
.rank-1 {
|
| 76 |
+
background: linear-gradient(135deg, #ffd700, #ffed4a);
|
| 77 |
+
color: #8b6914;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
.rank-2 {
|
| 81 |
+
background: linear-gradient(135deg, #c0c0c0, #e8e8e8);
|
| 82 |
+
color: #666;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
.rank-3 {
|
| 86 |
+
background: linear-gradient(135deg, #cd7f32, #daa520);
|
| 87 |
+
color: #5c3d1e;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
.rank-other {
|
| 91 |
+
background-color: #f0f0f0;
|
| 92 |
+
color: #666;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
/* Score cells */
|
| 96 |
+
.score-cell {
|
| 97 |
+
font-weight: 500;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
.score-high {
|
| 101 |
+
color: #2ca02c;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
.score-medium {
|
| 105 |
+
color: #1f77b4;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
.score-low {
|
| 109 |
+
color: #ff7f0e;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
.score-very-low {
|
| 113 |
+
color: #d62728;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
/* Model name styling */
|
| 117 |
+
.model-name {
|
| 118 |
+
font-weight: 600;
|
| 119 |
+
color: #1a5f2a;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
/* Category tabs */
|
| 123 |
+
.category-tabs {
|
| 124 |
+
display: flex;
|
| 125 |
+
gap: 8px;
|
| 126 |
+
flex-wrap: wrap;
|
| 127 |
+
margin-bottom: 1rem;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
.category-tab {
|
| 131 |
+
padding: 8px 16px;
|
| 132 |
+
border-radius: 20px;
|
| 133 |
+
background-color: #f0f0f0;
|
| 134 |
+
color: #666;
|
| 135 |
+
cursor: pointer;
|
| 136 |
+
transition: all 0.2s ease;
|
| 137 |
+
font-family: 'Noto Kufi Arabic', sans-serif;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
.category-tab:hover {
|
| 141 |
+
background-color: #e0e0e0;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
.category-tab.active {
|
| 145 |
+
background-color: #1a5f2a;
|
| 146 |
+
color: white;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
/* Cards */
|
| 150 |
+
.stat-card {
|
| 151 |
+
background: white;
|
| 152 |
+
border-radius: 12px;
|
| 153 |
+
padding: 1.5rem;
|
| 154 |
+
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
| 155 |
+
text-align: center;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
.stat-card .value {
|
| 159 |
+
font-size: 2.5rem;
|
| 160 |
+
font-weight: bold;
|
| 161 |
+
color: #1a5f2a;
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
.stat-card .label {
|
| 165 |
+
font-size: 0.9rem;
|
| 166 |
+
color: #666;
|
| 167 |
+
margin-top: 0.5rem;
|
| 168 |
+
font-family: 'Noto Kufi Arabic', sans-serif;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
/* Language toggle */
|
| 172 |
+
.lang-toggle {
|
| 173 |
+
display: flex;
|
| 174 |
+
gap: 8px;
|
| 175 |
+
justify-content: flex-end;
|
| 176 |
+
margin-bottom: 1rem;
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
.lang-btn {
|
| 180 |
+
padding: 6px 12px;
|
| 181 |
+
border-radius: 4px;
|
| 182 |
+
border: 1px solid #ddd;
|
| 183 |
+
background: white;
|
| 184 |
+
cursor: pointer;
|
| 185 |
+
transition: all 0.2s ease;
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
.lang-btn.active {
|
| 189 |
+
background-color: #1a5f2a;
|
| 190 |
+
color: white;
|
| 191 |
+
border-color: #1a5f2a;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
/* Submission form */
|
| 195 |
+
.submission-form {
|
| 196 |
+
background: #f9f9f9;
|
| 197 |
+
border-radius: 12px;
|
| 198 |
+
padding: 2rem;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
.submission-form h2 {
|
| 202 |
+
margin-bottom: 1.5rem;
|
| 203 |
+
color: #333;
|
| 204 |
+
font-family: 'Noto Kufi Arabic', sans-serif;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
.form-field {
|
| 208 |
+
margin-bottom: 1.5rem;
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
.form-field label {
|
| 212 |
+
display: block;
|
| 213 |
+
margin-bottom: 0.5rem;
|
| 214 |
+
font-weight: 500;
|
| 215 |
+
font-family: 'Noto Kufi Arabic', sans-serif;
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
.form-field input,
|
| 219 |
+
.form-field select {
|
| 220 |
+
width: 100%;
|
| 221 |
+
padding: 10px 12px;
|
| 222 |
+
border: 1px solid #ddd;
|
| 223 |
+
border-radius: 8px;
|
| 224 |
+
font-size: 1rem;
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
.submit-btn {
|
| 228 |
+
background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4a 100%);
|
| 229 |
+
color: white;
|
| 230 |
+
padding: 12px 24px;
|
| 231 |
+
border: none;
|
| 232 |
+
border-radius: 8px;
|
| 233 |
+
font-size: 1rem;
|
| 234 |
+
font-weight: 600;
|
| 235 |
+
cursor: pointer;
|
| 236 |
+
transition: transform 0.2s ease;
|
| 237 |
+
font-family: 'Noto Kufi Arabic', sans-serif;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
.submit-btn:hover {
|
| 241 |
+
transform: translateY(-2px);
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
/* About section */
|
| 245 |
+
.about-section {
|
| 246 |
+
line-height: 1.8;
|
| 247 |
+
font-family: 'Noto Kufi Arabic', sans-serif;
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
.about-section h2 {
|
| 251 |
+
color: #1a5f2a;
|
| 252 |
+
margin-top: 2rem;
|
| 253 |
+
margin-bottom: 1rem;
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
.about-section ul {
|
| 257 |
+
padding-right: 2rem;
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
.about-section li {
|
| 261 |
+
margin-bottom: 0.5rem;
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
/* Responsive design */
|
| 265 |
+
@media (max-width: 768px) {
|
| 266 |
+
.leaderboard-header h1 {
|
| 267 |
+
font-size: 1.5rem;
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
.leaderboard-table {
|
| 271 |
+
font-size: 0.85rem;
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
.leaderboard-table th,
|
| 275 |
+
.leaderboard-table td {
|
| 276 |
+
padding: 8px 10px;
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
.stat-card .value {
|
| 280 |
+
font-size: 2rem;
|
| 281 |
+
}
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
/* Dark mode support */
|
| 285 |
+
@media (prefers-color-scheme: dark) {
|
| 286 |
+
.leaderboard-table th {
|
| 287 |
+
background-color: #2a2a2a;
|
| 288 |
+
color: #e0e0e0;
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
.leaderboard-table td {
|
| 292 |
+
border-color: #3a3a3a;
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
.leaderboard-table tr:hover {
|
| 296 |
+
background-color: #2a2a2a;
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
.stat-card {
|
| 300 |
+
background: #1a1a1a;
|
| 301 |
+
box-shadow: 0 2px 8px rgba(0,0,0,0.3);
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
.submission-form {
|
| 305 |
+
background: #1a1a1a;
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
.category-tab {
|
| 309 |
+
background-color: #2a2a2a;
|
| 310 |
+
color: #ccc;
|
| 311 |
+
}
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
/* Gradio overrides */
|
| 315 |
+
.gradio-container {
|
| 316 |
+
font-family: 'Noto Kufi Arabic', 'Arial', sans-serif !important;
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
.gradio-container .prose {
|
| 320 |
+
font-family: 'Noto Kufi Arabic', 'Arial', sans-serif !important;
|
| 321 |
+
}
|
afcl/submission/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AFCL Submission Module
|
| 3 |
+
======================
|
| 4 |
+
|
| 5 |
+
Model submission and evaluation queue management.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from .handler import SubmissionHandler
|
| 9 |
+
|
| 10 |
+
__all__ = ['SubmissionHandler']
|
afcl/submission/handler.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Submission Handler
|
| 3 |
+
==================
|
| 4 |
+
|
| 5 |
+
Handles model submission workflow for the leaderboard.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Dict, Optional
|
| 13 |
+
from dataclasses import dataclass, asdict
|
| 14 |
+
import uuid
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class Submission:
|
| 19 |
+
"""Model submission data."""
|
| 20 |
+
id: str
|
| 21 |
+
model_name: str
|
| 22 |
+
model_type: str # "huggingface", "api", "local"
|
| 23 |
+
model_path: str
|
| 24 |
+
precision: str
|
| 25 |
+
base_model: Optional[str]
|
| 26 |
+
license: str
|
| 27 |
+
submitted_at: str
|
| 28 |
+
status: str # "pending", "running", "completed", "failed"
|
| 29 |
+
results: Optional[Dict] = None
|
| 30 |
+
error_message: Optional[str] = None
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class SubmissionHandler:
|
| 34 |
+
"""Handles model submissions and evaluation queue."""
|
| 35 |
+
|
| 36 |
+
def __init__(self, submissions_dir: str = "data/submissions"):
|
| 37 |
+
self.submissions_dir = Path(submissions_dir)
|
| 38 |
+
self.submissions_dir.mkdir(parents=True, exist_ok=True)
|
| 39 |
+
|
| 40 |
+
def create_submission(
|
| 41 |
+
self,
|
| 42 |
+
model_name: str,
|
| 43 |
+
model_type: str,
|
| 44 |
+
model_path: str,
|
| 45 |
+
precision: str = "float16",
|
| 46 |
+
base_model: Optional[str] = None,
|
| 47 |
+
license: str = "Apache-2.0"
|
| 48 |
+
) -> Submission:
|
| 49 |
+
"""Create a new submission."""
|
| 50 |
+
submission = Submission(
|
| 51 |
+
id=str(uuid.uuid4())[:8],
|
| 52 |
+
model_name=model_name,
|
| 53 |
+
model_type=model_type,
|
| 54 |
+
model_path=model_path,
|
| 55 |
+
precision=precision,
|
| 56 |
+
base_model=base_model,
|
| 57 |
+
license=license,
|
| 58 |
+
submitted_at=datetime.now().isoformat(),
|
| 59 |
+
status="pending"
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Save submission
|
| 63 |
+
self._save_submission(submission)
|
| 64 |
+
return submission
|
| 65 |
+
|
| 66 |
+
def _save_submission(self, submission: Submission):
|
| 67 |
+
"""Save submission to file."""
|
| 68 |
+
filepath = self.submissions_dir / f"{submission.id}.json"
|
| 69 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 70 |
+
json.dump(asdict(submission), f, ensure_ascii=False, indent=2)
|
| 71 |
+
|
| 72 |
+
def get_submission(self, submission_id: str) -> Optional[Submission]:
|
| 73 |
+
"""Load a submission by ID."""
|
| 74 |
+
filepath = self.submissions_dir / f"{submission_id}.json"
|
| 75 |
+
if not filepath.exists():
|
| 76 |
+
return None
|
| 77 |
+
|
| 78 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 79 |
+
data = json.load(f)
|
| 80 |
+
|
| 81 |
+
return Submission(**data)
|
| 82 |
+
|
| 83 |
+
def update_status(
|
| 84 |
+
self,
|
| 85 |
+
submission_id: str,
|
| 86 |
+
status: str,
|
| 87 |
+
results: Optional[Dict] = None,
|
| 88 |
+
error_message: Optional[str] = None
|
| 89 |
+
):
|
| 90 |
+
"""Update submission status."""
|
| 91 |
+
submission = self.get_submission(submission_id)
|
| 92 |
+
if submission:
|
| 93 |
+
submission.status = status
|
| 94 |
+
if results:
|
| 95 |
+
submission.results = results
|
| 96 |
+
if error_message:
|
| 97 |
+
submission.error_message = error_message
|
| 98 |
+
self._save_submission(submission)
|
| 99 |
+
|
| 100 |
+
def get_pending_submissions(self) -> list:
|
| 101 |
+
"""Get all pending submissions."""
|
| 102 |
+
submissions = []
|
| 103 |
+
for filepath in self.submissions_dir.glob("*.json"):
|
| 104 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 105 |
+
data = json.load(f)
|
| 106 |
+
if data.get('status') == 'pending':
|
| 107 |
+
submissions.append(Submission(**data))
|
| 108 |
+
return sorted(submissions, key=lambda x: x.submitted_at)
|
| 109 |
+
|
| 110 |
+
def get_all_submissions(self) -> list:
|
| 111 |
+
"""Get all submissions."""
|
| 112 |
+
submissions = []
|
| 113 |
+
for filepath in self.submissions_dir.glob("*.json"):
|
| 114 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 115 |
+
data = json.load(f)
|
| 116 |
+
submissions.append(Submission(**data))
|
| 117 |
+
return sorted(submissions, key=lambda x: x.submitted_at, reverse=True)
|
afcl/visualization/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AFCL Visualization Module
|
| 3 |
+
=========================
|
| 4 |
+
|
| 5 |
+
Charts and visualizations for the leaderboard.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from .charts import create_radar_chart, create_bar_chart, create_category_comparison
|
| 9 |
+
|
| 10 |
+
__all__ = ['create_radar_chart', 'create_bar_chart', 'create_category_comparison']
|
afcl/visualization/charts.py
ADDED
|
@@ -0,0 +1,363 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Visualization Charts
|
| 3 |
+
====================
|
| 4 |
+
|
| 5 |
+
Plotly-based visualizations for the Arabic Function Calling Leaderboard.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import plotly.graph_objects as go
|
| 9 |
+
import plotly.express as px
|
| 10 |
+
from typing import Dict, List, Optional
|
| 11 |
+
import pandas as pd
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Arabic category names mapping
|
| 15 |
+
CATEGORY_NAMES_AR = {
|
| 16 |
+
"simple": "بسيط",
|
| 17 |
+
"multiple": "متعدد",
|
| 18 |
+
"parallel": "متوازي",
|
| 19 |
+
"parallel_multiple": "متوازي متعدد",
|
| 20 |
+
"irrelevance": "اللا صلة",
|
| 21 |
+
"dialect_handling": "اللهجات",
|
| 22 |
+
"multi_turn": "متعدد الأدوار",
|
| 23 |
+
"native_arabic": "العربي الأصلي",
|
| 24 |
+
"java": "جافا",
|
| 25 |
+
"javascript": "جافاسكريبت",
|
| 26 |
+
"rest": "REST",
|
| 27 |
+
"sql": "SQL"
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
# Color palette for models
|
| 31 |
+
MODEL_COLORS = [
|
| 32 |
+
"#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
|
| 33 |
+
"#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def create_radar_chart(
|
| 38 |
+
model_scores: Dict[str, Dict[str, float]],
|
| 39 |
+
categories: Optional[List[str]] = None,
|
| 40 |
+
use_arabic: bool = True,
|
| 41 |
+
title: str = "Model Comparison"
|
| 42 |
+
) -> go.Figure:
|
| 43 |
+
"""
|
| 44 |
+
Create a radar/spider chart comparing models across categories.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
model_scores: Dict mapping model names to category scores
|
| 48 |
+
categories: Categories to include (defaults to main evaluation categories)
|
| 49 |
+
use_arabic: Whether to use Arabic labels
|
| 50 |
+
title: Chart title
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
Plotly Figure object
|
| 54 |
+
"""
|
| 55 |
+
if categories is None:
|
| 56 |
+
categories = ["simple", "multiple", "parallel", "parallel_multiple",
|
| 57 |
+
"irrelevance", "dialect_handling"]
|
| 58 |
+
|
| 59 |
+
# Prepare category labels
|
| 60 |
+
if use_arabic:
|
| 61 |
+
labels = [CATEGORY_NAMES_AR.get(cat, cat) for cat in categories]
|
| 62 |
+
else:
|
| 63 |
+
labels = categories
|
| 64 |
+
|
| 65 |
+
fig = go.Figure()
|
| 66 |
+
|
| 67 |
+
for i, (model_name, scores) in enumerate(model_scores.items()):
|
| 68 |
+
values = [scores.get(cat, 0) for cat in categories]
|
| 69 |
+
# Close the radar chart
|
| 70 |
+
values_closed = values + [values[0]]
|
| 71 |
+
labels_closed = labels + [labels[0]]
|
| 72 |
+
|
| 73 |
+
fig.add_trace(go.Scatterpolar(
|
| 74 |
+
r=values_closed,
|
| 75 |
+
theta=labels_closed,
|
| 76 |
+
fill='toself',
|
| 77 |
+
name=model_name,
|
| 78 |
+
line_color=MODEL_COLORS[i % len(MODEL_COLORS)],
|
| 79 |
+
opacity=0.7
|
| 80 |
+
))
|
| 81 |
+
|
| 82 |
+
fig.update_layout(
|
| 83 |
+
polar=dict(
|
| 84 |
+
radialaxis=dict(
|
| 85 |
+
visible=True,
|
| 86 |
+
range=[0, 100]
|
| 87 |
+
)
|
| 88 |
+
),
|
| 89 |
+
showlegend=True,
|
| 90 |
+
title=dict(
|
| 91 |
+
text=title,
|
| 92 |
+
font=dict(size=16)
|
| 93 |
+
),
|
| 94 |
+
font=dict(
|
| 95 |
+
family="Noto Kufi Arabic, Arial" if use_arabic else "Arial"
|
| 96 |
+
)
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
return fig
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def create_bar_chart(
|
| 103 |
+
leaderboard_data: List[Dict],
|
| 104 |
+
metric: str = "overall",
|
| 105 |
+
top_n: int = 10,
|
| 106 |
+
use_arabic: bool = True,
|
| 107 |
+
title: str = "Top Models"
|
| 108 |
+
) -> go.Figure:
|
| 109 |
+
"""
|
| 110 |
+
Create a horizontal bar chart of top models.
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
leaderboard_data: List of model entries with scores
|
| 114 |
+
metric: Metric to display (default: 'overall')
|
| 115 |
+
top_n: Number of top models to show
|
| 116 |
+
use_arabic: Whether to use Arabic labels
|
| 117 |
+
title: Chart title
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
Plotly Figure object
|
| 121 |
+
"""
|
| 122 |
+
# Sort and get top N
|
| 123 |
+
sorted_data = sorted(
|
| 124 |
+
leaderboard_data,
|
| 125 |
+
key=lambda x: x.get(metric, 0),
|
| 126 |
+
reverse=True
|
| 127 |
+
)[:top_n]
|
| 128 |
+
|
| 129 |
+
# Reverse for horizontal bar chart (top at top)
|
| 130 |
+
sorted_data = sorted_data[::-1]
|
| 131 |
+
|
| 132 |
+
models = [d.get('model', d.get('name', 'Unknown')) for d in sorted_data]
|
| 133 |
+
scores = [d.get(metric, 0) for d in sorted_data]
|
| 134 |
+
|
| 135 |
+
# Color based on score ranges
|
| 136 |
+
colors = []
|
| 137 |
+
for score in scores:
|
| 138 |
+
if score >= 80:
|
| 139 |
+
colors.append('#2ca02c') # Green
|
| 140 |
+
elif score >= 60:
|
| 141 |
+
colors.append('#1f77b4') # Blue
|
| 142 |
+
elif score >= 40:
|
| 143 |
+
colors.append('#ff7f0e') # Orange
|
| 144 |
+
else:
|
| 145 |
+
colors.append('#d62728') # Red
|
| 146 |
+
|
| 147 |
+
fig = go.Figure(go.Bar(
|
| 148 |
+
x=scores,
|
| 149 |
+
y=models,
|
| 150 |
+
orientation='h',
|
| 151 |
+
marker_color=colors,
|
| 152 |
+
text=[f"{s:.1f}%" for s in scores],
|
| 153 |
+
textposition='outside'
|
| 154 |
+
))
|
| 155 |
+
|
| 156 |
+
metric_label = CATEGORY_NAMES_AR.get(metric, metric) if use_arabic else metric
|
| 157 |
+
|
| 158 |
+
fig.update_layout(
|
| 159 |
+
title=dict(
|
| 160 |
+
text=title,
|
| 161 |
+
font=dict(size=16)
|
| 162 |
+
),
|
| 163 |
+
xaxis=dict(
|
| 164 |
+
title="الدقة (%)" if use_arabic else "Accuracy (%)",
|
| 165 |
+
range=[0, 105]
|
| 166 |
+
),
|
| 167 |
+
yaxis=dict(
|
| 168 |
+
title=""
|
| 169 |
+
),
|
| 170 |
+
font=dict(
|
| 171 |
+
family="Noto Kufi Arabic, Arial" if use_arabic else "Arial"
|
| 172 |
+
),
|
| 173 |
+
height=max(400, len(models) * 40)
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
return fig
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def create_category_comparison(
|
| 180 |
+
leaderboard_data: List[Dict],
|
| 181 |
+
models: Optional[List[str]] = None,
|
| 182 |
+
use_arabic: bool = True,
|
| 183 |
+
title: str = "Category Performance Comparison"
|
| 184 |
+
) -> go.Figure:
|
| 185 |
+
"""
|
| 186 |
+
Create a grouped bar chart comparing models across categories.
|
| 187 |
+
|
| 188 |
+
Args:
|
| 189 |
+
leaderboard_data: List of model entries with category scores
|
| 190 |
+
models: List of model names to include (default: top 5)
|
| 191 |
+
use_arabic: Whether to use Arabic labels
|
| 192 |
+
title: Chart title
|
| 193 |
+
|
| 194 |
+
Returns:
|
| 195 |
+
Plotly Figure object
|
| 196 |
+
"""
|
| 197 |
+
# Categories to show
|
| 198 |
+
categories = ["simple", "multiple", "parallel", "parallel_multiple",
|
| 199 |
+
"irrelevance", "dialect_handling"]
|
| 200 |
+
|
| 201 |
+
# Get models to compare
|
| 202 |
+
if models is None:
|
| 203 |
+
sorted_data = sorted(
|
| 204 |
+
leaderboard_data,
|
| 205 |
+
key=lambda x: x.get('overall', 0),
|
| 206 |
+
reverse=True
|
| 207 |
+
)[:5]
|
| 208 |
+
models = [d.get('model', d.get('name', 'Unknown')) for d in sorted_data]
|
| 209 |
+
|
| 210 |
+
# Filter data for selected models
|
| 211 |
+
model_data = {
|
| 212 |
+
d.get('model', d.get('name', 'Unknown')): d
|
| 213 |
+
for d in leaderboard_data
|
| 214 |
+
if d.get('model', d.get('name', 'Unknown')) in models
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
# Prepare labels
|
| 218 |
+
if use_arabic:
|
| 219 |
+
cat_labels = [CATEGORY_NAMES_AR.get(cat, cat) for cat in categories]
|
| 220 |
+
else:
|
| 221 |
+
cat_labels = categories
|
| 222 |
+
|
| 223 |
+
fig = go.Figure()
|
| 224 |
+
|
| 225 |
+
for i, model in enumerate(models):
|
| 226 |
+
if model in model_data:
|
| 227 |
+
scores = [model_data[model].get(cat, 0) for cat in categories]
|
| 228 |
+
fig.add_trace(go.Bar(
|
| 229 |
+
name=model,
|
| 230 |
+
x=cat_labels,
|
| 231 |
+
y=scores,
|
| 232 |
+
marker_color=MODEL_COLORS[i % len(MODEL_COLORS)]
|
| 233 |
+
))
|
| 234 |
+
|
| 235 |
+
fig.update_layout(
|
| 236 |
+
barmode='group',
|
| 237 |
+
title=dict(
|
| 238 |
+
text=title,
|
| 239 |
+
font=dict(size=16)
|
| 240 |
+
),
|
| 241 |
+
xaxis=dict(
|
| 242 |
+
title="الفئة" if use_arabic else "Category",
|
| 243 |
+
tickangle=-45 if use_arabic else 0
|
| 244 |
+
),
|
| 245 |
+
yaxis=dict(
|
| 246 |
+
title="الدقة (%)" if use_arabic else "Accuracy (%)",
|
| 247 |
+
range=[0, 105]
|
| 248 |
+
),
|
| 249 |
+
font=dict(
|
| 250 |
+
family="Noto Kufi Arabic, Arial" if use_arabic else "Arial"
|
| 251 |
+
),
|
| 252 |
+
legend=dict(
|
| 253 |
+
orientation="h",
|
| 254 |
+
yanchor="bottom",
|
| 255 |
+
y=1.02,
|
| 256 |
+
xanchor="right",
|
| 257 |
+
x=1
|
| 258 |
+
),
|
| 259 |
+
height=500
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
return fig
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def create_dialect_breakdown(
|
| 266 |
+
model_scores: Dict[str, Dict[str, float]],
|
| 267 |
+
use_arabic: bool = True,
|
| 268 |
+
title: str = "Dialect Performance"
|
| 269 |
+
) -> go.Figure:
|
| 270 |
+
"""
|
| 271 |
+
Create a chart showing performance across Arabic dialects.
|
| 272 |
+
|
| 273 |
+
Args:
|
| 274 |
+
model_scores: Dict mapping model names to dialect scores
|
| 275 |
+
use_arabic: Whether to use Arabic labels
|
| 276 |
+
title: Chart title
|
| 277 |
+
|
| 278 |
+
Returns:
|
| 279 |
+
Plotly Figure object
|
| 280 |
+
"""
|
| 281 |
+
dialects = ["msa", "egyptian", "gulf", "levantine"]
|
| 282 |
+
dialect_labels = {
|
| 283 |
+
"msa": "الفصحى" if use_arabic else "MSA",
|
| 284 |
+
"egyptian": "المصري" if use_arabic else "Egyptian",
|
| 285 |
+
"gulf": "الخليجي" if use_arabic else "Gulf",
|
| 286 |
+
"levantine": "الشامي" if use_arabic else "Levantine"
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
fig = go.Figure()
|
| 290 |
+
|
| 291 |
+
for i, (model_name, scores) in enumerate(model_scores.items()):
|
| 292 |
+
dialect_scores = [scores.get(d, 0) for d in dialects]
|
| 293 |
+
labels = [dialect_labels[d] for d in dialects]
|
| 294 |
+
|
| 295 |
+
fig.add_trace(go.Bar(
|
| 296 |
+
name=model_name,
|
| 297 |
+
x=labels,
|
| 298 |
+
y=dialect_scores,
|
| 299 |
+
marker_color=MODEL_COLORS[i % len(MODEL_COLORS)]
|
| 300 |
+
))
|
| 301 |
+
|
| 302 |
+
fig.update_layout(
|
| 303 |
+
barmode='group',
|
| 304 |
+
title=dict(
|
| 305 |
+
text=title,
|
| 306 |
+
font=dict(size=16)
|
| 307 |
+
),
|
| 308 |
+
xaxis=dict(title="اللهجة" if use_arabic else "Dialect"),
|
| 309 |
+
yaxis=dict(
|
| 310 |
+
title="الدقة (%)" if use_arabic else "Accuracy (%)",
|
| 311 |
+
range=[0, 105]
|
| 312 |
+
),
|
| 313 |
+
font=dict(
|
| 314 |
+
family="Noto Kufi Arabic, Arial" if use_arabic else "Arial"
|
| 315 |
+
),
|
| 316 |
+
height=400
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
return fig
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
def create_progress_over_time(
|
| 323 |
+
history_data: List[Dict],
|
| 324 |
+
models: Optional[List[str]] = None,
|
| 325 |
+
title: str = "Performance Over Time"
|
| 326 |
+
) -> go.Figure:
|
| 327 |
+
"""
|
| 328 |
+
Create a line chart showing model performance over time.
|
| 329 |
+
|
| 330 |
+
Args:
|
| 331 |
+
history_data: List of evaluation snapshots with dates
|
| 332 |
+
models: Models to include
|
| 333 |
+
title: Chart title
|
| 334 |
+
|
| 335 |
+
Returns:
|
| 336 |
+
Plotly Figure object
|
| 337 |
+
"""
|
| 338 |
+
if not history_data:
|
| 339 |
+
# Return empty figure
|
| 340 |
+
fig = go.Figure()
|
| 341 |
+
fig.update_layout(title=title)
|
| 342 |
+
return fig
|
| 343 |
+
|
| 344 |
+
df = pd.DataFrame(history_data)
|
| 345 |
+
|
| 346 |
+
if models is not None:
|
| 347 |
+
df = df[df['model'].isin(models)]
|
| 348 |
+
|
| 349 |
+
fig = px.line(
|
| 350 |
+
df,
|
| 351 |
+
x='date',
|
| 352 |
+
y='overall',
|
| 353 |
+
color='model',
|
| 354 |
+
title=title,
|
| 355 |
+
labels={'overall': 'Overall Score (%)', 'date': 'Date', 'model': 'Model'}
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
fig.update_layout(
|
| 359 |
+
yaxis=dict(range=[0, 100]),
|
| 360 |
+
height=400
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
+
return fig
|
app.py
CHANGED
|
@@ -1,204 +1,10 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
|
| 7 |
-
from
|
| 8 |
-
CITATION_BUTTON_LABEL,
|
| 9 |
-
CITATION_BUTTON_TEXT,
|
| 10 |
-
EVALUATION_QUEUE_TEXT,
|
| 11 |
-
INTRODUCTION_TEXT,
|
| 12 |
-
LLM_BENCHMARKS_TEXT,
|
| 13 |
-
TITLE,
|
| 14 |
-
)
|
| 15 |
-
from src.display.css_html_js import custom_css
|
| 16 |
-
from src.display.utils import (
|
| 17 |
-
BENCHMARK_COLS,
|
| 18 |
-
COLS,
|
| 19 |
-
EVAL_COLS,
|
| 20 |
-
EVAL_TYPES,
|
| 21 |
-
AutoEvalColumn,
|
| 22 |
-
ModelType,
|
| 23 |
-
fields,
|
| 24 |
-
WeightType,
|
| 25 |
-
Precision
|
| 26 |
-
)
|
| 27 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 28 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 29 |
-
from src.submission.submit import add_new_eval
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
API.restart_space(repo_id=REPO_ID)
|
| 34 |
-
|
| 35 |
-
### Space initialisation
|
| 36 |
-
try:
|
| 37 |
-
print(EVAL_REQUESTS_PATH)
|
| 38 |
-
snapshot_download(
|
| 39 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 40 |
-
)
|
| 41 |
-
except Exception:
|
| 42 |
-
restart_space()
|
| 43 |
-
try:
|
| 44 |
-
print(EVAL_RESULTS_PATH)
|
| 45 |
-
snapshot_download(
|
| 46 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 47 |
-
)
|
| 48 |
-
except Exception:
|
| 49 |
-
restart_space()
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 53 |
-
|
| 54 |
-
(
|
| 55 |
-
finished_eval_queue_df,
|
| 56 |
-
running_eval_queue_df,
|
| 57 |
-
pending_eval_queue_df,
|
| 58 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 59 |
-
|
| 60 |
-
def init_leaderboard(dataframe):
|
| 61 |
-
if dataframe is None or dataframe.empty:
|
| 62 |
-
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 63 |
-
return Leaderboard(
|
| 64 |
-
value=dataframe,
|
| 65 |
-
datatype=[c.type for c in fields(AutoEvalColumn)],
|
| 66 |
-
select_columns=SelectColumns(
|
| 67 |
-
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
| 68 |
-
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
| 69 |
-
label="Select Columns to Display:",
|
| 70 |
-
),
|
| 71 |
-
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
| 72 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 73 |
-
filter_columns=[
|
| 74 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 75 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
| 76 |
-
ColumnFilter(
|
| 77 |
-
AutoEvalColumn.params.name,
|
| 78 |
-
type="slider",
|
| 79 |
-
min=0.01,
|
| 80 |
-
max=150,
|
| 81 |
-
label="Select the number of parameters (B)",
|
| 82 |
-
),
|
| 83 |
-
ColumnFilter(
|
| 84 |
-
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
| 85 |
-
),
|
| 86 |
-
],
|
| 87 |
-
bool_checkboxgroup_label="Hide models",
|
| 88 |
-
interactive=False,
|
| 89 |
-
)
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
demo = gr.Blocks(css=custom_css)
|
| 93 |
-
with demo:
|
| 94 |
-
gr.HTML(TITLE)
|
| 95 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 96 |
-
|
| 97 |
-
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 98 |
-
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 99 |
-
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 100 |
-
|
| 101 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 102 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 103 |
-
|
| 104 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
| 105 |
-
with gr.Column():
|
| 106 |
-
with gr.Row():
|
| 107 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 108 |
-
|
| 109 |
-
with gr.Column():
|
| 110 |
-
with gr.Accordion(
|
| 111 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
| 112 |
-
open=False,
|
| 113 |
-
):
|
| 114 |
-
with gr.Row():
|
| 115 |
-
finished_eval_table = gr.components.Dataframe(
|
| 116 |
-
value=finished_eval_queue_df,
|
| 117 |
-
headers=EVAL_COLS,
|
| 118 |
-
datatype=EVAL_TYPES,
|
| 119 |
-
row_count=5,
|
| 120 |
-
)
|
| 121 |
-
with gr.Accordion(
|
| 122 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
| 123 |
-
open=False,
|
| 124 |
-
):
|
| 125 |
-
with gr.Row():
|
| 126 |
-
running_eval_table = gr.components.Dataframe(
|
| 127 |
-
value=running_eval_queue_df,
|
| 128 |
-
headers=EVAL_COLS,
|
| 129 |
-
datatype=EVAL_TYPES,
|
| 130 |
-
row_count=5,
|
| 131 |
-
)
|
| 132 |
-
|
| 133 |
-
with gr.Accordion(
|
| 134 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
| 135 |
-
open=False,
|
| 136 |
-
):
|
| 137 |
-
with gr.Row():
|
| 138 |
-
pending_eval_table = gr.components.Dataframe(
|
| 139 |
-
value=pending_eval_queue_df,
|
| 140 |
-
headers=EVAL_COLS,
|
| 141 |
-
datatype=EVAL_TYPES,
|
| 142 |
-
row_count=5,
|
| 143 |
-
)
|
| 144 |
-
with gr.Row():
|
| 145 |
-
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
| 146 |
-
|
| 147 |
-
with gr.Row():
|
| 148 |
-
with gr.Column():
|
| 149 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
| 150 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
| 151 |
-
model_type = gr.Dropdown(
|
| 152 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
| 153 |
-
label="Model type",
|
| 154 |
-
multiselect=False,
|
| 155 |
-
value=None,
|
| 156 |
-
interactive=True,
|
| 157 |
-
)
|
| 158 |
-
|
| 159 |
-
with gr.Column():
|
| 160 |
-
precision = gr.Dropdown(
|
| 161 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
| 162 |
-
label="Precision",
|
| 163 |
-
multiselect=False,
|
| 164 |
-
value="float16",
|
| 165 |
-
interactive=True,
|
| 166 |
-
)
|
| 167 |
-
weight_type = gr.Dropdown(
|
| 168 |
-
choices=[i.value.name for i in WeightType],
|
| 169 |
-
label="Weights type",
|
| 170 |
-
multiselect=False,
|
| 171 |
-
value="Original",
|
| 172 |
-
interactive=True,
|
| 173 |
-
)
|
| 174 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
| 175 |
-
|
| 176 |
-
submit_button = gr.Button("Submit Eval")
|
| 177 |
-
submission_result = gr.Markdown()
|
| 178 |
-
submit_button.click(
|
| 179 |
-
add_new_eval,
|
| 180 |
-
[
|
| 181 |
-
model_name_textbox,
|
| 182 |
-
base_model_name_textbox,
|
| 183 |
-
revision_name_textbox,
|
| 184 |
-
precision,
|
| 185 |
-
weight_type,
|
| 186 |
-
model_type,
|
| 187 |
-
],
|
| 188 |
-
submission_result,
|
| 189 |
-
)
|
| 190 |
-
|
| 191 |
-
with gr.Row():
|
| 192 |
-
with gr.Accordion("📙 Citation", open=False):
|
| 193 |
-
citation_button = gr.Textbox(
|
| 194 |
-
value=CITATION_BUTTON_TEXT,
|
| 195 |
-
label=CITATION_BUTTON_LABEL,
|
| 196 |
-
lines=20,
|
| 197 |
-
elem_id="citation-button",
|
| 198 |
-
show_copy_button=True,
|
| 199 |
-
)
|
| 200 |
-
|
| 201 |
-
scheduler = BackgroundScheduler()
|
| 202 |
-
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 203 |
-
scheduler.start()
|
| 204 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Arabic Function Calling Leaderboard - HuggingFace Space Entry Point
|
| 3 |
+
"""
|
| 4 |
+
import sys
|
| 5 |
+
sys.path.insert(0, ".")
|
| 6 |
|
| 7 |
+
from afcl.app import create_app
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
app = create_app()
|
| 10 |
+
app.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/leaderboard.json
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"rank": 1,
|
| 4 |
+
"model": "GPT-4o",
|
| 5 |
+
"overall": 78.5,
|
| 6 |
+
"simple": 85.2,
|
| 7 |
+
"multiple": 80.1,
|
| 8 |
+
"parallel": 75.3,
|
| 9 |
+
"parallel_multiple": 72.4,
|
| 10 |
+
"irrelevance": 82.0,
|
| 11 |
+
"dialect_handling": 70.5,
|
| 12 |
+
"java": 76.8,
|
| 13 |
+
"javascript": 74.2,
|
| 14 |
+
"rest": 79.5,
|
| 15 |
+
"sql": 77.3
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"rank": 2,
|
| 19 |
+
"model": "Claude 3.5 Sonnet",
|
| 20 |
+
"overall": 76.2,
|
| 21 |
+
"simple": 83.5,
|
| 22 |
+
"multiple": 78.8,
|
| 23 |
+
"parallel": 73.2,
|
| 24 |
+
"parallel_multiple": 70.1,
|
| 25 |
+
"irrelevance": 80.5,
|
| 26 |
+
"dialect_handling": 68.2,
|
| 27 |
+
"java": 75.2,
|
| 28 |
+
"javascript": 72.8,
|
| 29 |
+
"rest": 78.2,
|
| 30 |
+
"sql": 76.5
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"rank": 3,
|
| 34 |
+
"model": "Qwen2.5-72B",
|
| 35 |
+
"overall": 74.1,
|
| 36 |
+
"simple": 80.5,
|
| 37 |
+
"multiple": 76.2,
|
| 38 |
+
"parallel": 72.5,
|
| 39 |
+
"parallel_multiple": 69.8,
|
| 40 |
+
"irrelevance": 77.5,
|
| 41 |
+
"dialect_handling": 65.2,
|
| 42 |
+
"java": 72.5,
|
| 43 |
+
"javascript": 70.8,
|
| 44 |
+
"rest": 75.2,
|
| 45 |
+
"sql": 73.8
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"rank": 4,
|
| 49 |
+
"model": "Jais-30B",
|
| 50 |
+
"overall": 72.8,
|
| 51 |
+
"simple": 78.5,
|
| 52 |
+
"multiple": 74.2,
|
| 53 |
+
"parallel": 70.8,
|
| 54 |
+
"parallel_multiple": 68.5,
|
| 55 |
+
"irrelevance": 75.2,
|
| 56 |
+
"dialect_handling": 72.0,
|
| 57 |
+
"java": 68.5,
|
| 58 |
+
"javascript": 66.2,
|
| 59 |
+
"rest": 71.8,
|
| 60 |
+
"sql": 69.5
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"rank": 5,
|
| 64 |
+
"model": "Llama-3.1-70B",
|
| 65 |
+
"overall": 71.5,
|
| 66 |
+
"simple": 78.2,
|
| 67 |
+
"multiple": 73.5,
|
| 68 |
+
"parallel": 69.8,
|
| 69 |
+
"parallel_multiple": 66.2,
|
| 70 |
+
"irrelevance": 74.5,
|
| 71 |
+
"dialect_handling": 62.5,
|
| 72 |
+
"java": 70.2,
|
| 73 |
+
"javascript": 68.5,
|
| 74 |
+
"rest": 73.5,
|
| 75 |
+
"sql": 71.2
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"rank": 6,
|
| 79 |
+
"model": "ALLaM-7B",
|
| 80 |
+
"overall": 68.5,
|
| 81 |
+
"simple": 75.2,
|
| 82 |
+
"multiple": 70.5,
|
| 83 |
+
"parallel": 65.8,
|
| 84 |
+
"parallel_multiple": 62.3,
|
| 85 |
+
"irrelevance": 70.8,
|
| 86 |
+
"dialect_handling": 68.5,
|
| 87 |
+
"java": 62.5,
|
| 88 |
+
"javascript": 60.2,
|
| 89 |
+
"rest": 66.8,
|
| 90 |
+
"sql": 64.5
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"rank": 7,
|
| 94 |
+
"model": "SILMA-9B",
|
| 95 |
+
"overall": 65.2,
|
| 96 |
+
"simple": 72.8,
|
| 97 |
+
"multiple": 68.5,
|
| 98 |
+
"parallel": 62.1,
|
| 99 |
+
"parallel_multiple": 58.5,
|
| 100 |
+
"irrelevance": 68.2,
|
| 101 |
+
"dialect_handling": 62.8,
|
| 102 |
+
"java": 58.5,
|
| 103 |
+
"javascript": 56.2,
|
| 104 |
+
"rest": 63.2,
|
| 105 |
+
"sql": 60.8
|
| 106 |
+
}
|
| 107 |
+
]
|
requirements.txt
CHANGED
|
@@ -1,16 +1,5 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
gradio_leaderboard==0.0.13
|
| 7 |
-
gradio_client
|
| 8 |
-
huggingface-hub>=0.18.0
|
| 9 |
-
matplotlib
|
| 10 |
-
numpy
|
| 11 |
-
pandas
|
| 12 |
-
python-dateutil
|
| 13 |
-
tqdm
|
| 14 |
-
transformers
|
| 15 |
-
tokenizers>=0.15.0
|
| 16 |
-
sentencepiece
|
|
|
|
| 1 |
+
gradio>=4.0.0
|
| 2 |
+
datasets>=2.14.0
|
| 3 |
+
huggingface_hub>=0.19.0
|
| 4 |
+
pandas>=2.0.0
|
| 5 |
+
plotly>=5.18.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|