HeshamHaroon commited on
Commit
a5f8ac7
·
verified ·
1 Parent(s): 6add5d0

Update: Auto-evaluation on Space startup

Browse files
Files changed (4) hide show
  1. README.md +10 -9
  2. afcl/app.py +274 -290
  3. app.py +3 -3
  4. requirements.txt +1 -1
README.md CHANGED
@@ -29,22 +29,23 @@ The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Mode
29
  4. Handle parallel and complex function calls
30
  5. Detect when no function should be called
31
 
 
 
 
 
 
32
  ## Dataset
33
 
34
- The benchmark includes **1,470+ samples** across 10 categories:
 
 
35
  - Simple, Multiple, Parallel, Parallel Multiple
36
  - Irrelevance Detection
37
  - Dialect Handling (Egyptian, Gulf, Levantine)
38
- - Programming APIs (Java, JavaScript, REST, SQL)
39
-
40
- 📊 **Dataset**: [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
41
 
42
- ## Submit Your Model
43
 
44
- To submit your model for evaluation:
45
- 1. Go to the "Submit" tab
46
- 2. Fill in your model details
47
- 3. Your model will be added to the evaluation queue
48
 
49
  ## Citation
50
 
 
29
  4. Handle parallel and complex function calls
30
  5. Detect when no function should be called
31
 
32
+ ## Models Evaluated
33
+
34
+ - **Arabic-Native**: Jais, ALLaM, SILMA, AceGPT
35
+ - **Multilingual**: Qwen, Llama, Gemma, Mistral, Phi, BLOOMZ, Aya
36
+
37
  ## Dataset
38
 
39
+ 📊 **Dataset**: [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
40
+
41
+ - **147 test samples** across 10 categories
42
  - Simple, Multiple, Parallel, Parallel Multiple
43
  - Irrelevance Detection
44
  - Dialect Handling (Egyptian, Gulf, Levantine)
 
 
 
45
 
46
+ ## Evaluation
47
 
48
+ The leaderboard automatically evaluates models using the HuggingFace Inference API when the Space starts.
 
 
 
49
 
50
  ## Citation
51
 
afcl/app.py CHANGED
@@ -2,21 +2,22 @@
2
  Arabic Function Calling Leaderboard (AFCL)
3
  ==========================================
4
 
5
- A Gradio-based leaderboard for evaluating LLMs on Arabic function calling.
 
6
  """
7
 
8
  import gradio as gr
9
  import pandas as pd
10
  import json
11
  import os
 
 
 
12
  from pathlib import Path
13
  from typing import Dict, List, Optional
14
-
15
- # Local imports
16
- from .data.loader import (
17
- load_leaderboard, save_leaderboard, load_benchmark,
18
- calculate_overall_score, CATEGORY_WEIGHTS
19
- )
20
 
21
  # Constants
22
  TITLE = "🏆 Arabic Function Calling Leaderboard"
@@ -28,330 +29,313 @@ The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Mode
28
  **لوحة تقييم استدعاء الدوال بالعربية** تقيّم نماذج اللغة الكبيرة على قدرتها على فهم الاستعلامات العربية وإنشاء استدعاءات الدوال المناسبة.
29
  """
30
 
31
- # Column definitions
32
- LEADERBOARD_COLUMNS = {
33
- "rank": {"label": "#", "label_en": "#", "type": "number"},
34
- "model": {"label": "النموذج", "label_en": "Model", "type": "str"},
35
- "organization": {"label": "المنظمة", "label_en": "Organization", "type": "str"},
36
- "overall": {"label": "الدقة الكلية", "label_en": "Overall", "type": "number"},
37
- "simple": {"label": "بسيط", "label_en": "Simple", "type": "number"},
38
- "multiple": {"label": "متعدد", "label_en": "Multiple", "type": "number"},
39
- "parallel": {"label": "متوازي", "label_en": "Parallel", "type": "number"},
40
- "parallel_multiple": {"label": "متوازي متعدد", "label_en": "P. Multiple", "type": "number"},
41
- "irrelevance": {"label": "اللا صلة", "label_en": "Irrelevance", "type": "number"},
42
- "dialect_handling": {"label": "اللهجات", "label_en": "Dialects", "type": "number"},
43
- "status": {"label": "الحالة", "label_en": "Status", "type": "str"},
44
- }
45
-
46
- # Empty sample - will load from file
47
- SAMPLE_LEADERBOARD = []
48
-
49
-
50
- def get_leaderboard_data() -> List[Dict]:
51
- """Load leaderboard data from file or return sample data."""
 
52
  try:
53
- data = load_leaderboard("data/leaderboard.json")
54
- if data:
55
- return data
56
- except Exception:
57
- pass
58
- return SAMPLE_LEADERBOARD
59
-
60
-
61
- def format_leaderboard_dataframe(data: List[Dict], use_arabic: bool = True) -> pd.DataFrame:
62
- """Convert leaderboard data to pandas DataFrame."""
63
- if not data:
64
- return pd.DataFrame()
65
-
66
- df = pd.DataFrame(data)
67
-
68
- # Select columns to display (fewer columns for cleaner view)
69
- display_cols = ["rank", "model", "organization", "overall", "status"]
70
- df = df[[c for c in display_cols if c in df.columns]]
71
-
72
- # Rename columns based on language preference
73
- column_mapping = {}
74
- for col, info in LEADERBOARD_COLUMNS.items():
75
- if col in df.columns:
76
- label = info["label"] if use_arabic else info["label_en"]
77
- column_mapping[col] = label
78
-
79
- df = df.rename(columns=column_mapping)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
- # Format numeric columns (show as percentage, but mark 0.0 as "-")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  for col in df.columns:
83
- if df[col].dtype in ['float64', 'float32']:
84
- df[col] = df[col].apply(lambda x: "-" if x == 0.0 else f"{x:.1f}%")
85
-
86
- # Format status column
87
- status_col = "الحالة" if use_arabic else "Status"
88
- if status_col in df.columns:
89
- df[status_col] = df[status_col].apply(
90
- lambda x: "⏳ قيد الانتظار" if x == "pending" else "✅ مكتمل"
91
- if use_arabic else "⏳ Pending" if x == "pending" else "✅ Done"
92
- )
93
 
94
  return df
95
 
96
 
97
- def create_models_list_tab():
98
- """Create the models list tab showing all models to be evaluated."""
99
- data = get_leaderboard_data()
100
-
101
- # Group by organization
102
- orgs = {}
103
- for entry in data:
104
- org = entry.get("organization", "Other")
105
- if org not in orgs:
106
- orgs[org] = []
107
- orgs[org].append(entry)
108
-
109
- # Create markdown content
110
- md_content = """
111
- ## 📋 Models Queue | قائمة النماذج للتقييم
112
-
113
- The following **{total}** models are queued for evaluation on the Arabic Function Calling benchmark:
114
-
115
- النماذج التالية (**{total}** نموذج) في قائمة الانتظار للتقييم:
116
-
117
- ---
118
-
119
- """.format(total=len(data))
120
-
121
- for org, models in sorted(orgs.items()):
122
- md_content += f"### {org}\n"
123
- for m in models:
124
- model_url = m.get("model_url", "#")
125
- md_content += f"- [{m['model']}]({model_url}) - ⏳ Pending\n"
126
- md_content += "\n"
127
-
128
- return gr.Markdown(md_content)
129
-
130
-
131
- def create_submit_tab():
132
- """Create the model submission tab."""
133
- with gr.Column():
134
- gr.Markdown("""
135
- ## 📤 Submit Your Model | أرسل نموذجك
136
-
137
- To submit a model for evaluation, provide the following information:
138
-
139
- لإرسال نموذج للتقييم، قدم المعلومات التالية:
140
- """)
141
-
142
- with gr.Row():
143
- model_name = gr.Textbox(
144
- label="Model Name | اسم النموذج",
145
- placeholder="e.g., my-arabic-llm-7b"
146
- )
147
- model_type = gr.Dropdown(
148
- label="Model Type | نوع النموذج",
149
- choices=["HuggingFace Hub", "API Endpoint", "Local Model"],
150
- value="HuggingFace Hub"
151
- )
152
-
153
- model_path = gr.Textbox(
154
- label="Model Path/Endpoint | مسار النموذج",
155
- placeholder="e.g., organization/model-name or https://api.example.com/v1"
156
- )
157
-
158
- precision = gr.Dropdown(
159
- label="Precision | الدقة",
160
- choices=["float16", "bfloat16", "float32", "int8", "int4"],
161
- value="float16"
162
- )
163
-
164
- with gr.Row():
165
- base_model = gr.Textbox(
166
- label="Base Model (if fine-tuned) | النموذج الأساسي",
167
- placeholder="e.g., meta-llama/Llama-2-7b"
168
- )
169
- license_type = gr.Dropdown(
170
- label="License | الرخصة",
171
- choices=["Apache-2.0", "MIT", "CC-BY-4.0", "Llama 2", "Other"],
172
- value="Apache-2.0"
173
- )
174
-
175
- submit_btn = gr.Button("Submit for Evaluation | أرسل للتقييم", variant="primary")
176
-
177
- result_text = gr.Markdown("")
178
-
179
- def handle_submission(name, mtype, path, prec, base, lic):
180
- if not name or not path:
181
- return "❌ Please fill in the required fields | يرجى ملء الحقول المطلوبة"
182
- return f"""
183
- ✅ **Submission Received | تم استلام الطلب**
184
-
185
- - Model: {name}
186
- - Type: {mtype}
187
- - Path: {path}
188
-
189
- Your model will be evaluated and added to the leaderboard soon.
190
-
191
- سيتم تقييم نموذجك وإضافته إلى لوحة التقييم قريباً.
192
- """
193
-
194
- submit_btn.click(
195
- fn=handle_submission,
196
- inputs=[model_name, model_type, model_path, precision, base_model, license_type],
197
- outputs=result_text
198
- )
199
-
200
-
201
- def create_about_tab():
202
- """Create the about/methodology tab."""
203
- return gr.Markdown("""
204
- # About AFCL | عن لوحة التقييم
205
-
206
- ## Evaluation Categories | فئات التقييم
207
-
208
- | Category | الفئة | Samples | Description |
209
- |----------|-------|---------|-------------|
210
- | Simple | بسيط | 200 | Single function, single call |
211
- | Multiple | متعدد | 200 | Select correct function from options |
212
- | Parallel | متوازي | 200 | Multiple calls of same function |
213
- | Parallel Multiple | متوازي متعدد | 200 | Multiple functions, multiple calls |
214
- | Irrelevance | اللا صلة | 200 | No function should be called |
215
- | Dialect Handling | اللهجات | 150 | Egyptian/Gulf/Levantine queries |
216
- | Java | جافا | 100 | Java API function calls |
217
- | JavaScript | جافاسكريبت | 50 | JS function calls |
218
- | REST | REST | 70 | REST API calls |
219
- | SQL | SQL | 100 | SQL query generation |
220
-
221
- **Total: 1,470 samples**
222
-
223
- ## Scoring Formula | معادلة التقييم
224
-
225
- ```
226
- Overall Score = Σ (category_score × weight)
227
- ```
228
-
229
- **Weights | الأوزان:**
230
- - Simple: 15%
231
- - Multiple: 10%
232
- - Parallel: 10%
233
- - Parallel Multiple: 10%
234
- - Irrelevance: 15%
235
- - Dialect Handling: 15%
236
- - Multi-Turn: 15%
237
- - Native Arabic: 10%
238
-
239
- ## Dataset | مجموعة البيانات
240
-
241
- 📊 **[HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)**
242
-
243
- - **Total Samples**: 1,470
244
- - **Languages**: Arabic (MSA + Dialects) & English
245
- - **Categories**: 10 evaluation categories
246
- - **Source**: Translated from BFCL with dialect variants
247
-
248
- ## Citation | الاقتباس
249
-
250
- ```bibtex
251
- @misc{afcl2024,
252
- title={Arabic Function Calling Leaderboard},
253
- author={Hesham Haroon},
254
- year={2024},
255
- url={https://huggingface.co/spaces/HeshamHaroon/Arabic-Function-Calling-Leaderboard}
256
- }
257
- ```
258
- """)
259
-
260
-
261
  def create_app():
262
- """Create the main Gradio application."""
263
- # Load CSS
264
- css_path = Path(__file__).parent / "static" / "styles.css"
265
- custom_css = ""
266
- if css_path.exists():
267
- with open(css_path, "r") as f:
268
- custom_css = f.read()
269
-
270
- with gr.Blocks(
271
- title="Arabic Function Calling Leaderboard",
272
- css=custom_css,
273
- theme=gr.themes.Soft()
274
- ) as app:
275
- # Header
276
  gr.Markdown(f"""
277
  <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4a 100%); border-radius: 12px; color: white; margin-bottom: 20px;">
278
- <h1 style="font-size: 2rem; margin-bottom: 10px;">{TITLE_AR}</h1>
279
- <h2 style="font-size: 1.5rem; margin-bottom: 10px;">{TITLE}</h2>
280
- <p style="opacity: 0.9;">Evaluating LLMs on Arabic Function Calling | تقييم نماذج اللغة على استدعاء الدوال بالعربية</p>
281
  </div>
282
  """)
283
 
284
  gr.Markdown(DESCRIPTION)
285
 
286
- # Stats row
287
- data = get_leaderboard_data()
288
- evaluated = len([d for d in data if d.get("status") != "pending"])
289
- pending = len([d for d in data if d.get("status") == "pending"])
290
-
291
  with gr.Row():
292
  gr.Markdown(f"""
293
  <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
294
- <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(data)}</div>
295
- <div style="color: #666;">Total Models | إجمالي النماذج</div>
296
  </div>
297
  """)
298
- gr.Markdown(f"""
299
- <div style="text-align: center; padding: 15px; background: #fff3cd; border-radius: 8px;">
300
- <div style="font-size: 2rem; font-weight: bold; color: #856404;">{pending}</div>
301
- <div style="color: #856404;">⏳ Pending | قيد الانتظار</div>
302
  </div>
303
  """)
304
  gr.Markdown("""
305
  <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
306
- <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">1,470</div>
307
- <div style="color: #666;">Test Samples | عينات الاختبار</div>
308
  </div>
309
  """)
310
 
311
- # Notice about pending evaluation
312
- if pending > 0:
313
- gr.Markdown(f"""
314
- <div style="padding: 15px; background: #fff3cd; border: 1px solid #ffc107; border-radius: 8px; margin: 15px 0;">
315
- ⏳ <strong>Evaluation in Progress | التقييم قيد التنفيذ</strong><br>
316
- {pending} models are waiting to be evaluated. Results will be updated as evaluations complete.<br>
317
- {pending} نموذج في انتظار التقييم. سيتم تحديث النتائج فور اكتمال التقييم.
318
- </div>
319
- """)
320
 
321
- # Tabs
322
  with gr.Tabs():
323
- with gr.TabItem("🏆 Leaderboard | لوحة التقييم"):
324
- df = format_leaderboard_dataframe(data, use_arabic=True)
325
- gr.DataFrame(
326
- value=df,
327
- interactive=False,
328
- wrap=True,
329
  )
330
 
331
- with gr.TabItem("📋 Models | النماذج"):
332
- create_models_list_tab()
 
 
 
 
 
 
 
333
 
334
- with gr.TabItem("📤 Submit | إرسال"):
335
- create_submit_tab()
 
 
 
 
 
 
336
 
337
- with gr.TabItem("ℹ️ About | عن المشروع"):
338
- create_about_tab()
 
339
 
340
- # Footer
341
  gr.Markdown("""
342
  ---
343
- <div style="text-align: center; color: #666; padding: 20px;">
344
- Built with ❤️ for the Arabic NLP community | بُني بحب لمجتمع معالجة اللغة العربية
345
- <br>
346
- <a href="https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling">Dataset</a> |
347
- <a href="https://github.com/HeshamHaroon">GitHub</a>
348
  </div>
349
  """)
350
 
 
 
 
 
351
  return app
352
 
353
 
354
- # Main entry point
 
355
  if __name__ == "__main__":
356
- app = create_app()
357
  app.launch()
 
2
  Arabic Function Calling Leaderboard (AFCL)
3
  ==========================================
4
 
5
+ A Gradio-based leaderboard that evaluates LLMs on Arabic function calling.
6
+ Evaluation runs on HuggingFace Space infrastructure.
7
  """
8
 
9
  import gradio as gr
10
  import pandas as pd
11
  import json
12
  import os
13
+ import re
14
+ import time
15
+ import requests
16
  from pathlib import Path
17
  from typing import Dict, List, Optional
18
+ from threading import Thread
19
+ from datasets import load_dataset
20
+ import huggingface_hub
 
 
 
21
 
22
  # Constants
23
  TITLE = "🏆 Arabic Function Calling Leaderboard"
 
29
  **لوحة تقييم استدعاء الدوال بالعربية** تقيّم نماذج اللغة الكبيرة على قدرتها على فهم الاستعلامات العربية وإنشاء استدعاءات الدوال المناسبة.
30
  """
31
 
32
+ # Models to evaluate
33
+ MODELS_TO_EVALUATE = [
34
+ {"model": "Jais-30B-Chat", "model_id": "inceptionai/jais-30b-chat-v3", "organization": "Inception AI"},
35
+ {"model": "ALLaM-7B-Instruct", "model_id": "sdaia/allam-1-7b-instruct", "organization": "SDAIA"},
36
+ {"model": "SILMA-9B-Instruct", "model_id": "silma-ai/SILMA-9B-Instruct-v1.0", "organization": "Silma AI"},
37
+ {"model": "AceGPT-13B-Chat", "model_id": "FreedomIntelligence/AceGPT-13B-chat", "organization": "FreedomIntelligence"},
38
+ {"model": "BLOOMZ-7B1", "model_id": "bigscience/bloomz-7b1", "organization": "BigScience"},
39
+ {"model": "Aya-Expanse-8B", "model_id": "CohereForAI/aya-expanse-8b", "organization": "Cohere For AI"},
40
+ {"model": "Qwen2.5-7B-Instruct", "model_id": "Qwen/Qwen2.5-7B-Instruct", "organization": "Alibaba Qwen"},
41
+ {"model": "Llama-3.1-8B-Instruct", "model_id": "meta-llama/Llama-3.1-8B-Instruct", "organization": "Meta"},
42
+ {"model": "Gemma-2-9B-IT", "model_id": "google/gemma-2-9b-it", "organization": "Google"},
43
+ {"model": "Mistral-7B-Instruct", "model_id": "mistralai/Mistral-7B-Instruct-v0.3", "organization": "Mistral AI"},
44
+ {"model": "Phi-3-Mini-Instruct", "model_id": "microsoft/Phi-3-mini-4k-instruct", "organization": "Microsoft"},
45
+ ]
46
+
47
+ # Global state
48
+ LEADERBOARD_DATA = []
49
+ EVALUATION_STATUS = "Not started"
50
+
51
+
52
+ def load_evaluation_dataset():
53
+ """Load the Arabic FC dataset from HuggingFace."""
54
  try:
55
+ dataset = load_dataset("HeshamHaroon/Arabic_Function_Calling", split="test")
56
+ samples = []
57
+ for item in dataset:
58
+ sample = {
59
+ 'id': item['id'],
60
+ 'query_ar': item['query_ar'],
61
+ 'functions': json.loads(item['functions']) if item['functions'] else [],
62
+ 'ground_truth': json.loads(item['ground_truth']) if item['ground_truth'] else None,
63
+ 'category': item['category'],
64
+ }
65
+ samples.append(sample)
66
+ return samples
67
+ except Exception as e:
68
+ print(f"Error loading dataset: {e}")
69
+ return []
70
+
71
+
72
+ def create_prompt(query: str, functions: List[Dict]) -> str:
73
+ """Create evaluation prompt."""
74
+ func_desc = "You are a function calling AI. Given the user query and available functions, respond with a JSON function call.\n\nAvailable functions:\n"
75
+ for f in functions:
76
+ func_desc += f"- {f.get('name')}: {f.get('description', '')}\n"
77
+
78
+ return f"""{func_desc}
79
+
80
+ User Query (Arabic): {query}
81
+
82
+ Respond ONLY with a JSON object:
83
+ {{"name": "function_name", "arguments": {{"param1": "value1"}}}}
84
+
85
+ If no function should be called:
86
+ {{"name": null, "arguments": {{}}}}
87
+
88
+ JSON Response:"""
89
+
90
+
91
+ def call_model(model_id: str, prompt: str) -> str:
92
+ """Call model via HuggingFace Inference API."""
93
+ token = os.getenv("HF_TOKEN", "")
94
+ headers = {"Authorization": f"Bearer {token}"}
95
+ url = f"https://api-inference.huggingface.co/models/{model_id}"
96
+
97
+ payload = {
98
+ "inputs": prompt,
99
+ "parameters": {"max_new_tokens": 200, "temperature": 0.1}
100
+ }
101
 
102
+ try:
103
+ response = requests.post(url, headers=headers, json=payload, timeout=60)
104
+ if response.status_code == 503:
105
+ time.sleep(20)
106
+ response = requests.post(url, headers=headers, json=payload, timeout=60)
107
+
108
+ result = response.json()
109
+ if isinstance(result, list) and result:
110
+ return result[0].get("generated_text", "")
111
+ return str(result)
112
+ except:
113
+ return ""
114
+
115
+
116
+ def parse_response(response: str) -> Optional[Dict]:
117
+ """Parse function call from response."""
118
+ if not response:
119
+ return None
120
+ try:
121
+ return json.loads(response.strip())
122
+ except:
123
+ pass
124
+ match = re.search(r'\{[^{}]*"name"[^{}]*\}', response)
125
+ if match:
126
+ try:
127
+ return json.loads(match.group())
128
+ except:
129
+ pass
130
+ if any(x in response.lower() for x in ['null', 'none', 'لا يمكن']):
131
+ return {"name": None}
132
+ return None
133
+
134
+
135
+ def evaluate_sample(model_id: str, sample: Dict) -> float:
136
+ """Evaluate single sample."""
137
+ query = sample.get('query_ar', '')
138
+ functions = sample.get('functions', [])
139
+ category = sample.get('category', '')
140
+ ground_truth = sample.get('ground_truth')
141
+
142
+ prompt = create_prompt(query, functions)
143
+ response = call_model(model_id, prompt)
144
+ parsed = parse_response(response)
145
+
146
+ if category == 'irrelevance':
147
+ return 1.0 if (parsed is None or parsed.get('name') is None) else 0.0
148
+
149
+ if not ground_truth or not parsed:
150
+ return 0.0
151
+
152
+ expected = ground_truth.get('calls', [ground_truth])[0] if isinstance(ground_truth, dict) else ground_truth
153
+
154
+ if str(parsed.get('name', '')).lower() != str(expected.get('name', '')).lower():
155
+ return 0.0
156
+
157
+ pred_args = parsed.get('arguments', {})
158
+ exp_args = expected.get('arguments', {})
159
+ if not exp_args:
160
+ return 1.0
161
+
162
+ matched = sum(1 for k, v in exp_args.items() if str(pred_args.get(k, '')).lower() == str(v).lower())
163
+ return matched / len(exp_args)
164
+
165
+
166
+ def run_evaluation():
167
+ """Run full evaluation on all models."""
168
+ global LEADERBOARD_DATA, EVALUATION_STATUS
169
+
170
+ EVALUATION_STATUS = "Loading dataset..."
171
+ samples = load_evaluation_dataset()
172
+
173
+ if not samples:
174
+ EVALUATION_STATUS = "Failed to load dataset"
175
+ return
176
+
177
+ results = []
178
+ total_models = len(MODELS_TO_EVALUATE)
179
+
180
+ for idx, model_config in enumerate(MODELS_TO_EVALUATE):
181
+ model_name = model_config['model']
182
+ model_id = model_config['model_id']
183
+
184
+ EVALUATION_STATUS = f"Evaluating {model_name} ({idx+1}/{total_models})..."
185
+
186
+ category_scores = {}
187
+ category_counts = {}
188
+
189
+ for sample in samples:
190
+ cat = sample.get('category', 'simple')
191
+ if cat not in category_scores:
192
+ category_scores[cat] = 0.0
193
+ category_counts[cat] = 0
194
+
195
+ try:
196
+ score = evaluate_sample(model_id, sample)
197
+ category_scores[cat] += score
198
+ except:
199
+ pass
200
+ category_counts[cat] += 1
201
+ time.sleep(0.5) # Rate limiting
202
+
203
+ # Calculate scores
204
+ scores = {cat: round((category_scores[cat] / category_counts[cat]) * 100, 1)
205
+ for cat in category_scores if category_counts[cat] > 0}
206
+
207
+ # Weighted overall
208
+ weights = {"simple": 0.15, "multiple": 0.10, "parallel": 0.10,
209
+ "parallel_multiple": 0.10, "irrelevance": 0.15, "dialect_handling": 0.15}
210
+ overall = sum(scores.get(c, 0) * w for c, w in weights.items()) / sum(weights.values())
211
+
212
+ results.append({
213
+ "model": model_name,
214
+ "model_id": model_id,
215
+ "organization": model_config['organization'],
216
+ "overall": round(overall, 1),
217
+ "simple": scores.get('simple', 0),
218
+ "multiple": scores.get('multiple', 0),
219
+ "parallel": scores.get('parallel', 0),
220
+ "parallel_multiple": scores.get('parallel_multiple', 0),
221
+ "irrelevance": scores.get('irrelevance', 0),
222
+ "dialect_handling": scores.get('dialect_handling', 0),
223
+ "status": "completed"
224
+ })
225
+
226
+ # Sort and rank
227
+ results = sorted(results, key=lambda x: x['overall'], reverse=True)
228
+ for i, r in enumerate(results, 1):
229
+ r['rank'] = i
230
+
231
+ LEADERBOARD_DATA = results
232
+ EVALUATION_STATUS = f"Completed - {len(results)} models evaluated"
233
+
234
+
235
+ def get_leaderboard_df():
236
+ """Get leaderboard as DataFrame."""
237
+ if not LEADERBOARD_DATA:
238
+ # Return empty with pending status
239
+ data = [{"rank": i+1, "model": m["model"], "organization": m["organization"],
240
+ "overall": "-", "status": "⏳ Pending"}
241
+ for i, m in enumerate(MODELS_TO_EVALUATE)]
242
+ return pd.DataFrame(data)
243
+
244
+ df = pd.DataFrame(LEADERBOARD_DATA)
245
+ cols = ["rank", "model", "organization", "overall", "simple", "multiple",
246
+ "parallel", "parallel_multiple", "irrelevance", "dialect_handling"]
247
+ df = df[[c for c in cols if c in df.columns]]
248
+
249
+ # Format percentages
250
  for col in df.columns:
251
+ if df[col].dtype in ['float64', 'float32', 'int64']:
252
+ if col != 'rank':
253
+ df[col] = df[col].apply(lambda x: f"{x:.1f}%")
 
 
 
 
 
 
 
254
 
255
  return df
256
 
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  def create_app():
259
+ """Create the Gradio app."""
260
+ with gr.Blocks(title="Arabic FC Leaderboard", theme=gr.themes.Soft()) as app:
261
+
 
 
 
 
 
 
 
 
 
 
 
262
  gr.Markdown(f"""
263
  <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4a 100%); border-radius: 12px; color: white; margin-bottom: 20px;">
264
+ <h1>{TITLE_AR}</h1>
265
+ <h2>{TITLE}</h2>
266
+ <p>Evaluating LLMs on Arabic Function Calling | تقييم نماذج اللغة على استدعاء الدوال بالعربية</p>
267
  </div>
268
  """)
269
 
270
  gr.Markdown(DESCRIPTION)
271
 
 
 
 
 
 
272
  with gr.Row():
273
  gr.Markdown(f"""
274
  <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
275
+ <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(MODELS_TO_EVALUATE)}</div>
276
+ <div>Models | النماذج</div>
277
  </div>
278
  """)
279
+ gr.Markdown("""
280
+ <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
281
+ <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">147</div>
282
+ <div>Test Samples | عينات</div>
283
  </div>
284
  """)
285
  gr.Markdown("""
286
  <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
287
+ <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">10</div>
288
+ <div>Categories | الفئات</div>
289
  </div>
290
  """)
291
 
292
+ status_text = gr.Markdown(f"**Status:** {EVALUATION_STATUS}")
 
 
 
 
 
 
 
 
293
 
 
294
  with gr.Tabs():
295
+ with gr.TabItem("🏆 Leaderboard"):
296
+ leaderboard_df = gr.DataFrame(
297
+ value=get_leaderboard_df(),
298
+ interactive=False
 
 
299
  )
300
 
301
+ def refresh_leaderboard():
302
+ return get_leaderboard_df(), f"**Status:** {EVALUATION_STATUS}"
303
+
304
+ refresh_btn = gr.Button("🔄 Refresh | تحديث")
305
+ refresh_btn.click(refresh_leaderboard, outputs=[leaderboard_df, status_text])
306
+
307
+ with gr.TabItem("📊 About"):
308
+ gr.Markdown("""
309
+ ## Evaluation Categories
310
 
311
+ | Category | Samples | Description |
312
+ |----------|---------|-------------|
313
+ | Simple | ~20 | Single function call |
314
+ | Multiple | ~20 | Select from multiple functions |
315
+ | Parallel | ~20 | Multiple calls |
316
+ | Parallel Multiple | ~20 | Complex multi-call |
317
+ | Irrelevance | ~20 | Should not call |
318
+ | Dialect | ~15 | Egyptian/Gulf/Levantine |
319
 
320
+ ## Dataset
321
+ 📊 [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
322
+ """)
323
 
 
324
  gr.Markdown("""
325
  ---
326
+ <div style="text-align: center; color: #666;">
327
+ Built for the Arabic NLP community | بُني لمجتمع معالجة اللغة العربية
 
 
 
328
  </div>
329
  """)
330
 
331
+ # Start evaluation in background
332
+ if not LEADERBOARD_DATA:
333
+ Thread(target=run_evaluation, daemon=True).start()
334
+
335
  return app
336
 
337
 
338
+ app = create_app()
339
+
340
  if __name__ == "__main__":
 
341
  app.launch()
app.py CHANGED
@@ -4,7 +4,7 @@ Arabic Function Calling Leaderboard - HuggingFace Space Entry Point
4
  import sys
5
  sys.path.insert(0, ".")
6
 
7
- from afcl.app import create_app
8
 
9
- app = create_app()
10
- app.launch()
 
4
  import sys
5
  sys.path.insert(0, ".")
6
 
7
+ from afcl.app import app
8
 
9
+ if __name__ == "__main__":
10
+ app.launch()
requirements.txt CHANGED
@@ -2,4 +2,4 @@ gradio==4.44.0
2
  huggingface_hub==0.25.0
3
  datasets>=2.14.0
4
  pandas>=2.0.0
5
- plotly>=5.18.0
 
2
  huggingface_hub==0.25.0
3
  datasets>=2.14.0
4
  pandas>=2.0.0
5
+ requests>=2.28.0