Update: Auto-evaluation on Space startup
Browse files- README.md +1 -1
- afcl/app.py +104 -8
README.md
CHANGED
|
@@ -38,7 +38,7 @@ The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Mode
|
|
| 38 |
|
| 39 |
📊 **Dataset**: [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
|
| 40 |
|
| 41 |
-
- **
|
| 42 |
- Simple, Multiple, Parallel, Parallel Multiple
|
| 43 |
- Irrelevance Detection
|
| 44 |
- Dialect Handling (Egyptian, Gulf, Levantine)
|
|
|
|
| 38 |
|
| 39 |
📊 **Dataset**: [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
|
| 40 |
|
| 41 |
+
- **1,470 total samples** across 10 categories
|
| 42 |
- Simple, Multiple, Parallel, Parallel Multiple
|
| 43 |
- Irrelevance Detection
|
| 44 |
- Dialect Handling (Egyptian, Gulf, Levantine)
|
afcl/app.py
CHANGED
|
@@ -68,6 +68,67 @@ MODELS_TO_EVALUATE = [
|
|
| 68 |
# Global state
|
| 69 |
LEADERBOARD_DATA = []
|
| 70 |
EVALUATION_STATUS = {"current": "Initializing...", "progress": 0, "total": len(MODELS_TO_EVALUATE)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
# Custom CSS for professional look
|
| 73 |
CUSTOM_CSS = """
|
|
@@ -459,25 +520,55 @@ def evaluate_sample(model_id: str, sample: Dict) -> float:
|
|
| 459 |
|
| 460 |
|
| 461 |
def run_evaluation():
|
| 462 |
-
"""Run
|
| 463 |
global LEADERBOARD_DATA, EVALUATION_STATUS
|
| 464 |
|
| 465 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
samples = load_evaluation_dataset()
|
| 467 |
|
| 468 |
if not samples:
|
| 469 |
EVALUATION_STATUS["current"] = "Failed to load dataset"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
return
|
| 471 |
|
| 472 |
-
|
|
|
|
| 473 |
total_models = len(MODELS_TO_EVALUATE)
|
| 474 |
|
| 475 |
-
|
|
|
|
| 476 |
model_name = model_config['model']
|
| 477 |
model_id = model_config['model_id']
|
| 478 |
|
| 479 |
-
|
| 480 |
-
EVALUATION_STATUS["
|
|
|
|
| 481 |
|
| 482 |
category_scores = {}
|
| 483 |
category_counts = {}
|
|
@@ -505,7 +596,7 @@ def run_evaluation():
|
|
| 505 |
"parallel_multiple": 0.10, "irrelevance": 0.15, "dialect_handling": 0.15}
|
| 506 |
overall = sum(scores.get(c, 0) * w for c, w in weights.items()) / sum(weights.values())
|
| 507 |
|
| 508 |
-
|
| 509 |
"model": model_name,
|
| 510 |
"model_id": model_id,
|
| 511 |
"organization": model_config['organization'],
|
|
@@ -519,7 +610,12 @@ def run_evaluation():
|
|
| 519 |
"irrelevance": scores.get('irrelevance', 0),
|
| 520 |
"dialect_handling": scores.get('dialect_handling', 0),
|
| 521 |
"status": "completed"
|
| 522 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 523 |
|
| 524 |
# Update global data after each model
|
| 525 |
temp_results = sorted(results, key=lambda x: x['overall'], reverse=True)
|
|
|
|
| 68 |
# Global state
|
| 69 |
LEADERBOARD_DATA = []
|
| 70 |
EVALUATION_STATUS = {"current": "Initializing...", "progress": 0, "total": len(MODELS_TO_EVALUATE)}
|
| 71 |
+
RESULTS_DATASET_REPO = "HeshamHaroon/AFCL-Results" # HuggingFace dataset for persistent storage
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def load_cached_results() -> List[Dict]:
|
| 75 |
+
"""Load cached evaluation results from HuggingFace dataset."""
|
| 76 |
+
try:
|
| 77 |
+
from huggingface_hub import hf_hub_download
|
| 78 |
+
# Try to download the results file from HF
|
| 79 |
+
file_path = hf_hub_download(
|
| 80 |
+
repo_id=RESULTS_DATASET_REPO,
|
| 81 |
+
filename="results.json",
|
| 82 |
+
repo_type="dataset",
|
| 83 |
+
token=os.getenv("HF_TOKEN")
|
| 84 |
+
)
|
| 85 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 86 |
+
cached = json.load(f)
|
| 87 |
+
print(f"✅ Loaded {len(cached)} cached results from HuggingFace")
|
| 88 |
+
return cached
|
| 89 |
+
except Exception as e:
|
| 90 |
+
print(f"No cached results found (will evaluate all models): {e}")
|
| 91 |
+
return []
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def save_cached_results(results: List[Dict]):
|
| 95 |
+
"""Save evaluation results to HuggingFace dataset for persistence."""
|
| 96 |
+
try:
|
| 97 |
+
from huggingface_hub import HfApi, create_repo
|
| 98 |
+
import tempfile
|
| 99 |
+
|
| 100 |
+
api = HfApi()
|
| 101 |
+
token = os.getenv("HF_TOKEN")
|
| 102 |
+
|
| 103 |
+
# Create the dataset repo if it doesn't exist
|
| 104 |
+
try:
|
| 105 |
+
create_repo(
|
| 106 |
+
repo_id=RESULTS_DATASET_REPO,
|
| 107 |
+
repo_type="dataset",
|
| 108 |
+
exist_ok=True,
|
| 109 |
+
token=token
|
| 110 |
+
)
|
| 111 |
+
except:
|
| 112 |
+
pass
|
| 113 |
+
|
| 114 |
+
# Save results to a temp file and upload
|
| 115 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False, encoding='utf-8') as f:
|
| 116 |
+
json.dump(results, f, ensure_ascii=False, indent=2)
|
| 117 |
+
temp_path = f.name
|
| 118 |
+
|
| 119 |
+
api.upload_file(
|
| 120 |
+
path_or_fileobj=temp_path,
|
| 121 |
+
path_in_repo="results.json",
|
| 122 |
+
repo_id=RESULTS_DATASET_REPO,
|
| 123 |
+
repo_type="dataset",
|
| 124 |
+
token=token,
|
| 125 |
+
commit_message=f"Update results ({len(results)} models)"
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
os.unlink(temp_path)
|
| 129 |
+
print(f"✅ Saved {len(results)} results to HuggingFace dataset")
|
| 130 |
+
except Exception as e:
|
| 131 |
+
print(f"⚠️ Error saving to HuggingFace (results may not persist): {e}")
|
| 132 |
|
| 133 |
# Custom CSS for professional look
|
| 134 |
CUSTOM_CSS = """
|
|
|
|
| 520 |
|
| 521 |
|
| 522 |
def run_evaluation():
|
| 523 |
+
"""Run evaluation only on new models (uses cache for existing results)."""
|
| 524 |
global LEADERBOARD_DATA, EVALUATION_STATUS
|
| 525 |
|
| 526 |
+
# Step 1: Load cached results first
|
| 527 |
+
EVALUATION_STATUS["current"] = "Loading cached results..."
|
| 528 |
+
cached_results = load_cached_results()
|
| 529 |
+
|
| 530 |
+
# Build set of already evaluated model IDs
|
| 531 |
+
evaluated_models = {r['model_id'] for r in cached_results}
|
| 532 |
+
print(f"Already evaluated: {len(evaluated_models)} models")
|
| 533 |
+
|
| 534 |
+
# Step 2: Check which models need evaluation
|
| 535 |
+
models_to_run = [m for m in MODELS_TO_EVALUATE if m['model_id'] not in evaluated_models]
|
| 536 |
+
|
| 537 |
+
if not models_to_run:
|
| 538 |
+
# All models already evaluated - just use cache
|
| 539 |
+
EVALUATION_STATUS["current"] = "All models evaluated (from cache)"
|
| 540 |
+
EVALUATION_STATUS["progress"] = len(MODELS_TO_EVALUATE)
|
| 541 |
+
LEADERBOARD_DATA = sorted(cached_results, key=lambda x: x['overall'], reverse=True)
|
| 542 |
+
for i, r in enumerate(LEADERBOARD_DATA, 1):
|
| 543 |
+
r['rank'] = i
|
| 544 |
+
print("All models loaded from cache - no new evaluation needed")
|
| 545 |
+
return
|
| 546 |
+
|
| 547 |
+
# Step 3: Load dataset only if we need to evaluate new models
|
| 548 |
+
EVALUATION_STATUS["current"] = f"Loading dataset ({len(models_to_run)} new models to evaluate)..."
|
| 549 |
samples = load_evaluation_dataset()
|
| 550 |
|
| 551 |
if not samples:
|
| 552 |
EVALUATION_STATUS["current"] = "Failed to load dataset"
|
| 553 |
+
# Still show cached results
|
| 554 |
+
if cached_results:
|
| 555 |
+
LEADERBOARD_DATA = sorted(cached_results, key=lambda x: x['overall'], reverse=True)
|
| 556 |
+
for i, r in enumerate(LEADERBOARD_DATA, 1):
|
| 557 |
+
r['rank'] = i
|
| 558 |
return
|
| 559 |
|
| 560 |
+
# Start with cached results
|
| 561 |
+
results = list(cached_results)
|
| 562 |
total_models = len(MODELS_TO_EVALUATE)
|
| 563 |
|
| 564 |
+
# Step 4: Evaluate only new models
|
| 565 |
+
for idx, model_config in enumerate(models_to_run):
|
| 566 |
model_name = model_config['model']
|
| 567 |
model_id = model_config['model_id']
|
| 568 |
|
| 569 |
+
evaluated_count = len(evaluated_models) + idx + 1
|
| 570 |
+
EVALUATION_STATUS["current"] = f"Evaluating {model_name}... ({evaluated_count}/{total_models})"
|
| 571 |
+
EVALUATION_STATUS["progress"] = evaluated_count
|
| 572 |
|
| 573 |
category_scores = {}
|
| 574 |
category_counts = {}
|
|
|
|
| 596 |
"parallel_multiple": 0.10, "irrelevance": 0.15, "dialect_handling": 0.15}
|
| 597 |
overall = sum(scores.get(c, 0) * w for c, w in weights.items()) / sum(weights.values())
|
| 598 |
|
| 599 |
+
new_result = {
|
| 600 |
"model": model_name,
|
| 601 |
"model_id": model_id,
|
| 602 |
"organization": model_config['organization'],
|
|
|
|
| 610 |
"irrelevance": scores.get('irrelevance', 0),
|
| 611 |
"dialect_handling": scores.get('dialect_handling', 0),
|
| 612 |
"status": "completed"
|
| 613 |
+
}
|
| 614 |
+
|
| 615 |
+
results.append(new_result)
|
| 616 |
+
|
| 617 |
+
# Save cache after each model (in case of crash)
|
| 618 |
+
save_cached_results(results)
|
| 619 |
|
| 620 |
# Update global data after each model
|
| 621 |
temp_results = sorted(results, key=lambda x: x['overall'], reverse=True)
|