HeshamHaroon commited on
Commit
de63c9e
·
verified ·
1 Parent(s): 8c455b2

Update: Auto-evaluation on Space startup

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. afcl/app.py +104 -8
README.md CHANGED
@@ -38,7 +38,7 @@ The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Mode
38
 
39
  📊 **Dataset**: [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
40
 
41
- - **147 test samples** across 10 categories
42
  - Simple, Multiple, Parallel, Parallel Multiple
43
  - Irrelevance Detection
44
  - Dialect Handling (Egyptian, Gulf, Levantine)
 
38
 
39
  📊 **Dataset**: [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
40
 
41
+ - **1,470 total samples** across 10 categories
42
  - Simple, Multiple, Parallel, Parallel Multiple
43
  - Irrelevance Detection
44
  - Dialect Handling (Egyptian, Gulf, Levantine)
afcl/app.py CHANGED
@@ -68,6 +68,67 @@ MODELS_TO_EVALUATE = [
68
  # Global state
69
  LEADERBOARD_DATA = []
70
  EVALUATION_STATUS = {"current": "Initializing...", "progress": 0, "total": len(MODELS_TO_EVALUATE)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  # Custom CSS for professional look
73
  CUSTOM_CSS = """
@@ -459,25 +520,55 @@ def evaluate_sample(model_id: str, sample: Dict) -> float:
459
 
460
 
461
  def run_evaluation():
462
- """Run full evaluation on all models."""
463
  global LEADERBOARD_DATA, EVALUATION_STATUS
464
 
465
- EVALUATION_STATUS["current"] = "Loading dataset..."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466
  samples = load_evaluation_dataset()
467
 
468
  if not samples:
469
  EVALUATION_STATUS["current"] = "Failed to load dataset"
 
 
 
 
 
470
  return
471
 
472
- results = []
 
473
  total_models = len(MODELS_TO_EVALUATE)
474
 
475
- for idx, model_config in enumerate(MODELS_TO_EVALUATE):
 
476
  model_name = model_config['model']
477
  model_id = model_config['model_id']
478
 
479
- EVALUATION_STATUS["current"] = f"Evaluating {model_name}..."
480
- EVALUATION_STATUS["progress"] = idx + 1
 
481
 
482
  category_scores = {}
483
  category_counts = {}
@@ -505,7 +596,7 @@ def run_evaluation():
505
  "parallel_multiple": 0.10, "irrelevance": 0.15, "dialect_handling": 0.15}
506
  overall = sum(scores.get(c, 0) * w for c, w in weights.items()) / sum(weights.values())
507
 
508
- results.append({
509
  "model": model_name,
510
  "model_id": model_id,
511
  "organization": model_config['organization'],
@@ -519,7 +610,12 @@ def run_evaluation():
519
  "irrelevance": scores.get('irrelevance', 0),
520
  "dialect_handling": scores.get('dialect_handling', 0),
521
  "status": "completed"
522
- })
 
 
 
 
 
523
 
524
  # Update global data after each model
525
  temp_results = sorted(results, key=lambda x: x['overall'], reverse=True)
 
68
  # Global state
69
  LEADERBOARD_DATA = []
70
  EVALUATION_STATUS = {"current": "Initializing...", "progress": 0, "total": len(MODELS_TO_EVALUATE)}
71
+ RESULTS_DATASET_REPO = "HeshamHaroon/AFCL-Results" # HuggingFace dataset for persistent storage
72
+
73
+
74
+ def load_cached_results() -> List[Dict]:
75
+ """Load cached evaluation results from HuggingFace dataset."""
76
+ try:
77
+ from huggingface_hub import hf_hub_download
78
+ # Try to download the results file from HF
79
+ file_path = hf_hub_download(
80
+ repo_id=RESULTS_DATASET_REPO,
81
+ filename="results.json",
82
+ repo_type="dataset",
83
+ token=os.getenv("HF_TOKEN")
84
+ )
85
+ with open(file_path, 'r', encoding='utf-8') as f:
86
+ cached = json.load(f)
87
+ print(f"✅ Loaded {len(cached)} cached results from HuggingFace")
88
+ return cached
89
+ except Exception as e:
90
+ print(f"No cached results found (will evaluate all models): {e}")
91
+ return []
92
+
93
+
94
+ def save_cached_results(results: List[Dict]):
95
+ """Save evaluation results to HuggingFace dataset for persistence."""
96
+ try:
97
+ from huggingface_hub import HfApi, create_repo
98
+ import tempfile
99
+
100
+ api = HfApi()
101
+ token = os.getenv("HF_TOKEN")
102
+
103
+ # Create the dataset repo if it doesn't exist
104
+ try:
105
+ create_repo(
106
+ repo_id=RESULTS_DATASET_REPO,
107
+ repo_type="dataset",
108
+ exist_ok=True,
109
+ token=token
110
+ )
111
+ except:
112
+ pass
113
+
114
+ # Save results to a temp file and upload
115
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False, encoding='utf-8') as f:
116
+ json.dump(results, f, ensure_ascii=False, indent=2)
117
+ temp_path = f.name
118
+
119
+ api.upload_file(
120
+ path_or_fileobj=temp_path,
121
+ path_in_repo="results.json",
122
+ repo_id=RESULTS_DATASET_REPO,
123
+ repo_type="dataset",
124
+ token=token,
125
+ commit_message=f"Update results ({len(results)} models)"
126
+ )
127
+
128
+ os.unlink(temp_path)
129
+ print(f"✅ Saved {len(results)} results to HuggingFace dataset")
130
+ except Exception as e:
131
+ print(f"⚠️ Error saving to HuggingFace (results may not persist): {e}")
132
 
133
  # Custom CSS for professional look
134
  CUSTOM_CSS = """
 
520
 
521
 
522
  def run_evaluation():
523
+ """Run evaluation only on new models (uses cache for existing results)."""
524
  global LEADERBOARD_DATA, EVALUATION_STATUS
525
 
526
+ # Step 1: Load cached results first
527
+ EVALUATION_STATUS["current"] = "Loading cached results..."
528
+ cached_results = load_cached_results()
529
+
530
+ # Build set of already evaluated model IDs
531
+ evaluated_models = {r['model_id'] for r in cached_results}
532
+ print(f"Already evaluated: {len(evaluated_models)} models")
533
+
534
+ # Step 2: Check which models need evaluation
535
+ models_to_run = [m for m in MODELS_TO_EVALUATE if m['model_id'] not in evaluated_models]
536
+
537
+ if not models_to_run:
538
+ # All models already evaluated - just use cache
539
+ EVALUATION_STATUS["current"] = "All models evaluated (from cache)"
540
+ EVALUATION_STATUS["progress"] = len(MODELS_TO_EVALUATE)
541
+ LEADERBOARD_DATA = sorted(cached_results, key=lambda x: x['overall'], reverse=True)
542
+ for i, r in enumerate(LEADERBOARD_DATA, 1):
543
+ r['rank'] = i
544
+ print("All models loaded from cache - no new evaluation needed")
545
+ return
546
+
547
+ # Step 3: Load dataset only if we need to evaluate new models
548
+ EVALUATION_STATUS["current"] = f"Loading dataset ({len(models_to_run)} new models to evaluate)..."
549
  samples = load_evaluation_dataset()
550
 
551
  if not samples:
552
  EVALUATION_STATUS["current"] = "Failed to load dataset"
553
+ # Still show cached results
554
+ if cached_results:
555
+ LEADERBOARD_DATA = sorted(cached_results, key=lambda x: x['overall'], reverse=True)
556
+ for i, r in enumerate(LEADERBOARD_DATA, 1):
557
+ r['rank'] = i
558
  return
559
 
560
+ # Start with cached results
561
+ results = list(cached_results)
562
  total_models = len(MODELS_TO_EVALUATE)
563
 
564
+ # Step 4: Evaluate only new models
565
+ for idx, model_config in enumerate(models_to_run):
566
  model_name = model_config['model']
567
  model_id = model_config['model_id']
568
 
569
+ evaluated_count = len(evaluated_models) + idx + 1
570
+ EVALUATION_STATUS["current"] = f"Evaluating {model_name}... ({evaluated_count}/{total_models})"
571
+ EVALUATION_STATUS["progress"] = evaluated_count
572
 
573
  category_scores = {}
574
  category_counts = {}
 
596
  "parallel_multiple": 0.10, "irrelevance": 0.15, "dialect_handling": 0.15}
597
  overall = sum(scores.get(c, 0) * w for c, w in weights.items()) / sum(weights.values())
598
 
599
+ new_result = {
600
  "model": model_name,
601
  "model_id": model_id,
602
  "organization": model_config['organization'],
 
610
  "irrelevance": scores.get('irrelevance', 0),
611
  "dialect_handling": scores.get('dialect_handling', 0),
612
  "status": "completed"
613
+ }
614
+
615
+ results.append(new_result)
616
+
617
+ # Save cache after each model (in case of crash)
618
+ save_cached_results(results)
619
 
620
  # Update global data after each model
621
  temp_results = sorted(results, key=lambda x: x['overall'], reverse=True)