Spaces:

HeshamHaroon
/

Arabic-Function-Calling-Leaderboard

Running

App Files Files Community

HeshamHaroon commited on 15 days ago

Commit

de63c9e

verified ·

1 Parent(s): 8c455b2

Update: Auto-evaluation on Space startup

Browse files

Files changed (2) hide show

README.md +1 -1
afcl/app.py +104 -8

README.md CHANGED Viewed

@@ -38,7 +38,7 @@ The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Mode
 📊 **Dataset**: [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
-- **147 test samples** across 10 categories
 - Simple, Multiple, Parallel, Parallel Multiple
 - Irrelevance Detection
 - Dialect Handling (Egyptian, Gulf, Levantine)

 📊 **Dataset**: [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
+- **1,470 total samples** across 10 categories
 - Simple, Multiple, Parallel, Parallel Multiple
 - Irrelevance Detection
 - Dialect Handling (Egyptian, Gulf, Levantine)

afcl/app.py CHANGED Viewed

@@ -68,6 +68,67 @@ MODELS_TO_EVALUATE = [
 # Global state
 LEADERBOARD_DATA = []
 EVALUATION_STATUS = {"current": "Initializing...", "progress": 0, "total": len(MODELS_TO_EVALUATE)}
 # Custom CSS for professional look
 CUSTOM_CSS = """
@@ -459,25 +520,55 @@ def evaluate_sample(model_id: str, sample: Dict) -> float:
 def run_evaluation():
-    """Run full evaluation on all models."""
     global LEADERBOARD_DATA, EVALUATION_STATUS
-    EVALUATION_STATUS["current"] = "Loading dataset..."
     samples = load_evaluation_dataset()
     if not samples:
         EVALUATION_STATUS["current"] = "Failed to load dataset"
         return
-    results = []
     total_models = len(MODELS_TO_EVALUATE)
-    for idx, model_config in enumerate(MODELS_TO_EVALUATE):
         model_name = model_config['model']
         model_id = model_config['model_id']
-        EVALUATION_STATUS["current"] = f"Evaluating {model_name}..."
-        EVALUATION_STATUS["progress"] = idx + 1
         category_scores = {}
         category_counts = {}
@@ -505,7 +596,7 @@ def run_evaluation():
                    "parallel_multiple": 0.10, "irrelevance": 0.15, "dialect_handling": 0.15}
         overall = sum(scores.get(c, 0) * w for c, w in weights.items()) / sum(weights.values())
-        results.append({
             "model": model_name,
             "model_id": model_id,
             "organization": model_config['organization'],
@@ -519,7 +610,12 @@ def run_evaluation():
             "irrelevance": scores.get('irrelevance', 0),
             "dialect_handling": scores.get('dialect_handling', 0),
             "status": "completed"
-        })
         # Update global data after each model
         temp_results = sorted(results, key=lambda x: x['overall'], reverse=True)

 # Global state
 LEADERBOARD_DATA = []
 EVALUATION_STATUS = {"current": "Initializing...", "progress": 0, "total": len(MODELS_TO_EVALUATE)}
+RESULTS_DATASET_REPO = "HeshamHaroon/AFCL-Results"  # HuggingFace dataset for persistent storage
+def load_cached_results() -> List[Dict]:
+    """Load cached evaluation results from HuggingFace dataset."""
+    try:
+        from huggingface_hub import hf_hub_download
+        # Try to download the results file from HF
+        file_path = hf_hub_download(
+            repo_id=RESULTS_DATASET_REPO,
+            filename="results.json",
+            repo_type="dataset",
+            token=os.getenv("HF_TOKEN")
+        )
+        with open(file_path, 'r', encoding='utf-8') as f:
+            cached = json.load(f)
+            print(f"✅ Loaded {len(cached)} cached results from HuggingFace")
+            return cached
+    except Exception as e:
+        print(f"No cached results found (will evaluate all models): {e}")
+    return []
+def save_cached_results(results: List[Dict]):
+    """Save evaluation results to HuggingFace dataset for persistence."""
+    try:
+        from huggingface_hub import HfApi, create_repo
+        import tempfile
+        api = HfApi()
+        token = os.getenv("HF_TOKEN")
+        # Create the dataset repo if it doesn't exist
+        try:
+            create_repo(
+                repo_id=RESULTS_DATASET_REPO,
+                repo_type="dataset",
+                exist_ok=True,
+                token=token
+            )
+        except:
+            pass
+        # Save results to a temp file and upload
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False, encoding='utf-8') as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+            temp_path = f.name
+        api.upload_file(
+            path_or_fileobj=temp_path,
+            path_in_repo="results.json",
+            repo_id=RESULTS_DATASET_REPO,
+            repo_type="dataset",
+            token=token,
+            commit_message=f"Update results ({len(results)} models)"
+        )
+        os.unlink(temp_path)
+        print(f"✅ Saved {len(results)} results to HuggingFace dataset")
+    except Exception as e:
+        print(f"⚠️ Error saving to HuggingFace (results may not persist): {e}")
 # Custom CSS for professional look
 CUSTOM_CSS = """
 def run_evaluation():
+    """Run evaluation only on new models (uses cache for existing results)."""
     global LEADERBOARD_DATA, EVALUATION_STATUS
+    # Step 1: Load cached results first
+    EVALUATION_STATUS["current"] = "Loading cached results..."
+    cached_results = load_cached_results()
+    # Build set of already evaluated model IDs
+    evaluated_models = {r['model_id'] for r in cached_results}
+    print(f"Already evaluated: {len(evaluated_models)} models")
+    # Step 2: Check which models need evaluation
+    models_to_run = [m for m in MODELS_TO_EVALUATE if m['model_id'] not in evaluated_models]
+    if not models_to_run:
+        # All models already evaluated - just use cache
+        EVALUATION_STATUS["current"] = "All models evaluated (from cache)"
+        EVALUATION_STATUS["progress"] = len(MODELS_TO_EVALUATE)
+        LEADERBOARD_DATA = sorted(cached_results, key=lambda x: x['overall'], reverse=True)
+        for i, r in enumerate(LEADERBOARD_DATA, 1):
+            r['rank'] = i
+        print("All models loaded from cache - no new evaluation needed")
+        return
+    # Step 3: Load dataset only if we need to evaluate new models
+    EVALUATION_STATUS["current"] = f"Loading dataset ({len(models_to_run)} new models to evaluate)..."
     samples = load_evaluation_dataset()
     if not samples:
         EVALUATION_STATUS["current"] = "Failed to load dataset"
+        # Still show cached results
+        if cached_results:
+            LEADERBOARD_DATA = sorted(cached_results, key=lambda x: x['overall'], reverse=True)
+            for i, r in enumerate(LEADERBOARD_DATA, 1):
+                r['rank'] = i
         return
+    # Start with cached results
+    results = list(cached_results)
     total_models = len(MODELS_TO_EVALUATE)
+    # Step 4: Evaluate only new models
+    for idx, model_config in enumerate(models_to_run):
         model_name = model_config['model']
         model_id = model_config['model_id']
+        evaluated_count = len(evaluated_models) + idx + 1
+        EVALUATION_STATUS["current"] = f"Evaluating {model_name}... ({evaluated_count}/{total_models})"
+        EVALUATION_STATUS["progress"] = evaluated_count
         category_scores = {}
         category_counts = {}
                    "parallel_multiple": 0.10, "irrelevance": 0.15, "dialect_handling": 0.15}
         overall = sum(scores.get(c, 0) * w for c, w in weights.items()) / sum(weights.values())
+        new_result = {
             "model": model_name,
             "model_id": model_id,
             "organization": model_config['organization'],
             "irrelevance": scores.get('irrelevance', 0),
             "dialect_handling": scores.get('dialect_handling', 0),
             "status": "completed"
+        }
+        results.append(new_result)
+        # Save cache after each model (in case of crash)
+        save_cached_results(results)
         # Update global data after each model
         temp_results = sorted(results, key=lambda x: x['overall'], reverse=True)