Spaces:

optiviseapp
/

fnmodel

Paused

App Files Files Community

aeb56 commited on 29 days ago

Commit

0b25a32

1 Parent(s): d7f07c2

Add live status table and improved logging with attn_implementation=eager fix

Browse files

Files changed (1) hide show

app.py +92 -28

app.py CHANGED Viewed

@@ -20,6 +20,34 @@ class ChatBot:
         self.tokenizer = None
         self.loaded = False
     def load_model(self):
         if self.loaded:
             return "✅ Model already loaded!"
@@ -140,11 +168,15 @@ class ChatBot:
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             output_dir = f"/tmp/eval_results_{timestamp}"
-            yield f"🔄 **Preparing for evaluation...**\n\nTasks: {', '.join(tasks_to_run)}\n\n"
             # IMPORTANT: Clean up any loaded model to free VRAM for lm_eval
             if self.loaded and self.model is not None:
-                yield f"🔄 **Unloading model to free VRAM...**\n\nThis is necessary because lm_eval will load its own instance.\n\n"
                 if self.model is not None:
                     del self.model
@@ -155,7 +187,8 @@ class ChatBot:
                 self.loaded = False
             else:
-                yield f"🔄 **Cleaning up memory...**\n\nPreparing environment for evaluation.\n\n"
             # Aggressive memory cleanup
             import gc
@@ -170,27 +203,35 @@ class ChatBot:
                     torch.cuda.reset_accumulated_memory_stats(device=i)
             # Wait for memory to be fully released
-            yield f"🔄 **Waiting for memory cleanup...**\n\nGiving the system time to fully release VRAM.\n\n"
             time.sleep(5)
             # Final garbage collection
             gc.collect()
-            yield f"✅ **Memory cleared! Starting evaluation...**\n\nThis will take 30-60 minutes total.\n\n"
             # Run lm_eval with optimized memory settings
             # Note: We use parallelize=True to distribute across GPUs instead of device_map in model_args
             cmd = [
                 "lm_eval",
                 "--model", "hf",
-                "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True",
                 "--tasks", task_string,
                 "--batch_size", "1",  # Reduced to minimize memory usage
                 "--output_path", output_dir,
                 "--log_samples"
             ]
-            yield f"🔄 **Running lm_eval...**\n\nCommand: `{' '.join(cmd)}`\n\nProgress will update below...\n\n"
             # Run evaluation
             process = subprocess.Popen(
@@ -202,16 +243,24 @@ class ChatBot:
             )
             output_lines = []
             for line in process.stdout:
                 output_lines.append(line)
-                # Show last 20 lines
-                recent = ''.join(output_lines[-20:])
-                yield f"🔄 **Running evaluation...**\n\n```\n{recent}\n```"
             process.wait()
             if process.returncode != 0:
-                yield f"❌ **Evaluation failed!**\n\nExit code: {process.returncode}\n\nLogs:\n```\n{''.join(output_lines[-50:])}\n```"
                 return
             # Read results
@@ -220,39 +269,53 @@ class ChatBot:
                 with open(results_file, 'r') as f:
                     results = json.load(f)
-                # Format results
-                result_text = "✅ **Evaluation Complete!**\n\n"
-                result_text += f"**Timestamp:** {timestamp}\n\n"
-                result_text += "## 📊 Results:\n\n"
                 for task in selected_tasks:
                     if task in results['results']:
                         task_results = results['results'][task]
-                        result_text += f"### {task}\n"
                         for metric, value in task_results.items():
                             if isinstance(value, float):
-                                result_text += f"- **{metric}:** {value:.4f}\n"
                             else:
-                                result_text += f"- **{metric}:** {value}\n"
-                        result_text += "\n"
                 # Add summary if available
                 if 'summary' in results:
-                    result_text += "## 📈 Summary:\n\n"
                     for metric, value in results['summary'].items():
                         if isinstance(value, float):
-                            result_text += f"- **{metric}:** {value:.4f}\n"
                         else:
-                            result_text += f"- **{metric}:** {value}\n"
-                result_text += f"\n\n**Full results saved to:** `{output_dir}`"
-                yield result_text
             else:
-                yield f"⚠️ **Evaluation completed but results file not found.**\n\nOutput:\n```\n{''.join(output_lines[-30:])}\n```"
         except Exception as e:
-            yield f"❌ **Evaluation error:**\n\n{str(e)}"
 # Initialize
 bot = ChatBot()
@@ -351,7 +414,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned - Evaluation")
                     """)
                 with gr.Column(scale=2):
-                    eval_results = gr.Markdown("Results will appear here after evaluation completes.")
             gr.Markdown("""
             ---
@@ -384,7 +448,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned - Evaluation")
     """
     # Evaluation event handler
-    eval_btn.click(bot.run_evaluation, inputs=tasks, outputs=eval_results)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

         self.tokenizer = None
         self.loaded = False
+    def _create_status_table(self, tasks, status="⏳ Waiting", results=None):
+        """Create a markdown table showing evaluation status"""
+        table = "## 📊 Evaluation Progress\n\n"
+        table += "| Benchmark | Status | Score | Details |\n"
+        table += "|-----------|--------|-------|----------|\n"
+        for task in tasks:
+            task_status = status
+            task_score = "-"
+            task_details = ""
+            if results and task in results:
+                task_status = "✅ Complete"
+                if task == "ARC-Challenge" and "arc_challenge" in results[task]:
+                    score_data = results[task]["arc_challenge"]
+                    task_score = f"{score_data.get('acc_norm', 0):.2%}"
+                    task_details = f"acc: {score_data.get('acc', 0):.2%}"
+                elif task == "TruthfulQA" and "truthfulqa_mc2" in results[task]:
+                    score_data = results[task]["truthfulqa_mc2"]
+                    task_score = f"{score_data.get('acc', 0):.2%}"
+                elif task == "Winogrande" and "winogrande" in results[task]:
+                    score_data = results[task]["winogrande"]
+                    task_score = f"{score_data.get('acc', 0):.2%}"
+            table += f"| {task} | {task_status} | {task_score} | {task_details} |\n"
+        return table
     def load_model(self):
         if self.loaded:
             return "✅ Model already loaded!"
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             output_dir = f"/tmp/eval_results_{timestamp}"
+            # Initial status table
+            status_table = self._create_status_table(tasks_to_run, "⏳ Preparing")
+            logs = "🔄 **Preparing for evaluation...**\n\nTasks: " + ", ".join(tasks_to_run) + "\n\n"
+            yield status_table, logs
             # IMPORTANT: Clean up any loaded model to free VRAM for lm_eval
             if self.loaded and self.model is not None:
+                logs += "🔄 **Unloading model to free VRAM...**\n\n"
+                yield status_table, logs
                 if self.model is not None:
                     del self.model
                 self.loaded = False
             else:
+                logs += "🔄 **Cleaning up memory...**\n\n"
+                yield status_table, logs
             # Aggressive memory cleanup
             import gc
                     torch.cuda.reset_accumulated_memory_stats(device=i)
             # Wait for memory to be fully released
+            logs += "🔄 **Waiting for memory cleanup (5s)...**\n\n"
+            yield status_table, logs
             time.sleep(5)
             # Final garbage collection
             gc.collect()
+            status_table = self._create_status_table(tasks_to_run, "🔄 Loading Model")
+            logs += "✅ **Memory cleared! Starting evaluation...**\n\n"
+            logs += f"⏱️ Estimated time: 30-60 minutes\n\n"
+            yield status_table, logs
             # Run lm_eval with optimized memory settings
             # Note: We use parallelize=True to distribute across GPUs instead of device_map in model_args
+            # attn_implementation=eager is required because flash attention isn't properly installed
             cmd = [
                 "lm_eval",
                 "--model", "hf",
+                "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True,attn_implementation=eager",
                 "--tasks", task_string,
                 "--batch_size", "1",  # Reduced to minimize memory usage
                 "--output_path", output_dir,
                 "--log_samples"
             ]
+            status_table = self._create_status_table(tasks_to_run, "🔄 Running")
+            logs += f"🔄 **Running lm_eval...**\n\nCommand: `{' '.join(cmd)}`\n\n"
+            logs += "---\n\n### 📜 Live Logs (last 15 lines):\n\n```\n"
+            yield status_table, logs
             # Run evaluation
             process = subprocess.Popen(
             )
             output_lines = []
+            log_update_counter = 0
             for line in process.stdout:
                 output_lines.append(line)
+                log_update_counter += 1
+                # Update every 5 lines to reduce UI flickering
+                if log_update_counter % 5 == 0:
+                    recent = ''.join(output_lines[-15:])
+                    current_logs = logs + recent + "\n```"
+                    yield status_table, current_logs
             process.wait()
             if process.returncode != 0:
+                status_table = self._create_status_table(tasks_to_run, "❌ Failed")
+                error_logs = logs + ''.join(output_lines[-50:]) + "\n```\n\n"
+                error_logs += f"❌ **Evaluation failed!**\n\nExit code: {process.returncode}\n"
+                yield status_table, error_logs
                 return
             # Read results
                 with open(results_file, 'r') as f:
                     results = json.load(f)
+                # Parse results for status table
+                parsed_results = {}
+                for task in tasks_to_run:
+                    task_key = task_map[task]
+                    if task_key in results['results']:
+                        parsed_results[task] = {task_key: results['results'][task_key]}
+                # Update status table with results
+                status_table = self._create_status_table(tasks_to_run, "✅ Complete", parsed_results)
+                # Format detailed results
+                result_logs = "✅ **Evaluation Complete!**\n\n"
+                result_logs += f"**Timestamp:** {timestamp}\n\n"
+                result_logs += "## 📊 Detailed Results:\n\n"
                 for task in selected_tasks:
                     if task in results['results']:
                         task_results = results['results'][task]
+                        result_logs += f"### {task}\n"
                         for metric, value in task_results.items():
                             if isinstance(value, float):
+                                result_logs += f"- **{metric}:** {value:.4f}\n"
                             else:
+                                result_logs += f"- **{metric}:** {value}\n"
+                        result_logs += "\n"
                 # Add summary if available
                 if 'summary' in results:
+                    result_logs += "## 📈 Summary:\n\n"
                     for metric, value in results['summary'].items():
                         if isinstance(value, float):
+                            result_logs += f"- **{metric}:** {value:.4f}\n"
                         else:
+                            result_logs += f"- **{metric}:** {value}\n"
+                result_logs += f"\n\n**Full results saved to:** `{output_dir}`"
+                yield status_table, result_logs
             else:
+                status_table = self._create_status_table(tasks_to_run, "⚠️ Unknown")
+                warning_logs = f"⚠️ **Evaluation completed but results file not found.**\n\nOutput:\n```\n{''.join(output_lines[-30:])}\n```"
+                yield status_table, warning_logs
         except Exception as e:
+            status_table = self._create_status_table(tasks_to_run if 'tasks_to_run' in locals() else [], "❌ Error")
+            error_logs = f"❌ **Evaluation error:**\n\n{str(e)}"
+            yield status_table, error_logs
 # Initialize
 bot = ChatBot()
                     """)
                 with gr.Column(scale=2):
+                    eval_status = gr.Markdown("## 📊 Evaluation Progress\n\nClick '🚀 Start Evaluation' to begin.", height=200)
+                    eval_logs = gr.Markdown("### 📜 Logs\n\nLogs will appear here during evaluation.", height=500)
             gr.Markdown("""
             ---
     """
     # Evaluation event handler
+    eval_btn.click(bot.run_evaluation, inputs=tasks, outputs=[eval_status, eval_logs])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, share=True)