import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer import os import subprocess import json from datetime import datetime import time # Set environment variables for flash-linear-attention and memory management os.environ["FLA_USE_TRITON"] = "1" os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" # Updated from PYTORCH_CUDA_ALLOC_CONF # Model configuration MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune" class ChatBot: def __init__(self): self.model = None self.tokenizer = None self.loaded = False def _create_status_table(self, tasks, status="âŗ Waiting", results=None): """Create a markdown table showing evaluation status""" table = "## 📊 Evaluation Progress\n\n" table += "| Benchmark | Status | Score | Details |\n" table += "|-----------|--------|-------|----------|\n" for task in tasks: task_status = status task_score = "-" task_details = "" if results and task in results: task_status = "✅ Complete" if task == "ARC-Challenge" and "arc_challenge" in results[task]: score_data = results[task]["arc_challenge"] task_score = f"{score_data.get('acc_norm', 0):.2%}" task_details = f"acc: {score_data.get('acc', 0):.2%}" elif task == "TruthfulQA" and "truthfulqa_mc2" in results[task]: score_data = results[task]["truthfulqa_mc2"] task_score = f"{score_data.get('acc', 0):.2%}" elif task == "Winogrande" and "winogrande" in results[task]: score_data = results[task]["winogrande"] task_score = f"{score_data.get('acc', 0):.2%}" table += f"| {task} | {task_status} | {task_score} | {task_details} |\n" return table def load_model(self): if self.loaded: return "✅ Model already loaded!" try: yield "🔄 Loading tokenizer..." self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) yield "🔄 Loading model (this takes 5-10 minutes)...\n\nThe 48B model is being distributed across 4 GPUs..." # Configure memory for 4 GPUs num_gpus = torch.cuda.device_count() max_memory = {i: f"{int(23)}GB" for i in range(num_gpus)} # L4 has 24GB, leave 1GB self.model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.bfloat16, device_map="balanced", max_memory=max_memory, trust_remote_code=True, low_cpu_mem_usage=True, attn_implementation="eager", ) self.model.eval() # Patch model config to avoid flash attention issues if hasattr(self.model.config, '_attn_implementation'): self.model.config._attn_implementation = "eager" if hasattr(self.model.config, 'attn_implementation'): self.model.config.attn_implementation = "eager" self.loaded = True # Get GPU distribution info if hasattr(self.model, 'hf_device_map'): device_info = "\n\n**GPU Distribution:**\n" devices = {} for name, device in self.model.hf_device_map.items(): if device not in devices: devices[device] = 0 devices[device] += 1 for device, count in devices.items(): device_info += f"- {device}: {count} layers\n" else: device_info = "" yield f"✅ **Model loaded successfully!**{device_info}\n\nYou can now use the Evaluation tab." except Exception as e: self.loaded = False yield f"❌ **Error loading model:**\n\n{str(e)}" def chat(self, message, history, system_prompt, max_tokens, temperature, top_p): if not self.loaded: return "❌ Please load the model first by clicking the 'Load Model' button in Controls." try: # Build prompt from history conversation = [] if system_prompt.strip(): conversation.append(f"System: {system_prompt}") for user_msg, bot_msg in history: conversation.append(f"User: {user_msg}") if bot_msg: conversation.append(f"Assistant: {bot_msg}") conversation.append(f"User: {message}") conversation.append("Assistant:") prompt = "\n".join(conversation) # Tokenize inputs = self.tokenizer(prompt, return_tensors="pt") inputs = {k: v.to(self.model.device) for k, v in inputs.items()} # Generate with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, do_sample=temperature > 0, pad_token_id=self.tokenizer.eos_token_id, use_cache=True, ) # Decode response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract assistant response if "Assistant:" in response: response = response.split("Assistant:")[-1].strip() return response except Exception as e: return f"❌ Error: {str(e)}" def run_evaluation(self, tasks_to_run): """Run lm_eval on selected tasks""" # Note: We don't strictly require the model to be loaded first # since we'll be unloading it anyway. The load step is just for verification. try: # Map friendly names to lm_eval task names task_map = { "ARC-Challenge": "arc_challenge", "TruthfulQA": "truthfulqa_mc2", "Winogrande": "winogrande" } selected_tasks = [task_map[t] for t in tasks_to_run] task_string = ",".join(selected_tasks) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_dir = f"/tmp/eval_results_{timestamp}" # Initial status table status_table = self._create_status_table(tasks_to_run, "âŗ Preparing") logs = "🔄 **Preparing for evaluation...**\n\nTasks: " + ", ".join(tasks_to_run) + "\n\n" yield status_table, logs # IMPORTANT: Clean up any loaded model to free VRAM for lm_eval if self.loaded and self.model is not None: logs += "🔄 **Unloading model to free VRAM...**\n\n" yield status_table, logs if self.model is not None: del self.model self.model = None if self.tokenizer is not None: del self.tokenizer self.tokenizer = None self.loaded = False else: logs += "🔄 **Cleaning up memory...**\n\n" yield status_table, logs # Aggressive memory cleanup import gc for _ in range(3): gc.collect() if torch.cuda.is_available(): for i in range(torch.cuda.device_count()): torch.cuda.empty_cache() torch.cuda.synchronize(device=i) torch.cuda.reset_peak_memory_stats(device=i) torch.cuda.reset_accumulated_memory_stats(device=i) # Wait for memory to be fully released logs += "🔄 **Waiting for memory cleanup (5s)...**\n\n" yield status_table, logs time.sleep(5) # Final garbage collection gc.collect() status_table = self._create_status_table(tasks_to_run, "🔄 Loading Model") logs += "✅ **Memory cleared! Starting evaluation...**\n\n" logs += f"âąī¸ Estimated time: 30-60 minutes\n\n" yield status_table, logs # Create a wrapper script that disables flash attention before running lm_eval wrapper_script = f"/tmp/run_eval_{timestamp}.py" with open(wrapper_script, 'w') as f: f.write(f""" import sys import os # Monkey-patch transformers to disable flash attention import transformers.modeling_flash_attention_utils as flash_utils def disabled_lazy_import(*args, **kwargs): raise ImportError("Flash attention disabled - using eager attention") flash_utils.lazy_import_flash_attention = disabled_lazy_import # Now run lm_eval sys.argv = [ 'lm_eval', '--model', 'hf', '--model_args', 'pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True,attn_implementation=eager', '--tasks', '{task_string}', '--batch_size', '1', '--output_path', '{output_dir}', '--log_samples' ] from lm_eval.__main__ import cli_evaluate cli_evaluate() """) logs += "âš ī¸ **Note:** Flash attention disabled, using eager attention (slower but compatible)\n\n" yield status_table, logs # Run lm_eval via wrapper script cmd = ["python3", wrapper_script] status_table = self._create_status_table(tasks_to_run, "🔄 Running") logs += f"🔄 **Running lm_eval...**\n\nTasks: {task_string}\n\n" logs += "---\n\n### 📜 Live Logs (last 15 lines):\n\n```\n" yield status_table, logs # Run evaluation process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1 ) output_lines = [] log_update_counter = 0 for line in process.stdout: output_lines.append(line) log_update_counter += 1 # Update every 5 lines to reduce UI flickering if log_update_counter % 5 == 0: recent = ''.join(output_lines[-15:]) current_logs = logs + recent + "\n```" yield status_table, current_logs process.wait() if process.returncode != 0: status_table = self._create_status_table(tasks_to_run, "❌ Failed") error_logs = logs + ''.join(output_lines[-50:]) + "\n```\n\n" error_logs += f"❌ **Evaluation failed!**\n\nExit code: {process.returncode}\n" yield status_table, error_logs return # Read results results_file = os.path.join(output_dir, "results.json") if os.path.exists(results_file): with open(results_file, 'r') as f: results = json.load(f) # Parse results for status table parsed_results = {} for task in tasks_to_run: task_key = task_map[task] if task_key in results['results']: parsed_results[task] = {task_key: results['results'][task_key]} # Update status table with results status_table = self._create_status_table(tasks_to_run, "✅ Complete", parsed_results) # Format detailed results result_logs = "✅ **Evaluation Complete!**\n\n" result_logs += f"**Timestamp:** {timestamp}\n\n" result_logs += "## 📊 Detailed Results:\n\n" for task in selected_tasks: if task in results['results']: task_results = results['results'][task] result_logs += f"### {task}\n" for metric, value in task_results.items(): if isinstance(value, float): result_logs += f"- **{metric}:** {value:.4f}\n" else: result_logs += f"- **{metric}:** {value}\n" result_logs += "\n" # Add summary if available if 'summary' in results: result_logs += "## 📈 Summary:\n\n" for metric, value in results['summary'].items(): if isinstance(value, float): result_logs += f"- **{metric}:** {value:.4f}\n" else: result_logs += f"- **{metric}:** {value}\n" result_logs += f"\n\n**Full results saved to:** `{output_dir}`" yield status_table, result_logs else: status_table = self._create_status_table(tasks_to_run, "âš ī¸ Unknown") warning_logs = f"âš ī¸ **Evaluation completed but results file not found.**\n\nOutput:\n```\n{''.join(output_lines[-30:])}\n```" yield status_table, warning_logs except Exception as e: status_table = self._create_status_table(tasks_to_run if 'tasks_to_run' in locals() else [], "❌ Error") error_logs = f"❌ **Evaluation error:**\n\n{str(e)}" yield status_table, error_logs # Initialize bot = ChatBot() # UI with Tabs with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned - Evaluation") as demo: gr.Markdown(""" # 📊 Kimi Linear 48B A3B - Evaluation **Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune` **This Space is configured for model evaluation only. Chat/inference is disabled.** """) # Show GPU info if torch.cuda.is_available(): gpu_count = torch.cuda.device_count() gpu_name = torch.cuda.get_device_name(0) total_vram = sum(torch.cuda.get_device_properties(i).total_memory / 1024**3 for i in range(gpu_count)) gr.Markdown(f"**Hardware:** {gpu_count}x {gpu_name} ({total_vram:.0f}GB total VRAM)") with gr.Tabs(): # Tab 1: Controls (always visible) with gr.Tab("đŸŽ›ī¸ Controls"): gr.Markdown("### Load Model (Optional)") load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg") status = gr.Markdown("**Status:** Model not loaded") gr.Markdown(""" ### â„šī¸ Instructions 1. **(Optional)** Click "Load Model" to verify setup (takes 5-10 minutes) 2. **Go directly to Evaluation tab** to run benchmarks **Note:** - Chat/inference functionality is currently disabled. This Space focuses on model evaluation only. - Loading the model first is optional - you can go straight to the Evaluation tab - Any loaded model will be automatically unloaded before evaluation starts to free VRAM for lm_eval. """) # Tab 2: Chat - DISABLED # Uncomment this section to re-enable chat functionality """ with gr.Tab("đŸ’Ŧ Chat"): with gr.Row(): with gr.Column(scale=1): gr.Markdown("### âš™ī¸ Settings") system_prompt = gr.Textbox( label="System Prompt", placeholder="You are a helpful assistant...", lines=2 ) max_tokens = gr.Slider(50, 2048, 512, label="Max Tokens", step=1) temperature = gr.Slider(0, 2, 0.7, label="Temperature", step=0.1) top_p = gr.Slider(0, 1, 0.9, label="Top P", step=0.05) with gr.Column(scale=2): chatbot = gr.Chatbot(height=500, show_copy_button=True) with gr.Row(): msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4) send = gr.Button("Send", variant="primary", scale=1) clear = gr.Button("Clear Chat") """ # Tab 3: Evaluation with gr.Tab("📊 Evaluation"): gr.Markdown(""" ### Run LM Evaluation Harness Select benchmarks to evaluate your fine-tuned model. **Estimated time: 30-60 minutes total.** """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Select Benchmarks") tasks = gr.CheckboxGroup( choices=["ARC-Challenge", "TruthfulQA", "Winogrande"], value=["ARC-Challenge", "TruthfulQA", "Winogrande"], label="Tasks to Run", info="Select one or more tasks" ) eval_btn = gr.Button("🚀 Start Evaluation", variant="primary", size="lg") gr.Markdown(""" ### âąī¸ Estimated Time: - **ARC-Challenge:** 15-30 min - **TruthfulQA:** 10-20 min - **Winogrande:** 15-30 min **Total:** ~40-80 minutes for all 3 """) with gr.Column(scale=2): eval_status = gr.Markdown("## 📊 Evaluation Progress\n\nClick '🚀 Start Evaluation' to begin.") eval_logs = gr.Markdown("### 📜 Logs\n\nLogs will appear here during evaluation.") gr.Markdown(""" --- **Note:** - You can start evaluation immediately - no need to load the model first - If you did load the model, it will be automatically unloaded before evaluation to free VRAM - lm_eval will load its own fresh instance of the model for evaluation - Results will be saved to `/tmp/eval_results_[timestamp]/` """) gr.Markdown(""" --- **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune) """) # Events load_btn.click(bot.load_model, outputs=status) # Chat event handlers - DISABLED # Uncomment these lines to re-enable chat functionality """ def respond(message, history, system, max_tok, temp, top): bot_message = bot.chat(message, history, system, max_tok, temp, top) history.append((message, bot_message)) return history, "" msg.submit(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg]) send.click(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg]) clear.click(lambda: None, None, chatbot) """ # Evaluation event handler eval_btn.click(bot.run_evaluation, inputs=tasks, outputs=[eval_status, eval_logs]) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860, share=True)