Spaces:
Paused
Paused
aeb56
commited on
Commit
Β·
29f5263
1
Parent(s):
2f60fd7
Add Evaluation tab with ARC-Challenge, TruthfulQA, and Winogrande benchmarks
Browse files- app.py +182 -35
- requirements.txt +3 -0
app.py
CHANGED
|
@@ -2,6 +2,9 @@ import gradio as gr
|
|
| 2 |
import torch
|
| 3 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 4 |
import os
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
# Set environment variable for flash-linear-attention
|
| 7 |
os.environ["FLA_USE_TRITON"] = "1"
|
|
@@ -32,11 +35,11 @@ class ChatBot:
|
|
| 32 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 33 |
MODEL_NAME,
|
| 34 |
torch_dtype=torch.bfloat16,
|
| 35 |
-
device_map="balanced",
|
| 36 |
max_memory=max_memory,
|
| 37 |
trust_remote_code=True,
|
| 38 |
low_cpu_mem_usage=True,
|
| 39 |
-
attn_implementation="eager",
|
| 40 |
)
|
| 41 |
|
| 42 |
self.model.eval()
|
|
@@ -62,7 +65,7 @@ class ChatBot:
|
|
| 62 |
else:
|
| 63 |
device_info = ""
|
| 64 |
|
| 65 |
-
yield f"β
**Model loaded successfully!**{device_info}\n\nYou can now
|
| 66 |
|
| 67 |
except Exception as e:
|
| 68 |
self.loaded = False
|
|
@@ -70,7 +73,7 @@ class ChatBot:
|
|
| 70 |
|
| 71 |
def chat(self, message, history, system_prompt, max_tokens, temperature, top_p):
|
| 72 |
if not self.loaded:
|
| 73 |
-
return "β Please load the model first by clicking the 'Load Model' button."
|
| 74 |
|
| 75 |
try:
|
| 76 |
# Build prompt from history
|
|
@@ -92,7 +95,7 @@ class ChatBot:
|
|
| 92 |
inputs = self.tokenizer(prompt, return_tensors="pt")
|
| 93 |
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
|
| 94 |
|
| 95 |
-
# Generate
|
| 96 |
with torch.no_grad():
|
| 97 |
outputs = self.model.generate(
|
| 98 |
**inputs,
|
|
@@ -101,7 +104,7 @@ class ChatBot:
|
|
| 101 |
top_p=top_p,
|
| 102 |
do_sample=temperature > 0,
|
| 103 |
pad_token_id=self.tokenizer.eos_token_id,
|
| 104 |
-
use_cache=True,
|
| 105 |
)
|
| 106 |
|
| 107 |
# Decode
|
|
@@ -115,17 +118,112 @@ class ChatBot:
|
|
| 115 |
|
| 116 |
except Exception as e:
|
| 117 |
return f"β Error: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
# Initialize
|
| 120 |
bot = ChatBot()
|
| 121 |
|
| 122 |
-
# UI
|
| 123 |
with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned") as demo:
|
| 124 |
gr.Markdown("""
|
| 125 |
# π Kimi Linear 48B A3B - Fine-tuned
|
| 126 |
|
| 127 |
-
Chat interface for the fine-tuned Kimi model.
|
| 128 |
-
|
| 129 |
**Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
|
| 130 |
""")
|
| 131 |
|
|
@@ -136,35 +234,87 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned") as demo:
|
|
| 136 |
total_vram = sum(torch.cuda.get_device_properties(i).total_memory / 1024**3 for i in range(gpu_count))
|
| 137 |
gr.Markdown(f"**Hardware:** {gpu_count}x {gpu_name} ({total_vram:.0f}GB total VRAM)")
|
| 138 |
|
| 139 |
-
with gr.
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
load_btn = gr.Button("π Load Model", variant="primary", size="lg")
|
| 144 |
status = gr.Markdown("**Status:** Model not loaded")
|
| 145 |
|
| 146 |
-
gr.Markdown("
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
lines=2
|
| 153 |
-
)
|
| 154 |
-
|
| 155 |
-
max_tokens = gr.Slider(50, 2048, 512, label="Max Tokens", step=1)
|
| 156 |
-
temperature = gr.Slider(0, 2, 0.7, label="Temperature", step=0.1)
|
| 157 |
-
top_p = gr.Slider(0, 1, 0.9, label="Top P", step=0.05)
|
| 158 |
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
with gr.Row():
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
# Events
|
| 170 |
load_btn.click(bot.load_model, outputs=status)
|
|
@@ -178,10 +328,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned") as demo:
|
|
| 178 |
send.click(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg])
|
| 179 |
clear.click(lambda: None, None, chatbot)
|
| 180 |
|
| 181 |
-
|
| 182 |
-
---
|
| 183 |
-
**Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
|
| 184 |
-
""")
|
| 185 |
|
| 186 |
if __name__ == "__main__":
|
| 187 |
demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
|
|
|
|
| 2 |
import torch
|
| 3 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 4 |
import os
|
| 5 |
+
import subprocess
|
| 6 |
+
import json
|
| 7 |
+
from datetime import datetime
|
| 8 |
|
| 9 |
# Set environment variable for flash-linear-attention
|
| 10 |
os.environ["FLA_USE_TRITON"] = "1"
|
|
|
|
| 35 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 36 |
MODEL_NAME,
|
| 37 |
torch_dtype=torch.bfloat16,
|
| 38 |
+
device_map="balanced",
|
| 39 |
max_memory=max_memory,
|
| 40 |
trust_remote_code=True,
|
| 41 |
low_cpu_mem_usage=True,
|
| 42 |
+
attn_implementation="eager",
|
| 43 |
)
|
| 44 |
|
| 45 |
self.model.eval()
|
|
|
|
| 65 |
else:
|
| 66 |
device_info = ""
|
| 67 |
|
| 68 |
+
yield f"β
**Model loaded successfully!**{device_info}\n\nYou can now use Chat or Evaluation tabs."
|
| 69 |
|
| 70 |
except Exception as e:
|
| 71 |
self.loaded = False
|
|
|
|
| 73 |
|
| 74 |
def chat(self, message, history, system_prompt, max_tokens, temperature, top_p):
|
| 75 |
if not self.loaded:
|
| 76 |
+
return "β Please load the model first by clicking the 'Load Model' button in Controls."
|
| 77 |
|
| 78 |
try:
|
| 79 |
# Build prompt from history
|
|
|
|
| 95 |
inputs = self.tokenizer(prompt, return_tensors="pt")
|
| 96 |
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
|
| 97 |
|
| 98 |
+
# Generate
|
| 99 |
with torch.no_grad():
|
| 100 |
outputs = self.model.generate(
|
| 101 |
**inputs,
|
|
|
|
| 104 |
top_p=top_p,
|
| 105 |
do_sample=temperature > 0,
|
| 106 |
pad_token_id=self.tokenizer.eos_token_id,
|
| 107 |
+
use_cache=True,
|
| 108 |
)
|
| 109 |
|
| 110 |
# Decode
|
|
|
|
| 118 |
|
| 119 |
except Exception as e:
|
| 120 |
return f"β Error: {str(e)}"
|
| 121 |
+
|
| 122 |
+
def run_evaluation(self, tasks_to_run):
|
| 123 |
+
"""Run lm_eval on selected tasks"""
|
| 124 |
+
if not self.loaded:
|
| 125 |
+
yield "β Please load the model first!"
|
| 126 |
+
return
|
| 127 |
+
|
| 128 |
+
try:
|
| 129 |
+
# Map friendly names to lm_eval task names
|
| 130 |
+
task_map = {
|
| 131 |
+
"ARC-Challenge": "arc_challenge",
|
| 132 |
+
"TruthfulQA": "truthfulqa_mc2",
|
| 133 |
+
"Winogrande": "winogrande"
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
selected_tasks = [task_map[t] for t in tasks_to_run]
|
| 137 |
+
task_string = ",".join(selected_tasks)
|
| 138 |
+
|
| 139 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 140 |
+
output_dir = f"/tmp/eval_results_{timestamp}"
|
| 141 |
+
|
| 142 |
+
yield f"π **Starting evaluation...**\n\nTasks: {', '.join(tasks_to_run)}\n\nThis will take 30-60 minutes total.\n\n"
|
| 143 |
+
|
| 144 |
+
# Run lm_eval
|
| 145 |
+
cmd = [
|
| 146 |
+
"lm_eval",
|
| 147 |
+
"--model", "hf",
|
| 148 |
+
"--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16",
|
| 149 |
+
"--tasks", task_string,
|
| 150 |
+
"--batch_size", "auto:4",
|
| 151 |
+
"--output_path", output_dir,
|
| 152 |
+
"--log_samples"
|
| 153 |
+
]
|
| 154 |
+
|
| 155 |
+
yield f"π **Running lm_eval...**\n\nCommand: `{' '.join(cmd)}`\n\nProgress will update below...\n\n"
|
| 156 |
+
|
| 157 |
+
# Run evaluation
|
| 158 |
+
process = subprocess.Popen(
|
| 159 |
+
cmd,
|
| 160 |
+
stdout=subprocess.PIPE,
|
| 161 |
+
stderr=subprocess.STDOUT,
|
| 162 |
+
text=True,
|
| 163 |
+
bufsize=1
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
output_lines = []
|
| 167 |
+
for line in process.stdout:
|
| 168 |
+
output_lines.append(line)
|
| 169 |
+
# Show last 20 lines
|
| 170 |
+
recent = ''.join(output_lines[-20:])
|
| 171 |
+
yield f"π **Running evaluation...**\n\n```\n{recent}\n```"
|
| 172 |
+
|
| 173 |
+
process.wait()
|
| 174 |
+
|
| 175 |
+
if process.returncode != 0:
|
| 176 |
+
yield f"β **Evaluation failed!**\n\nExit code: {process.returncode}\n\nLogs:\n```\n{''.join(output_lines[-50:])}\n```"
|
| 177 |
+
return
|
| 178 |
+
|
| 179 |
+
# Read results
|
| 180 |
+
results_file = os.path.join(output_dir, "results.json")
|
| 181 |
+
if os.path.exists(results_file):
|
| 182 |
+
with open(results_file, 'r') as f:
|
| 183 |
+
results = json.load(f)
|
| 184 |
+
|
| 185 |
+
# Format results
|
| 186 |
+
result_text = "β
**Evaluation Complete!**\n\n"
|
| 187 |
+
result_text += f"**Timestamp:** {timestamp}\n\n"
|
| 188 |
+
result_text += "## π Results:\n\n"
|
| 189 |
+
|
| 190 |
+
for task in selected_tasks:
|
| 191 |
+
if task in results['results']:
|
| 192 |
+
task_results = results['results'][task]
|
| 193 |
+
result_text += f"### {task}\n"
|
| 194 |
+
for metric, value in task_results.items():
|
| 195 |
+
if isinstance(value, float):
|
| 196 |
+
result_text += f"- **{metric}:** {value:.4f}\n"
|
| 197 |
+
else:
|
| 198 |
+
result_text += f"- **{metric}:** {value}\n"
|
| 199 |
+
result_text += "\n"
|
| 200 |
+
|
| 201 |
+
# Add summary if available
|
| 202 |
+
if 'summary' in results:
|
| 203 |
+
result_text += "## π Summary:\n\n"
|
| 204 |
+
for metric, value in results['summary'].items():
|
| 205 |
+
if isinstance(value, float):
|
| 206 |
+
result_text += f"- **{metric}:** {value:.4f}\n"
|
| 207 |
+
else:
|
| 208 |
+
result_text += f"- **{metric}:** {value}\n"
|
| 209 |
+
|
| 210 |
+
result_text += f"\n\n**Full results saved to:** `{output_dir}`"
|
| 211 |
+
|
| 212 |
+
yield result_text
|
| 213 |
+
else:
|
| 214 |
+
yield f"β οΈ **Evaluation completed but results file not found.**\n\nOutput:\n```\n{''.join(output_lines[-30:])}\n```"
|
| 215 |
+
|
| 216 |
+
except Exception as e:
|
| 217 |
+
yield f"β **Evaluation error:**\n\n{str(e)}"
|
| 218 |
|
| 219 |
# Initialize
|
| 220 |
bot = ChatBot()
|
| 221 |
|
| 222 |
+
# UI with Tabs
|
| 223 |
with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned") as demo:
|
| 224 |
gr.Markdown("""
|
| 225 |
# π Kimi Linear 48B A3B - Fine-tuned
|
| 226 |
|
|
|
|
|
|
|
| 227 |
**Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
|
| 228 |
""")
|
| 229 |
|
|
|
|
| 234 |
total_vram = sum(torch.cuda.get_device_properties(i).total_memory / 1024**3 for i in range(gpu_count))
|
| 235 |
gr.Markdown(f"**Hardware:** {gpu_count}x {gpu_name} ({total_vram:.0f}GB total VRAM)")
|
| 236 |
|
| 237 |
+
with gr.Tabs():
|
| 238 |
+
# Tab 1: Controls (always visible)
|
| 239 |
+
with gr.Tab("ποΈ Controls"):
|
| 240 |
+
gr.Markdown("### Load Model First")
|
| 241 |
load_btn = gr.Button("π Load Model", variant="primary", size="lg")
|
| 242 |
status = gr.Markdown("**Status:** Model not loaded")
|
| 243 |
|
| 244 |
+
gr.Markdown("""
|
| 245 |
+
### βΉοΈ Instructions
|
| 246 |
+
1. **Click "Load Model"** - Takes 5-10 minutes
|
| 247 |
+
2. **Use Chat tab** - For conversations
|
| 248 |
+
3. **Use Evaluation tab** - To run benchmarks
|
| 249 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
+
# Tab 2: Chat
|
| 252 |
+
with gr.Tab("π¬ Chat"):
|
| 253 |
+
with gr.Row():
|
| 254 |
+
with gr.Column(scale=1):
|
| 255 |
+
gr.Markdown("### βοΈ Settings")
|
| 256 |
+
|
| 257 |
+
system_prompt = gr.Textbox(
|
| 258 |
+
label="System Prompt",
|
| 259 |
+
placeholder="You are a helpful assistant...",
|
| 260 |
+
lines=2
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
max_tokens = gr.Slider(50, 2048, 512, label="Max Tokens", step=1)
|
| 264 |
+
temperature = gr.Slider(0, 2, 0.7, label="Temperature", step=0.1)
|
| 265 |
+
top_p = gr.Slider(0, 1, 0.9, label="Top P", step=0.05)
|
| 266 |
+
|
| 267 |
+
with gr.Column(scale=2):
|
| 268 |
+
chatbot = gr.Chatbot(height=500, show_copy_button=True)
|
| 269 |
+
|
| 270 |
+
with gr.Row():
|
| 271 |
+
msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4)
|
| 272 |
+
send = gr.Button("Send", variant="primary", scale=1)
|
| 273 |
+
|
| 274 |
+
clear = gr.Button("Clear Chat")
|
| 275 |
+
|
| 276 |
+
# Tab 3: Evaluation
|
| 277 |
+
with gr.Tab("π Evaluation"):
|
| 278 |
+
gr.Markdown("""
|
| 279 |
+
### Run LM Evaluation Harness
|
| 280 |
+
|
| 281 |
+
Select benchmarks to evaluate your fine-tuned model. **Estimated time: 30-60 minutes total.**
|
| 282 |
+
""")
|
| 283 |
|
| 284 |
with gr.Row():
|
| 285 |
+
with gr.Column(scale=1):
|
| 286 |
+
gr.Markdown("### Select Benchmarks")
|
| 287 |
+
|
| 288 |
+
tasks = gr.CheckboxGroup(
|
| 289 |
+
choices=["ARC-Challenge", "TruthfulQA", "Winogrande"],
|
| 290 |
+
value=["ARC-Challenge", "TruthfulQA", "Winogrande"],
|
| 291 |
+
label="Tasks to Run",
|
| 292 |
+
info="Select one or more tasks"
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
eval_btn = gr.Button("π Start Evaluation", variant="primary", size="lg")
|
| 296 |
+
|
| 297 |
+
gr.Markdown("""
|
| 298 |
+
### β±οΈ Estimated Time:
|
| 299 |
+
- **ARC-Challenge:** 15-30 min
|
| 300 |
+
- **TruthfulQA:** 10-20 min
|
| 301 |
+
- **Winogrande:** 15-30 min
|
| 302 |
+
|
| 303 |
+
**Total:** ~40-80 minutes for all 3
|
| 304 |
+
""")
|
| 305 |
+
|
| 306 |
+
with gr.Column(scale=2):
|
| 307 |
+
eval_results = gr.Markdown("Results will appear here after evaluation completes.")
|
| 308 |
|
| 309 |
+
gr.Markdown("""
|
| 310 |
+
---
|
| 311 |
+
**Note:** Evaluation requires the model to be loaded first. Results will be saved to `/tmp/eval_results_[timestamp]/`.
|
| 312 |
+
""")
|
| 313 |
+
|
| 314 |
+
gr.Markdown("""
|
| 315 |
+
---
|
| 316 |
+
**Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
|
| 317 |
+
""")
|
| 318 |
|
| 319 |
# Events
|
| 320 |
load_btn.click(bot.load_model, outputs=status)
|
|
|
|
| 328 |
send.click(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg])
|
| 329 |
clear.click(lambda: None, None, chatbot)
|
| 330 |
|
| 331 |
+
eval_btn.click(bot.run_evaluation, inputs=tasks, outputs=eval_results)
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
if __name__ == "__main__":
|
| 334 |
demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
|
requirements.txt
CHANGED
|
@@ -10,6 +10,9 @@ triton>=3.0.0
|
|
| 10 |
# Flash Linear Attention (required by Kimi model)
|
| 11 |
git+https://github.com/sustcsonglin/flash-linear-attention.git@main
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
# UI
|
| 14 |
gradio==4.19.2
|
| 15 |
|
|
|
|
| 10 |
# Flash Linear Attention (required by Kimi model)
|
| 11 |
git+https://github.com/sustcsonglin/flash-linear-attention.git@main
|
| 12 |
|
| 13 |
+
# Evaluation
|
| 14 |
+
lm-eval>=0.4.0
|
| 15 |
+
|
| 16 |
# UI
|
| 17 |
gradio==4.19.2
|
| 18 |
|