aeb56 commited on
Commit
0b25a32
Β·
1 Parent(s): d7f07c2

Add live status table and improved logging with attn_implementation=eager fix

Browse files
Files changed (1) hide show
  1. app.py +92 -28
app.py CHANGED
@@ -20,6 +20,34 @@ class ChatBot:
20
  self.tokenizer = None
21
  self.loaded = False
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def load_model(self):
24
  if self.loaded:
25
  return "βœ… Model already loaded!"
@@ -140,11 +168,15 @@ class ChatBot:
140
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
141
  output_dir = f"/tmp/eval_results_{timestamp}"
142
 
143
- yield f"πŸ”„ **Preparing for evaluation...**\n\nTasks: {', '.join(tasks_to_run)}\n\n"
 
 
 
144
 
145
  # IMPORTANT: Clean up any loaded model to free VRAM for lm_eval
146
  if self.loaded and self.model is not None:
147
- yield f"πŸ”„ **Unloading model to free VRAM...**\n\nThis is necessary because lm_eval will load its own instance.\n\n"
 
148
 
149
  if self.model is not None:
150
  del self.model
@@ -155,7 +187,8 @@ class ChatBot:
155
 
156
  self.loaded = False
157
  else:
158
- yield f"πŸ”„ **Cleaning up memory...**\n\nPreparing environment for evaluation.\n\n"
 
159
 
160
  # Aggressive memory cleanup
161
  import gc
@@ -170,27 +203,35 @@ class ChatBot:
170
  torch.cuda.reset_accumulated_memory_stats(device=i)
171
 
172
  # Wait for memory to be fully released
173
- yield f"πŸ”„ **Waiting for memory cleanup...**\n\nGiving the system time to fully release VRAM.\n\n"
 
174
  time.sleep(5)
175
 
176
  # Final garbage collection
177
  gc.collect()
178
 
179
- yield f"βœ… **Memory cleared! Starting evaluation...**\n\nThis will take 30-60 minutes total.\n\n"
 
 
 
180
 
181
  # Run lm_eval with optimized memory settings
182
  # Note: We use parallelize=True to distribute across GPUs instead of device_map in model_args
 
183
  cmd = [
184
  "lm_eval",
185
  "--model", "hf",
186
- "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True",
187
  "--tasks", task_string,
188
  "--batch_size", "1", # Reduced to minimize memory usage
189
  "--output_path", output_dir,
190
  "--log_samples"
191
  ]
192
 
193
- yield f"πŸ”„ **Running lm_eval...**\n\nCommand: `{' '.join(cmd)}`\n\nProgress will update below...\n\n"
 
 
 
194
 
195
  # Run evaluation
196
  process = subprocess.Popen(
@@ -202,16 +243,24 @@ class ChatBot:
202
  )
203
 
204
  output_lines = []
 
205
  for line in process.stdout:
206
  output_lines.append(line)
207
- # Show last 20 lines
208
- recent = ''.join(output_lines[-20:])
209
- yield f"πŸ”„ **Running evaluation...**\n\n```\n{recent}\n```"
 
 
 
 
210
 
211
  process.wait()
212
 
213
  if process.returncode != 0:
214
- yield f"❌ **Evaluation failed!**\n\nExit code: {process.returncode}\n\nLogs:\n```\n{''.join(output_lines[-50:])}\n```"
 
 
 
215
  return
216
 
217
  # Read results
@@ -220,39 +269,53 @@ class ChatBot:
220
  with open(results_file, 'r') as f:
221
  results = json.load(f)
222
 
223
- # Format results
224
- result_text = "βœ… **Evaluation Complete!**\n\n"
225
- result_text += f"**Timestamp:** {timestamp}\n\n"
226
- result_text += "## πŸ“Š Results:\n\n"
 
 
 
 
 
 
 
 
 
 
227
 
228
  for task in selected_tasks:
229
  if task in results['results']:
230
  task_results = results['results'][task]
231
- result_text += f"### {task}\n"
232
  for metric, value in task_results.items():
233
  if isinstance(value, float):
234
- result_text += f"- **{metric}:** {value:.4f}\n"
235
  else:
236
- result_text += f"- **{metric}:** {value}\n"
237
- result_text += "\n"
238
 
239
  # Add summary if available
240
  if 'summary' in results:
241
- result_text += "## πŸ“ˆ Summary:\n\n"
242
  for metric, value in results['summary'].items():
243
  if isinstance(value, float):
244
- result_text += f"- **{metric}:** {value:.4f}\n"
245
  else:
246
- result_text += f"- **{metric}:** {value}\n"
247
 
248
- result_text += f"\n\n**Full results saved to:** `{output_dir}`"
249
 
250
- yield result_text
251
  else:
252
- yield f"⚠️ **Evaluation completed but results file not found.**\n\nOutput:\n```\n{''.join(output_lines[-30:])}\n```"
 
 
253
 
254
  except Exception as e:
255
- yield f"❌ **Evaluation error:**\n\n{str(e)}"
 
 
256
 
257
  # Initialize
258
  bot = ChatBot()
@@ -351,7 +414,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned - Evaluation")
351
  """)
352
 
353
  with gr.Column(scale=2):
354
- eval_results = gr.Markdown("Results will appear here after evaluation completes.")
 
355
 
356
  gr.Markdown("""
357
  ---
@@ -384,7 +448,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned - Evaluation")
384
  """
385
 
386
  # Evaluation event handler
387
- eval_btn.click(bot.run_evaluation, inputs=tasks, outputs=eval_results)
388
 
389
  if __name__ == "__main__":
390
  demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
 
20
  self.tokenizer = None
21
  self.loaded = False
22
 
23
+ def _create_status_table(self, tasks, status="⏳ Waiting", results=None):
24
+ """Create a markdown table showing evaluation status"""
25
+ table = "## πŸ“Š Evaluation Progress\n\n"
26
+ table += "| Benchmark | Status | Score | Details |\n"
27
+ table += "|-----------|--------|-------|----------|\n"
28
+
29
+ for task in tasks:
30
+ task_status = status
31
+ task_score = "-"
32
+ task_details = ""
33
+
34
+ if results and task in results:
35
+ task_status = "βœ… Complete"
36
+ if task == "ARC-Challenge" and "arc_challenge" in results[task]:
37
+ score_data = results[task]["arc_challenge"]
38
+ task_score = f"{score_data.get('acc_norm', 0):.2%}"
39
+ task_details = f"acc: {score_data.get('acc', 0):.2%}"
40
+ elif task == "TruthfulQA" and "truthfulqa_mc2" in results[task]:
41
+ score_data = results[task]["truthfulqa_mc2"]
42
+ task_score = f"{score_data.get('acc', 0):.2%}"
43
+ elif task == "Winogrande" and "winogrande" in results[task]:
44
+ score_data = results[task]["winogrande"]
45
+ task_score = f"{score_data.get('acc', 0):.2%}"
46
+
47
+ table += f"| {task} | {task_status} | {task_score} | {task_details} |\n"
48
+
49
+ return table
50
+
51
  def load_model(self):
52
  if self.loaded:
53
  return "βœ… Model already loaded!"
 
168
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
169
  output_dir = f"/tmp/eval_results_{timestamp}"
170
 
171
+ # Initial status table
172
+ status_table = self._create_status_table(tasks_to_run, "⏳ Preparing")
173
+ logs = "πŸ”„ **Preparing for evaluation...**\n\nTasks: " + ", ".join(tasks_to_run) + "\n\n"
174
+ yield status_table, logs
175
 
176
  # IMPORTANT: Clean up any loaded model to free VRAM for lm_eval
177
  if self.loaded and self.model is not None:
178
+ logs += "πŸ”„ **Unloading model to free VRAM...**\n\n"
179
+ yield status_table, logs
180
 
181
  if self.model is not None:
182
  del self.model
 
187
 
188
  self.loaded = False
189
  else:
190
+ logs += "πŸ”„ **Cleaning up memory...**\n\n"
191
+ yield status_table, logs
192
 
193
  # Aggressive memory cleanup
194
  import gc
 
203
  torch.cuda.reset_accumulated_memory_stats(device=i)
204
 
205
  # Wait for memory to be fully released
206
+ logs += "πŸ”„ **Waiting for memory cleanup (5s)...**\n\n"
207
+ yield status_table, logs
208
  time.sleep(5)
209
 
210
  # Final garbage collection
211
  gc.collect()
212
 
213
+ status_table = self._create_status_table(tasks_to_run, "πŸ”„ Loading Model")
214
+ logs += "βœ… **Memory cleared! Starting evaluation...**\n\n"
215
+ logs += f"⏱️ Estimated time: 30-60 minutes\n\n"
216
+ yield status_table, logs
217
 
218
  # Run lm_eval with optimized memory settings
219
  # Note: We use parallelize=True to distribute across GPUs instead of device_map in model_args
220
+ # attn_implementation=eager is required because flash attention isn't properly installed
221
  cmd = [
222
  "lm_eval",
223
  "--model", "hf",
224
+ "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True,attn_implementation=eager",
225
  "--tasks", task_string,
226
  "--batch_size", "1", # Reduced to minimize memory usage
227
  "--output_path", output_dir,
228
  "--log_samples"
229
  ]
230
 
231
+ status_table = self._create_status_table(tasks_to_run, "πŸ”„ Running")
232
+ logs += f"πŸ”„ **Running lm_eval...**\n\nCommand: `{' '.join(cmd)}`\n\n"
233
+ logs += "---\n\n### πŸ“œ Live Logs (last 15 lines):\n\n```\n"
234
+ yield status_table, logs
235
 
236
  # Run evaluation
237
  process = subprocess.Popen(
 
243
  )
244
 
245
  output_lines = []
246
+ log_update_counter = 0
247
  for line in process.stdout:
248
  output_lines.append(line)
249
+ log_update_counter += 1
250
+
251
+ # Update every 5 lines to reduce UI flickering
252
+ if log_update_counter % 5 == 0:
253
+ recent = ''.join(output_lines[-15:])
254
+ current_logs = logs + recent + "\n```"
255
+ yield status_table, current_logs
256
 
257
  process.wait()
258
 
259
  if process.returncode != 0:
260
+ status_table = self._create_status_table(tasks_to_run, "❌ Failed")
261
+ error_logs = logs + ''.join(output_lines[-50:]) + "\n```\n\n"
262
+ error_logs += f"❌ **Evaluation failed!**\n\nExit code: {process.returncode}\n"
263
+ yield status_table, error_logs
264
  return
265
 
266
  # Read results
 
269
  with open(results_file, 'r') as f:
270
  results = json.load(f)
271
 
272
+ # Parse results for status table
273
+ parsed_results = {}
274
+ for task in tasks_to_run:
275
+ task_key = task_map[task]
276
+ if task_key in results['results']:
277
+ parsed_results[task] = {task_key: results['results'][task_key]}
278
+
279
+ # Update status table with results
280
+ status_table = self._create_status_table(tasks_to_run, "βœ… Complete", parsed_results)
281
+
282
+ # Format detailed results
283
+ result_logs = "βœ… **Evaluation Complete!**\n\n"
284
+ result_logs += f"**Timestamp:** {timestamp}\n\n"
285
+ result_logs += "## πŸ“Š Detailed Results:\n\n"
286
 
287
  for task in selected_tasks:
288
  if task in results['results']:
289
  task_results = results['results'][task]
290
+ result_logs += f"### {task}\n"
291
  for metric, value in task_results.items():
292
  if isinstance(value, float):
293
+ result_logs += f"- **{metric}:** {value:.4f}\n"
294
  else:
295
+ result_logs += f"- **{metric}:** {value}\n"
296
+ result_logs += "\n"
297
 
298
  # Add summary if available
299
  if 'summary' in results:
300
+ result_logs += "## πŸ“ˆ Summary:\n\n"
301
  for metric, value in results['summary'].items():
302
  if isinstance(value, float):
303
+ result_logs += f"- **{metric}:** {value:.4f}\n"
304
  else:
305
+ result_logs += f"- **{metric}:** {value}\n"
306
 
307
+ result_logs += f"\n\n**Full results saved to:** `{output_dir}`"
308
 
309
+ yield status_table, result_logs
310
  else:
311
+ status_table = self._create_status_table(tasks_to_run, "⚠️ Unknown")
312
+ warning_logs = f"⚠️ **Evaluation completed but results file not found.**\n\nOutput:\n```\n{''.join(output_lines[-30:])}\n```"
313
+ yield status_table, warning_logs
314
 
315
  except Exception as e:
316
+ status_table = self._create_status_table(tasks_to_run if 'tasks_to_run' in locals() else [], "❌ Error")
317
+ error_logs = f"❌ **Evaluation error:**\n\n{str(e)}"
318
+ yield status_table, error_logs
319
 
320
  # Initialize
321
  bot = ChatBot()
 
414
  """)
415
 
416
  with gr.Column(scale=2):
417
+ eval_status = gr.Markdown("## πŸ“Š Evaluation Progress\n\nClick 'πŸš€ Start Evaluation' to begin.", height=200)
418
+ eval_logs = gr.Markdown("### πŸ“œ Logs\n\nLogs will appear here during evaluation.", height=500)
419
 
420
  gr.Markdown("""
421
  ---
 
448
  """
449
 
450
  # Evaluation event handler
451
+ eval_btn.click(bot.run_evaluation, inputs=tasks, outputs=[eval_status, eval_logs])
452
 
453
  if __name__ == "__main__":
454
  demo.launch(server_name="0.0.0.0", server_port=7860, share=True)