Spaces:

optiviseapp
/

fnmodel

Paused

App Files Files Community

aeb56 commited on 29 days ago

Commit

b705945

1 Parent(s): ef25cbe

Workaround flash-attn: create fake module with PyTorch fallback attention

Browse files

Files changed (2) hide show

app.py +55 -6
requirements.txt +0 -3

app.py CHANGED Viewed

@@ -215,15 +215,63 @@ class ChatBot:
             logs += f"⏱️ Estimated time: 30-60 minutes\n\n"
             yield status_table, logs
-            # Run lm_eval with optimized memory settings
-            # Note: We use parallelize=True to distribute across GPUs instead of device_map in model_args
-            # We need to install flash-attn for this model to work properly
             cmd = [
                 "lm_eval",
                 "--model", "hf",
                 "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True",
                 "--tasks", task_string,
-                "--batch_size", "1",  # Reduced to minimize memory usage
                 "--output_path", output_dir,
                 "--log_samples"
             ]
@@ -233,13 +281,14 @@ class ChatBot:
             logs += "---\n\n### 📜 Live Logs (last 15 lines):\n\n```\n"
             yield status_table, logs
-            # Run evaluation
             process = subprocess.Popen(
                 cmd,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.STDOUT,
                 text=True,
-                bufsize=1
             )
             output_lines = []

             logs += f"⏱️ Estimated time: 30-60 minutes\n\n"
             yield status_table, logs
+            # Create a fake flash_attn package to avoid import errors
+            # This will fallback to standard PyTorch attention
+            fake_flash_dir = f"/tmp/flash_attn_{timestamp}"
+            os.makedirs(fake_flash_dir, exist_ok=True)
+            with open(os.path.join(fake_flash_dir, "__init__.py"), 'w') as f:
+                f.write("""
+# Fake flash_attn module that falls back to standard PyTorch attention
+import torch
+def flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=None, causal=False, **kwargs):
+    '''Fallback to standard PyTorch attention (slower but works without flash-attn)'''
+    if softmax_scale is None:
+        softmax_scale = 1.0 / (q.size(-1) ** 0.5)
+    # Standard attention: softmax(Q @ K.T) @ V
+    attn_weights = torch.matmul(q, k.transpose(-2, -1)) * softmax_scale
+    if causal:
+        seq_len = attn_weights.size(-1)
+        causal_mask = torch.triu(torch.ones(seq_len, seq_len, device=attn_weights.device), diagonal=1).bool()
+        attn_weights = attn_weights.masked_fill(causal_mask, float('-inf'))
+    attn_weights = torch.softmax(attn_weights, dim=-1)
+    if dropout_p > 0:
+        attn_weights = torch.nn.functional.dropout(attn_weights, p=dropout_p)
+    output = torch.matmul(attn_weights, v)
+    return output, None  # Return None for attention weights
+def flash_attn_varlen_func(*args, **kwargs):
+    return flash_attn_func(*args, **kwargs)
+__version__ = "2.5.0"
+""")
+            # Add fake package to Python path for subprocess
+            import sys
+            if f"/tmp" not in sys.path:
+                sys.path.insert(0, "/tmp")
+            # Set PYTHONPATH environment variable so subprocess can find fake flash_attn
+            env = os.environ.copy()
+            pythonpath = env.get('PYTHONPATH', '')
+            env['PYTHONPATH'] = f"/tmp:{pythonpath}" if pythonpath else "/tmp"
+            logs += "⚠️ **Note:** Using fallback PyTorch attention (slower than flash-attn)\n\n"
+            yield status_table, logs
+            # Run lm_eval
             cmd = [
                 "lm_eval",
                 "--model", "hf",
                 "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True",
                 "--tasks", task_string,
+                "--batch_size", "1",
                 "--output_path", output_dir,
                 "--log_samples"
             ]
             logs += "---\n\n### 📜 Live Logs (last 15 lines):\n\n```\n"
             yield status_table, logs
+            # Run evaluation with custom environment
             process = subprocess.Popen(
                 cmd,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.STDOUT,
                 text=True,
+                bufsize=1,
+                env=env  # Pass custom environment with PYTHONPATH
             )
             output_lines = []

requirements.txt CHANGED Viewed

@@ -10,9 +10,6 @@ triton>=3.0.0
 # Flash Linear Attention (required by Kimi model)
 git+https://github.com/sustcsonglin/flash-linear-attention.git@main
-# Flash Attention (required for attention layers)
-flash-attn>=2.5.0
 # Evaluation
 lm-eval>=0.4.0

 # Flash Linear Attention (required by Kimi model)
 git+https://github.com/sustcsonglin/flash-linear-attention.git@main
 # Evaluation
 lm-eval>=0.4.0