Spaces:

Mungert
/

GradLLM

Sleeping

App Files Files Community

johnbridges commited on Sep 17

Commit

1d79762

1 Parent(s): aa096cd

.

Browse files

Files changed (1) hide show

hf_backend.py +27 -4

hf_backend.py CHANGED Viewed

@@ -3,7 +3,7 @@ import time, logging
 from typing import Any, Dict, AsyncIterable
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from backends_base import ChatBackend, ImagesBackend
 from config import settings
@@ -24,13 +24,27 @@ try:
     tokenizer = AutoTokenizer.from_pretrained(
         MODEL_ID,
         trust_remote_code=True,
-        use_fast=False
     )
 except Exception as e:
     load_error = f"Failed to load tokenizer: {e}"
     logger.exception(load_error)
 # ---------------- Chat Backend ----------------
 class HFChatBackend(ChatBackend):
     async def stream(self, request: Dict[str, Any]) -> AsyncIterable[Dict[str, Any]]:
@@ -52,13 +66,21 @@ class HFChatBackend(ChatBackend):
             logger.debug("Injected X-IP-Token into ZeroGPU headers")
         def _run_once(prompt: str, device: str, dtype: torch.dtype) -> str:
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_ID,
                 torch_dtype=dtype,
                 trust_remote_code=True,
                 device_map="auto" if device != "cpu" else {"": "cpu"},
             )
             model.eval()
             inputs = tokenizer(prompt, return_tensors="pt").to(device)
             with torch.inference_mode():
@@ -85,8 +107,9 @@ class HFChatBackend(ChatBackend):
             text = run_once(prompt)
         else:
-            # --- CPU-only fallback ---
-            text = _run_once(prompt, device="cpu", dtype=torch.float32)
         yield {
             "id": rid,

 from typing import Any, Dict, AsyncIterable
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
 from backends_base import ChatBackend, ImagesBackend
 from config import settings
     tokenizer = AutoTokenizer.from_pretrained(
         MODEL_ID,
         trust_remote_code=True,
+        use_fast=False,
     )
 except Exception as e:
     load_error = f"Failed to load tokenizer: {e}"
     logger.exception(load_error)
+# ---------------- helpers ----------------
+def _pick_cpu_dtype() -> torch.dtype:
+    # Prefer BF16 if CPU supports it
+    if hasattr(torch, "cpu") and hasattr(torch.cpu, "is_bf16_supported"):
+        try:
+            if torch.cpu.is_bf16_supported():
+                logger.info("CPU BF16 supported, using torch.bfloat16")
+                return torch.bfloat16
+        except Exception:
+            pass
+    logger.info("Falling back to torch.float32 on CPU")
+    return torch.float32
 # ---------------- Chat Backend ----------------
 class HFChatBackend(ChatBackend):
     async def stream(self, request: Dict[str, Any]) -> AsyncIterable[Dict[str, Any]]:
             logger.debug("Injected X-IP-Token into ZeroGPU headers")
         def _run_once(prompt: str, device: str, dtype: torch.dtype) -> str:
+            # Load config and strip any quantization settings (fix FP8 issue)
+            cfg = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
+            if hasattr(cfg, "quantization_config"):
+                logger.warning("Removing quantization_config from model config")
+                cfg.quantization_config = None
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_ID,
+                config=cfg,
                 torch_dtype=dtype,
                 trust_remote_code=True,
                 device_map="auto" if device != "cpu" else {"": "cpu"},
             )
             model.eval()
             inputs = tokenizer(prompt, return_tensors="pt").to(device)
             with torch.inference_mode():
             text = run_once(prompt)
         else:
+            # --- CPU-only fallback with auto dtype detection ---
+            dtype = _pick_cpu_dtype()
+            text = _run_once(prompt, device="cpu", dtype=dtype)
         yield {
             "id": rid,