Spaces:

Mungert
/

GradLLM

Running

App Files Files Community

johnbridges commited on Sep 17

Commit

11cacc3

1 Parent(s): d76b941

.

Browse files

Files changed (1) hide show

hf_backend.py +9 -10

hf_backend.py CHANGED Viewed

@@ -34,7 +34,6 @@ except Exception as e:
 # ---------------- helpers ----------------
 def _pick_cpu_dtype() -> torch.dtype:
-    # Prefer BF16 if CPU supports it
     if hasattr(torch, "cpu") and hasattr(torch.cpu, "is_bf16_supported"):
         try:
             if torch.cpu.is_bf16_supported():
@@ -51,16 +50,14 @@ _MODEL_CACHE: Dict[tuple[str, torch.dtype], AutoModelForCausalLM] = {}
 def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, torch.dtype]:
-    # Return model and the effective dtype actually loaded with
-    # (handles CPU BF16 -> FP32 fallback)
-    effective_key = (device, dtype)
-    if effective_key in _MODEL_CACHE:
-        return _MODEL_CACHE[effective_key], dtype
     cfg = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
     if hasattr(cfg, "quantization_config"):
         logger.warning("Removing quantization_config from model config")
-        delattr(cfg, "quantization_config")  # delete instead of setting None
     eff_dtype = dtype
     try:
@@ -70,6 +67,7 @@ def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, t
             torch_dtype=dtype,
             trust_remote_code=True,
             device_map="auto" if device != "cpu" else {"": "cpu"},
         )
     except Exception as e:
         if device == "cpu" and dtype == torch.bfloat16:
@@ -81,10 +79,14 @@ def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, t
                 torch_dtype=eff_dtype,
                 trust_remote_code=True,
                 device_map={"": "cpu"},
             )
         else:
             raise
     model.eval()
     _MODEL_CACHE[(device, eff_dtype)] = model
     return model, eff_dtype
@@ -151,17 +153,14 @@ class HFChatBackend(ChatBackend):
             return tokenizer.decode(outputs[0], skip_special_tokens=True)
         if spaces:
-            # Always dispatch via ZeroGPU decorator if available.
             @spaces.GPU(duration=120)
             def run_once(prompt: str) -> str:
                 if torch.cuda.is_available():
                     return _run_once(prompt, device="cuda", req_dtype=torch.float16)
-                # Fallback to CPU inside the GPU context if CUDA is unavailable
                 return _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
             text = run_once(prompt)
         else:
-            # CPU-only runtime
             text = _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
         yield {

 # ---------------- helpers ----------------
 def _pick_cpu_dtype() -> torch.dtype:
     if hasattr(torch, "cpu") and hasattr(torch.cpu, "is_bf16_supported"):
         try:
             if torch.cpu.is_bf16_supported():
 def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, torch.dtype]:
+    key = (device, dtype)
+    if key in _MODEL_CACHE:
+        return _MODEL_CACHE[key], dtype
     cfg = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
     if hasattr(cfg, "quantization_config"):
         logger.warning("Removing quantization_config from model config")
+        delattr(cfg, "quantization_config")
     eff_dtype = dtype
     try:
             torch_dtype=dtype,
             trust_remote_code=True,
             device_map="auto" if device != "cpu" else {"": "cpu"},
+            low_cpu_mem_usage=False,  # ensure full load before casting
         )
     except Exception as e:
         if device == "cpu" and dtype == torch.bfloat16:
                 torch_dtype=eff_dtype,
                 trust_remote_code=True,
                 device_map={"": "cpu"},
+                low_cpu_mem_usage=False,
             )
         else:
             raise
+    # --- Force recast to target dtype/device (fixes FP8 leftovers) ---
+    model = model.to(device=device, dtype=eff_dtype)
     model.eval()
     _MODEL_CACHE[(device, eff_dtype)] = model
     return model, eff_dtype
             return tokenizer.decode(outputs[0], skip_special_tokens=True)
         if spaces:
             @spaces.GPU(duration=120)
             def run_once(prompt: str) -> str:
                 if torch.cuda.is_available():
                     return _run_once(prompt, device="cuda", req_dtype=torch.float16)
                 return _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
             text = run_once(prompt)
         else:
             text = _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
         yield {