Spaces:

Mungert
/

GradLLM

Running

App Files Files Community

johnbridges commited on Sep 18

Commit

213e916

1 Parent(s): 11cacc3

.

Browse files

Files changed (1) hide show

hf_backend.py +11 -11

hf_backend.py CHANGED Viewed

@@ -67,7 +67,7 @@ def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, t
             torch_dtype=dtype,
             trust_remote_code=True,
             device_map="auto" if device != "cpu" else {"": "cpu"},
-            low_cpu_mem_usage=False,  # ensure full load before casting
         )
     except Exception as e:
         if device == "cpu" and dtype == torch.bfloat16:
@@ -84,8 +84,10 @@ def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, t
         else:
             raise
-    # --- Force recast to target dtype/device (fixes FP8 leftovers) ---
-    model = model.to(device=device, dtype=eff_dtype)
     model.eval()
     _MODEL_CACHE[(device, eff_dtype)] = model
@@ -105,13 +107,11 @@ class HFChatBackend(ChatBackend):
         rid = f"chatcmpl-hf-{int(time.time())}"
         now = int(time.time())
-        # --- Inject X-IP-Token into global headers if ZeroGPU is used ---
         x_ip_token = request.get("x_ip_token")
         if x_ip_token and zero_client:
             zero_client.HEADERS["X-IP-Token"] = x_ip_token
             logger.debug("Injected X-IP-Token into ZeroGPU headers")
-        # Build prompt using chat template if available
         if hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None):
             try:
                 prompt = tokenizer.apply_chat_template(
@@ -150,7 +150,11 @@ class HFChatBackend(ChatBackend):
                         use_cache=True,
                     )
-            return tokenizer.decode(outputs[0], skip_special_tokens=True)
         if spaces:
             @spaces.GPU(duration=120)
@@ -169,17 +173,13 @@ class HFChatBackend(ChatBackend):
             "created": now,
             "model": MODEL_ID,
             "choices": [
-                {"index": 0, "delta": {"content": text}, "finish_reason": "stop"}
             ],
         }
 # ---------------- Stub Images Backend ----------------
 class StubImagesBackend(ImagesBackend):
-    """
-    Stub backend for images since HFChatBackend is text-only.
-    Returns a transparent 1x1 PNG placeholder.
-    """
     async def generate_b64(self, request: Dict[str, Any]) -> str:
         logger.warning("Image generation not supported in HF backend.")
         return (

             torch_dtype=dtype,
             trust_remote_code=True,
             device_map="auto" if device != "cpu" else {"": "cpu"},
+            low_cpu_mem_usage=False,
         )
     except Exception as e:
         if device == "cpu" and dtype == torch.bfloat16:
         else:
             raise
+    if device == "cpu":
+        model = model.to(device=device, dtype=eff_dtype)
+    else:
+        model = model.to(device=device)
     model.eval()
     _MODEL_CACHE[(device, eff_dtype)] = model
         rid = f"chatcmpl-hf-{int(time.time())}"
         now = int(time.time())
         x_ip_token = request.get("x_ip_token")
         if x_ip_token and zero_client:
             zero_client.HEADERS["X-IP-Token"] = x_ip_token
             logger.debug("Injected X-IP-Token into ZeroGPU headers")
         if hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None):
             try:
                 prompt = tokenizer.apply_chat_template(
                         use_cache=True,
                     )
+            # Slice: keep only newly generated tokens
+            input_len = inputs["input_ids"].shape[-1]
+            generated_ids = outputs[0][input_len:]
+            text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+            return text
         if spaces:
             @spaces.GPU(duration=120)
             "created": now,
             "model": MODEL_ID,
             "choices": [
+                {"index": 0, "delta": {"role": "assistant", "content": text}, "finish_reason": "stop"}
             ],
         }
 # ---------------- Stub Images Backend ----------------
 class StubImagesBackend(ImagesBackend):
     async def generate_b64(self, request: Dict[str, Any]) -> str:
         logger.warning("Image generation not supported in HF backend.")
         return (