Spaces:

Mungert
/

GradLLM

Running

App Files Files Community

johnbridges commited on Sep 17

Commit

2ad6a17

1 Parent(s): ba9c53a

added cpu support to hf_backend

Browse files

Files changed (1) hide show

hf_backend.py +48 -39

hf_backend.py CHANGED Viewed

@@ -15,21 +15,19 @@ try:
 except ImportError:
     spaces, zero_client = None, None
-# --- Model setup (CPU-safe load, real inference on GPU only) ---
 MODEL_ID = settings.LlmHFModelID or "Qwen/Qwen2.5-1.5B-Instruct"
-logger.info(f"Preloading tokenizer for {MODEL_ID} on CPU (ZeroGPU safe)...")
-tokenizer, model, load_error = None, None, None
 try:
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=False)
-    model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
-        torch_dtype=torch.float32,   # dummy dtype for CPU preload
         trust_remote_code=True,
     )
-    model.eval()
 except Exception as e:
-    load_error = f"Failed to load model/tokenizer: {e}"
     logger.exception(load_error)
@@ -47,47 +45,58 @@ class HFChatBackend(ChatBackend):
         rid = f"chatcmpl-hf-{int(time.time())}"
         now = int(time.time())
-        if not spaces:
-            raise RuntimeError("ZeroGPU (spaces) is required but not available!")
-        # --- Inject X-IP-Token into global headers ---
         x_ip_token = request.get("x_ip_token")
         if x_ip_token and zero_client:
             zero_client.HEADERS["X-IP-Token"] = x_ip_token
             logger.debug("Injected X-IP-Token into ZeroGPU headers")
-        # --- Define the GPU-only inference function ---
-        @spaces.GPU(duration=120)
-        def run_once(prompt: str) -> str:
-            device = "cuda"   # force CUDA
-            dtype = torch.float16
-            model.to(device=device, dtype=dtype).eval()
             inputs = tokenizer(prompt, return_tensors="pt").to(device)
-            with torch.inference_mode(), torch.autocast(device_type=device, dtype=dtype):
-                outputs = model.generate(
-                    **inputs,
-                    max_new_tokens=max_tokens,
-                    temperature=temperature,
-                    do_sample=True,
-                )
             return tokenizer.decode(outputs[0], skip_special_tokens=True)
-        try:
             text = run_once(prompt)
-            yield {
-                "id": rid,
-                "object": "chat.completion.chunk",
-                "created": now,
-                "model": MODEL_ID,
-                "choices": [
-                    {"index": 0, "delta": {"content": text}, "finish_reason": "stop"}
-                ],
-            }
-        except Exception:
-            logger.exception("HF inference failed")
-            raise
 # ---------------- Stub Images Backend ----------------

 except ImportError:
     spaces, zero_client = None, None
+# --- Model setup ---
 MODEL_ID = settings.LlmHFModelID or "Qwen/Qwen2.5-1.5B-Instruct"
+logger.info(f"Preloading tokenizer for {MODEL_ID} on CPU...")
+tokenizer, load_error = None, None
 try:
+    tokenizer = AutoTokenizer.from_pretrained(
         MODEL_ID,
         trust_remote_code=True,
+        use_fast=False
     )
 except Exception as e:
+    load_error = f"Failed to load tokenizer: {e}"
     logger.exception(load_error)
         rid = f"chatcmpl-hf-{int(time.time())}"
         now = int(time.time())
+        # --- Inject X-IP-Token into global headers if ZeroGPU is used ---
         x_ip_token = request.get("x_ip_token")
         if x_ip_token and zero_client:
             zero_client.HEADERS["X-IP-Token"] = x_ip_token
             logger.debug("Injected X-IP-Token into ZeroGPU headers")
+        def _run_once(prompt: str, device: str, dtype: torch.dtype) -> str:
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_ID,
+                torch_dtype=dtype,
+                trust_remote_code=True,
+                device_map="auto" if device != "cpu" else {"": "cpu"},
+            )
+            model.eval()
             inputs = tokenizer(prompt, return_tensors="pt").to(device)
+            with torch.inference_mode():
+                if device != "cpu":
+                    autocast_ctx = torch.autocast(device_type=device, dtype=dtype)
+                else:
+                    autocast_ctx = torch.cpu.amp.autocast(dtype=dtype)
+                with autocast_ctx:
+                    outputs = model.generate(
+                        **inputs,
+                        max_new_tokens=max_tokens,
+                        temperature=temperature,
+                        do_sample=True,
+                    )
             return tokenizer.decode(outputs[0], skip_special_tokens=True)
+        if spaces:
+            # --- GPU path with ZeroGPU ---
+            @spaces.GPU(duration=120)
+            def run_once(prompt: str) -> str:
+                return _run_once(prompt, device="cuda", dtype=torch.float16)
             text = run_once(prompt)
+        else:
+            # --- CPU-only fallback ---
+            text = _run_once(prompt, device="cpu", dtype=torch.float32)
+        yield {
+            "id": rid,
+            "object": "chat.completion.chunk",
+            "created": now,
+            "model": MODEL_ID,
+            "choices": [
+                {"index": 0, "delta": {"content": text}, "finish_reason": "stop"}
+            ],
+        }
 # ---------------- Stub Images Backend ----------------