# transformers_backend.py import time, logging from typing import Any, Dict, AsyncIterable from transformers import AutoTokenizer, AutoModelForCausalLM from backends_base import ChatBackend, ImagesBackend from config import settings logger = logging.getLogger(__name__) try: import spaces except ImportError: spaces = None class TransformersChatBackend(ChatBackend): """ Lightweight backend for Hugging Face Spaces (ZeroGPU). Reloads model on every request using Transformers, not vLLM. """ async def stream(self, request: Dict[str, Any]) -> AsyncIterable[Dict[str, Any]]: messages = request.get("messages", []) prompt = messages[-1]["content"] if messages else "(empty)" # Config-driven defaults model_id = request.get("model") or settings.LlmHFModelID temperature = float(request.get("temperature", settings.LlmTemp or 0.7)) max_tokens = int(request.get("max_tokens", settings.LlmOpenAICtxSize or 512)) rid = f"chatcmpl-transformers-{int(time.time())}" now = int(time.time()) # Run inside ZeroGPU lease if spaces: @spaces.GPU(duration=300) def run_once(prompt: str) -> str: tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto") inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, do_sample=True, ) return tokenizer.decode(outputs[0], skip_special_tokens=True) else: def run_once(prompt: str) -> str: tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id) inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, do_sample=True, ) return tokenizer.decode(outputs[0], skip_special_tokens=True) try: text = run_once(prompt) yield { "id": rid, "object": "chat.completion.chunk", "created": now, "model": model_id, "choices": [ {"index": 0, "delta": {"content": text}, "finish_reason": "stop"} ], } except Exception: logger.exception("Transformers inference failed") raise class StubImagesBackend(ImagesBackend): """ Image generation stub — returns a transparent PNG placeholder. """ async def generate_b64(self, request: Dict[str, Any]) -> str: logger.warning("Image generation not supported in Transformers backend.") return ( "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGP4BwQACfsD/etCJH0AAAAASUVORK5CYII=" )