|
|
|
|
|
import time, logging |
|
|
from typing import Any, Dict, AsyncIterable |
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
from backends_base import ChatBackend, ImagesBackend |
|
|
from config import settings |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
try: |
|
|
import spaces |
|
|
except ImportError: |
|
|
spaces = None |
|
|
|
|
|
|
|
|
class TransformersChatBackend(ChatBackend): |
|
|
""" |
|
|
Lightweight backend for Hugging Face Spaces (ZeroGPU). |
|
|
Reloads model on every request using Transformers, not vLLM. |
|
|
""" |
|
|
|
|
|
async def stream(self, request: Dict[str, Any]) -> AsyncIterable[Dict[str, Any]]: |
|
|
messages = request.get("messages", []) |
|
|
prompt = messages[-1]["content"] if messages else "(empty)" |
|
|
|
|
|
|
|
|
model_id = request.get("model") or settings.LlmHFModelID |
|
|
temperature = float(request.get("temperature", settings.LlmTemp or 0.7)) |
|
|
max_tokens = int(request.get("max_tokens", settings.LlmOpenAICtxSize or 512)) |
|
|
|
|
|
rid = f"chatcmpl-transformers-{int(time.time())}" |
|
|
now = int(time.time()) |
|
|
|
|
|
|
|
|
if spaces: |
|
|
@spaces.GPU(duration=300) |
|
|
def run_once(prompt: str) -> str: |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto") |
|
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=max_tokens, |
|
|
temperature=temperature, |
|
|
do_sample=True, |
|
|
) |
|
|
return tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
else: |
|
|
def run_once(prompt: str) -> str: |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
model = AutoModelForCausalLM.from_pretrained(model_id) |
|
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt") |
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=max_tokens, |
|
|
temperature=temperature, |
|
|
do_sample=True, |
|
|
) |
|
|
return tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
try: |
|
|
text = run_once(prompt) |
|
|
yield { |
|
|
"id": rid, |
|
|
"object": "chat.completion.chunk", |
|
|
"created": now, |
|
|
"model": model_id, |
|
|
"choices": [ |
|
|
{"index": 0, "delta": {"content": text}, "finish_reason": "stop"} |
|
|
], |
|
|
} |
|
|
except Exception: |
|
|
logger.exception("Transformers inference failed") |
|
|
raise |
|
|
|
|
|
|
|
|
class StubImagesBackend(ImagesBackend): |
|
|
""" |
|
|
Image generation stub β returns a transparent PNG placeholder. |
|
|
""" |
|
|
async def generate_b64(self, request: Dict[str, Any]) -> str: |
|
|
logger.warning("Image generation not supported in Transformers backend.") |
|
|
return ( |
|
|
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGP4BwQACfsD/etCJH0AAAAASUVORK5CYII=" |
|
|
) |
|
|
|