|
|
|
|
|
import asyncio, logging |
|
|
import gradio as gr |
|
|
|
|
|
from config import settings |
|
|
from rabbit_base import RabbitBase |
|
|
from listener import RabbitListenerBase |
|
|
from rabbit_repo import RabbitRepo |
|
|
from oa_server import OpenAIServers |
|
|
from vllm_backend import VLLMChatBackend, StubImagesBackend |
|
|
|
|
|
|
|
|
from vllm.engine.async_llm_engine import AsyncLLMEngine |
|
|
from vllm.engine.arg_utils import AsyncEngineArgs |
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s" |
|
|
) |
|
|
log = logging.getLogger("app") |
|
|
|
|
|
try: |
|
|
import spaces |
|
|
@spaces.GPU(duration=60) |
|
|
def gpu_entrypoint() -> str: |
|
|
return "gpu: ready" |
|
|
except Exception: |
|
|
def gpu_entrypoint() -> str: |
|
|
return "gpu: not available (CPU only)" |
|
|
|
|
|
|
|
|
vllm_engine: AsyncLLMEngine | None = None |
|
|
|
|
|
async def init_vllm(): |
|
|
"""Initialize vLLM engine with a Hugging Face model.""" |
|
|
global vllm_engine |
|
|
if vllm_engine is not None: |
|
|
return vllm_engine |
|
|
|
|
|
model_id = getattr(settings, "LlmHFModelID", "Qwen/Qwen2.5-7B-Instruct") |
|
|
log.info(f"Loading vLLM model: {model_id}") |
|
|
|
|
|
args = AsyncEngineArgs( |
|
|
model=model_id, |
|
|
trust_remote_code=True, |
|
|
max_model_len=getattr(settings, "LlmOpenAICtxSize", 32768), |
|
|
) |
|
|
vllm_engine = AsyncLLMEngine.from_engine_args(args) |
|
|
return vllm_engine |
|
|
|
|
|
|
|
|
publisher = RabbitRepo(external_source="openai.mq.server") |
|
|
resolver = (lambda name: "direct" if name.startswith("oa.") else settings.RABBIT_EXCHANGE_TYPE) |
|
|
base = RabbitBase(exchange_type_resolver=resolver) |
|
|
|
|
|
servers = OpenAIServers( |
|
|
publisher, |
|
|
chat_backend=VLLMChatBackend(), |
|
|
images_backend=StubImagesBackend() |
|
|
) |
|
|
|
|
|
handlers = { |
|
|
"oaChatCreate": servers.handle_chat_create, |
|
|
"oaImagesGenerate": servers.handle_images_generate, |
|
|
} |
|
|
|
|
|
DECLS = [ |
|
|
{"ExchangeName": "oa.chat.create", "FuncName": "oaChatCreate", |
|
|
"MessageTimeout": 600_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]}, |
|
|
{"ExchangeName": "oa.images.generate", "FuncName": "oaImagesGenerate", |
|
|
"MessageTimeout": 600_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]}, |
|
|
] |
|
|
|
|
|
listener = RabbitListenerBase(base, instance_name=settings.RABBIT_INSTANCE_NAME, handlers=handlers) |
|
|
|
|
|
|
|
|
async def _startup_init(): |
|
|
try: |
|
|
await init_vllm() |
|
|
await base.connect() |
|
|
await listener.start(DECLS) |
|
|
return "OpenAI MQ + vLLM: ready" |
|
|
except Exception as e: |
|
|
log.exception("Startup init failed") |
|
|
return f"ERROR: {e}" |
|
|
|
|
|
async def ping(): |
|
|
return "ok" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="OpenAI over RabbitMQ (local vLLM)", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown("## OpenAI-compatible over RabbitMQ β using vLLM locally inside Space") |
|
|
with gr.Tabs(): |
|
|
with gr.Tab("Service"): |
|
|
btn = gr.Button("Ping") |
|
|
out = gr.Textbox(label="Ping result") |
|
|
btn.click(ping, inputs=None, outputs=out) |
|
|
init_status = gr.Textbox(label="Startup status", interactive=False) |
|
|
demo.load(fn=_startup_init, inputs=None, outputs=init_status) |
|
|
|
|
|
with gr.Tab("@spaces.GPU Probe"): |
|
|
gpu_btn = gr.Button("GPU Ready Probe", variant="primary") |
|
|
gpu_out = gr.Textbox(label="GPU Probe Result", interactive=False) |
|
|
gpu_btn.click(gpu_entrypoint, inputs=None, outputs=gpu_out) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True, debug=True) |
|
|
|