Spaces:

MCP-1st-Birthday
/

eu-ai-act-chatgpt-mcp

Running

File size: 13,903 Bytes

9434d3d

"""
GPT-OSS Model Deployment on Modal with vLLM

This script deploys OpenAI's GPT-OSS models (20B or 120B) on Modal.com
with vLLM for efficient inference.

Usage:
    # First time setup - pre-download model weights (run once, takes ~5-10 min)
    modal run gpt_oss_inference.py::download_model
    
    # Test the server locally
    modal run gpt_oss_inference.py
    
    # Deploy to production
    modal deploy gpt_oss_inference.py

Performance Tips:
    1. Run download_model first to cache weights in the volume
    2. Reduce MAX_MODEL_LEN for faster startup (8k is sufficient for most use cases)
    3. Keep FAST_BOOT=True for cheaper GPUs (A10G, L4)
    4. Increase SCALEDOWN_WINDOW to reduce cold starts during demos

Based on: https://modal.com/docs/examples/gpt_oss_inference
"""

import json
import time
from datetime import datetime, timezone
from typing import Any

import aiohttp
import modal

# =============================================================================
# Container Image Configuration
# =============================================================================

# Enable HF Transfer for faster model downloads (5-10x faster)
vllm_image = (
    modal.Image.from_registry(
        "nvidia/cuda:12.8.1-devel-ubuntu22.04",
        add_python="3.12",
    )
    .entrypoint([])
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})  # Enable fast downloads
    .uv_pip_install(
        "vllm==0.11.0",
        "huggingface_hub[hf_transfer]==0.35.0",
        "flashinfer-python==0.3.1",
    )
)

# =============================================================================
# Model Configuration
# =============================================================================

# Choose the model size - 20B is faster, 120B has more capabilities
MODEL_NAME = "openai/gpt-oss-20b"  # or "openai/gpt-oss-120b"
MODEL_REVISION = "d666cf3b67006cf8227666739edf25164aaffdeb"

# =============================================================================
# GPU Configuration - CHOOSE YOUR GPU TIER
# =============================================================================
# 
# Modal GPU Pricing (approximate, per hour):
# ┌─────────────┬──────────┬────────────────────────────────────────────┐
# │ GPU         │ Price/hr │ Notes                                      │
# ├─────────────┼──────────┼────────────────────────────────────────────┤
# │ T4 (16GB)   │ ~$0.25   │ ❌ Too small for GPT-OSS                   │
# │ L4 (24GB)   │ ~$0.59   │ ⚠️  Tight fit, may work with 20B           │
# │ A10G (24GB) │ ~$0.76   │ ✅ Good balance for 20B model              │
# │ A100 40GB   │ ~$1.79   │ ✅ Comfortable for 20B                     │
# │ A100 80GB   │ ~$2.78   │ ✅ Works for both 20B and 120B             │
# │ H100 (80GB) │ ~$3.95   │ ✅ Best performance, both models           │
# └─────────────┴──────────┴────────────────────────────────────────────┘
#
# GPT-OSS 20B with MXFP4 quantization needs ~10-15GB VRAM
# GPT-OSS 120B needs ~40-50GB VRAM

# Choose your GPU - uncomment the one you want to use:
GPU_CONFIG = "A100-40GB"  # ~$0.76/hr - RECOMMENDED for budget (works with 20B)
# GPU_CONFIG = "L4"     # ~$0.59/hr - Cheapest option (may be tight)
# GPU_CONFIG = "A100"   # ~$1.79/hr - More headroom (40GB version)
# GPU_CONFIG = "H100"   # ~$3.95/hr - Maximum performance

# =============================================================================
# Volume Configuration for Caching
# =============================================================================

# Cache for HuggingFace model weights
hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)

# Cache for vLLM compilation artifacts
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)

# =============================================================================
# Performance Configuration
# =============================================================================

MINUTES = 60  # Helper constant

# FAST_BOOT = True: Faster startup but slower inference
# FAST_BOOT = False: Slower startup but faster inference (recommended for production)
FAST_BOOT = True  # Use True for cheaper GPUs to reduce startup memory

# CUDA graph capture sizes for optimized inference
CUDA_GRAPH_CAPTURE_SIZES = [1, 2, 4, 8, 16, 24, 32]

# Data type configuration
# NOTE: GPT-OSS uses MXFP4 quantization which REQUIRES bfloat16 - float16 is NOT supported
# The Marlin kernel warning on A10G/L4 is expected and can be ignored
USE_FLOAT16 = False  # Must be False for GPT-OSS (MXFP4 only supports bfloat16)

# Maximum model length (context window) - SIGNIFICANTLY REDUCED for faster startup
# The KV cache allocation is proportional to context length, so smaller = much faster startup
# For EU AI Act assessments, 8k-16k tokens is more than enough
# GPT-OSS 20B supports up to 128k tokens, but we only need ~8k for our use case
MAX_MODEL_LEN = 16384  # 16k tokens - sufficient for compliance assessments, 4x faster startup

# Server configuration
VLLM_PORT = 8000
N_GPU = 1  # Number of GPUs for tensor parallelism
MAX_INPUTS = 50  # Reduced for smaller GPUs

# Keep container warm longer to avoid cold starts (costs more but faster response)
# For hackathon demo: 10 minutes to reduce cold starts during presentation
SCALEDOWN_WINDOW = 10 * MINUTES  # Increased for demo stability

# =============================================================================
# Modal App Definition
# =============================================================================

app = modal.App("gpt-oss-vllm-inference")


# Select GPU based on GPU_CONFIG
_GPU_MAP = {
    "T4": "T4",
    "L4": "L4",
    "A10G": "A10G",
    "A100": "A100:40GB",
    "A100-80GB": "A100:80GB",
    "H100": "H100",
}
SELECTED_GPU = _GPU_MAP.get(GPU_CONFIG, "A10G")


# =============================================================================
# Pre-download Model Weights (reduces warm start time significantly)
# =============================================================================

@app.function(
    image=vllm_image,
    volumes={"/root/.cache/huggingface": hf_cache_vol},
    timeout=30 * MINUTES,
)
def download_model():
    """
    Pre-download the model weights to the volume cache.
    Run this once with: modal run gpt_oss_inference.py::download_model
    This will cache the weights and make subsequent starts much faster.
    """
    from huggingface_hub import snapshot_download
    
    print(f"📥 Downloading model weights for {MODEL_NAME}...")
    print(f"   Revision: {MODEL_REVISION}")
    
    snapshot_download(
        MODEL_NAME,
        revision=MODEL_REVISION,
        local_dir=f"/root/.cache/huggingface/hub/models--{MODEL_NAME.replace('/', '--')}",
    )
    
    print("✅ Model weights downloaded and cached!")
    print("   Future container starts will use the cached weights.")


@app.function(
    image=vllm_image,
    gpu=SELECTED_GPU,
    scaledown_window=SCALEDOWN_WINDOW,
    timeout=30 * MINUTES,
    volumes={
        "/root/.cache/huggingface": hf_cache_vol,
        "/root/.cache/vllm": vllm_cache_vol,
    },
)
@modal.concurrent(max_inputs=MAX_INPUTS)
@modal.web_server(port=VLLM_PORT, startup_timeout=30 * MINUTES)
def serve():
    """Start the vLLM server with GPT-OSS model."""
    import subprocess

    cmd = [
        "vllm",
        "serve",
        "--uvicorn-log-level=info",
        MODEL_NAME,
        "--revision",
        MODEL_REVISION,
        "--served-model-name",
        "llm",  # Serve model as "llm" - this is what clients expect
        "--host",
        "0.0.0.0",
        "--port",
        str(VLLM_PORT),
    ]

    # enforce-eager disables both Torch compilation and CUDA graph capture
    # default is no-enforce-eager. see the --compilation-config flag for tighter control
    cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]

    if not FAST_BOOT:  # CUDA graph capture is only used with `--no-enforce-eager`
        cmd += [
            "-O.cudagraph_capture_sizes="
            + str(CUDA_GRAPH_CAPTURE_SIZES).replace(" ", "")
        ]

    # Data type optimization: use float16 for A10G/L4 (SM86) to avoid Marlin kernel warning
    # bf16 is optimized for SM90+ (H100), fp16 is better for Ampere architecture
    if USE_FLOAT16:
        cmd += ["--dtype", "float16"]
    else:
        cmd += ["--dtype", "bfloat16"]

    # Limit context length to speed up startup and reduce memory allocation
    cmd += ["--max-model-len", str(MAX_MODEL_LEN)]

    # Disable custom all-reduce for single GPU (reduces startup overhead)
    if N_GPU == 1:
        cmd += ["--disable-custom-all-reduce"]

    # Enable prefix caching for faster subsequent requests
    cmd += ["--enable-prefix-caching"]

    # Trust remote code for GPT-OSS models
    cmd += ["--trust-remote-code"]

    # Optimize loading format for faster startup
    cmd += ["--load-format", "auto"]  # Auto-detect best format

    # assume multiple GPUs are for splitting up large matrix multiplications
    cmd += ["--tensor-parallel-size", str(N_GPU)]

    # Additional optimizations for faster startup and inference
    # Disable usage stats collection to speed up startup
    cmd += ["--disable-log-stats"]
    
    # Use swap space if needed (helps with memory pressure on smaller GPUs)
    cmd += ["--swap-space", "4"]  # 4GB swap space

    print(f"Starting vLLM server with command: {' '.join(cmd)}")

    subprocess.Popen(" ".join(cmd), shell=True)


# =============================================================================
# Local Test Entrypoint
# =============================================================================


@app.local_entrypoint()
async def test(test_timeout=30 * MINUTES, user_content=None, twice=True):
    """
    Test the deployed server with a sample prompt.
    
    Args:
        test_timeout: Maximum time to wait for server health
        user_content: Custom prompt to send (default: SVD explanation)
        twice: Whether to send a second request
    """
    url = serve.get_web_url()
    
    system_prompt = {
        "role": "system",
        "content": f"""You are ChatModal, a large language model trained by Modal.
Knowledge cutoff: 2024-06
Current date: {datetime.now(timezone.utc).date()}
Reasoning: low
# Valid channels: analysis, commentary, final. Channel must be included for every message.
Calls to these tools must go to the commentary channel: 'functions'.""",
    }

    if user_content is None:
        user_content = "Explain what the Singular Value Decomposition is."

    messages = [  # OpenAI chat format
        system_prompt,
        {"role": "user", "content": user_content},
    ]

    async with aiohttp.ClientSession(base_url=url) as session:
        print(f"Running health check for server at {url}")
        async with session.get("/health", timeout=test_timeout - 1 * MINUTES) as resp:
            up = resp.status == 200
        assert up, f"Failed health check for server at {url}"
        print(f"Successful health check for server at {url}")

        print(f"Sending messages to {url}:", *messages, sep="\n\t")
        await _send_request(session, "llm", messages)

        if twice:
            messages[0]["content"] += "\nTalk like a pirate, matey."
            print(f"Re-sending messages to {url}:", *messages, sep="\n\t")
            await _send_request(session, "llm", messages)


async def _send_request(
    session: aiohttp.ClientSession, model: str, messages: list
) -> None:
    """Send a streaming request to the vLLM server."""
    # `stream=True` tells an OpenAI-compatible backend to stream chunks
    payload: dict[str, Any] = {"messages": messages, "model": model, "stream": True}

    headers = {"Content-Type": "application/json", "Accept": "text/event-stream"}

    t = time.perf_counter()
    async with session.post(
        "/v1/chat/completions", json=payload, headers=headers, timeout=10 * MINUTES
    ) as resp:
        async for raw in resp.content:
            resp.raise_for_status()
            # extract new content and stream it
            line = raw.decode().strip()
            if not line or line == "data: [DONE]":
                continue
            if line.startswith("data: "):  # SSE prefix
                line = line[len("data: ") :]

            chunk = json.loads(line)
            assert (
                chunk["object"] == "chat.completion.chunk"
            )  # or something went horribly wrong
            delta = chunk["choices"][0]["delta"]

            if "content" in delta:
                print(delta["content"], end="")  # print the content as it comes in
            elif "reasoning_content" in delta:
                print(delta["reasoning_content"], end="")
            elif not delta:
                print()
            else:
                raise ValueError(f"Unsupported response delta: {delta}")
    print("")
    print(f"Time to Last Token: {time.perf_counter() - t:.2f} seconds")


# =============================================================================
# Utility Functions
# =============================================================================


def get_endpoint_url() -> str:
    """Get the deployed endpoint URL."""
    return serve.get_web_url()


if __name__ == "__main__":
    print("Run this script with Modal:")
    print("  modal run gpt_oss_inference.py       # Test the server")
    print("  modal deploy gpt_oss_inference.py    # Deploy to production")