| | |
| | |
| |
|
| | |
| | FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 AS base |
| |
|
| | |
| | ENV DEBIAN_FRONTEND=noninteractive \ |
| | PYTHONUNBUFFERED=1 \ |
| | CUDA_HOME=/usr/local/cuda \ |
| | TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" \ |
| | FORCE_CUDA=1 \ |
| | MAX_JOBS=8 |
| |
|
| | |
| | RUN apt-get update && apt-get install -y \ |
| | python3.10 \ |
| | python3-pip \ |
| | python3.10-dev \ |
| | git \ |
| | wget \ |
| | curl \ |
| | vim \ |
| | build-essential \ |
| | cmake \ |
| | ninja-build \ |
| | ccache \ |
| | libssl-dev \ |
| | libffi-dev \ |
| | libjpeg-dev \ |
| | libpng-dev \ |
| | libgomp1 \ |
| | && rm -rf /var/lib/apt/lists/* |
| |
|
| | |
| | RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel |
| |
|
| | |
| | FROM base AS builder |
| |
|
| | WORKDIR /build |
| |
|
| | |
| | RUN pip install --no-cache-dir \ |
| | torch==2.2.0 \ |
| | torchvision==0.17.0 \ |
| | torchaudio==2.2.0 \ |
| | --index-url https://download.pytorch.org/whl/cu121 |
| | |
| | |
| | RUN pip install --no-cache-dir \ |
| | vllm==0.3.3 \ |
| | transformers==4.40.0 \ |
| | tokenizers==0.15.2 \ |
| | sentencepiece==0.2.0 \ |
| | accelerate==0.28.0 \ |
| | bitsandbytes==0.43.0 \ |
| | safetensors==0.4.2 \ |
| | huggingface-hub==0.21.4 |
| | |
| | |
| | RUN pip install --no-cache-dir \ |
| | numpy==1.26.4 \ |
| | scipy==1.12.0 \ |
| | pandas==2.2.1 \ |
| | scikit-learn==1.4.1 \ |
| | pydantic==2.6.4 \ |
| | fastapi==0.110.0 \ |
| | uvicorn[standard]==0.29.0 \ |
| | aiohttp==3.9.3 \ |
| | ray[default]==2.10.0 |
| | |
| | |
| | RUN pip install --no-cache-dir \ |
| | prometheus-client==0.20.0 \ |
| | gputil==1.4.0 \ |
| | psutil==5.9.8 \ |
| | py-cpuinfo==9.0.0 \ |
| | pynvml==11.5.0 |
| | |
| | |
| | FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 |
| | |
| | |
| | ENV DEBIAN_FRONTEND=noninteractive \ |
| | PYTHONUNBUFFERED=1 \ |
| | CUDA_HOME=/usr/local/cuda \ |
| | MODEL_NAME=DeepXR/Helion-2.5-Rnd \ |
| | MODEL_PATH=/models/helion \ |
| | PORT=8000 \ |
| | HOST=0.0.0.0 \ |
| | TENSOR_PARALLEL_SIZE=2 \ |
| | MAX_MODEL_LEN=131072 \ |
| | GPU_MEMORY_UTILIZATION=0.95 \ |
| | WORKERS=1 |
| | |
| | |
| | RUN apt-get update && apt-get install -y \ |
| | python3.10 \ |
| | python3-pip \ |
| | curl \ |
| | vim \ |
| | libgomp1 \ |
| | && rm -rf /var/lib/apt/lists/* |
| | |
| | |
| | COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages |
| | COPY --from=builder /usr/local/bin /usr/local/bin |
| | |
| | |
| | WORKDIR /app |
| | |
| | |
| | RUN mkdir -p /models/helion /app/inference /app/logs /app/cache |
| | |
| | |
| | COPY ./inference /app/inference |
| | COPY ./model_config.yaml /app/ |
| | COPY ./config.json /app/ |
| | |
| | |
| | RUN chmod +x /app/inference/*.py |
| | |
| | |
| | RUN useradd -m -u 1000 helion && \ |
| | chown -R helion:helion /app /models |
| | |
| | USER helion |
| | |
| | |
| | HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ |
| | CMD curl -f http://localhost:${PORT}/health || exit 1 |
| |
|
| | |
| | EXPOSE 8000 8001 8002 |
| |
|
| | |
| | CMD ["python3", "-m", "inference.server", \ |
| | "--model", "${MODEL_PATH}", \ |
| | "--host", "${HOST}", \ |
| | "--port", "${PORT}", \ |
| | "--tensor-parallel-size", "${TENSOR_PARALLEL_SIZE}", \ |
| | "--max-model-len", "${MAX_MODEL_LEN}", \ |
| | "--gpu-memory-utilization", "${GPU_MEMORY_UTILIZATION}"] |
| |
|
| | |
| | LABEL maintainer="DeepXR Team" \ |
| | version="2.5.0-rnd" \ |
| | description="Helion-2.5 Research & Development Model - Advanced Language Model" \ |
| | model="DeepXR/Helion-2.5-Rnd" \ |
| | license="Apache-2.0" |