# vLLM 0.20.0 for DGX Spark GB10 (aarch64 + Blackwell sm_121a + CUDA 13.0)
# Single-stage build. Compile vLLM and serve inference from one image.

# docker buildx build \
#     --network=host \
#     --build-arg HTTP_PROXY=http://127.0.0.1:7897 \
#     --build-arg HTTPS_PROXY=http://127.0.0.1:7897 \
#     -t i-dgx-spark-gb10:vllm0.20.0.build \
#     --load .

# docker run \
#     --rm -it \
#     --gpus all \
#     --ipc=host \
#     --network=host \
#     --name ${BASE_MODEL} \
#     -v $(pwd)/${MODEL_PATH}:/models/${BASE_MODEL} \
#     -p 4321:4321 \
#     --entrypoint vllm \
#     vllm:0.20.0-sm121 \
#         serve /models/${BASE_MODEL} \
#         --served-model-name ${BASE_MODEL} \
#         --port 4321 \
#         --gpu-memory-utilization 0.75 \
#         --load-format safetensors \
#         --trust-remote-code

ARG CUDA_VERSION=13.0.0
ARG UBUNTU_VERSION=24.04

FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

ARG MAX_JOBS=16
ARG PYTHON_VERSION=3.12
ARG TORCH_CUDA_ARCH_LIST="12.1a"
ARG HTTP_PROXY
ARG HTTPS_PROXY
ARG NO_PROXY=localhost,127.0.0.1,host.docker.internal

ENV http_proxy=${HTTP_PROXY} \
    https_proxy=${HTTPS_PROXY} \
    no_proxy=${NO_PROXY} \
    DEBIAN_FRONTEND=noninteractive \
    PIP_DISABLE_PIP_VERSION_CHECK=1 \
    PIP_NO_CACHE_DIR=1 \
    PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1 \
    CMAKE_BUILD_PARALLEL_LEVEL=${MAX_JOBS} \
    MAX_JOBS=${MAX_JOBS} \
    NVCC_THREADS=4 \
    TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} \
    VLLM_TARGET_DEVICE=cuda \
    VLLM_WORKER_MULTIPROC_METHOD=spawn \
    PATH=/opt/venv/bin:$PATH

RUN apt-get update \
    && \
    apt-get install -y --no-install-recommends \
        ca-certificates \
        git \
        build-essential \
        gcc \
        g++ \
        make \
        cmake \
        ninja-build \
        ccache \
        libnuma-dev \
        python${PYTHON_VERSION} \
        python${PYTHON_VERSION}-dev \
        python${PYTHON_VERSION}-venv \
    && \
    rm -rf /var/lib/apt/lists/*

RUN python${PYTHON_VERSION} -m venv /opt/venv \
    && \
    pip install -U pip wheel setuptools \
    && \
    pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu130 \
    && \
    pip install instanttensor

WORKDIR /src
RUN git clone --branch v0.20.0 --depth 1 https://github.com/vllm-project/vllm.git
WORKDIR /src/vllm

RUN python - <<'PY'
import re, pathlib
p = pathlib.Path("CMakeLists.txt")
s = p.read_text()
s2 = s.replace(
    '"7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0"',
    '"7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0;12.1"',
)
assert s2 != s, "CUDA_SUPPORTED_ARCHS for CUDA13 not patched — upstream layout changed"
s2 = re.sub(r'"12\.0f"(\s*\))', r'"12.0f;12.1a"\1', s2)
p.write_text(s2)
print("[patch] CMakeLists.txt OK")
PY

RUN python use_existing_torch.py 2>/dev/null || true

RUN pip install -r requirements/build/cuda.txt
ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.20.0
RUN --mount=type=cache,target=/root/.cache/ccache pip wheel --no-build-isolation --no-deps . -w /wheels -v

RUN pip install /wheels/*.whl && pip install -r requirements/cuda.txt

WORKDIR /workspace
EXPOSE 4321
