|
| 1 | +# This Dockerfile is used to build a Docker image for NeMo RL with the NGC PyTorch base image. |
| 2 | +# However, it is still a work in progress and is not yet ready for production use. |
| 3 | +# |
| 4 | +# Usage: |
| 5 | +# Self-contained build (default: builds from main): docker buildx build -f docker/Dockerfile.ngc_pytorch --tag <registry>/nemo-rl:latest --push . |
| 6 | +# Self-contained build (specific git ref): docker buildx build -f docker/Dockerfile.ngc_pytorch --build-arg NRL_GIT_REF=r0.3.0 --tag <registry>/nemo-rl:r0.3.0 --push . |
| 7 | +# Self-contained build (remote NeMo RL source; no need for a local clone of NeMo RL): docker buildx build -f docker/Dockerfile.ngc_pytorch --build-arg NRL_GIT_REF=r0.3.0 --tag <registry>/nemo-rl:r0.3.0 --push https://github.com/NVIDIA-NeMo/RL.git |
| 8 | +# Local NeMo RL source override: docker buildx build --build-context nemo-rl=. -f docker/Dockerfile.ngc_pytorch --tag <registry>/nemo-rl:latest --push . |
| 9 | +# |
| 10 | +# If installing new dependencies in the container, then use "uv pip install new-dependency" |
| 11 | +ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.06-py3 |
| 12 | +FROM scratch AS nemo-rl |
| 13 | +ARG NRL_GIT_REF=main |
| 14 | +ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} / |
| 15 | + |
| 16 | +FROM ${BASE_IMAGE} AS base |
| 17 | + |
| 18 | +# It is more convenient for users to run as root |
| 19 | +USER root |
| 20 | + |
| 21 | +RUN <<"EOF" bash -exu -o pipefail |
| 22 | +export DEBIAN_FRONTEND=noninteractive |
| 23 | +export TZ=America/Los_Angeles |
| 24 | + |
| 25 | +apt-get update |
| 26 | +apt-get install -y --no-install-recommends \ |
| 27 | + jq \ |
| 28 | + curl \ |
| 29 | + git \ |
| 30 | + rsync \ |
| 31 | + wget \ |
| 32 | + less \ |
| 33 | + vim \ |
| 34 | + |
| 35 | + |
| 36 | +apt-get clean |
| 37 | +rm -rf /var/lib/apt/lists/* |
| 38 | +EOF |
| 39 | + |
| 40 | +# Install uv at /usr/local/bin in case the root home directory is bind mounted |
| 41 | +ARG UV_VERSION=0.7.2 |
| 42 | +RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | XDG_BIN_HOME=/usr/local/bin sh |
| 43 | + |
| 44 | +# Disable usage stats by default for users who are sensitive to sharing usage. |
| 45 | +# Users are encouraged to enable if they wish. |
| 46 | +ENV RAY_USAGE_STATS_ENABLED=0 |
| 47 | +ENV NEMO_RL_VENV_DIR=/opt/ray_venvs |
| 48 | + |
| 49 | +# Build vLLM from source to use with the NVIDIA PyTorch base image |
| 50 | +FROM base AS build_vllm |
| 51 | + |
| 52 | +ARG MAX_JOBS=32 |
| 53 | +WORKDIR /opt |
| 54 | +COPY --from=nemo-rl uv.lock /tmp/uv.lock |
| 55 | + |
| 56 | +RUN <<"EOF" bash -exu |
| 57 | +echo "Building vLLM from source for PyTorch base image" |
| 58 | +VLLM_VERSION=$(grep -A 1 'name = "vllm"' /tmp/uv.lock | grep 'version =' | sed 's/version = "\(.*\)"/\1/') && \ |
| 59 | +echo "Building vLLM version: $VLLM_VERSION" |
| 60 | +git clone https://github.com/vllm-project/vllm.git |
| 61 | +cd vllm |
| 62 | +git checkout v$VLLM_VERSION |
| 63 | +python use_existing_torch.py |
| 64 | +pip install -r requirements/build.txt |
| 65 | +pip wheel --no-deps --no-build-isolation -v . |
| 66 | +EOF |
| 67 | + |
| 68 | +FROM base AS hermetic |
| 69 | + |
| 70 | +WORKDIR /opt/nemo-rl |
| 71 | + |
| 72 | +# Variables to control the build of TE. If there are issues with parallelization, consider |
| 73 | +# setting these to 1. |
| 74 | +ARG MAX_JOBS |
| 75 | +ARG NVTE_BUILD_THREADS_PER_JOB |
| 76 | + |
| 77 | +ENV UV_PROJECT_ENVIRONMENT=/opt/nemo_rl_venv |
| 78 | +ENV UV_CACHE_DIR=/opt/uv_cache |
| 79 | +ENV UV_LINK_MODE=copy |
| 80 | + |
| 81 | +# Define the no-install-package arguments for PyTorch base images |
| 82 | +ARG BASE_IMAGE |
| 83 | +ARG UV_NO_INSTALL_PACKAGES="--no-install-package torch --no-install-package torchvision --no-install-package triton --no-install-package nvidia-cublas-cu12 --no-install-package nvidia-cuda-cupti-cu12 --no-install-package nvidia-cuda-nvrtc-cu12 --no-install-package nvidia-cuda-runtime-cu12 --no-install-package nvidia-cudnn-cu12 --no-install-package nvidia-cufft-cu12 --no-install-package nvidia-cufile-cu12 --no-install-package nvidia-curand-cu12 --no-install-package nvidia-cusolver-cu12 --no-install-package nvidia-cusparse-cu12 --no-install-package nvidia-cusparselt-cu12 --no-install-package nvidia-nccl-cu12 --no-install-package vllm --no-install-package flash-attn --no-install-package transformer-engine --no-install-package transformer-engine-cu12 --no-install-package transformer-engine-torch --no-install-package numpy" |
| 84 | +ENV UV_NO_INSTALL_PACKAGES=${UV_NO_INSTALL_PACKAGES} |
| 85 | +ENV PATH="/opt/nemo_rl_venv/bin:$PATH" |
| 86 | + |
| 87 | +# First copy only the dependency files |
| 88 | +COPY --from=nemo-rl pyproject.toml uv.lock ./ |
| 89 | +COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/ |
| 90 | + |
| 91 | + |
| 92 | +RUN --mount=type=bind,from=build_vllm,source=/opt/,target=/tmp/build_vllm/ <<"EOF" bash -exu |
| 93 | + |
| 94 | +# uv sync has a more reliable resolver than simple uv pip install which can fail |
| 95 | +# The venv is symlinked to avoid bloating the layer size |
| 96 | +uv venv --system-site-packages ${UV_PROJECT_ENVIRONMENT} |
| 97 | +uv pip install --no-cache-dir --no-deps /tmp/build_vllm/vllm/vllm*.whl |
| 98 | +uv sync --link-mode symlink --locked --inexact --extra vllm --extra mcore --extra automodel --all-groups --no-install-project $UV_NO_INSTALL_PACKAGES |
| 99 | +EOF |
| 100 | + |
| 101 | +ENV NEMO_RL_VENV_DIR=/opt/ray_venvs |
| 102 | + |
| 103 | +WORKDIR /opt/nemo-rl |
| 104 | + |
| 105 | +FROM hermetic AS release |
| 106 | + |
| 107 | +ARG NEMO_RL_COMMIT |
| 108 | +ARG NVIDIA_BUILD_ID |
| 109 | +ARG NVIDIA_BUILD_REF |
| 110 | +ENV UV_NO_SYNC=1 |
| 111 | +ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-<unknown>} |
| 112 | +ENV NVIDIA_BUILD_ID=${NVIDIA_BUILD_ID:-<unknown>} |
| 113 | +ENV NVIDIA_BUILD_REF=${NVIDIA_BUILD_REF:-<unknown>} |
| 114 | +ENV NEMO_RL_PY_EXECUTABLES_SYSTEM=1 |
| 115 | +# The 25.06 Pytorch container is not compatible with vllm standalone compile so we disable it |
| 116 | +ENV VLLM_USE_STANDALONE_COMPILE=0 |
| 117 | +LABEL com.nvidia.build.id="${NVIDIA_BUILD_ID}" |
| 118 | +LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}" |
| 119 | + |
| 120 | +ENV NEMO_RL_VENV_DIR=/opt/ray_venvs |
| 121 | + |
| 122 | +# Copy in source from build context (defaults to cloned repo, can be overridden) |
| 123 | +COPY --from=nemo-rl . /opt/nemo-rl |
| 124 | +# Unshallow the repo to get the full history (in the case it was from the scratch layer). |
| 125 | +# Potentially not necessary if the repo is passed in as a complete repository (w/ full git history), |
| 126 | +# so do a quick check before trying to unshallow. |
| 127 | +RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallow || true |
| 128 | +RUN UV_LINK_MODE=symlink uv sync --locked --inexact $UV_NO_INSTALL_PACKAGES |
0 commit comments