[CI] Use system nccl in build (#150226)

Install nccl in the docker image (which is already being done in some docker images), and use USE_SYSTEM_NCCL=1 in CI builds

It takes some time to build nccl and doesn't happen in parallel, so theres less benefit in switching to a bigger runner and using more processes

The other changes in this PR are because there is an install_cuda script and an install_cuda_aarch64 script and they both build nccl from source and define their own pins for the nccl version.  There is also a .ci/docker/nccl-cu11.txt and cu12.txt that define the pins, and this is an attempt to unify them.  Unfortunately this leads to a lot of files needing to be copied to the docker build

Generally seems to increase docker pull times by <1 min, P1768456379 but its hard to tell what the real increase is
15761 mib -> 16221 [linux-focal-cuda11.8-py3.10-gcc9 / test (distributed](https://github.com/pytorch/pytorch/actions/runs/14114171729/job/39545500161#logs)
`jq '[.layers[].size, .config.size] | add / 1024 / 1024'`

Example 6eb3c2e282 (39520169577-box)
![image](https://github.com/user-attachments/assets/d44ef415-6e48-41ef-ac83-f19bab47560c)

TODO:
* Figure out a way to verify that nccl was built + works properly when it is expected (this time i just checked torch.distributed.is_nccl_available)
* Merge the cusparse installation scripts
* Merge the cuda installation scripts
* Either split the nccl, cuda, and cusparse installations always, or make the always together in one bash script

distributed/test_distributed_spawn
Pull Request resolved: https://github.com/pytorch/pytorch/pull/150226
Approved by: https://github.com/seemethere, https://github.com/atalman
This commit is contained in:
Catherine Lee 2025-04-02 19:42:43 +00:00 committed by PyTorch MergeBot
parent cb4cd6166e
commit d4298f2136
11 changed files with 70 additions and 53 deletions

View File

@ -44,6 +44,8 @@ FROM base as cuda
ARG CUDA_VERSION=12.4
RUN rm -rf /usr/local/cuda-*
ADD ./common/install_cuda.sh install_cuda.sh
COPY ./common/install_nccl.sh install_nccl.sh
COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
# Preserve CUDA_VERSION for the builds
ENV CUDA_VERSION=${CUDA_VERSION}

View File

@ -2,7 +2,6 @@
set -ex
NCCL_VERSION=v2.26.2-1
CUDNN_VERSION=9.5.1.17
function install_cusparselt_040 {
@ -40,8 +39,7 @@ function install_cusparselt_063 {
function install_118 {
CUDNN_VERSION=9.1.0.70
NCCL_VERSION=v2.21.5-1
echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.4.0"
rm -rf /usr/local/cuda-11.8 /usr/local/cuda
# install CUDA 11.8.0 in the same container
wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
@ -59,14 +57,7 @@ function install_118 {
cd ..
rm -rf tmp_cudnn
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl && make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf nccl
CUDA_VERSION=11.8 bash install_nccl.sh
install_cusparselt_040
@ -75,7 +66,7 @@ function install_118 {
function install_124 {
CUDNN_VERSION=9.1.0.70
echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.2"
rm -rf /usr/local/cuda-12.4 /usr/local/cuda
# install CUDA 12.4.1 in the same container
wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
@ -93,14 +84,7 @@ function install_124 {
cd ..
rm -rf tmp_cudnn
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl && make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf nccl
CUDA_VERSION=12.4 bash install_nccl.sh
install_cusparselt_062
@ -108,7 +92,7 @@ function install_124 {
}
function install_126 {
echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
rm -rf /usr/local/cuda-12.6 /usr/local/cuda
# install CUDA 12.6.3 in the same container
wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run
@ -126,14 +110,7 @@ function install_126 {
cd ..
rm -rf tmp_cudnn
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl && make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf nccl
CUDA_VERSION=12.6 bash install_nccl.sh
install_cusparselt_063
@ -241,7 +218,7 @@ function prune_126 {
function install_128 {
CUDNN_VERSION=9.8.0.87
echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
rm -rf /usr/local/cuda-12.8 /usr/local/cuda
# install CUDA 12.8.0 in the same container
wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run
@ -259,14 +236,7 @@ function install_128 {
cd ..
rm -rf tmp_cudnn
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl && make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf nccl
CUDA_VERSION=12.8 bash install_nccl.sh
install_cusparselt_063

View File

@ -3,7 +3,6 @@
set -ex
NCCL_VERSION=v2.26.2-1
CUDNN_VERSION=9.8.0.87
function install_cusparselt_063 {
@ -18,7 +17,7 @@ function install_cusparselt_063 {
}
function install_128 {
echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
rm -rf /usr/local/cuda-12.8 /usr/local/cuda
# install CUDA 12.8.0 in the same container
wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux_sbsa.run
@ -36,14 +35,7 @@ function install_128 {
cd ..
rm -rf tmp_cudnn
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl && make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf nccl
CUDA_VERSION=12.8 bash install_nccl.sh
install_cusparselt_063

View File

@ -0,0 +1,26 @@
#!/bin/bash
set -ex
NCCL_VERSION=""
if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
else
echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
exit 1
fi
if [[ -n "${NCCL_VERSION}" ]]; then
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
pushd nccl
make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
popd
rm -rf nccl
ldconfig
fi

View File

@ -49,6 +49,8 @@ RUN bash ./install_mkl.sh && rm install_mkl.sh
FROM cpu as cuda
ADD ./common/install_cuda.sh install_cuda.sh
ADD ./common/install_magma.sh install_magma.sh
COPY ./common/install_nccl.sh install_nccl.sh
COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
ENV CUDA_HOME /usr/local/cuda
FROM cuda as cuda11.8

View File

@ -30,7 +30,9 @@ RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt
# Install cuda and cudnn
ARG CUDA_VERSION
COPY ./common/install_cuda.sh install_cuda.sh
RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
COPY ./common/install_nccl.sh install_nccl.sh
COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
ENV DESIRED_CUDA ${CUDA_VERSION}
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

View File

@ -64,7 +64,9 @@ FROM base as cuda
ARG BASE_CUDA_VERSION=10.2
# Install CUDA
ADD ./common/install_cuda.sh install_cuda.sh
RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
COPY ./common/install_nccl.sh install_nccl.sh
COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
FROM base as intel
# MKL

View File

@ -36,7 +36,9 @@ FROM base as cuda
ARG BASE_CUDA_VERSION=11.8
# Install CUDA
ADD ./common/install_cuda.sh install_cuda.sh
RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
COPY ./common/install_nccl.sh install_nccl.sh
COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu*
FROM base as intel
# MKL

View File

@ -67,7 +67,9 @@ FROM base as cuda
ARG BASE_CUDA_VERSION
# Install CUDA
ADD ./common/install_cuda_aarch64.sh install_cuda_aarch64.sh
RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh
COPY ./common/install_nccl.sh install_nccl.sh
COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh install_nccl.sh ci_commit_pins/nccl-cu*
FROM base as magma
ARG BASE_CUDA_VERSION

View File

@ -158,6 +158,16 @@ COPY ./common/install_cusparselt.sh install_cusparselt.sh
RUN bash install_cusparselt.sh
RUN rm install_cusparselt.sh
# Install NCCL
ARG CUDA_VERSION
COPY ./common/install_nccl.sh install_nccl.sh
COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
RUN bash install_nccl.sh
RUN rm install_nccl.sh /ci_commit_pins/nccl-cu*
ENV USE_SYSTEM_NCCL=1
ENV NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
# Install CUDSS
ARG CUDA_VERSION
COPY ./common/install_cudss.sh install_cudss.sh

View File

@ -52,9 +52,16 @@ RUN bash ./install_lcov.sh && rm install_lcov.sh
# Install cuda and cudnn
ARG CUDA_VERSION
COPY ./common/install_cuda.sh install_cuda.sh
RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
COPY ./common/install_nccl.sh install_nccl.sh
COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
ENV DESIRED_CUDA ${CUDA_VERSION}
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
# No effect if cuda not installed
ENV USE_SYSTEM_NCCL=1
ENV NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
# (optional) Install UCC
ARG UCX_COMMIT