diff --git a/.ci/docker/README.md b/.ci/docker/README.md index 90dcb7c6e6f..68df3076315 100644 --- a/.ci/docker/README.md +++ b/.ci/docker/README.md @@ -1,4 +1,4 @@ -# Docker images for GitHub CI +# Docker images for GitHub CI and CD This directory contains everything needed to build the Docker images that are used in our CI. @@ -12,7 +12,7 @@ each image as the `BUILD_ENVIRONMENT` environment variable. See `build.sh` for valid build environments (it's the giant switch). -## Contents +## Docker CI builds * `build.sh` -- dispatch script to launch all builds * `common` -- scripts used to execute individual Docker build stages @@ -21,6 +21,12 @@ See `build.sh` for valid build environments (it's the giant switch). * `ubuntu-rocm` -- Dockerfile for Ubuntu image with ROCm support * `ubuntu-xpu` -- Dockerfile for Ubuntu image with XPU support +### Docker CD builds + +* `conda` - Dockerfile and build.sh to build Docker images used in nightly conda builds +* `manywheel` - Dockerfile and build.sh to build Docker images used in nightly manywheel builds +* `libtorch` - Dockerfile and build.sh to build Docker images used in nightly libtorch builds + ## Usage ```bash diff --git a/.ci/docker/common/aotriton_version.txt b/.ci/docker/common/aotriton_version.txt new file mode 100644 index 00000000000..00f3f90cb78 --- /dev/null +++ b/.ci/docker/common/aotriton_version.txt @@ -0,0 +1,5 @@ +0.6b +manylinux_2_17 +rocm6.1 +04b5df8c8123f90cba3ede7e971e6fbc6040d506 +77c29fa3f3b614e187d7213d745e989a92708cee2bc6020419ab49019af399d1 diff --git a/.ci/docker/common/install_conda_docker.sh b/.ci/docker/common/install_conda_docker.sh new file mode 100755 index 00000000000..dc377075750 --- /dev/null +++ b/.ci/docker/common/install_conda_docker.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Script used only in CD pipeline +set -ex + +# Anaconda +# Latest anaconda is using openssl-3 which is incompatible with all currently published versions of git +# Which are using openssl-1.1.1, see https://anaconda.org/anaconda/git/files?version=2.40.1 for example +MINICONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-py311_23.5.2-0-Linux-x86_64.sh +wget -q $MINICONDA_URL +# NB: Manually invoke bash per https://github.com/conda/conda/issues/10431 +bash $(basename "$MINICONDA_URL") -b -p /opt/conda +rm $(basename "$MINICONDA_URL") +export PATH=/opt/conda/bin:$PATH +# See https://github.com/pytorch/builder/issues/1473 +# Pin conda to 23.5.2 as it's the last one compatible with openssl-1.1.1 +conda install -y conda=23.5.2 conda-build anaconda-client git ninja +# The cmake version here needs to match with the minimum version of cmake +# supported by PyTorch (3.18). There is only 3.18.2 on anaconda +/opt/conda/bin/pip3 install cmake==3.18.2 +conda remove -y --force patchelf diff --git a/.ci/docker/common/install_cpython.sh b/.ci/docker/common/install_cpython.sh new file mode 100755 index 00000000000..1bd25fb2de9 --- /dev/null +++ b/.ci/docker/common/install_cpython.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# Script used only in CD pipeline +set -uex -o pipefail + +PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python +PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads +GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py + +# Python versions to be installed in /opt/$VERSION_NO +CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0"} + +function check_var { + if [ -z "$1" ]; then + echo "required variable not defined" + exit 1 + fi +} + +function do_cpython_build { + local py_ver=$1 + local py_folder=$2 + check_var $py_ver + check_var $py_folder + tar -xzf Python-$py_ver.tgz + pushd $py_folder + + local prefix="/opt/_internal/cpython-${py_ver}" + mkdir -p ${prefix}/lib + if [[ -n $(which patchelf) ]]; then + local shared_flags="--enable-shared" + else + local shared_flags="--disable-shared" + fi + if [[ -z "${WITH_OPENSSL+x}" ]]; then + local openssl_flags="" + else + local openssl_flags="--with-openssl=${WITH_OPENSSL} --with-openssl-rpath=auto" + fi + + # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6 + CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} > /dev/null + + make -j40 > /dev/null + make install > /dev/null + + if [[ "${shared_flags}" == "--enable-shared" ]]; then + patchelf --set-rpath '$ORIGIN/../lib' ${prefix}/bin/python3 + fi + + popd + rm -rf $py_folder + # Some python's install as bin/python3. Make them available as + # bin/python. + if [ -e ${prefix}/bin/python3 ]; then + ln -s python3 ${prefix}/bin/python + fi + ${prefix}/bin/python get-pip.py + if [ -e ${prefix}/bin/pip3 ] && [ ! -e ${prefix}/bin/pip ]; then + ln -s pip3 ${prefix}/bin/pip + fi + ${prefix}/bin/pip install wheel==0.34.2 + local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))") + ln -s ${prefix} /opt/python/${abi_tag} +} + +function build_cpython { + local py_ver=$1 + check_var $py_ver + check_var $PYTHON_DOWNLOAD_URL + local py_ver_folder=$py_ver + if [ "$py_ver" = "3.13.0" ]; then + PY_VER_SHORT="3.13" + check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH + wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz + do_cpython_build $py_ver cpython-$PY_VER_SHORT + else + wget -q $PYTHON_DOWNLOAD_URL/$py_ver_folder/Python-$py_ver.tgz + do_cpython_build $py_ver Python-$py_ver + fi + + rm -f Python-$py_ver.tgz +} + +function build_cpythons { + check_var $GET_PIP_URL + curl -sLO $GET_PIP_URL + for py_ver in $@; do + build_cpython $py_ver + done + rm -f get-pip.py +} + +mkdir -p /opt/python +mkdir -p /opt/_internal +build_cpythons $CPYTHON_VERSIONS diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh new file mode 100644 index 00000000000..2088447a7ef --- /dev/null +++ b/.ci/docker/common/install_cuda.sh @@ -0,0 +1,239 @@ +#!/bin/bash + +set -ex + +NCCL_VERSION=v2.21.5-1 +CUDNN_VERSION=9.1.0.70 + +function install_cusparselt_040 { + # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html + mkdir tmp_cusparselt && pushd tmp_cusparselt + wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz + tar xf libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz + cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/include/* /usr/local/cuda/include/ + cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/lib/* /usr/local/cuda/lib64/ + popd + rm -rf tmp_cusparselt +} + +function install_cusparselt_052 { + # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html + mkdir tmp_cusparselt && pushd tmp_cusparselt + wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz + tar xf libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz + cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/include/* /usr/local/cuda/include/ + cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/ + popd + rm -rf tmp_cusparselt +} + +function install_118 { + echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0" + rm -rf /usr/local/cuda-11.8 /usr/local/cuda + # install CUDA 11.8.0 in the same container + wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run + chmod +x cuda_11.8.0_520.61.05_linux.run + ./cuda_11.8.0_520.61.05_linux.run --toolkit --silent + rm -f cuda_11.8.0_520.61.05_linux.run + rm -f /usr/local/cuda && ln -s /usr/local/cuda-11.8 /usr/local/cuda + + # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement + mkdir tmp_cudnn && cd tmp_cudnn + wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz + tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz + cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/include/* /usr/local/cuda/include/ + cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/lib/* /usr/local/cuda/lib64/ + cd .. + rm -rf tmp_cudnn + + # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses + # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build + git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git + cd nccl && make -j src.build + cp -a build/include/* /usr/local/cuda/include/ + cp -a build/lib/* /usr/local/cuda/lib64/ + cd .. + rm -rf nccl + + install_cusparselt_040 + + ldconfig +} + +function install_121 { + echo "Installing CUDA 12.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2" + rm -rf /usr/local/cuda-12.1 /usr/local/cuda + # install CUDA 12.1.0 in the same container + wget -q https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run + chmod +x cuda_12.1.1_530.30.02_linux.run + ./cuda_12.1.1_530.30.02_linux.run --toolkit --silent + rm -f cuda_12.1.1_530.30.02_linux.run + rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.1 /usr/local/cuda + + # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement + mkdir tmp_cudnn && cd tmp_cudnn + wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz + tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz + cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/ + cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/ + cd .. + rm -rf tmp_cudnn + + # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses + # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build + git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git + cd nccl && make -j src.build + cp -a build/include/* /usr/local/cuda/include/ + cp -a build/lib/* /usr/local/cuda/lib64/ + cd .. + rm -rf nccl + + install_cusparselt_052 + + ldconfig +} + +function install_124 { + echo "Installing CUDA 12.4 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2" + rm -rf /usr/local/cuda-12.4 /usr/local/cuda + # install CUDA 12.4.0 in the same container + wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run + chmod +x cuda_12.4.0_550.54.14_linux.run + ./cuda_12.4.0_550.54.14_linux.run --toolkit --silent + rm -f cuda_12.4.0_550.54.14_linux.run + rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda + + # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement + mkdir tmp_cudnn && cd tmp_cudnn + wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz + tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz + cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/ + cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/ + cd .. + rm -rf tmp_cudnn + + # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses + # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build + git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git + cd nccl && make -j src.build + cp -a build/include/* /usr/local/cuda/include/ + cp -a build/lib/* /usr/local/cuda/lib64/ + cd .. + rm -rf nccl + + install_cusparselt_052 + + ldconfig +} + +function prune_118 { + echo "Pruning CUDA 11.8 and cuDNN" + ##################################################################################### + # CUDA 11.8 prune static libs + ##################################################################################### + export NVPRUNE="/usr/local/cuda-11.8/bin/nvprune" + export CUDA_LIB_DIR="/usr/local/cuda-11.8/lib64" + + export GENCODE="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" + export GENCODE_CUDNN="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" + + if [[ -n "$OVERRIDE_GENCODE" ]]; then + export GENCODE=$OVERRIDE_GENCODE + fi + + # all CUDA libs except CuDNN and CuBLAS (cudnn and cublas need arch 3.7 included) + ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \ + | xargs -I {} bash -c \ + "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}" + + # prune CuDNN and CuBLAS + $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a + $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a + + ##################################################################################### + # CUDA 11.8 prune visual tools + ##################################################################################### + export CUDA_BASE="/usr/local/cuda-11.8/" + rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2022.3.0 $CUDA_BASE/nsight-systems-2022.4.2/ +} + +function prune_121 { + echo "Pruning CUDA 12.1" + ##################################################################################### + # CUDA 12.1 prune static libs + ##################################################################################### + export NVPRUNE="/usr/local/cuda-12.1/bin/nvprune" + export CUDA_LIB_DIR="/usr/local/cuda-12.1/lib64" + + export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" + export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" + + if [[ -n "$OVERRIDE_GENCODE" ]]; then + export GENCODE=$OVERRIDE_GENCODE + fi + + # all CUDA libs except CuDNN and CuBLAS + ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \ + | xargs -I {} bash -c \ + "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}" + + # prune CuDNN and CuBLAS + $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a + $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a + + ##################################################################################### + # CUDA 12.1 prune visual tools + ##################################################################################### + export CUDA_BASE="/usr/local/cuda-12.1/" + rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2023.1.0 $CUDA_BASE/nsight-systems-2023.1.2/ +} + +function prune_124 { + echo "Pruning CUDA 12.4" + ##################################################################################### + # CUDA 12.4 prune static libs + ##################################################################################### + export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune" + export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64" + + export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" + export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" + + if [[ -n "$OVERRIDE_GENCODE" ]]; then + export GENCODE=$OVERRIDE_GENCODE + fi + if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then + export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN + fi + + # all CUDA libs except CuDNN and CuBLAS + ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \ + | xargs -I {} bash -c \ + "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}" + + # prune CuDNN and CuBLAS + $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a + $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a + + ##################################################################################### + # CUDA 12.1 prune visual tools + ##################################################################################### + export CUDA_BASE="/usr/local/cuda-12.4/" + rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/ +} + +# idiomatic parameter and option handling in sh +while test $# -gt 0 +do + case "$1" in + 11.8) install_118; prune_118 + ;; + 12.1) install_121; prune_121 + ;; + 12.4) install_124; prune_124 + ;; + *) echo "bad argument $1"; exit 1 + ;; + esac + shift +done diff --git a/.ci/docker/common/install_cuda_aarch64.sh b/.ci/docker/common/install_cuda_aarch64.sh new file mode 100644 index 00000000000..7e503869761 --- /dev/null +++ b/.ci/docker/common/install_cuda_aarch64.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# Script used only in CD pipeline + +set -ex + +NCCL_VERSION=v2.21.5-1 + +function install_cusparselt_052 { + # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html + mkdir tmp_cusparselt && pushd tmp_cusparselt + wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz + tar xf libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz + cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/include/* /usr/local/cuda/include/ + cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/ + popd + rm -rf tmp_cusparselt +} + +function install_124 { + echo "Installing CUDA 12.4 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2" + rm -rf /usr/local/cuda-12.4 /usr/local/cuda + # install CUDA 12.4.0 in the same container + wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux_sbsa.run + chmod +x cuda_12.4.0_550.54.14_linux_sbsa.run + ./cuda_12.4.0_550.54.14_linux_sbsa.run --toolkit --silent + rm -f cuda_12.4.0_550.54.14_linux_sbsa.run + rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda + + # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement + mkdir tmp_cudnn && cd tmp_cudnn + wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz -O cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz + tar xf cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz + cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/include/* /usr/local/cuda/include/ + cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/lib/* /usr/local/cuda/lib64/ + cd .. + rm -rf tmp_cudnn + + # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses + # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build + git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git + cd nccl && make -j src.build + cp -a build/include/* /usr/local/cuda/include/ + cp -a build/lib/* /usr/local/cuda/lib64/ + cd .. + rm -rf nccl + + install_cusparselt_052 + + ldconfig +} + +function prune_124 { + echo "Pruning CUDA 12.4" + ##################################################################################### + # CUDA 12.4 prune static libs + ##################################################################################### + export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune" + export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64" + + export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" + export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" + + if [[ -n "$OVERRIDE_GENCODE" ]]; then + export GENCODE=$OVERRIDE_GENCODE + fi + + # all CUDA libs except CuDNN and CuBLAS + ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \ + | xargs -I {} bash -c \ + "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}" + + # prune CuDNN and CuBLAS + $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a + $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a + + ##################################################################################### + # CUDA 12.1 prune visual tools + ##################################################################################### + export CUDA_BASE="/usr/local/cuda-12.4/" + rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/ +} + +# idiomatic parameter and option handling in sh +while test $# -gt 0 +do + case "$1" in + 12.4) install_124; prune_124 + ;; + *) echo "bad argument $1"; exit 1 + ;; + esac + shift +done diff --git a/.ci/docker/common/install_libpng.sh b/.ci/docker/common/install_libpng.sh new file mode 100644 index 00000000000..32453411ae9 --- /dev/null +++ b/.ci/docker/common/install_libpng.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Script used only in CD pipeline + +set -ex + +LIBPNG_VERSION=1.6.37 + +mkdir -p libpng +pushd libpng + +wget http://download.sourceforge.net/libpng/libpng-$LIBPNG_VERSION.tar.gz +tar -xvzf libpng-$LIBPNG_VERSION.tar.gz + +pushd libpng-$LIBPNG_VERSION + +./configure +make +make install + +popd + +popd +rm -rf libpng diff --git a/.ci/docker/common/install_magma.sh b/.ci/docker/common/install_magma.sh new file mode 100644 index 00000000000..d0c6f67773d --- /dev/null +++ b/.ci/docker/common/install_magma.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Script used only in CD pipeline + +set -eou pipefail + +MAGMA_VERSION="2.5.2" + +function do_install() { + cuda_version=$1 + cuda_version_nodot=${1/./} + + MAGMA_VERSION="2.6.1" + magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2" + + cuda_dir="/usr/local/cuda-${cuda_version}" + ( + set -x + tmp_dir=$(mktemp -d) + pushd ${tmp_dir} + curl -OLs https://anaconda.org/pytorch/magma-cuda${cuda_version_nodot}/${MAGMA_VERSION}/download/linux-64/${magma_archive} + tar -xvf "${magma_archive}" + mkdir -p "${cuda_dir}/magma" + mv include "${cuda_dir}/magma/include" + mv lib "${cuda_dir}/magma/lib" + popd + ) +} + +do_install $1 diff --git a/.ci/docker/common/install_miopen.sh b/.ci/docker/common/install_miopen.sh new file mode 100644 index 00000000000..5f2062414f2 --- /dev/null +++ b/.ci/docker/common/install_miopen.sh @@ -0,0 +1,134 @@ +#!/bin/bash +# Script used only in CD pipeline + +set -ex + +ROCM_VERSION=$1 + +if [[ -z $ROCM_VERSION ]]; then + echo "missing ROCM_VERSION" + exit 1; +fi + +# To make version comparison easier, create an integer representation. +save_IFS="$IFS" +IFS=. ROCM_VERSION_ARRAY=(${ROCM_VERSION}) +IFS="$save_IFS" +if [[ ${#ROCM_VERSION_ARRAY[@]} == 2 ]]; then + ROCM_VERSION_MAJOR=${ROCM_VERSION_ARRAY[0]} + ROCM_VERSION_MINOR=${ROCM_VERSION_ARRAY[1]} + ROCM_VERSION_PATCH=0 +elif [[ ${#ROCM_VERSION_ARRAY[@]} == 3 ]]; then + ROCM_VERSION_MAJOR=${ROCM_VERSION_ARRAY[0]} + ROCM_VERSION_MINOR=${ROCM_VERSION_ARRAY[1]} + ROCM_VERSION_PATCH=${ROCM_VERSION_ARRAY[2]} +else + echo "Unhandled ROCM_VERSION ${ROCM_VERSION}" + exit 1 +fi +ROCM_INT=$(($ROCM_VERSION_MAJOR * 10000 + $ROCM_VERSION_MINOR * 100 + $ROCM_VERSION_PATCH)) + +# Install custom MIOpen + COMgr for ROCm >= 4.0.1 +if [[ $ROCM_INT -lt 40001 ]]; then + echo "ROCm version < 4.0.1; will not install custom MIOpen" + exit 0 +fi + +# Function to retry functions that sometimes timeout or have flaky failures +retry () { + $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) +} + +# Build custom MIOpen to use comgr for offline compilation. + +## Need a sanitized ROCM_VERSION without patchlevel; patchlevel version 0 must be added to paths. +ROCM_DOTS=$(echo ${ROCM_VERSION} | tr -d -c '.' | wc -c) +if [[ ${ROCM_DOTS} == 1 ]]; then + ROCM_VERSION_NOPATCH="${ROCM_VERSION}" + ROCM_INSTALL_PATH="/opt/rocm-${ROCM_VERSION}.0" +else + ROCM_VERSION_NOPATCH="${ROCM_VERSION%.*}" + ROCM_INSTALL_PATH="/opt/rocm-${ROCM_VERSION}" +fi + +# MIOPEN_USE_HIP_KERNELS is a Workaround for COMgr issues +MIOPEN_CMAKE_COMMON_FLAGS=" +-DMIOPEN_USE_COMGR=ON +-DMIOPEN_BUILD_DRIVER=OFF +" +# Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version +if [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then + echo "ROCm 6.1 MIOpen does not need any patches, do not build from source" + exit 0 +elif [[ $ROCM_INT -ge 60000 ]] && [[ $ROCM_INT -lt 60100 ]]; then + echo "ROCm 6.0 MIOpen does not need any patches, do not build from source" + exit 0 +elif [[ $ROCM_INT -ge 50700 ]] && [[ $ROCM_INT -lt 60000 ]]; then + echo "ROCm 5.7 MIOpen does not need any patches, do not build from source" + exit 0 +elif [[ $ROCM_INT -ge 50600 ]] && [[ $ROCM_INT -lt 50700 ]]; then + MIOPEN_BRANCH="release/rocm-rel-5.6-staging" +elif [[ $ROCM_INT -ge 50500 ]] && [[ $ROCM_INT -lt 50600 ]]; then + MIOPEN_BRANCH="release/rocm-rel-5.5-gfx11" +elif [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then + MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off" + MIOPEN_BRANCH="release/rocm-rel-5.4-staging" +elif [[ $ROCM_INT -ge 50300 ]] && [[ $ROCM_INT -lt 50400 ]]; then + MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off" + MIOPEN_BRANCH="release/rocm-rel-5.3-staging" +elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50300 ]]; then + MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off" + MIOPEN_BRANCH="release/rocm-rel-5.2-staging" +elif [[ $ROCM_INT -ge 50100 ]] && [[ $ROCM_INT -lt 50200 ]]; then + MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36" + MIOPEN_BRANCH="release/rocm-rel-5.1-staging" +elif [[ $ROCM_INT -ge 50000 ]] && [[ $ROCM_INT -lt 50100 ]]; then + MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36" + MIOPEN_BRANCH="release/rocm-rel-5.0-staging" +else + echo "Unhandled ROCM_VERSION ${ROCM_VERSION}" + exit 1 +fi + +yum remove -y miopen-hip + +git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH} +pushd MIOpen +# remove .git to save disk space since CI runner was running out +rm -rf .git +# Don't build MLIR to save docker build time +# since we are disabling MLIR backend for MIOpen anyway +if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then + sed -i '/rocMLIR/d' requirements.txt +elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50400 ]]; then + sed -i '/llvm-project-mlir/d' requirements.txt +fi +## MIOpen minimum requirements +cmake -P install_deps.cmake --minimum + +# clean up since CI runner was running out of disk space +rm -rf /tmp/* +yum clean all +rm -rf /var/cache/yum +rm -rf /var/lib/yum/yumdb +rm -rf /var/lib/yum/history + +## Build MIOpen +mkdir -p build +cd build +PKG_CONFIG_PATH=/usr/local/lib/pkgconfig CXX=${ROCM_INSTALL_PATH}/llvm/bin/clang++ cmake .. \ + ${MIOPEN_CMAKE_COMMON_FLAGS} \ + ${MIOPEN_CMAKE_DB_FLAGS} \ + -DCMAKE_PREFIX_PATH="${ROCM_INSTALL_PATH}/hip;${ROCM_INSTALL_PATH}" +make MIOpen -j $(nproc) + +# Build MIOpen package +make -j $(nproc) package + +# clean up since CI runner was running out of disk space +rm -rf /usr/local/cget + +yum install -y miopen-*.rpm + +popd +rm -rf MIOpen diff --git a/.ci/docker/common/install_mkl.sh b/.ci/docker/common/install_mkl.sh new file mode 100644 index 00000000000..d5d139266b9 --- /dev/null +++ b/.ci/docker/common/install_mkl.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -ex + +# MKL +MKL_VERSION=2024.2.0 + +MKLROOT=/opt/intel +mkdir -p ${MKLROOT} +pushd /tmp + +python3 -mpip install wheel +python3 -mpip download -d . mkl-static==${MKL_VERSION} +python3 -m wheel unpack mkl_static-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl +python3 -m wheel unpack mkl_include-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl +mv mkl_static-${MKL_VERSION}/mkl_static-${MKL_VERSION}.data/data/lib ${MKLROOT} +mv mkl_include-${MKL_VERSION}/mkl_include-${MKL_VERSION}.data/data/include ${MKLROOT} diff --git a/.ci/docker/common/install_mnist.sh b/.ci/docker/common/install_mnist.sh new file mode 100644 index 00000000000..4c009d9db02 --- /dev/null +++ b/.ci/docker/common/install_mnist.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# Script used only in CD pipeline + +set -ex + +mkdir -p /usr/local/mnist/ + +cd /usr/local/mnist + +for img in train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz t10k-images-idx3-ubyte.gz t10k-labels-idx1-ubyte.gz; do + wget -q https://ossci-datasets.s3.amazonaws.com/mnist/$img + gzip -d $img +done diff --git a/.ci/docker/common/install_openblas.sh b/.ci/docker/common/install_openblas.sh new file mode 100644 index 00000000000..80e4fb0f755 --- /dev/null +++ b/.ci/docker/common/install_openblas.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Script used only in CD pipeline + +set -ex + +cd / +git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.25 --depth 1 --shallow-submodules + + +OPENBLAS_BUILD_FLAGS=" +NUM_THREADS=128 +USE_OPENMP=1 +NO_SHARED=0 +DYNAMIC_ARCH=1 +TARGET=ARMV8 +CFLAGS=-O3 +" + +OPENBLAS_CHECKOUT_DIR="OpenBLAS" + +make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR} +make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR} diff --git a/.ci/docker/common/install_patchelf.sh b/.ci/docker/common/install_patchelf.sh new file mode 100644 index 00000000000..8f2ef5a26fd --- /dev/null +++ b/.ci/docker/common/install_patchelf.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# Script used only in CD pipeline + +set -ex + +# Pin the version to latest release 0.17.2, building newer commit starts +# to fail on the current image +git clone -b 0.17.2 --single-branch https://github.com/NixOS/patchelf +cd patchelf +sed -i 's/serial/parallel/g' configure.ac +./bootstrap.sh +./configure +make +make install +cd .. +rm -rf patchelf diff --git a/.ci/docker/common/install_rocm_drm.sh b/.ci/docker/common/install_rocm_drm.sh new file mode 100644 index 00000000000..a6c73560c1a --- /dev/null +++ b/.ci/docker/common/install_rocm_drm.sh @@ -0,0 +1,150 @@ +#!/bin/bash +# Script used only in CD pipeline + +########################### +### prereqs +########################### +# Install Python packages depending on the base OS +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +case "$ID" in + ubuntu) + apt-get update -y + apt-get install -y libpciaccess-dev pkg-config + apt-get clean + ;; + centos) + yum install -y libpciaccess-devel pkgconfig + ;; + *) + echo "Unable to determine OS..." + exit 1 + ;; +esac +python3 -m pip install meson ninja + +########################### +### clone repo +########################### +GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git +pushd drm + +########################### +### patch +########################### +patch -p1 <<'EOF' +diff --git a/amdgpu/amdgpu_asic_id.c b/amdgpu/amdgpu_asic_id.c +index a5007ffc..13fa07fc 100644 +--- a/amdgpu/amdgpu_asic_id.c ++++ b/amdgpu/amdgpu_asic_id.c +@@ -22,6 +22,13 @@ + * + */ + ++#define _XOPEN_SOURCE 700 ++#define _LARGEFILE64_SOURCE ++#define _FILE_OFFSET_BITS 64 ++#include ++#include ++#include ++ + #include + #include + #include +@@ -34,6 +41,19 @@ + #include "amdgpu_drm.h" + #include "amdgpu_internal.h" + ++static char *amdgpuids_path = NULL; ++static const char* amdgpuids_path_msg = NULL; ++ ++static int check_for_location_of_amdgpuids(const char *filepath, const struct stat *info, const int typeflag, struct FTW *pathinfo) ++{ ++ if (typeflag == FTW_F && strstr(filepath, "amdgpu.ids")) { ++ amdgpuids_path = strdup(filepath); ++ return 1; ++ } ++ ++ return 0; ++} ++ + static int parse_one_line(struct amdgpu_device *dev, const char *line) + { + char *buf, *saveptr; +@@ -113,10 +133,46 @@ void amdgpu_parse_asic_ids(struct amdgpu_device *dev) + int line_num = 1; + int r = 0; + ++ // attempt to find typical location for amdgpu.ids file + fp = fopen(AMDGPU_ASIC_ID_TABLE, "r"); ++ ++ // if it doesn't exist, search ++ if (!fp) { ++ ++ char self_path[ PATH_MAX ]; ++ ssize_t count; ++ ssize_t i; ++ ++ count = readlink( "/proc/self/exe", self_path, PATH_MAX ); ++ if (count > 0) { ++ self_path[count] = '\0'; ++ ++ // remove '/bin/python' from self_path ++ for (i=count; i>0; --i) { ++ if (self_path[i] == '/') break; ++ self_path[i] = '\0'; ++ } ++ self_path[i] = '\0'; ++ for (; i>0; --i) { ++ if (self_path[i] == '/') break; ++ self_path[i] = '\0'; ++ } ++ self_path[i] = '\0'; ++ ++ if (1 == nftw(self_path, check_for_location_of_amdgpuids, 5, FTW_PHYS)) { ++ fp = fopen(amdgpuids_path, "r"); ++ amdgpuids_path_msg = amdgpuids_path; ++ } ++ } ++ ++ } ++ else { ++ amdgpuids_path_msg = AMDGPU_ASIC_ID_TABLE; ++ } ++ ++ // both hard-coded location and search have failed + if (!fp) { +- fprintf(stderr, "%s: %s\n", AMDGPU_ASIC_ID_TABLE, +- strerror(errno)); ++ fprintf(stderr, "amdgpu.ids: No such file or directory\n"); + return; + } + +@@ -132,7 +188,7 @@ void amdgpu_parse_asic_ids(struct amdgpu_device *dev) + continue; + } + +- drmMsg("%s version: %s\n", AMDGPU_ASIC_ID_TABLE, line); ++ drmMsg("%s version: %s\n", amdgpuids_path_msg, line); + break; + } + +@@ -150,7 +206,7 @@ void amdgpu_parse_asic_ids(struct amdgpu_device *dev) + + if (r == -EINVAL) { + fprintf(stderr, "Invalid format: %s: line %d: %s\n", +- AMDGPU_ASIC_ID_TABLE, line_num, line); ++ amdgpuids_path_msg, line_num, line); + } else if (r && r != -EAGAIN) { + fprintf(stderr, "%s: Cannot parse ASIC IDs: %s\n", + __func__, strerror(-r)); +EOF + +########################### +### build +########################### +meson builddir --prefix=/opt/amdgpu +pushd builddir +ninja install + +popd +popd diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh index 94b94661c46..fc3d49a309b 100644 --- a/.ci/docker/common/install_rocm_magma.sh +++ b/.ci/docker/common/install_rocm_magma.sh @@ -1,7 +1,11 @@ #!/bin/bash +# Script used in CI and CD pipeline set -ex + +MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION} + # "install" hipMAGMA into /opt/rocm/magma by copying after build git clone https://bitbucket.org/icl/magma.git pushd magma @@ -11,7 +15,10 @@ git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6 cp make.inc-examples/make.inc.hip-gcc-mkl make.inc echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc -echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib' >> make.inc +if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then + echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc +fi +echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc export PATH="${PATH}:/opt/rocm/bin" if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then @@ -25,7 +32,7 @@ done # hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition sed -i 's/^FOPENMP/#FOPENMP/g' make.inc make -f make.gen.hipMAGMA -j $(nproc) -LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT=/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION -make testing/testing_dgemm -j $(nproc) MKLROOT=/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION +LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}" +make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}" popd mv magma /opt/rocm diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh index aa308010326..7902200bdfb 100644 --- a/.ci/docker/common/install_xpu.sh +++ b/.ci/docker/common/install_xpu.sh @@ -1,6 +1,6 @@ #!/bin/bash set -xe - +# Script used in CI and CD pipeline # IntelĀ® software for general purpose GPU capabilities. # Refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html @@ -8,19 +8,23 @@ set -xe # Users should update to the latest version as it becomes available function install_ubuntu() { + . /etc/os-release + if [[ ! " jammy " =~ " ${VERSION_CODENAME} " ]]; then + echo "Ubuntu version ${VERSION_CODENAME} not supported" + exit + fi + apt-get update -y apt-get install -y gpg-agent wget - - # Set up the repository. To do this, download the key to the system keyring + # To add the online network package repository for the GPU Driver LTS releases wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \ - | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg - wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ - | gpg --dearmor --output /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg - - # Add the signed entry to APT sources and configure the APT client to use the Intel repository + | gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \ - https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \ - | tee /etc/apt/sources.list.d/intel-gpu-jammy.list + https://repositories.intel.com/gpu/ubuntu ${VERSION_CODENAME}/lts/2350 unified" \ + | tee /etc/apt/sources.list.d/intel-gpu-${VERSION_CODENAME}.list + # To add the online network network package repository for the Intel Support Packages + wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ + | gpg --dearmor > /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \ https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \ | tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list @@ -97,6 +101,86 @@ EOF rm -rf /var/lib/yum/history } +function install_rhel() { + . /etc/os-release + if [[ "${ID}" == "rhel" ]]; then + if [[ ! " 8.6 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then + echo "RHEL version ${VERSION_ID} not supported" + exit + fi + elif [[ "${ID}" == "almalinux" ]]; then + # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64 + VERSION_ID="8.6" + fi + + dnf install -y 'dnf-command(config-manager)' + # To add the online network package repository for the GPU Driver LTS releases + dnf config-manager --add-repo \ + https://repositories.intel.com/gpu/rhel/${VERSION_ID}/lts/2350/unified/intel-gpu-${VERSION_ID}.repo + # To add the online network network package repository for the Intel Support Packages + tee > /etc/yum.repos.d/intel-for-pytorch-gpu-dev.repo << EOF +[intel-for-pytorch-gpu-dev] +name=Intel for Pytorch GPU dev repository +baseurl=https://yum.repos.intel.com/intel-for-pytorch-gpu-dev +enabled=1 +gpgcheck=1 +repo_gpgcheck=1 +gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB +EOF + + # The xpu-smi packages + dnf install -y xpu-smi + # Compute and Media Runtimes + dnf install -y \ + intel-opencl intel-media intel-mediasdk libmfxgen1 libvpl2\ + level-zero intel-level-zero-gpu mesa-dri-drivers mesa-vulkan-drivers \ + mesa-vdpau-drivers libdrm mesa-libEGL mesa-libgbm mesa-libGL \ + mesa-libxatracker libvpl-tools intel-metrics-discovery \ + intel-metrics-library intel-igc-core intel-igc-cm \ + libva libva-utils intel-gmmlib libmetee intel-gsc intel-ocloc + # Development packages + dnf install -y --refresh \ + intel-igc-opencl-devel level-zero-devel intel-gsc-devel libmetee-devel \ + level-zero-devel + # Install Intel Support Packages + yum install -y intel-for-pytorch-gpu-dev intel-pti-dev + + # Cleanup + dnf clean all + rm -rf /var/cache/yum + rm -rf /var/lib/yum/yumdb + rm -rf /var/lib/yum/history +} + +function install_sles() { + . /etc/os-release + VERSION_SP=${VERSION_ID//./sp} + if [[ ! " 15sp4 15sp5 " =~ " ${VERSION_SP} " ]]; then + echo "SLES version ${VERSION_ID} not supported" + exit + fi + + # To add the online network package repository for the GPU Driver LTS releases + zypper addrepo -f -r \ + https://repositories.intel.com/gpu/sles/${VERSION_SP}/lts/2350/unified/intel-gpu-${VERSION_SP}.repo + rpm --import https://repositories.intel.com/gpu/intel-graphics.key + # To add the online network network package repository for the Intel Support Packages + zypper addrepo https://yum.repos.intel.com/intel-for-pytorch-gpu-dev intel-for-pytorch-gpu-dev + rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + + # The xpu-smi packages + zypper install -y lsb-release flex bison xpu-smi + # Compute and Media Runtimes + zypper install -y intel-level-zero-gpu level-zero intel-gsc intel-opencl intel-ocloc \ + intel-media-driver libigfxcmrt7 libvpl2 libvpl-tools libmfxgen1 libmfx1 + # Development packages + zypper install -y libigdfcl-devel intel-igc-cm libigfxcmrt-devel level-zero-devel + + # Install Intel Support Packages + zypper install -y intel-for-pytorch-gpu-dev intel-pti-dev + +} + # The installation depends on the base OS ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') @@ -107,6 +191,12 @@ case "$ID" in centos) install_centos ;; + rhel|almalinux) + install_rhel + ;; + sles) + install_sles + ;; *) echo "Unable to determine OS..." exit 1 diff --git a/.ci/docker/conda/Dockerfile b/.ci/docker/conda/Dockerfile new file mode 100644 index 00000000000..0958aad0b5f --- /dev/null +++ b/.ci/docker/conda/Dockerfile @@ -0,0 +1,101 @@ +ARG CUDA_VERSION=10.2 +ARG BASE_TARGET=cuda${CUDA_VERSION} +FROM centos:7 as base + +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 + +ARG DEVTOOLSET_VERSION=9 +RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo +RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo +RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo +RUN yum update -y +RUN yum install -y wget curl perl util-linux xz bzip2 git patch which unzip +# Just add everything as a safe.directory for git since these will be used in multiple places with git +RUN git config --global --add safe.directory '*' +RUN yum install -y yum-utils centos-release-scl +RUN yum-config-manager --enable rhel-server-rhscl-7-rpms +RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo +RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo +RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo +RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils +# EPEL for cmake +RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \ + rpm -ivh epel-release-latest-7.noarch.rpm && \ + rm -f epel-release-latest-7.noarch.rpm +# cmake +RUN yum install -y cmake3 && \ + ln -s /usr/bin/cmake3 /usr/bin/cmake +ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH + +RUN yum install -y autoconf aclocal automake make sudo +RUN rm -rf /usr/local/cuda-* + +FROM base as patchelf +# Install patchelf +ADD ./common/install_patchelf.sh install_patchelf.sh +RUN bash ./install_patchelf.sh && rm install_patchelf.sh && cp $(which patchelf) /patchelf + +FROM base as openssl +# Install openssl +ADD ./common/install_openssl.sh install_openssl.sh +RUN bash ./install_openssl.sh && rm install_openssl.sh + +FROM base as conda +# Install Anaconda +ADD ./common/install_conda_docker.sh install_conda.sh +RUN bash ./install_conda.sh && rm install_conda.sh + +# Install CUDA +FROM base as cuda +ARG CUDA_VERSION=10.2 +RUN rm -rf /usr/local/cuda-* +ADD ./common/install_cuda.sh install_cuda.sh +ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION} +# Preserve CUDA_VERSION for the builds +ENV CUDA_VERSION=${CUDA_VERSION} +# Make things in our path by default +ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH + +FROM cuda as cuda11.8 +RUN bash ./install_cuda.sh 11.8 +ENV DESIRED_CUDA=11.8 + +FROM cuda as cuda12.1 +RUN bash ./install_cuda.sh 12.1 +ENV DESIRED_CUDA=12.1 + +FROM cuda as cuda12.4 +RUN bash ./install_cuda.sh 12.4 +ENV DESIRED_CUDA=12.4 + +# Install MNIST test data +FROM base as mnist +ADD ./common/install_mnist.sh install_mnist.sh +RUN bash ./install_mnist.sh + +FROM base as all_cuda +COPY --from=cuda11.8 /usr/local/cuda-11.8 /usr/local/cuda-11.8 +COPY --from=cuda12.1 /usr/local/cuda-12.1 /usr/local/cuda-12.1 +COPY --from=cuda12.4 /usr/local/cuda-12.4 /usr/local/cuda-12.4 + +# Final step +FROM ${BASE_TARGET} as final +COPY --from=openssl /opt/openssl /opt/openssl +COPY --from=patchelf /patchelf /usr/local/bin/patchelf +COPY --from=conda /opt/conda /opt/conda + +# Add jni.h for java host build. +COPY ./common/install_jni.sh install_jni.sh +COPY ./java/jni.h jni.h +RUN bash ./install_jni.sh && rm install_jni.sh + +ENV PATH /opt/conda/bin:$PATH +COPY --from=mnist /usr/local/mnist /usr/local/mnist +RUN rm -rf /usr/local/cuda +RUN chmod o+rw /usr/local +RUN touch /.condarc && \ + chmod o+rw /.condarc && \ + chmod -R o+rw /opt/conda diff --git a/.ci/docker/conda/build.sh b/.ci/docker/conda/build.sh new file mode 100755 index 00000000000..6e8a1c37ff9 --- /dev/null +++ b/.ci/docker/conda/build.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +# Script used only in CD pipeline + +set -eou pipefail + +image="$1" +shift + +if [ -z "${image}" ]; then + echo "Usage: $0 IMAGE" + exit 1 +fi + +DOCKER_IMAGE_NAME="pytorch/${image}" + + +export DOCKER_BUILDKIT=1 +TOPDIR=$(git rev-parse --show-toplevel) + +CUDA_VERSION=${CUDA_VERSION:-12.1} + +case ${CUDA_VERSION} in + cpu) + BASE_TARGET=base + DOCKER_TAG=cpu + ;; + all) + BASE_TARGET=all_cuda + DOCKER_TAG=latest + ;; + *) + BASE_TARGET=cuda${CUDA_VERSION} + DOCKER_TAG=cuda${CUDA_VERSION} + ;; +esac + + +( + set -x + docker build \ + --target final \ + --progress plain \ + --build-arg "BASE_TARGET=${BASE_TARGET}" \ + --build-arg "CUDA_VERSION=${CUDA_VERSION}" \ + --build-arg "DEVTOOLSET_VERSION=9" \ + -t ${DOCKER_IMAGE_NAME} \ + $@ \ + -f "${TOPDIR}/.ci/docker/conda/Dockerfile" \ + ${TOPDIR}/.ci/docker/ +) + +if [[ "${DOCKER_TAG}" =~ ^cuda* ]]; then + # Test that we're using the right CUDA compiler + ( + set -x + docker run --rm "${DOCKER_IMAGE_NAME}" nvcc --version | grep "cuda_${CUDA_VERSION}" + ) +fi + +GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)} +GIT_BRANCH_NAME=${GITHUB_REF##*/} +GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)} +DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE_NAME}-${GIT_BRANCH_NAME} +DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE_NAME}-${GIT_COMMIT_SHA} +if [[ "${WITH_PUSH:-}" == true ]]; then + ( + set -x + docker push "${DOCKER_IMAGE_NAME}" + if [[ -n ${GITHUB_REF} ]]; then + docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_BRANCH_TAG} + docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_SHA_TAG} + docker push "${DOCKER_IMAGE_BRANCH_TAG}" + docker push "${DOCKER_IMAGE_SHA_TAG}" + fi + ) +fi diff --git a/.ci/docker/libtorch/Dockerfile b/.ci/docker/libtorch/Dockerfile new file mode 100644 index 00000000000..c5249e30de3 --- /dev/null +++ b/.ci/docker/libtorch/Dockerfile @@ -0,0 +1,107 @@ +ARG BASE_TARGET=base +ARG GPU_IMAGE=ubuntu:20.04 +FROM ${GPU_IMAGE} as base + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get clean && apt-get update +RUN apt-get install -y curl locales g++ git-all autoconf automake make cmake wget unzip sudo +# Just add everything as a safe.directory for git since these will be used in multiple places with git +RUN git config --global --add safe.directory '*' + +RUN locale-gen en_US.UTF-8 + +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 + +# Install openssl +FROM base as openssl +ADD ./common/install_openssl.sh install_openssl.sh +RUN bash ./install_openssl.sh && rm install_openssl.sh + +# Install python +FROM base as python +ADD common/install_cpython.sh install_cpython.sh +RUN apt-get update -y && \ + apt-get install build-essential gdb lcov libbz2-dev libffi-dev \ + libgdbm-dev liblzma-dev libncurses5-dev libreadline6-dev \ + libsqlite3-dev libssl-dev lzma lzma-dev tk-dev uuid-dev zlib1g-dev -y && \ + bash ./install_cpython.sh && \ + rm install_cpython.sh && \ + apt-get clean + +FROM base as conda +ADD ./common/install_conda_docker.sh install_conda.sh +RUN bash ./install_conda.sh && rm install_conda.sh + +FROM base as cpu +# Install Anaconda +COPY --from=conda /opt/conda /opt/conda +# Install python +COPY --from=python /opt/python /opt/python +COPY --from=python /opt/_internal /opt/_internal +ENV PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH +# Install MKL +ADD ./common/install_mkl.sh install_mkl.sh +RUN bash ./install_mkl.sh && rm install_mkl.sh + +FROM cpu as cuda +ADD ./common/install_cuda.sh install_cuda.sh +ADD ./common/install_magma.sh install_magma.sh +ENV CUDA_HOME /usr/local/cuda + +FROM cuda as cuda11.8 +RUN bash ./install_cuda.sh 11.8 +RUN bash ./install_magma.sh 11.8 +RUN ln -sf /usr/local/cuda-11.8 /usr/local/cuda + +FROM cuda as cuda12.1 +RUN bash ./install_cuda.sh 12.1 +RUN bash ./install_magma.sh 12.1 +RUN ln -sf /usr/local/cuda-12.1 /usr/local/cuda + +FROM cuda as cuda12.4 +RUN bash ./install_cuda.sh 12.4 +RUN bash ./install_magma.sh 12.4 +RUN ln -sf /usr/local/cuda-12.4 /usr/local/cuda + +FROM cpu as rocm +ARG PYTORCH_ROCM_ARCH +ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} +ENV MKLROOT /opt/intel +# Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0) +# find HIP works for ROCm5.7. Not needed for ROCm6.0 and above. +# Remove below when ROCm5.7 is not in support matrix anymore. +ENV ROCM_PATH /opt/rocm +# No need to install ROCm as base docker image should have full ROCm install +#ADD ./common/install_rocm.sh install_rocm.sh +ADD ./common/install_rocm_drm.sh install_rocm_drm.sh +ADD ./common/install_rocm_magma.sh install_rocm_magma.sh +# gfortran and python needed for building magma from source for ROCm +RUN apt-get update -y && \ + apt-get install gfortran -y && \ + apt-get install python -y && \ + apt-get clean + +RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh +RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh + +# Install AOTriton +COPY ./common/common_utils.sh common_utils.sh +COPY ./common/aotriton_version.txt aotriton_version.txt +COPY ./common/install_aotriton.sh install_aotriton.sh +RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt +ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton + +FROM ${BASE_TARGET} as final +COPY --from=openssl /opt/openssl /opt/openssl +# Install patchelf +ADD ./common/install_patchelf.sh install_patchelf.sh +RUN bash ./install_patchelf.sh && rm install_patchelf.sh +# Install Anaconda +COPY --from=conda /opt/conda /opt/conda +# Install python +COPY --from=python /opt/python /opt/python +COPY --from=python /opt/_internal /opt/_internal +ENV PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH diff --git a/.ci/docker/libtorch/build.sh b/.ci/docker/libtorch/build.sh new file mode 100755 index 00000000000..4238bb5ef00 --- /dev/null +++ b/.ci/docker/libtorch/build.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash +# Script used only in CD pipeline + +set -eou pipefail + +image="$1" +shift + +if [ -z "${image}" ]; then + echo "Usage: $0 IMAGE" + exit 1 +fi + +DOCKER_IMAGE="pytorch/${image}" + +TOPDIR=$(git rev-parse --show-toplevel) + +GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu} +GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-} + +WITH_PUSH=${WITH_PUSH:-} + +DOCKER=${DOCKER:-docker} + +case ${GPU_ARCH_TYPE} in + cpu) + BASE_TARGET=cpu + DOCKER_TAG=cpu + GPU_IMAGE=ubuntu:20.04 + DOCKER_GPU_BUILD_ARG="" + ;; + cuda) + BASE_TARGET=cuda${GPU_ARCH_VERSION} + DOCKER_TAG=cuda${GPU_ARCH_VERSION} + GPU_IMAGE=ubuntu:20.04 + DOCKER_GPU_BUILD_ARG="" + ;; + rocm) + BASE_TARGET=rocm + DOCKER_TAG=rocm${GPU_ARCH_VERSION} + GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete + PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100" + ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)" + if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then + ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0})) + else + echo "ERROR: rocm regex failed" + exit 1 + fi + if [[ $ROCM_VERSION_INT -ge 60000 ]]; then + PYTORCH_ROCM_ARCH+=";gfx942" + fi + DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}" + ;; + *) + echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}" + exit 1 + ;; +esac + + +( + set -x + DOCKER_BUILDKIT=1 ${DOCKER} build \ + --target final \ + ${DOCKER_GPU_BUILD_ARG} \ + --build-arg "GPU_IMAGE=${GPU_IMAGE}" \ + --build-arg "BASE_TARGET=${BASE_TARGET}" \ + -t "${DOCKER_IMAGE}" \ + $@ \ + -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \ + "${TOPDIR}/.ci/docker/" + +) + +GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)} +GIT_BRANCH_NAME=${GITHUB_REF##*/} +GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)} +DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME} +DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA} + +if [[ "${WITH_PUSH}" == true ]]; then + ( + set -x + ${DOCKER} push "${DOCKER_IMAGE}" + if [[ -n ${GITHUB_REF} ]]; then + ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG} + ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG} + ${DOCKER} push "${DOCKER_IMAGE_BRANCH_TAG}" + ${DOCKER} push "${DOCKER_IMAGE_SHA_TAG}" + fi + ) +fi diff --git a/.ci/docker/manywheel/Dockerfile b/.ci/docker/manywheel/Dockerfile new file mode 100644 index 00000000000..4162926d9bc --- /dev/null +++ b/.ci/docker/manywheel/Dockerfile @@ -0,0 +1,203 @@ +# syntax = docker/dockerfile:experimental +ARG ROCM_VERSION=3.7 +ARG BASE_CUDA_VERSION=11.8 + +ARG GPU_IMAGE=centos:7 +FROM centos:7 as base + +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 + +ARG DEVTOOLSET_VERSION=9 +# Note: This is required patch since CentOS have reached EOL +# otherwise any yum install setp will fail +RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo +RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo +RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo +RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel +# Just add everything as a safe.directory for git since these will be used in multiple places with git +RUN git config --global --add safe.directory '*' +RUN yum install -y yum-utils centos-release-scl +RUN yum-config-manager --enable rhel-server-rhscl-7-rpms +# Note: After running yum-config-manager --enable rhel-server-rhscl-7-rpms +# patch is required once again. Somehow this steps adds mirror.centos.org +RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo +RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo +RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo +RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils +ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH + +RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \ + rpm -ivh epel-release-latest-7.noarch.rpm && \ + rm -f epel-release-latest-7.noarch.rpm + +# cmake-3.18.4 from pip +RUN yum install -y python3-pip && \ + python3 -mpip install cmake==3.18.4 && \ + ln -s /usr/local/bin/cmake /usr/bin/cmake + +RUN yum install -y autoconf aclocal automake make sudo + +FROM base as openssl +# Install openssl (this must precede `build python` step) +# (In order to have a proper SSL module, Python is compiled +# against a recent openssl [see env vars above], which is linked +# statically. We delete openssl afterwards.) +ADD ./common/install_openssl.sh install_openssl.sh +RUN bash ./install_openssl.sh && rm install_openssl.sh + +# EPEL for cmake +FROM base as patchelf +# Install patchelf +ADD ./common/install_patchelf.sh install_patchelf.sh +RUN bash ./install_patchelf.sh && rm install_patchelf.sh +RUN cp $(which patchelf) /patchelf + +FROM patchelf as python +# build python +COPY manywheel/build_scripts /build_scripts +ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh +RUN bash build_scripts/build.sh && rm -r build_scripts + +FROM base as cuda +ARG BASE_CUDA_VERSION=10.2 +# Install CUDA +ADD ./common/install_cuda.sh install_cuda.sh +RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh + +FROM base as intel +# MKL +ADD ./common/install_mkl.sh install_mkl.sh +RUN bash ./install_mkl.sh && rm install_mkl.sh + +FROM base as magma +ARG BASE_CUDA_VERSION=10.2 +# Install magma +ADD ./common/install_magma.sh install_magma.sh +RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh + +FROM base as jni +# Install java jni header +ADD ./common/install_jni.sh install_jni.sh +ADD ./java/jni.h jni.h +RUN bash ./install_jni.sh && rm install_jni.sh + +FROM base as libpng +# Install libpng +ADD ./common/install_libpng.sh install_libpng.sh +RUN bash ./install_libpng.sh && rm install_libpng.sh + +FROM ${GPU_IMAGE} as common +RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo +RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo +RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 +RUN yum install -y \ + aclocal \ + autoconf \ + automake \ + bison \ + bzip2 \ + curl \ + diffutils \ + file \ + git \ + make \ + patch \ + perl \ + unzip \ + util-linux \ + wget \ + which \ + xz \ + yasm +RUN yum install -y \ + https://repo.ius.io/ius-release-el7.rpm \ + https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm +RUN yum swap -y git git236-core +# git236+ would refuse to run git commands in repos owned by other users +# Which causes version check to fail, as pytorch repo is bind-mounted into the image +# Override this behaviour by treating every folder as safe +# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327 +RUN git config --global --add safe.directory "*" + +ENV SSL_CERT_FILE=/opt/_internal/certs.pem +# Install LLVM version +COPY --from=openssl /opt/openssl /opt/openssl +COPY --from=python /opt/python /opt/python +COPY --from=python /opt/_internal /opt/_internal +COPY --from=python /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel +COPY --from=intel /opt/intel /opt/intel +COPY --from=patchelf /usr/local/bin/patchelf /usr/local/bin/patchelf +COPY --from=jni /usr/local/include/jni.h /usr/local/include/jni.h +COPY --from=libpng /usr/local/bin/png* /usr/local/bin/ +COPY --from=libpng /usr/local/bin/libpng* /usr/local/bin/ +COPY --from=libpng /usr/local/include/png* /usr/local/include/ +COPY --from=libpng /usr/local/include/libpng* /usr/local/include/ +COPY --from=libpng /usr/local/lib/libpng* /usr/local/lib/ +COPY --from=libpng /usr/local/lib/pkgconfig /usr/local/lib/pkgconfig + +FROM common as cpu_final +ARG BASE_CUDA_VERSION=10.1 +ARG DEVTOOLSET_VERSION=9 +RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo +RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo +RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo + +RUN yum install -y yum-utils centos-release-scl +RUN yum-config-manager --enable rhel-server-rhscl-7-rpms +RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo +RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo +RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo +RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils +ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH + +# cmake is already installed inside the rocm base image, so remove if present +RUN rpm -e cmake || true +# cmake-3.18.4 from pip +RUN yum install -y python3-pip && \ + python3 -mpip install cmake==3.18.4 && \ + ln -s /usr/local/bin/cmake /usr/bin/cmake + +# ninja +RUN yum install -y ninja-build + +FROM cpu_final as cuda_final +RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION} +COPY --from=cuda /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION} +COPY --from=magma /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION} +RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda +ENV PATH=/usr/local/cuda/bin:$PATH + +FROM cpu_final as rocm_final +ARG ROCM_VERSION=3.7 +ARG PYTORCH_ROCM_ARCH +ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} +# Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0) +# find HIP works for ROCm5.7. Not needed for ROCm6.0 and above. +# Remove below when ROCm5.7 is not in support matrix anymore. +ENV ROCM_PATH /opt/rocm +ENV MKLROOT /opt/intel +# No need to install ROCm as base docker image should have full ROCm install +#ADD ./common/install_rocm.sh install_rocm.sh +#RUN ROCM_VERSION=${ROCM_VERSION} bash ./install_rocm.sh && rm install_rocm.sh +ADD ./common/install_rocm_drm.sh install_rocm_drm.sh +RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh +# cmake3 is needed for the MIOpen build +RUN ln -sf /usr/local/bin/cmake /usr/bin/cmake3 +ADD ./common/install_rocm_magma.sh install_rocm_magma.sh +RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh +ADD ./common/install_miopen.sh install_miopen.sh +RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh + +# Install AOTriton +COPY ./common/common_utils.sh common_utils.sh +COPY ./common/aotriton_version.txt aotriton_version.txt +COPY ./common/install_aotriton.sh install_aotriton.sh +RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt +ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton diff --git a/.ci/docker/manywheel/Dockerfile_2014 b/.ci/docker/manywheel/Dockerfile_2014 new file mode 100644 index 00000000000..87f2f78472a --- /dev/null +++ b/.ci/docker/manywheel/Dockerfile_2014 @@ -0,0 +1,152 @@ +# syntax = docker/dockerfile:experimental +ARG ROCM_VERSION=3.7 +ARG BASE_CUDA_VERSION=10.2 +ARG GPU_IMAGE=nvidia/cuda:${BASE_CUDA_VERSION}-devel-centos7 +FROM quay.io/pypa/manylinux2014_x86_64 as base + +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 + +RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo +RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo +RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo +RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel +RUN yum install -y yum-utils centos-release-scl sudo +RUN yum-config-manager --enable rhel-server-rhscl-7-rpms +RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils +ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH + +# cmake +RUN yum install -y cmake3 && \ + ln -s /usr/bin/cmake3 /usr/bin/cmake +FROM base as openssl +# Install openssl (this must precede `build python` step) +# (In order to have a proper SSL module, Python is compiled +# against a recent openssl [see env vars above], which is linked +# statically. We delete openssl afterwards.) +ADD ./common/install_openssl.sh install_openssl.sh +RUN bash ./install_openssl.sh && rm install_openssl.sh + + + +# remove unncessary python versions +RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2 +RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 +RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 +RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6 + +FROM base as cuda +ARG BASE_CUDA_VERSION=10.2 +# Install CUDA +ADD ./common/install_cuda.sh install_cuda.sh +RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh + +FROM base as intel +# MKL +ADD ./common/install_mkl.sh install_mkl.sh +RUN bash ./install_mkl.sh && rm install_mkl.sh + +FROM base as magma +ARG BASE_CUDA_VERSION=10.2 +# Install magma +ADD ./common/install_magma.sh install_magma.sh +RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh + +FROM base as jni +# Install java jni header +ADD ./common/install_jni.sh install_jni.sh +ADD ./java/jni.h jni.h +RUN bash ./install_jni.sh && rm install_jni.sh + +FROM base as libpng +# Install libpng +ADD ./common/install_libpng.sh install_libpng.sh +RUN bash ./install_libpng.sh && rm install_libpng.sh + +FROM ${GPU_IMAGE} as common +RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo +RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo +RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 +RUN yum install -y \ + aclocal \ + autoconf \ + automake \ + bison \ + bzip2 \ + curl \ + diffutils \ + file \ + git \ + make \ + patch \ + perl \ + unzip \ + util-linux \ + wget \ + which \ + xz \ + yasm +RUN yum install -y \ + https://repo.ius.io/ius-release-el7.rpm \ + https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm +RUN yum swap -y git git236-core +# git236+ would refuse to run git commands in repos owned by other users +# Which causes version check to fail, as pytorch repo is bind-mounted into the image +# Override this behaviour by treating every folder as safe +# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327 +RUN git config --global --add safe.directory "*" + +ENV SSL_CERT_FILE=/opt/_internal/certs.pem +# Install LLVM version +COPY --from=openssl /opt/openssl /opt/openssl +COPY --from=base /opt/python /opt/python +COPY --from=base /opt/_internal /opt/_internal +COPY --from=base /usr/local/bin/auditwheel /usr/local/bin/auditwheel +COPY --from=intel /opt/intel /opt/intel +COPY --from=base /usr/local/bin/patchelf /usr/local/bin/patchelf +COPY --from=libpng /usr/local/bin/png* /usr/local/bin/ +COPY --from=libpng /usr/local/bin/libpng* /usr/local/bin/ +COPY --from=libpng /usr/local/include/png* /usr/local/include/ +COPY --from=libpng /usr/local/include/libpng* /usr/local/include/ +COPY --from=libpng /usr/local/lib/libpng* /usr/local/lib/ +COPY --from=libpng /usr/local/lib/pkgconfig /usr/local/lib/pkgconfig +COPY --from=jni /usr/local/include/jni.h /usr/local/include/jni.h + +FROM common as cpu_final +ARG BASE_CUDA_VERSION=10.2 +RUN yum install -y yum-utils centos-release-scl +RUN yum-config-manager --enable rhel-server-rhscl-7-rpms +RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils +ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH + +# cmake +RUN yum install -y cmake3 && \ + ln -s /usr/bin/cmake3 /usr/bin/cmake + +# ninja +RUN yum install -y http://repo.okay.com.mx/centos/7/x86_64/release/okay-release-1-1.noarch.rpm +RUN yum install -y ninja-build + +FROM cpu_final as cuda_final +RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION} +COPY --from=cuda /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION} +COPY --from=magma /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION} + +FROM common as rocm_final +ARG ROCM_VERSION=3.7 +# Install ROCm +ADD ./common/install_rocm.sh install_rocm.sh +RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh +# cmake is already installed inside the rocm base image, but both 2 and 3 exist +# cmake3 is needed for the later MIOpen custom build, so that step is last. +RUN yum install -y cmake3 && \ + rm -f /usr/bin/cmake && \ + ln -s /usr/bin/cmake3 /usr/bin/cmake +ADD ./common/install_miopen.sh install_miopen.sh +RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh diff --git a/.ci/docker/manywheel/Dockerfile_2_28 b/.ci/docker/manywheel/Dockerfile_2_28 new file mode 100644 index 00000000000..a036a576431 --- /dev/null +++ b/.ci/docker/manywheel/Dockerfile_2_28 @@ -0,0 +1,153 @@ +# syntax = docker/dockerfile:experimental +ARG ROCM_VERSION=3.7 +ARG BASE_CUDA_VERSION=11.8 +ARG GPU_IMAGE=amd64/almalinux:8 +FROM quay.io/pypa/manylinux_2_28_x86_64 as base + +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 + +ARG DEVTOOLSET_VERSION=11 +RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-toolchain +ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH + +# cmake-3.18.4 from pip +RUN yum install -y python3-pip && \ + python3 -mpip install cmake==3.18.4 && \ + ln -s /usr/local/bin/cmake /usr/bin/cmake3 + +FROM base as openssl +# Install openssl (this must precede `build python` step) +# (In order to have a proper SSL module, Python is compiled +# against a recent openssl [see env vars above], which is linked +# statically. We delete openssl afterwards.) +ADD ./common/install_openssl.sh install_openssl.sh +RUN bash ./install_openssl.sh && rm install_openssl.sh + + +# remove unncessary python versions +RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2 +RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 +RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 +RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6 + +FROM base as cuda +ARG BASE_CUDA_VERSION=11.8 +# Install CUDA +ADD ./common/install_cuda.sh install_cuda.sh +RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh + +FROM base as intel +# MKL +ADD ./common/install_mkl.sh install_mkl.sh +RUN bash ./install_mkl.sh && rm install_mkl.sh + +FROM base as magma +ARG BASE_CUDA_VERSION=10.2 +# Install magma +ADD ./common/install_magma.sh install_magma.sh +RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh + +FROM base as jni +# Install java jni header +ADD ./common/install_jni.sh install_jni.sh +ADD ./java/jni.h jni.h +RUN bash ./install_jni.sh && rm install_jni.sh + +FROM base as libpng +# Install libpng +ADD ./common/install_libpng.sh install_libpng.sh +RUN bash ./install_libpng.sh && rm install_libpng.sh + +FROM ${GPU_IMAGE} as common +ARG DEVTOOLSET_VERSION=11 +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 +RUN yum -y install epel-release +RUN yum -y update +RUN yum install -y \ + autoconf \ + automake \ + bison \ + bzip2 \ + curl \ + diffutils \ + file \ + git \ + make \ + patch \ + perl \ + unzip \ + util-linux \ + wget \ + which \ + xz \ + gcc-toolset-${DEVTOOLSET_VERSION}-toolchain \ + glibc-langpack-en + +RUN yum install -y \ + https://repo.ius.io/ius-release-el7.rpm \ + https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm +RUN yum swap -y git git236-core +# git236+ would refuse to run git commands in repos owned by other users +# Which causes version check to fail, as pytorch repo is bind-mounted into the image +# Override this behaviour by treating every folder as safe +# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327 +RUN git config --global --add safe.directory "*" + +ENV SSL_CERT_FILE=/opt/_internal/certs.pem +# Install LLVM version +COPY --from=openssl /opt/openssl /opt/openssl +COPY --from=base /opt/python /opt/python +COPY --from=base /opt/_internal /opt/_internal +COPY --from=base /usr/local/bin/auditwheel /usr/local/bin/auditwheel +COPY --from=intel /opt/intel /opt/intel +COPY --from=base /usr/local/bin/patchelf /usr/local/bin/patchelf +COPY --from=libpng /usr/local/bin/png* /usr/local/bin/ +COPY --from=libpng /usr/local/bin/libpng* /usr/local/bin/ +COPY --from=libpng /usr/local/include/png* /usr/local/include/ +COPY --from=libpng /usr/local/include/libpng* /usr/local/include/ +COPY --from=libpng /usr/local/lib/libpng* /usr/local/lib/ +COPY --from=libpng /usr/local/lib/pkgconfig /usr/local/lib/pkgconfig +COPY --from=jni /usr/local/include/jni.h /usr/local/include/jni.h + +FROM common as cpu_final +ARG BASE_CUDA_VERSION=11.8 +ARG DEVTOOLSET_VERSION=11 +# Ensure the expected devtoolset is used +ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH + +# cmake-3.18.4 from pip +RUN yum install -y python3-pip && \ + python3 -mpip install cmake==3.18.4 && \ + ln -s /usr/local/bin/cmake /usr/bin/cmake3 + +FROM cpu_final as cuda_final +RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION} +COPY --from=cuda /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION} +COPY --from=magma /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION} + +FROM common as rocm_final +ARG ROCM_VERSION=3.7 +# Install ROCm +ADD ./common/install_rocm.sh install_rocm.sh +RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh +# cmake is already installed inside the rocm base image, but both 2 and 3 exist +# cmake3 is needed for the later MIOpen custom build, so that step is last. +RUN yum install -y cmake3 && \ + rm -f /usr/bin/cmake && \ + ln -s /usr/bin/cmake3 /usr/bin/cmake +ADD ./common/install_miopen.sh install_miopen.sh +RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh + +FROM cpu_final as xpu_final +# cmake-3.28.4 from pip +RUN python3 -m pip install --upgrade pip && \ + python3 -mpip install cmake==3.28.4 +ADD ./common/install_xpu.sh install_xpu.sh +RUN bash ./install_xpu.sh && rm install_xpu.sh +RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd diff --git a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 b/.ci/docker/manywheel/Dockerfile_2_28_aarch64 new file mode 100644 index 00000000000..cbf2a757010 --- /dev/null +++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64 @@ -0,0 +1,57 @@ +FROM quay.io/pypa/manylinux_2_28_aarch64 as base + +# Graviton needs GCC 10 or above for the build. GCC12 is the default version in almalinux-8. +ARG GCCTOOLSET_VERSION=11 + +# Language variabes +ENV LC_ALL=en_US.UTF-8 +ENV LANG=en_US.UTF-8 +ENV LANGUAGE=en_US.UTF-8 + +# Installed needed OS packages. This is to support all +# the binary builds (torch, vision, audio, text, data) +RUN yum -y install epel-release +RUN yum -y update +RUN yum install -y \ + autoconf \ + automake \ + bison \ + bzip2 \ + curl \ + diffutils \ + file \ + git \ + less \ + libffi-devel \ + libgomp \ + make \ + openssl-devel \ + patch \ + perl \ + unzip \ + util-linux \ + wget \ + which \ + xz \ + yasm \ + zstd \ + sudo \ + gcc-toolset-${GCCTOOLSET_VERSION}-toolchain + +# Ensure the expected devtoolset is used +ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH + +# git236+ would refuse to run git commands in repos owned by other users +# Which causes version check to fail, as pytorch repo is bind-mounted into the image +# Override this behaviour by treating every folder as safe +# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327 +RUN git config --global --add safe.directory "*" + +FROM base as final + +# remove unncessary python versions +RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2 +RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 +RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 +RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6 diff --git a/.ci/docker/manywheel/Dockerfile_aarch64 b/.ci/docker/manywheel/Dockerfile_aarch64 new file mode 100644 index 00000000000..cb1ddb7c62b --- /dev/null +++ b/.ci/docker/manywheel/Dockerfile_aarch64 @@ -0,0 +1,94 @@ +FROM quay.io/pypa/manylinux2014_aarch64 as base + + +# Graviton needs GCC 10 for the build +ARG DEVTOOLSET_VERSION=10 + +# Language variabes +ENV LC_ALL=en_US.UTF-8 +ENV LANG=en_US.UTF-8 +ENV LANGUAGE=en_US.UTF-8 + +# Installed needed OS packages. This is to support all +# the binary builds (torch, vision, audio, text, data) +RUN yum -y install epel-release +RUN yum -y update +RUN yum install -y \ + autoconf \ + automake \ + bison \ + bzip2 \ + curl \ + diffutils \ + file \ + git \ + make \ + patch \ + perl \ + unzip \ + util-linux \ + wget \ + which \ + xz \ + yasm \ + less \ + zstd \ + libgomp \ + sudo \ + devtoolset-${DEVTOOLSET_VERSION}-gcc \ + devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ \ + devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran \ + devtoolset-${DEVTOOLSET_VERSION}-binutils + +# Ensure the expected devtoolset is used +ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH + + +# git236+ would refuse to run git commands in repos owned by other users +# Which causes version check to fail, as pytorch repo is bind-mounted into the image +# Override this behaviour by treating every folder as safe +# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327 +RUN git config --global --add safe.directory "*" + + +############################################################################### +# libglfortran.a hack +# +# libgfortran.a from quay.io/pypa/manylinux2014_aarch64 is not compiled with -fPIC. +# This causes __stack_chk_guard@@GLIBC_2.17 on pytorch build. To solve, get +# ubuntu's libgfortran.a which is compiled with -fPIC +# NOTE: Need a better way to get this library as Ubuntu's package can be removed by the vender, or changed +############################################################################### +RUN cd ~/ \ + && curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-1ubuntu1_arm64.deb \ + && ar x ~/libgfortran-10-dev.deb \ + && tar --use-compress-program=unzstd -xvf data.tar.zst -C ~/ \ + && cp -f ~/usr/lib/gcc/aarch64-linux-gnu/10/libgfortran.a /opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/ + +# install cmake +RUN yum install -y cmake3 && \ + ln -s /usr/bin/cmake3 /usr/bin/cmake + +FROM base as openssl +# Install openssl (this must precede `build python` step) +# (In order to have a proper SSL module, Python is compiled +# against a recent openssl [see env vars above], which is linked +# statically. We delete openssl afterwards.) +ADD ./common/install_openssl.sh install_openssl.sh +RUN bash ./install_openssl.sh && rm install_openssl.sh +ENV SSL_CERT_FILE=/opt/_internal/certs.pem + +FROM base as openblas +# Install openblas +ADD ./common/install_openblas.sh install_openblas.sh +RUN bash ./install_openblas.sh && rm install_openblas.sh + +FROM openssl as final +# remove unncessary python versions +RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2 +RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 +RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 +RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6 +COPY --from=openblas /opt/OpenBLAS/ /opt/OpenBLAS/ +ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH diff --git a/.ci/docker/manywheel/Dockerfile_cuda_aarch64 b/.ci/docker/manywheel/Dockerfile_cuda_aarch64 new file mode 100644 index 00000000000..7ffd38f5edc --- /dev/null +++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64 @@ -0,0 +1,91 @@ +FROM quay.io/pypa/manylinux_2_28_aarch64 as base + +# Cuda ARM build needs gcc 11 +ARG DEVTOOLSET_VERSION=11 + +# Language variables +ENV LC_ALL=en_US.UTF-8 +ENV LANG=en_US.UTF-8 +ENV LANGUAGE=en_US.UTF-8 + +# Installed needed OS packages. This is to support all +# the binary builds (torch, vision, audio, text, data) +RUN yum -y install epel-release +RUN yum -y update +RUN yum install -y \ + autoconf \ + automake \ + bison \ + bzip2 \ + curl \ + diffutils \ + file \ + git \ + make \ + patch \ + perl \ + unzip \ + util-linux \ + wget \ + which \ + xz \ + yasm \ + less \ + zstd \ + libgomp \ + sudo \ + gcc-toolset-${DEVTOOLSET_VERSION}-toolchain + +# Ensure the expected devtoolset is used +ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH + +# git236+ would refuse to run git commands in repos owned by other users +# Which causes version check to fail, as pytorch repo is bind-mounted into the image +# Override this behaviour by treating every folder as safe +# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327 +RUN git config --global --add safe.directory "*" + + +FROM base as openssl +# Install openssl (this must precede `build python` step) +# (In order to have a proper SSL module, Python is compiled +# against a recent openssl [see env vars above], which is linked +# statically. We delete openssl afterwards.) +ADD ./common/install_openssl.sh install_openssl.sh +RUN bash ./install_openssl.sh && rm install_openssl.sh +ENV SSL_CERT_FILE=/opt/_internal/certs.pem + +FROM openssl as final +# remove unncessary python versions +RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2 +RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 +RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 +RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6 + +FROM base as cuda +ARG BASE_CUDA_VERSION +# Install CUDA +ADD ./common/install_cuda_aarch64.sh install_cuda_aarch64.sh +RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh + +FROM base as magma +ARG BASE_CUDA_VERSION +# Install magma +ADD ./common/install_magma.sh install_magma.sh +RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh + +FROM base as openblas +# Install openblas +ADD ./common/install_openblas.sh install_openblas.sh +RUN bash ./install_openblas.sh && rm install_openblas.sh + +FROM final as cuda_final +ARG BASE_CUDA_VERSION +RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION} +COPY --from=cuda /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION} +COPY --from=magma /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION} +COPY --from=openblas /opt/OpenBLAS/ /opt/OpenBLAS/ +RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda +ENV PATH=/usr/local/cuda/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH diff --git a/.ci/docker/manywheel/Dockerfile_cxx11-abi b/.ci/docker/manywheel/Dockerfile_cxx11-abi new file mode 100644 index 00000000000..ed33cc61df0 --- /dev/null +++ b/.ci/docker/manywheel/Dockerfile_cxx11-abi @@ -0,0 +1,71 @@ +FROM centos:8 as base + +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 +ENV PATH /opt/rh/gcc-toolset-11/root/bin/:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + +# change to a valid repo +RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Linux-*.repo +# enable to install ninja-build +RUN sed -i 's|enabled=0|enabled=1|g' /etc/yum.repos.d/CentOS-Linux-PowerTools.repo + +RUN yum -y update +RUN yum install -y wget curl perl util-linux xz bzip2 git patch which zlib-devel sudo +RUN yum install -y autoconf automake make cmake gdb gcc-toolset-11-gcc-c++ + + +FROM base as openssl +ADD ./common/install_openssl.sh install_openssl.sh +RUN bash ./install_openssl.sh && rm install_openssl.sh + +# Install python +FROM base as python +RUN yum install -y openssl-devel zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel +ADD common/install_cpython.sh install_cpython.sh +RUN bash ./install_cpython.sh && rm install_cpython.sh + +FROM base as conda +ADD ./common/install_conda_docker.sh install_conda.sh +RUN bash ./install_conda.sh && rm install_conda.sh +RUN /opt/conda/bin/conda install -y cmake + +FROM base as intel +# Install MKL +COPY --from=python /opt/python /opt/python +COPY --from=python /opt/_internal /opt/_internal +COPY --from=conda /opt/conda /opt/conda +ENV PATH=/opt/conda/bin:$PATH +ADD ./common/install_mkl.sh install_mkl.sh +RUN bash ./install_mkl.sh && rm install_mkl.sh + +FROM base as patchelf +ADD ./common/install_patchelf.sh install_patchelf.sh +RUN bash ./install_patchelf.sh && rm install_patchelf.sh +RUN cp $(which patchelf) /patchelf + +FROM base as jni +ADD ./common/install_jni.sh install_jni.sh +ADD ./java/jni.h jni.h +RUN bash ./install_jni.sh && rm install_jni.sh + +FROM base as libpng +ADD ./common/install_libpng.sh install_libpng.sh +RUN bash ./install_libpng.sh && rm install_libpng.sh + +FROM base as final +COPY --from=openssl /opt/openssl /opt/openssl +COPY --from=python /opt/python /opt/python +COPY --from=python /opt/_internal /opt/_internal +COPY --from=intel /opt/intel /opt/intel +COPY --from=conda /opt/conda /opt/conda +COPY --from=patchelf /usr/local/bin/patchelf /usr/local/bin/patchelf +COPY --from=jni /usr/local/include/jni.h /usr/local/include/jni.h +COPY --from=libpng /usr/local/bin/png* /usr/local/bin/ +COPY --from=libpng /usr/local/bin/libpng* /usr/local/bin/ +COPY --from=libpng /usr/local/include/png* /usr/local/include/ +COPY --from=libpng /usr/local/include/libpng* /usr/local/include/ +COPY --from=libpng /usr/local/lib/libpng* /usr/local/lib/ +COPY --from=libpng /usr/local/lib/pkgconfig /usr/local/lib/pkgconfig + +RUN yum install -y ninja-build diff --git a/.ci/docker/manywheel/Dockerfile_s390x b/.ci/docker/manywheel/Dockerfile_s390x new file mode 100644 index 00000000000..5125e3830e8 --- /dev/null +++ b/.ci/docker/manywheel/Dockerfile_s390x @@ -0,0 +1,73 @@ +FROM --platform=linux/s390x docker.io/ubuntu:24.04 as base + +# Language variables +ENV LC_ALL=C.UTF-8 +ENV LANG=C.UTF-8 +ENV LANGUAGE=C.UTF-8 + +# Installed needed OS packages. This is to support all +# the binary builds (torch, vision, audio, text, data) +RUN apt update ; apt upgrade -y +RUN apt install -y \ + build-essential \ + autoconf \ + automake \ + bzip2 \ + curl \ + diffutils \ + file \ + git \ + make \ + patch \ + perl \ + unzip \ + util-linux \ + wget \ + which \ + xz-utils \ + less \ + zstd \ + cmake \ + python3 \ + python3-dev \ + python3-setuptools \ + python3-yaml \ + python3-typing-extensions \ + libblas-dev \ + libopenblas-dev \ + liblapack-dev \ + libatlas-base-dev + +# git236+ would refuse to run git commands in repos owned by other users +# Which causes version check to fail, as pytorch repo is bind-mounted into the image +# Override this behaviour by treating every folder as safe +# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327 +RUN git config --global --add safe.directory "*" + +FROM base as openssl +# Install openssl (this must precede `build python` step) +# (In order to have a proper SSL module, Python is compiled +# against a recent openssl [see env vars above], which is linked +# statically. We delete openssl afterwards.) +ADD ./common/install_openssl.sh install_openssl.sh +RUN bash ./install_openssl.sh && rm install_openssl.sh +ENV SSL_CERT_FILE=/opt/_internal/certs.pem + +# EPEL for cmake +FROM base as patchelf +# Install patchelf +ADD ./common/install_patchelf.sh install_patchelf.sh +RUN bash ./install_patchelf.sh && rm install_patchelf.sh +RUN cp $(which patchelf) /patchelf + +FROM patchelf as python +# build python +COPY manywheel/build_scripts /build_scripts +ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh +RUN bash build_scripts/build.sh && rm -r build_scripts + +FROM openssl as final +COPY --from=python /opt/python /opt/python +COPY --from=python /opt/_internal /opt/_internal +COPY --from=python /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel +COPY --from=patchelf /usr/local/bin/patchelf /usr/local/bin/patchelf diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh new file mode 100755 index 00000000000..0cfb88ef72f --- /dev/null +++ b/.ci/docker/manywheel/build.sh @@ -0,0 +1,154 @@ +#!/usr/bin/env bash +# Script used only in CD pipeline + +set -eou pipefail + +TOPDIR=$(git rev-parse --show-toplevel) + +image="$1" +shift + +if [ -z "${image}" ]; then + echo "Usage: $0 IMAGE" + exit 1 +fi + +DOCKER_IMAGE="pytorch/${image}" + +DOCKER_REGISTRY="${DOCKER_REGISTRY:-docker.io}" + +GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu} +GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-} +MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-} +DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-} +WITH_PUSH=${WITH_PUSH:-} + +case ${GPU_ARCH_TYPE} in + cpu) + TARGET=cpu_final + DOCKER_TAG=cpu + GPU_IMAGE=centos:7 + DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9" + ;; + cpu-manylinux_2_28) + TARGET=cpu_final + DOCKER_TAG=cpu + GPU_IMAGE=amd64/almalinux:8 + DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11" + MANY_LINUX_VERSION="2_28" + ;; + cpu-aarch64) + TARGET=final + DOCKER_TAG=cpu-aarch64 + GPU_IMAGE=arm64v8/centos:7 + DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=10" + MANY_LINUX_VERSION="aarch64" + ;; + cpu-aarch64-2_28) + TARGET=final + DOCKER_TAG=cpu-aarch64 + GPU_IMAGE=arm64v8/almalinux:8 + DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11" + MANY_LINUX_VERSION="2_28_aarch64" + ;; + cpu-cxx11-abi) + TARGET=final + DOCKER_TAG=cpu-cxx11-abi + GPU_IMAGE="" + DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9" + MANY_LINUX_VERSION="cxx11-abi" + ;; + cpu-s390x) + TARGET=final + DOCKER_TAG=cpu-s390x + GPU_IMAGE=redhat/ubi9 + DOCKER_GPU_BUILD_ARG="" + MANY_LINUX_VERSION="s390x" + ;; + cuda) + TARGET=cuda_final + DOCKER_TAG=cuda${GPU_ARCH_VERSION} + # Keep this up to date with the minimum version of CUDA we currently support + GPU_IMAGE=centos:7 + DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=9" + ;; + cuda-manylinux_2_28) + TARGET=cuda_final + DOCKER_TAG=cuda${GPU_ARCH_VERSION} + GPU_IMAGE=amd64/almalinux:8 + DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11" + MANY_LINUX_VERSION="2_28" + ;; + cuda-aarch64) + TARGET=cuda_final + DOCKER_TAG=cuda${GPU_ARCH_VERSION} + GPU_IMAGE=arm64v8/centos:7 + DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11" + MANY_LINUX_VERSION="aarch64" + DOCKERFILE_SUFFIX="_cuda_aarch64" + ;; + rocm) + TARGET=rocm_final + DOCKER_TAG=rocm${GPU_ARCH_VERSION} + GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete + PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100" + ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)" + if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then + ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0})) + else + echo "ERROR: rocm regex failed" + exit 1 + fi + if [[ $ROCM_VERSION_INT -ge 60000 ]]; then + PYTORCH_ROCM_ARCH+=";gfx942" + fi + DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=9" + ;; + xpu) + TARGET=xpu_final + DOCKER_TAG=xpu + GPU_IMAGE=amd64/almalinux:8 + DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11" + MANY_LINUX_VERSION="2_28" + ;; + *) + echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}" + exit 1 + ;; +esac + +IMAGES='' + +if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then + DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION} +fi +( + set -x + DOCKER_BUILDKIT=1 docker build \ + ${DOCKER_GPU_BUILD_ARG} \ + --build-arg "GPU_IMAGE=${GPU_IMAGE}" \ + --target "${TARGET}" \ + -t "${DOCKER_IMAGE}" \ + $@ \ + -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \ + "${TOPDIR}/.ci/docker/" +) + +GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)} +GIT_BRANCH_NAME=${GITHUB_REF##*/} +GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)} +DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME} +DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA} + +if [[ "${WITH_PUSH}" == true ]]; then + ( + set -x + docker push "${DOCKER_IMAGE}" + if [[ -n ${GITHUB_REF} ]]; then + docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG} + docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG} + docker push "${DOCKER_IMAGE_BRANCH_TAG}" + docker push "${DOCKER_IMAGE_SHA_TAG}" + fi + ) +fi diff --git a/.ci/docker/manywheel/build_scripts/build.sh b/.ci/docker/manywheel/build_scripts/build.sh new file mode 100644 index 00000000000..1708b71a19b --- /dev/null +++ b/.ci/docker/manywheel/build_scripts/build.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# Top-level build script called from Dockerfile +# Script used only in CD pipeline + +# Stop at any error, show all commands +set -ex + +# openssl version to build, with expected sha256 hash of .tar.gz +# archive +OPENSSL_ROOT=openssl-1.1.1l +OPENSSL_HASH=0b7a3e5e59c34827fe0c3a74b7ec8baef302b98fa80088d7f9153aa16fa76bd1 +DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc +PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb +CURL_ROOT=curl-7.73.0 +CURL_HASH=cf34fe0b07b800f1c01a499a6e8b2af548f6d0e044dca4a29d88a4bee146d131 +AUTOCONF_ROOT=autoconf-2.69 +AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969 + +# Get build utilities +MY_DIR=$(dirname "${BASH_SOURCE[0]}") +source $MY_DIR/build_utils.sh + +if [ "$(uname -m)" != "s390x" ] ; then + # Dependencies for compiling Python that we want to remove from + # the final image after compiling Python + PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel" + + # Libraries that are allowed as part of the manylinux1 profile + MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel mesa-libGL-devel libICE-devel libSM-devel ncurses-devel" + + # Development tools and libraries + yum -y install bzip2 make git patch unzip bison yasm diffutils \ + automake which file cmake28 \ + kernel-devel-`uname -r` \ + ${PYTHON_COMPILE_DEPS} +else + # Dependencies for compiling Python that we want to remove from + # the final image after compiling Python + PYTHON_COMPILE_DEPS="zlib1g-dev libbz2-dev libncurses-dev libsqlite3-dev libdb-dev libpcap-dev liblzma-dev libffi-dev" + + # Libraries that are allowed as part of the manylinux1 profile + MANYLINUX1_DEPS="libglib2.0-dev libX11-dev libncurses-dev" + + # Development tools and libraries + apt install -y bzip2 make git patch unzip diffutils \ + automake which file cmake \ + linux-headers-virtual \ + ${PYTHON_COMPILE_DEPS} +fi + +# Install newest autoconf +build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH +autoconf --version + +# Compile the latest Python releases. +# (In order to have a proper SSL module, Python is compiled +# against a recent openssl [see env vars above], which is linked +# statically. We delete openssl afterwards.) +build_openssl $OPENSSL_ROOT $OPENSSL_HASH +/build_scripts/install_cpython.sh + +PY39_BIN=/opt/python/cp39-cp39/bin + +# Our openssl doesn't know how to find the system CA trust store +# (https://github.com/pypa/manylinux/issues/53) +# And it's not clear how up-to-date that is anyway +# So let's just use the same one pip and everyone uses +$PY39_BIN/pip install certifi +ln -s $($PY39_BIN/python -c 'import certifi; print(certifi.where())') \ + /opt/_internal/certs.pem +# If you modify this line you also have to modify the versions in the +# Dockerfiles: +export SSL_CERT_FILE=/opt/_internal/certs.pem + +# Install newest curl +build_curl $CURL_ROOT $CURL_HASH +rm -rf /usr/local/include/curl /usr/local/lib/libcurl* /usr/local/lib/pkgconfig/libcurl.pc +hash -r +curl --version +curl-config --features + +# Install patchelf (latest with unreleased bug fixes) +curl -sLOk https://nixos.org/releases/patchelf/patchelf-0.10/patchelf-0.10.tar.gz +# check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH +tar -xzf patchelf-0.10.tar.gz +(cd patchelf-0.10 && ./configure && make && make install) +rm -rf patchelf-0.10.tar.gz patchelf-0.10 + +# Install latest pypi release of auditwheel +$PY39_BIN/pip install auditwheel +ln -s $PY39_BIN/auditwheel /usr/local/bin/auditwheel + +# Clean up development headers and other unnecessary stuff for +# final image +if [ "$(uname -m)" != "s390x" ] ; then + yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \ + avahi freetype bitstream-vera-fonts \ + ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1 + yum -y install ${MANYLINUX1_DEPS} + yum -y clean all > /dev/null 2>&1 + yum list installed +else + apt purge -y ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1 +fi +# we don't need libpython*.a, and they're many megabytes +find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f +# Strip what we can -- and ignore errors, because this just attempts to strip +# *everything*, including non-ELF files: +find /opt/_internal -type f -print0 \ + | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true +# We do not need the Python test suites, or indeed the precompiled .pyc and +# .pyo files. Partially cribbed from: +# https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile +find /opt/_internal \ + \( -type d -a -name test -o -name tests \) \ + -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \ + -print0 | xargs -0 rm -f + +for PYTHON in /opt/python/*/bin/python; do + # Smoke test to make sure that our Pythons work, and do indeed detect as + # being manylinux compatible: + $PYTHON $MY_DIR/manylinux1-check.py + # Make sure that SSL cert checking works + $PYTHON $MY_DIR/ssl-check.py +done + +# Fix libc headers to remain compatible with C99 compilers. +find /usr/include/ -type f -exec sed -i 's/\bextern _*inline_*\b/extern __inline __attribute__ ((__gnu_inline__))/g' {} + + +# Now we can delete our built SSL +rm -rf /usr/local/ssl diff --git a/.ci/docker/manywheel/build_scripts/build_utils.sh b/.ci/docker/manywheel/build_scripts/build_utils.sh new file mode 100755 index 00000000000..279a7b17a52 --- /dev/null +++ b/.ci/docker/manywheel/build_scripts/build_utils.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# Helper utilities for build +# Script used only in CD pipeline + +OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/ +CURL_DOWNLOAD_URL=https://curl.askapache.com/download + +AUTOCONF_DOWNLOAD_URL=https://ftp.gnu.org/gnu/autoconf + + +function check_var { + if [ -z "$1" ]; then + echo "required variable not defined" + exit 1 + fi +} + + +function do_openssl_build { + ./config no-ssl2 no-shared -fPIC --prefix=/usr/local/ssl > /dev/null + make > /dev/null + make install > /dev/null +} + + +function check_sha256sum { + local fname=$1 + check_var ${fname} + local sha256=$2 + check_var ${sha256} + + echo "${sha256} ${fname}" > ${fname}.sha256 + sha256sum -c ${fname}.sha256 + rm -f ${fname}.sha256 +} + + +function build_openssl { + local openssl_fname=$1 + check_var ${openssl_fname} + local openssl_sha256=$2 + check_var ${openssl_sha256} + check_var ${OPENSSL_DOWNLOAD_URL} + curl -sLO ${OPENSSL_DOWNLOAD_URL}/${openssl_fname}.tar.gz + check_sha256sum ${openssl_fname}.tar.gz ${openssl_sha256} + tar -xzf ${openssl_fname}.tar.gz + (cd ${openssl_fname} && do_openssl_build) + rm -rf ${openssl_fname} ${openssl_fname}.tar.gz +} + + +function do_curl_build { + LIBS=-ldl ./configure --with-ssl --disable-shared > /dev/null + make > /dev/null + make install > /dev/null +} + + +function build_curl { + local curl_fname=$1 + check_var ${curl_fname} + local curl_sha256=$2 + check_var ${curl_sha256} + check_var ${CURL_DOWNLOAD_URL} + curl -sLO ${CURL_DOWNLOAD_URL}/${curl_fname}.tar.bz2 + check_sha256sum ${curl_fname}.tar.bz2 ${curl_sha256} + tar -jxf ${curl_fname}.tar.bz2 + (cd ${curl_fname} && do_curl_build) + rm -rf ${curl_fname} ${curl_fname}.tar.bz2 +} + + +function do_standard_install { + ./configure > /dev/null + make > /dev/null + make install > /dev/null +} + + +function build_autoconf { + local autoconf_fname=$1 + check_var ${autoconf_fname} + local autoconf_sha256=$2 + check_var ${autoconf_sha256} + check_var ${AUTOCONF_DOWNLOAD_URL} + curl -sLO ${AUTOCONF_DOWNLOAD_URL}/${autoconf_fname}.tar.gz + check_sha256sum ${autoconf_fname}.tar.gz ${autoconf_sha256} + tar -zxf ${autoconf_fname}.tar.gz + (cd ${autoconf_fname} && do_standard_install) + rm -rf ${autoconf_fname} ${autoconf_fname}.tar.gz +} diff --git a/.ci/docker/manywheel/build_scripts/manylinux1-check.py b/.ci/docker/manywheel/build_scripts/manylinux1-check.py new file mode 100644 index 00000000000..bdc8f610298 --- /dev/null +++ b/.ci/docker/manywheel/build_scripts/manylinux1-check.py @@ -0,0 +1,60 @@ +# Logic copied from PEP 513 + + +def is_manylinux1_compatible(): + # Only Linux, and only x86-64 / i686 + from distutils.util import get_platform + + if get_platform() not in ["linux-x86_64", "linux-i686", "linux-s390x"]: + return False + + # Check for presence of _manylinux module + try: + import _manylinux + + return bool(_manylinux.manylinux1_compatible) + except (ImportError, AttributeError): + # Fall through to heuristic check below + pass + + # Check glibc version. CentOS 5 uses glibc 2.5. + return have_compatible_glibc(2, 5) + + +def have_compatible_glibc(major, minimum_minor): + import ctypes + + process_namespace = ctypes.CDLL(None) + try: + gnu_get_libc_version = process_namespace.gnu_get_libc_version + except AttributeError: + # Symbol doesn't exist -> therefore, we are not linked to + # glibc. + return False + + # Call gnu_get_libc_version, which returns a string like "2.5". + gnu_get_libc_version.restype = ctypes.c_char_p + version_str = gnu_get_libc_version() + # py2 / py3 compatibility: + if not isinstance(version_str, str): + version_str = version_str.decode("ascii") + + # Parse string and check against requested version. + version = [int(piece) for piece in version_str.split(".")] + assert len(version) == 2 + if major != version[0]: + return False + if minimum_minor > version[1]: + return False + return True + + +import sys + + +if is_manylinux1_compatible(): + print(f"{sys.executable} is manylinux1 compatible") + sys.exit(0) +else: + print(f"{sys.executable} is NOT manylinux1 compatible") + sys.exit(1) diff --git a/.ci/docker/manywheel/build_scripts/ssl-check.py b/.ci/docker/manywheel/build_scripts/ssl-check.py new file mode 100644 index 00000000000..b1df3e1346f --- /dev/null +++ b/.ci/docker/manywheel/build_scripts/ssl-check.py @@ -0,0 +1,35 @@ +# cf. https://github.com/pypa/manylinux/issues/53 + +GOOD_SSL = "https://google.com" +BAD_SSL = "https://self-signed.badssl.com" + +import sys + + +print("Testing SSL certificate checking for Python:", sys.version) + +if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4): + print("This version never checks SSL certs; skipping tests") + sys.exit(0) + +if sys.version_info[0] >= 3: + from urllib.request import urlopen + + EXC = OSError +else: + from urllib import urlopen + + EXC = IOError + +print(f"Connecting to {GOOD_SSL} should work") +urlopen(GOOD_SSL) +print("...it did, yay.") + +print(f"Connecting to {BAD_SSL} should fail") +try: + urlopen(BAD_SSL) + # If we get here then we failed: + print("...it DIDN'T!!!!!11!!1one!") + sys.exit(1) +except EXC: + print("...it did, yay.") diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index da85d65cc3c..18225b8da39 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -9,7 +9,9 @@ self-hosted-runner: - linux.large - linux.2xlarge - linux.4xlarge + - linux.9xlarge.ephemeral - linux.12xlarge + - linux.12xlarge.ephemeral - linux.24xlarge - linux.arm64.2xlarge - linux.4xlarge.nvidia.gpu diff --git a/.github/workflows/build-conda-images.yml b/.github/workflows/build-conda-images.yml new file mode 100644 index 00000000000..f77cf7cef38 --- /dev/null +++ b/.github/workflows/build-conda-images.yml @@ -0,0 +1,64 @@ +name: Build conda docker images + +on: + workflow_dispatch: + push: + branches: + - main + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate or nightly builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + paths: + - conda/Dockerfile + - 'common/*' + - .github/workflows/build-conda-images.yml + pull_request: + paths: + - conda/Dockerfile + - 'common/*' + - .github/workflows/build-conda-images.yml + +env: + DOCKER_REGISTRY: "docker.io" + DOCKER_BUILDKIT: 1 + DOCKER_ID: ${{ secrets.DOCKER_ID }} + DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }} + WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }} + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + build-docker: + runs-on: linux.9xlarge.ephemeral + strategy: + matrix: + cuda_version: ["11.8", "12.1", "12.4", "cpu"] + env: + CUDA_VERSION: ${{ matrix.cuda_version }} + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: false + - name: Calculate docker image + if: env.WITH_PUSH == 'false' + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: conda-builder${{ matrix.cuda_version == 'cpu' && '-' || '-cuda' }}${{matrix.cuda_version}} + docker-build-dir: .ci/docker/conda + always-rebuild: true + push: true + - name: Authenticate if WITH_PUSH + if: env.WITH_PUSH == 'true' + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + if: env.WITH_PUSH == 'true' + run: | + .ci/docker/conda/build.sh conda-builder${{ matrix.cuda_version == 'cpu' && ':' || ':cuda' }}${{matrix.cuda_version}} diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml new file mode 100644 index 00000000000..baab800cfc4 --- /dev/null +++ b/.github/workflows/build-libtorch-images.yml @@ -0,0 +1,120 @@ +name: Build libtorch docker images + +on: + push: + branches: + - main + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate or nightly builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + paths: + - '.ci/docker/libtorch/*' + - '.ci/docker/common/*' + - .github/workflows/build-libtorch-images.yml + pull_request: + paths: + - '.ci/docker/libtorch/*' + - '.ci/docker/common/*' + - .github/workflows/build-libtorch-images.yml + +env: + DOCKER_REGISTRY: "docker.io" + DOCKER_BUILDKIT: 1 + DOCKER_ID: ${{ secrets.DOCKER_ID }} + DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }} + WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }} + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + build-docker-cuda: + runs-on: linux.9xlarge.ephemeral + strategy: + matrix: + cuda_version: ["12.4", "12.1", "11.8"] + env: + GPU_ARCH_TYPE: cuda + GPU_ARCH_VERSION: ${{ matrix.cuda_version }} + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: false + - name: Calculate docker image + if: env.WITH_PUSH == 'false' + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: libtorch-cxx11-builder-cuda${{matrix.cuda_version}} + docker-build-dir: .ci/docker/libtorch + always-rebuild: true + push: true + - name: Authenticate if WITH_PUSH + if: env.WITH_PUSH == 'true' + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + if: env.WITH_PUSH == 'true' + run: | + .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cuda${{matrix.cuda_version}} + build-docker-rocm: + runs-on: linux.9xlarge.ephemeral + strategy: + matrix: + rocm_version: ["6.0", "6.1"] + env: + GPU_ARCH_TYPE: rocm + GPU_ARCH_VERSION: ${{ matrix.rocm_version }} + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: false + - name: Calculate docker image + if: env.WITH_PUSH == 'false' + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: libtorch-cxx11-builder-rocm${{matrix.rocm_version}} + docker-build-dir: .ci/docker/libtorch + always-rebuild: true + push: true + - name: Authenticate if WITH_PUSH + if: env.WITH_PUSH == 'true' + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + if: env.WITH_PUSH == 'true' + run: | + .ci/docker/libtorch/build.sh libtorch-cxx11-builder:rocm${{matrix.rocm_version}} + build-docker-cpu: + runs-on: linux.9xlarge.ephemeral + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: false + - name: Calculate docker image + if: env.WITH_PUSH == 'false' + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: libtorch-cxx11-builder-cpu + docker-build-dir: .ci/docker/libtorch + always-rebuild: true + push: true + - name: Authenticate if WITH_PUSH + if: env.WITH_PUSH == 'true' + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + if: env.WITH_PUSH == 'true' + run: | + .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cpu diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml new file mode 100644 index 00000000000..2ba8cbd9ae6 --- /dev/null +++ b/.github/workflows/build-manywheel-images.yml @@ -0,0 +1,322 @@ +name: Build manywheel docker images + +on: + workflow_dispatch: + push: + branches: + - main + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate or nightly builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + paths: + - '.ci/docker/manywheel/*' + - '.ci/docker/common/*' + - .github/workflows/build-manywheel-images.yml + pull_request: + paths: + - '.ci/docker/manywheel/*' + - '.ci/docker/common/*' + - .github/workflows/build-manywheel-images.yml + + +env: + DOCKER_REGISTRY: "docker.io" + DOCKER_BUILDKIT: 1 + DOCKER_ID: ${{ secrets.DOCKER_ID }} + DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }} + WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }} + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + build-docker-cuda: + runs-on: linux.9xlarge.ephemeral + strategy: + matrix: + cuda_version: ["12.4", "12.1", "11.8"] + env: + GPU_ARCH_TYPE: cuda + GPU_ARCH_VERSION: ${{ matrix.cuda_version }} + steps: + - name: Purge tools folder (free space for build) + run: rm -rf /opt/hostedtoolcache + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: false + - name: Calculate docker image + if: env.WITH_PUSH == 'false' + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: manylinux-builder-cuda${{matrix.cuda_version}} + docker-build-dir: .ci/docker/manywheel + always-rebuild: true + push: true + - name: Authenticate if WITH_PUSH + if: env.WITH_PUSH == 'true' + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + if: env.WITH_PUSH == 'true' + run: | + .ci/docker/manywheel/build.sh manylinux-builder:cuda${{matrix.cuda_version}} + # NOTE: manylinux_2_28 are still experimental, see https://github.com/pytorch/pytorch/issues/123649 + build-docker-cuda-manylinux_2_28: + runs-on: linux.9xlarge.ephemeral + strategy: + matrix: + cuda_version: ["12.4", "12.1", "11.8"] + env: + GPU_ARCH_TYPE: cuda-manylinux_2_28 + GPU_ARCH_VERSION: ${{ matrix.cuda_version }} + steps: + - name: Purge tools folder (free space for build) + run: rm -rf /opt/hostedtoolcache + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: false + - name: Calculate docker image + if: env.WITH_PUSH == 'false' + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: manylinux2_28-builder-cuda${{matrix.cuda_version}} + docker-build-dir: .ci/docker/manywheel + always-rebuild: true + push: true + - name: Authenticate if WITH_PUSH + if: env.WITH_PUSH == 'true' + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + if: env.WITH_PUSH == 'true' + run: | + .ci/docker/manywheel/build.sh manylinux2_28-builder:cuda${{matrix.cuda_version}} + build-docker-cuda-aarch64: + runs-on: linux.arm64.2xlarge + strategy: + matrix: + cuda_version: ["12.4"] + env: + GPU_ARCH_TYPE: cuda-aarch64 + GPU_ARCH_VERSION: ${{ matrix.cuda_version }} + steps: + - name: Checkout PyTorch + uses: actions/checkout@v3 + - name: Calculate docker image + if: env.WITH_PUSH == 'false' + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: manylinuxaarch64-builder-cuda${{matrix.cuda_version}} + docker-build-dir: .ci/docker/manywheel + always-rebuild: true + push: true + - name: Authenticate if WITH_PUSH + if: env.WITH_PUSH == 'true' + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + if: env.WITH_PUSH == 'true' + run: | + .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}} + build-docker-rocm: + runs-on: linux.9xlarge.ephemeral + strategy: + matrix: + rocm_version: ["6.0", "6.1"] + env: + GPU_ARCH_TYPE: rocm + GPU_ARCH_VERSION: ${{ matrix.rocm_version }} + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: false + - name: Calculate docker image + if: env.WITH_PUSH == 'false' + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: manylinux-builder-rocm${{matrix.rocm_version}} + docker-build-dir: .ci/docker/manywheel + always-rebuild: true + push: true + - name: Authenticate if WITH_PUSH + if: env.WITH_PUSH == 'true' + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + if: env.WITH_PUSH == 'true' + run: | + .ci/docker/manywheel/build.sh manylinux-builder:rocm${{matrix.rocm_version}} + build-docker-cpu: + runs-on: linux.9xlarge.ephemeral + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: false + - name: Calculate docker image + if: env.WITH_PUSH == 'false' + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: manylinux-builder-cpu + docker-build-dir: .ci/docker/manywheel + always-rebuild: true + push: true + - name: Authenticate if WITH_PUSH + if: env.WITH_PUSH == 'true' + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + if: env.WITH_PUSH == 'true' + run: | + .ci/docker/manywheel/build.sh manylinux-builder:cpu + build-docker-cpu-manylinux_2_28: + runs-on: linux.9xlarge.ephemeral + env: + GPU_ARCH_TYPE: cpu-manylinux_2_28 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: false + - name: Calculate docker image + if: env.WITH_PUSH == 'false' + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: manylinux2_28-builder-cpu + docker-build-dir: .ci/docker/manywheel + always-rebuild: true + push: true + - name: Authenticate if WITH_PUSH + if: env.WITH_PUSH == 'true' + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + if: env.WITH_PUSH == 'true' + run: | + .ci/docker/manywheel/build.sh manylinux2_28-builder:cpu + build-docker-cpu-aarch64: + runs-on: linux.arm64.2xlarge + env: + GPU_ARCH_TYPE: cpu-aarch64 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: false + - name: Calculate docker image + if: env.WITH_PUSH == 'false' + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: manylinuxaarch64-builder-cpu-aarch64 + docker-build-dir: .ci/docker/manywheel + always-rebuild: true + push: true + - name: Authenticate if WITH_PUSH + if: env.WITH_PUSH == 'true' + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + if: env.WITH_PUSH == 'true' + run: | + .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cpu-aarch64 + build-docker-cpu-aarch64-2_28: + runs-on: linux.arm64.2xlarge + env: + GPU_ARCH_TYPE: cpu-aarch64-2_28 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: false + - name: Calculate docker image + if: env.WITH_PUSH == 'false' + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: manylinux2_28_aarch64-builder-cpu-aarch64 + docker-build-dir: .ci/docker/manywheel + always-rebuild: true + push: true + - name: Authenticate if WITH_PUSH + if: env.WITH_PUSH == 'true' + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + if: env.WITH_PUSH == 'true' + run: | + .ci/docker/manywheel/build.sh manylinux2_28_aarch64-builder:cpu-aarch64 + build-docker-cpu-cxx11-abi: + runs-on: linux.9xlarge.ephemeral + env: + GPU_ARCH_TYPE: cpu-cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: false + - name: Calculate docker image + if: env.WITH_PUSH == 'false' + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: manylinuxcxx11-abi-builder-cpu-cxx11-abi + docker-build-dir: .ci/docker/manywheel + always-rebuild: true + push: true + - name: Authenticate if WITH_PUSH + if: env.WITH_PUSH == 'true' + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + if: env.WITH_PUSH == 'true' + run: | + .ci/docker/manywheel/build.sh manylinuxcxx11-abi-builder:cpu-cxx11-abi + build-docker-xpu: + runs-on: linux.9xlarge.ephemeral + env: + GPU_ARCH_TYPE: xpu + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: false + - name: Calculate docker image + if: env.WITH_PUSH == 'false' + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: manylinux2_28-builder-xpu + docker-build-dir: .ci/docker/manywheel + always-rebuild: true + push: true + - name: Authenticate if WITH_PUSH + if: env.WITH_PUSH == 'true' + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + if: env.WITH_PUSH == 'true' + run: | + .ci/docker/manywheel/build.sh manylinux2_28-builder:xpu diff --git a/.lintrunner.toml b/.lintrunner.toml index 2b6aaa3037b..ce41c89338e 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -417,6 +417,7 @@ exclude_patterns = [ 'aten/src/ATen/native/vulkan/api/vk_mem_alloc.h', 'test/cpp/jit/upgrader_models/*.ptl', 'test/cpp/jit/upgrader_models/*.ptl.ff', + '.ci/docker/common/install_rocm_drm.sh', '.lintrunner.toml', ] command = [