diff --git a/.ci/docker/README.md b/.ci/docker/README.md
index 90dcb7c6e6f..68df3076315 100644
--- a/.ci/docker/README.md
+++ b/.ci/docker/README.md
@@ -1,4 +1,4 @@
-# Docker images for GitHub CI
+# Docker images for GitHub CI and CD
 
 This directory contains everything needed to build the Docker images
 that are used in our CI.
@@ -12,7 +12,7 @@ each image as the `BUILD_ENVIRONMENT` environment variable.
 
 See `build.sh` for valid build environments (it's the giant switch).
 
-## Contents
+## Docker CI builds
 
 * `build.sh` -- dispatch script to launch all builds
 * `common` -- scripts used to execute individual Docker build stages
@@ -21,6 +21,12 @@ See `build.sh` for valid build environments (it's the giant switch).
 * `ubuntu-rocm` -- Dockerfile for Ubuntu image with ROCm support
 * `ubuntu-xpu` -- Dockerfile for Ubuntu image with XPU support
 
+### Docker CD builds
+
+* `conda` - Dockerfile and build.sh to build Docker images used in nightly conda builds
+* `manywheel` - Dockerfile and build.sh to build Docker images used in nightly manywheel builds
+* `libtorch` - Dockerfile and build.sh to build Docker images used in nightly libtorch builds
+
 ## Usage
 
 ```bash
diff --git a/.ci/docker/common/aotriton_version.txt b/.ci/docker/common/aotriton_version.txt
new file mode 100644
index 00000000000..00f3f90cb78
--- /dev/null
+++ b/.ci/docker/common/aotriton_version.txt
@@ -0,0 +1,5 @@
+0.6b
+manylinux_2_17
+rocm6.1
+04b5df8c8123f90cba3ede7e971e6fbc6040d506
+77c29fa3f3b614e187d7213d745e989a92708cee2bc6020419ab49019af399d1
diff --git a/.ci/docker/common/install_conda_docker.sh b/.ci/docker/common/install_conda_docker.sh
new file mode 100755
index 00000000000..dc377075750
--- /dev/null
+++ b/.ci/docker/common/install_conda_docker.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Script used only in CD pipeline
+set -ex
+
+# Anaconda
+# Latest anaconda is using openssl-3 which is incompatible with all currently published versions of git
+# Which are using openssl-1.1.1, see https://anaconda.org/anaconda/git/files?version=2.40.1 for example
+MINICONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-py311_23.5.2-0-Linux-x86_64.sh
+wget -q $MINICONDA_URL
+# NB: Manually invoke bash per https://github.com/conda/conda/issues/10431
+bash $(basename "$MINICONDA_URL") -b -p /opt/conda
+rm $(basename "$MINICONDA_URL")
+export PATH=/opt/conda/bin:$PATH
+# See https://github.com/pytorch/builder/issues/1473
+# Pin conda to 23.5.2 as it's the last one compatible with openssl-1.1.1
+conda install -y conda=23.5.2 conda-build anaconda-client git ninja
+# The cmake version here needs to match with the minimum version of cmake
+# supported by PyTorch (3.18). There is only 3.18.2 on anaconda
+/opt/conda/bin/pip3 install cmake==3.18.2
+conda remove -y --force patchelf
diff --git a/.ci/docker/common/install_cpython.sh b/.ci/docker/common/install_cpython.sh
new file mode 100755
index 00000000000..1bd25fb2de9
--- /dev/null
+++ b/.ci/docker/common/install_cpython.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+# Script used only in CD pipeline
+set -uex -o pipefail
+
+PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
+PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads
+GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
+
+# Python versions to be installed in /opt/$VERSION_NO
+CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0"}
+
+function check_var {
+    if [ -z "$1" ]; then
+        echo "required variable not defined"
+        exit 1
+    fi
+}
+
+function do_cpython_build {
+    local py_ver=$1
+    local py_folder=$2
+    check_var $py_ver
+    check_var $py_folder
+    tar -xzf Python-$py_ver.tgz
+    pushd $py_folder
+
+    local prefix="/opt/_internal/cpython-${py_ver}"
+    mkdir -p ${prefix}/lib
+    if [[ -n $(which patchelf) ]]; then
+        local shared_flags="--enable-shared"
+    else
+        local shared_flags="--disable-shared"
+    fi
+    if [[ -z  "${WITH_OPENSSL+x}" ]]; then
+        local openssl_flags=""
+    else
+        local openssl_flags="--with-openssl=${WITH_OPENSSL} --with-openssl-rpath=auto"
+    fi
+
+    # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
+    CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} > /dev/null
+
+    make -j40 > /dev/null
+    make install > /dev/null
+
+    if [[ "${shared_flags}" == "--enable-shared" ]]; then
+        patchelf --set-rpath '$ORIGIN/../lib' ${prefix}/bin/python3
+    fi
+
+    popd
+    rm -rf $py_folder
+    # Some python's install as bin/python3. Make them available as
+    # bin/python.
+    if [ -e ${prefix}/bin/python3 ]; then
+        ln -s python3 ${prefix}/bin/python
+    fi
+    ${prefix}/bin/python get-pip.py
+    if [ -e ${prefix}/bin/pip3 ] && [ ! -e ${prefix}/bin/pip ]; then
+        ln -s pip3 ${prefix}/bin/pip
+    fi
+    ${prefix}/bin/pip install wheel==0.34.2
+    local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
+    ln -s ${prefix} /opt/python/${abi_tag}
+}
+
+function build_cpython {
+    local py_ver=$1
+    check_var $py_ver
+    check_var $PYTHON_DOWNLOAD_URL
+    local py_ver_folder=$py_ver
+    if [ "$py_ver" = "3.13.0" ]; then
+        PY_VER_SHORT="3.13"
+        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
+        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
+        do_cpython_build $py_ver cpython-$PY_VER_SHORT
+    else
+        wget -q $PYTHON_DOWNLOAD_URL/$py_ver_folder/Python-$py_ver.tgz
+        do_cpython_build $py_ver Python-$py_ver
+    fi
+
+    rm -f Python-$py_ver.tgz
+}
+
+function build_cpythons {
+    check_var $GET_PIP_URL
+    curl -sLO $GET_PIP_URL
+    for py_ver in $@; do
+        build_cpython $py_ver
+    done
+    rm -f get-pip.py
+}
+
+mkdir -p /opt/python
+mkdir -p /opt/_internal
+build_cpythons $CPYTHON_VERSIONS
diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh
new file mode 100644
index 00000000000..2088447a7ef
--- /dev/null
+++ b/.ci/docker/common/install_cuda.sh
@@ -0,0 +1,239 @@
+#!/bin/bash
+
+set -ex
+
+NCCL_VERSION=v2.21.5-1
+CUDNN_VERSION=9.1.0.70
+
+function install_cusparselt_040 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_cusparselt_052 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_118 {
+    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
+    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
+    # install CUDA 11.8.0 in the same container
+    wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
+    chmod +x cuda_11.8.0_520.61.05_linux.run
+    ./cuda_11.8.0_520.61.05_linux.run --toolkit --silent
+    rm -f cuda_11.8.0_520.61.05_linux.run
+    rm -f /usr/local/cuda && ln -s /usr/local/cuda-11.8 /usr/local/cuda
+
+    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+    mkdir tmp_cudnn && cd tmp_cudnn
+    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
+    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/include/* /usr/local/cuda/include/
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf tmp_cudnn
+
+    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+    git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+    cd nccl && make -j src.build
+    cp -a build/include/* /usr/local/cuda/include/
+    cp -a build/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf nccl
+
+    install_cusparselt_040
+
+    ldconfig
+}
+
+function install_121 {
+    echo "Installing CUDA 12.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+    rm -rf /usr/local/cuda-12.1 /usr/local/cuda
+    # install CUDA 12.1.0 in the same container
+    wget -q https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
+    chmod +x cuda_12.1.1_530.30.02_linux.run
+    ./cuda_12.1.1_530.30.02_linux.run --toolkit --silent
+    rm -f cuda_12.1.1_530.30.02_linux.run
+    rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.1 /usr/local/cuda
+
+    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+    mkdir tmp_cudnn && cd tmp_cudnn
+    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf tmp_cudnn
+
+    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+    git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+    cd nccl && make -j src.build
+    cp -a build/include/* /usr/local/cuda/include/
+    cp -a build/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf nccl
+
+    install_cusparselt_052
+
+    ldconfig
+}
+
+function install_124 {
+  echo "Installing CUDA 12.4 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
+  # install CUDA 12.4.0 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run
+  chmod +x cuda_12.4.0_550.54.14_linux.run
+  ./cuda_12.4.0_550.54.14_linux.run --toolkit --silent
+  rm -f cuda_12.4.0_550.54.14_linux.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_052
+
+  ldconfig
+}
+
+function prune_118 {
+    echo "Pruning CUDA 11.8 and cuDNN"
+    #####################################################################################
+    # CUDA 11.8 prune static libs
+    #####################################################################################
+    export NVPRUNE="/usr/local/cuda-11.8/bin/nvprune"
+    export CUDA_LIB_DIR="/usr/local/cuda-11.8/lib64"
+
+    export GENCODE="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+    export GENCODE_CUDNN="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+    if [[ -n "$OVERRIDE_GENCODE" ]]; then
+        export GENCODE=$OVERRIDE_GENCODE
+    fi
+
+    # all CUDA libs except CuDNN and CuBLAS (cudnn and cublas need arch 3.7 included)
+    ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+    # prune CuDNN and CuBLAS
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+    #####################################################################################
+    # CUDA 11.8 prune visual tools
+    #####################################################################################
+    export CUDA_BASE="/usr/local/cuda-11.8/"
+    rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2022.3.0 $CUDA_BASE/nsight-systems-2022.4.2/
+}
+
+function prune_121 {
+  echo "Pruning CUDA 12.1"
+  #####################################################################################
+  # CUDA 12.1 prune static libs
+  #####################################################################################
+    export NVPRUNE="/usr/local/cuda-12.1/bin/nvprune"
+    export CUDA_LIB_DIR="/usr/local/cuda-12.1/lib64"
+
+    export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+    export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+    if [[ -n "$OVERRIDE_GENCODE" ]]; then
+        export GENCODE=$OVERRIDE_GENCODE
+    fi
+
+    # all CUDA libs except CuDNN and CuBLAS
+    ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+    # prune CuDNN and CuBLAS
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+    #####################################################################################
+    # CUDA 12.1 prune visual tools
+    #####################################################################################
+    export CUDA_BASE="/usr/local/cuda-12.1/"
+    rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2023.1.0 $CUDA_BASE/nsight-systems-2023.1.2/
+}
+
+function prune_124 {
+  echo "Pruning CUDA 12.4"
+  #####################################################################################
+  # CUDA 12.4 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.1 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.4/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+    case "$1" in
+    11.8) install_118; prune_118
+        ;;
+    12.1) install_121; prune_121
+        ;;
+    12.4) install_124; prune_124
+        ;;
+    *) echo "bad argument $1"; exit 1
+        ;;
+    esac
+    shift
+done
diff --git a/.ci/docker/common/install_cuda_aarch64.sh b/.ci/docker/common/install_cuda_aarch64.sh
new file mode 100644
index 00000000000..7e503869761
--- /dev/null
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+NCCL_VERSION=v2.21.5-1
+
+function install_cusparselt_052 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
+    tar xf libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
+    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_124 {
+  echo "Installing CUDA 12.4 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
+  # install CUDA 12.4.0 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux_sbsa.run
+  chmod +x cuda_12.4.0_550.54.14_linux_sbsa.run
+  ./cuda_12.4.0_550.54.14_linux_sbsa.run --toolkit --silent
+  rm -f cuda_12.4.0_550.54.14_linux_sbsa.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz -O cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz
+  tar xf cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz
+  cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_052
+
+  ldconfig
+}
+
+function prune_124 {
+  echo "Pruning CUDA 12.4"
+  #####################################################################################
+  # CUDA 12.4 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.1 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.4/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+    case "$1" in
+    12.4) install_124; prune_124
+        ;;
+    *) echo "bad argument $1"; exit 1
+        ;;
+    esac
+    shift
+done
diff --git a/.ci/docker/common/install_libpng.sh b/.ci/docker/common/install_libpng.sh
new file mode 100644
index 00000000000..32453411ae9
--- /dev/null
+++ b/.ci/docker/common/install_libpng.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+LIBPNG_VERSION=1.6.37
+
+mkdir -p libpng
+pushd libpng
+
+wget http://download.sourceforge.net/libpng/libpng-$LIBPNG_VERSION.tar.gz
+tar -xvzf libpng-$LIBPNG_VERSION.tar.gz
+
+pushd libpng-$LIBPNG_VERSION
+
+./configure
+make
+make install
+
+popd
+
+popd
+rm -rf libpng
diff --git a/.ci/docker/common/install_magma.sh b/.ci/docker/common/install_magma.sh
new file mode 100644
index 00000000000..d0c6f67773d
--- /dev/null
+++ b/.ci/docker/common/install_magma.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Script used only in CD pipeline
+
+set -eou pipefail
+
+MAGMA_VERSION="2.5.2"
+
+function do_install() {
+    cuda_version=$1
+    cuda_version_nodot=${1/./}
+
+    MAGMA_VERSION="2.6.1"
+    magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
+
+    cuda_dir="/usr/local/cuda-${cuda_version}"
+    (
+        set -x
+        tmp_dir=$(mktemp -d)
+        pushd ${tmp_dir}
+        curl -OLs https://anaconda.org/pytorch/magma-cuda${cuda_version_nodot}/${MAGMA_VERSION}/download/linux-64/${magma_archive}
+        tar -xvf "${magma_archive}"
+        mkdir -p "${cuda_dir}/magma"
+        mv include "${cuda_dir}/magma/include"
+        mv lib "${cuda_dir}/magma/lib"
+        popd
+    )
+}
+
+do_install $1
diff --git a/.ci/docker/common/install_miopen.sh b/.ci/docker/common/install_miopen.sh
new file mode 100644
index 00000000000..5f2062414f2
--- /dev/null
+++ b/.ci/docker/common/install_miopen.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+ROCM_VERSION=$1
+
+if [[ -z $ROCM_VERSION ]]; then
+    echo "missing ROCM_VERSION"
+    exit 1;
+fi
+
+# To make version comparison easier, create an integer representation.
+save_IFS="$IFS"
+IFS=. ROCM_VERSION_ARRAY=(${ROCM_VERSION})
+IFS="$save_IFS"
+if [[ ${#ROCM_VERSION_ARRAY[@]} == 2 ]]; then
+    ROCM_VERSION_MAJOR=${ROCM_VERSION_ARRAY[0]}
+    ROCM_VERSION_MINOR=${ROCM_VERSION_ARRAY[1]}
+    ROCM_VERSION_PATCH=0
+elif [[ ${#ROCM_VERSION_ARRAY[@]} == 3 ]]; then
+    ROCM_VERSION_MAJOR=${ROCM_VERSION_ARRAY[0]}
+    ROCM_VERSION_MINOR=${ROCM_VERSION_ARRAY[1]}
+    ROCM_VERSION_PATCH=${ROCM_VERSION_ARRAY[2]}
+else
+    echo "Unhandled ROCM_VERSION ${ROCM_VERSION}"
+    exit 1
+fi
+ROCM_INT=$(($ROCM_VERSION_MAJOR * 10000 + $ROCM_VERSION_MINOR * 100 + $ROCM_VERSION_PATCH))
+
+# Install custom MIOpen + COMgr for ROCm >= 4.0.1
+if [[ $ROCM_INT -lt 40001 ]]; then
+    echo "ROCm version < 4.0.1; will not install custom MIOpen"
+    exit 0
+fi
+
+# Function to retry functions that sometimes timeout or have flaky failures
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+# Build custom MIOpen to use comgr for offline compilation.
+
+## Need a sanitized ROCM_VERSION without patchlevel; patchlevel version 0 must be added to paths.
+ROCM_DOTS=$(echo ${ROCM_VERSION} | tr -d -c '.' | wc -c)
+if [[ ${ROCM_DOTS} == 1 ]]; then
+    ROCM_VERSION_NOPATCH="${ROCM_VERSION}"
+    ROCM_INSTALL_PATH="/opt/rocm-${ROCM_VERSION}.0"
+else
+    ROCM_VERSION_NOPATCH="${ROCM_VERSION%.*}"
+    ROCM_INSTALL_PATH="/opt/rocm-${ROCM_VERSION}"
+fi
+
+# MIOPEN_USE_HIP_KERNELS is a Workaround for COMgr issues
+MIOPEN_CMAKE_COMMON_FLAGS="
+-DMIOPEN_USE_COMGR=ON
+-DMIOPEN_BUILD_DRIVER=OFF
+"
+# Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version
+if [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
+    echo "ROCm 6.1 MIOpen does not need any patches, do not build from source"
+    exit 0
+elif [[ $ROCM_INT -ge 60000 ]] && [[ $ROCM_INT -lt 60100 ]]; then
+    echo "ROCm 6.0 MIOpen does not need any patches, do not build from source"
+    exit 0
+elif [[ $ROCM_INT -ge 50700 ]] && [[ $ROCM_INT -lt 60000 ]]; then
+    echo "ROCm 5.7 MIOpen does not need any patches, do not build from source"
+    exit 0
+elif [[ $ROCM_INT -ge 50600 ]] && [[ $ROCM_INT -lt 50700 ]]; then
+    MIOPEN_BRANCH="release/rocm-rel-5.6-staging"
+elif [[ $ROCM_INT -ge 50500 ]] && [[ $ROCM_INT -lt 50600 ]]; then
+    MIOPEN_BRANCH="release/rocm-rel-5.5-gfx11"
+elif [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
+    MIOPEN_BRANCH="release/rocm-rel-5.4-staging"
+elif [[ $ROCM_INT -ge 50300 ]] && [[ $ROCM_INT -lt 50400 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
+    MIOPEN_BRANCH="release/rocm-rel-5.3-staging"
+elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50300 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
+    MIOPEN_BRANCH="release/rocm-rel-5.2-staging"
+elif [[ $ROCM_INT -ge 50100 ]] && [[ $ROCM_INT -lt 50200 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36"
+    MIOPEN_BRANCH="release/rocm-rel-5.1-staging"
+elif [[ $ROCM_INT -ge 50000 ]] && [[ $ROCM_INT -lt 50100 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36"
+    MIOPEN_BRANCH="release/rocm-rel-5.0-staging"
+else
+    echo "Unhandled ROCM_VERSION ${ROCM_VERSION}"
+    exit 1
+fi
+
+yum remove -y miopen-hip
+
+git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH}
+pushd MIOpen
+# remove .git to save disk space since CI runner was running out
+rm -rf .git
+# Don't build MLIR to save docker build time
+# since we are disabling MLIR backend for MIOpen anyway
+if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
+    sed -i '/rocMLIR/d' requirements.txt
+elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50400 ]]; then
+    sed -i '/llvm-project-mlir/d' requirements.txt
+fi
+## MIOpen minimum requirements
+cmake -P install_deps.cmake --minimum
+
+# clean up since CI runner was running out of disk space
+rm -rf /tmp/*
+yum clean all
+rm -rf /var/cache/yum
+rm -rf /var/lib/yum/yumdb
+rm -rf /var/lib/yum/history
+
+## Build MIOpen
+mkdir -p build
+cd build
+PKG_CONFIG_PATH=/usr/local/lib/pkgconfig CXX=${ROCM_INSTALL_PATH}/llvm/bin/clang++ cmake .. \
+    ${MIOPEN_CMAKE_COMMON_FLAGS} \
+    ${MIOPEN_CMAKE_DB_FLAGS} \
+    -DCMAKE_PREFIX_PATH="${ROCM_INSTALL_PATH}/hip;${ROCM_INSTALL_PATH}"
+make MIOpen -j $(nproc)
+
+# Build MIOpen package
+make -j $(nproc) package
+
+# clean up since CI runner was running out of disk space
+rm -rf /usr/local/cget
+
+yum install -y miopen-*.rpm
+
+popd
+rm -rf MIOpen
diff --git a/.ci/docker/common/install_mkl.sh b/.ci/docker/common/install_mkl.sh
new file mode 100644
index 00000000000..d5d139266b9
--- /dev/null
+++ b/.ci/docker/common/install_mkl.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -ex
+
+# MKL
+MKL_VERSION=2024.2.0
+
+MKLROOT=/opt/intel
+mkdir -p ${MKLROOT}
+pushd /tmp
+
+python3 -mpip install wheel
+python3 -mpip download -d . mkl-static==${MKL_VERSION}
+python3 -m wheel unpack mkl_static-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
+python3 -m wheel unpack mkl_include-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
+mv mkl_static-${MKL_VERSION}/mkl_static-${MKL_VERSION}.data/data/lib ${MKLROOT}
+mv mkl_include-${MKL_VERSION}/mkl_include-${MKL_VERSION}.data/data/include ${MKLROOT}
diff --git a/.ci/docker/common/install_mnist.sh b/.ci/docker/common/install_mnist.sh
new file mode 100644
index 00000000000..4c009d9db02
--- /dev/null
+++ b/.ci/docker/common/install_mnist.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+mkdir -p /usr/local/mnist/
+
+cd /usr/local/mnist
+
+for img in train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz t10k-images-idx3-ubyte.gz t10k-labels-idx1-ubyte.gz; do
+  wget -q https://ossci-datasets.s3.amazonaws.com/mnist/$img
+  gzip -d $img
+done
diff --git a/.ci/docker/common/install_openblas.sh b/.ci/docker/common/install_openblas.sh
new file mode 100644
index 00000000000..80e4fb0f755
--- /dev/null
+++ b/.ci/docker/common/install_openblas.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+cd /
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.25 --depth 1 --shallow-submodules
+
+
+OPENBLAS_BUILD_FLAGS="
+NUM_THREADS=128
+USE_OPENMP=1
+NO_SHARED=0
+DYNAMIC_ARCH=1
+TARGET=ARMV8
+CFLAGS=-O3
+"
+
+OPENBLAS_CHECKOUT_DIR="OpenBLAS"
+
+make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
+make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
diff --git a/.ci/docker/common/install_patchelf.sh b/.ci/docker/common/install_patchelf.sh
new file mode 100644
index 00000000000..8f2ef5a26fd
--- /dev/null
+++ b/.ci/docker/common/install_patchelf.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+# Pin the version to latest release 0.17.2, building newer commit starts
+# to fail on the current image
+git clone -b 0.17.2 --single-branch https://github.com/NixOS/patchelf
+cd patchelf
+sed -i 's/serial/parallel/g' configure.ac
+./bootstrap.sh
+./configure
+make
+make install
+cd ..
+rm -rf patchelf
diff --git a/.ci/docker/common/install_rocm_drm.sh b/.ci/docker/common/install_rocm_drm.sh
new file mode 100644
index 00000000000..a6c73560c1a
--- /dev/null
+++ b/.ci/docker/common/install_rocm_drm.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+###########################
+### prereqs
+###########################
+# Install Python packages depending on the base OS
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    apt-get update -y
+    apt-get install -y libpciaccess-dev pkg-config
+    apt-get clean
+    ;;
+  centos)
+    yum install -y libpciaccess-devel pkgconfig
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
+python3 -m pip install meson ninja
+
+###########################
+### clone repo
+###########################
+GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
+pushd drm
+
+###########################
+### patch
+###########################
+patch -p1 <<'EOF'
+diff --git a/amdgpu/amdgpu_asic_id.c b/amdgpu/amdgpu_asic_id.c
+index a5007ffc..13fa07fc 100644
+--- a/amdgpu/amdgpu_asic_id.c
++++ b/amdgpu/amdgpu_asic_id.c
+@@ -22,6 +22,13 @@
+  *
+  */
+
++#define _XOPEN_SOURCE 700
++#define _LARGEFILE64_SOURCE
++#define _FILE_OFFSET_BITS 64
++#include <ftw.h>
++#include <link.h>
++#include <limits.h>
++
+ #include <ctype.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+@@ -34,6 +41,19 @@
+ #include "amdgpu_drm.h"
+ #include "amdgpu_internal.h"
+
++static char *amdgpuids_path = NULL;
++static const char* amdgpuids_path_msg = NULL;
++
++static int check_for_location_of_amdgpuids(const char *filepath, const struct stat *info, const int typeflag, struct FTW *pathinfo)
++{
++	if (typeflag == FTW_F && strstr(filepath, "amdgpu.ids")) {
++		amdgpuids_path = strdup(filepath);
++		return 1;
++	}
++
++	return 0;
++}
++
+ static int parse_one_line(struct amdgpu_device *dev, const char *line)
+ {
+ 	char *buf, *saveptr;
+@@ -113,10 +133,46 @@ void amdgpu_parse_asic_ids(struct amdgpu_device *dev)
+ 	int line_num = 1;
+ 	int r = 0;
+
++	// attempt to find typical location for amdgpu.ids file
+ 	fp = fopen(AMDGPU_ASIC_ID_TABLE, "r");
++
++	// if it doesn't exist, search
++	if (!fp) {
++
++	char self_path[ PATH_MAX ];
++	ssize_t count;
++	ssize_t i;
++
++	count = readlink( "/proc/self/exe", self_path, PATH_MAX );
++	if (count > 0) {
++		self_path[count] = '\0';
++
++		// remove '/bin/python' from self_path
++		for (i=count; i>0; --i) {
++			if (self_path[i] == '/') break;
++			self_path[i] = '\0';
++		}
++		self_path[i] = '\0';
++		for (; i>0; --i) {
++			if (self_path[i] == '/') break;
++			self_path[i] = '\0';
++		}
++		self_path[i] = '\0';
++
++		if (1 == nftw(self_path, check_for_location_of_amdgpuids, 5, FTW_PHYS)) {
++			fp = fopen(amdgpuids_path, "r");
++			amdgpuids_path_msg = amdgpuids_path;
++		}
++	}
++
++	}
++	else {
++		amdgpuids_path_msg = AMDGPU_ASIC_ID_TABLE;
++	}
++
++	// both hard-coded location and search have failed
+ 	if (!fp) {
+-		fprintf(stderr, "%s: %s\n", AMDGPU_ASIC_ID_TABLE,
+-			strerror(errno));
++		fprintf(stderr, "amdgpu.ids: No such file or directory\n");
+ 		return;
+ 	}
+
+@@ -132,7 +188,7 @@ void amdgpu_parse_asic_ids(struct amdgpu_device *dev)
+ 			continue;
+ 		}
+
+-		drmMsg("%s version: %s\n", AMDGPU_ASIC_ID_TABLE, line);
++		drmMsg("%s version: %s\n", amdgpuids_path_msg, line);
+ 		break;
+ 	}
+
+@@ -150,7 +206,7 @@ void amdgpu_parse_asic_ids(struct amdgpu_device *dev)
+
+ 	if (r == -EINVAL) {
+ 		fprintf(stderr, "Invalid format: %s: line %d: %s\n",
+-			AMDGPU_ASIC_ID_TABLE, line_num, line);
++			amdgpuids_path_msg, line_num, line);
+ 	} else if (r && r != -EAGAIN) {
+ 		fprintf(stderr, "%s: Cannot parse ASIC IDs: %s\n",
+ 			__func__, strerror(-r));
+EOF
+
+###########################
+### build
+###########################
+meson builddir --prefix=/opt/amdgpu
+pushd builddir
+ninja install
+
+popd
+popd
diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh
index 94b94661c46..fc3d49a309b 100644
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@@ -1,7 +1,11 @@
 #!/bin/bash
+# Script used in CI and CD pipeline
 
 set -ex
 
+
+MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
+
 # "install" hipMAGMA into /opt/rocm/magma by copying after build
 git clone https://bitbucket.org/icl/magma.git
 pushd magma
@@ -11,7 +15,10 @@ git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6
 
 cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
 echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
-echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib' >> make.inc
+if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
+    echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
+fi
+echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
 echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
 export PATH="${PATH}:/opt/rocm/bin"
 if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
@@ -25,7 +32,7 @@ done
 # hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
 sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
 make -f make.gen.hipMAGMA -j $(nproc)
-LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT=/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION
-make testing/testing_dgemm -j $(nproc) MKLROOT=/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION
+LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
+make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
 popd
 mv magma /opt/rocm
diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh
index aa308010326..7902200bdfb 100644
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 set -xe
-
+# Script used in CI and CD pipeline
 
 # Intel® software for general purpose GPU capabilities.
 # Refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
@@ -8,19 +8,23 @@ set -xe
 # Users should update to the latest version as it becomes available
 
 function install_ubuntu() {
+    . /etc/os-release
+    if [[ ! " jammy " =~ " ${VERSION_CODENAME} " ]]; then
+        echo "Ubuntu version ${VERSION_CODENAME} not supported"
+        exit
+    fi
+
     apt-get update -y
     apt-get install -y gpg-agent wget
-
-    # Set up the repository. To do this, download the key to the system keyring
+    # To add the online network package repository for the GPU Driver LTS releases
     wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
-        | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
-    wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
-        | gpg --dearmor --output /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
-
-    # Add the signed entry to APT sources and configure the APT client to use the Intel repository
+        | gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg
     echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \
-        https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
-        | tee /etc/apt/sources.list.d/intel-gpu-jammy.list
+        https://repositories.intel.com/gpu/ubuntu ${VERSION_CODENAME}/lts/2350 unified" \
+        | tee /etc/apt/sources.list.d/intel-gpu-${VERSION_CODENAME}.list
+    # To add the online network network package repository for the Intel Support Packages
+    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+        | gpg --dearmor > /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
     echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \
         https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \
         | tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list
@@ -97,6 +101,86 @@ EOF
     rm -rf /var/lib/yum/history
 }
 
+function install_rhel() {
+    . /etc/os-release
+    if [[ "${ID}" == "rhel" ]]; then
+        if [[ ! " 8.6 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+            echo "RHEL version ${VERSION_ID} not supported"
+            exit
+        fi
+    elif [[ "${ID}" == "almalinux" ]]; then
+        # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64
+        VERSION_ID="8.6"
+    fi
+
+    dnf install -y 'dnf-command(config-manager)'
+    # To add the online network package repository for the GPU Driver LTS releases
+    dnf config-manager --add-repo \
+        https://repositories.intel.com/gpu/rhel/${VERSION_ID}/lts/2350/unified/intel-gpu-${VERSION_ID}.repo
+    # To add the online network network package repository for the Intel Support Packages
+    tee > /etc/yum.repos.d/intel-for-pytorch-gpu-dev.repo << EOF
+[intel-for-pytorch-gpu-dev]
+name=Intel for Pytorch GPU dev repository
+baseurl=https://yum.repos.intel.com/intel-for-pytorch-gpu-dev
+enabled=1
+gpgcheck=1
+repo_gpgcheck=1
+gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+EOF
+
+    # The xpu-smi packages
+    dnf install -y xpu-smi
+    # Compute and Media Runtimes
+    dnf install -y \
+        intel-opencl intel-media intel-mediasdk libmfxgen1 libvpl2\
+        level-zero intel-level-zero-gpu mesa-dri-drivers mesa-vulkan-drivers \
+        mesa-vdpau-drivers libdrm mesa-libEGL mesa-libgbm mesa-libGL \
+        mesa-libxatracker libvpl-tools intel-metrics-discovery \
+        intel-metrics-library intel-igc-core intel-igc-cm \
+        libva libva-utils intel-gmmlib libmetee intel-gsc intel-ocloc
+    # Development packages
+    dnf install -y --refresh \
+        intel-igc-opencl-devel level-zero-devel intel-gsc-devel libmetee-devel \
+        level-zero-devel
+    # Install Intel Support Packages
+    yum install -y intel-for-pytorch-gpu-dev intel-pti-dev
+
+    # Cleanup
+    dnf clean all
+    rm -rf /var/cache/yum
+    rm -rf /var/lib/yum/yumdb
+    rm -rf /var/lib/yum/history
+}
+
+function install_sles() {
+    . /etc/os-release
+    VERSION_SP=${VERSION_ID//./sp}
+    if [[ ! " 15sp4 15sp5 " =~ " ${VERSION_SP} " ]]; then
+        echo "SLES version ${VERSION_ID} not supported"
+        exit
+    fi
+
+    # To add the online network package repository for the GPU Driver LTS releases
+    zypper addrepo -f -r \
+        https://repositories.intel.com/gpu/sles/${VERSION_SP}/lts/2350/unified/intel-gpu-${VERSION_SP}.repo
+    rpm --import https://repositories.intel.com/gpu/intel-graphics.key
+    # To add the online network network package repository for the Intel Support Packages
+    zypper addrepo https://yum.repos.intel.com/intel-for-pytorch-gpu-dev intel-for-pytorch-gpu-dev
+    rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+
+    # The xpu-smi packages
+    zypper install -y lsb-release flex bison xpu-smi
+    # Compute and Media Runtimes
+    zypper install -y intel-level-zero-gpu level-zero intel-gsc intel-opencl intel-ocloc \
+        intel-media-driver libigfxcmrt7 libvpl2 libvpl-tools libmfxgen1 libmfx1
+    # Development packages
+    zypper install -y libigdfcl-devel intel-igc-cm libigfxcmrt-devel level-zero-devel
+
+    # Install Intel Support Packages
+    zypper install -y intel-for-pytorch-gpu-dev intel-pti-dev
+
+}
+
 
 # The installation depends on the base OS
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
@@ -107,6 +191,12 @@ case "$ID" in
     centos)
         install_centos
     ;;
+    rhel|almalinux)
+        install_rhel
+    ;;
+    sles)
+        install_sles
+    ;;
     *)
         echo "Unable to determine OS..."
         exit 1
diff --git a/.ci/docker/conda/Dockerfile b/.ci/docker/conda/Dockerfile
new file mode 100644
index 00000000000..0958aad0b5f
--- /dev/null
+++ b/.ci/docker/conda/Dockerfile
@@ -0,0 +1,101 @@
+ARG CUDA_VERSION=10.2
+ARG BASE_TARGET=cuda${CUDA_VERSION}
+FROM centos:7 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+ARG DEVTOOLSET_VERSION=9
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum update -y
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which unzip
+# Just add everything as a safe.directory for git since these will be used in multiple places with git
+RUN git config --global --add safe.directory '*'
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
+# EPEL for cmake
+RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \
+    rpm -ivh epel-release-latest-7.noarch.rpm && \
+    rm -f epel-release-latest-7.noarch.rpm
+# cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+RUN yum install -y autoconf aclocal automake make sudo
+RUN rm -rf /usr/local/cuda-*
+
+FROM base as patchelf
+# Install patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh && cp $(which patchelf) /patchelf
+
+FROM base as openssl
+# Install openssl
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+FROM base as conda
+# Install Anaconda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+
+# Install CUDA
+FROM base as cuda
+ARG CUDA_VERSION=10.2
+RUN rm -rf /usr/local/cuda-*
+ADD ./common/install_cuda.sh install_cuda.sh
+ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
+# Preserve CUDA_VERSION for the builds
+ENV CUDA_VERSION=${CUDA_VERSION}
+# Make things in our path by default
+ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH
+
+FROM cuda as cuda11.8
+RUN bash ./install_cuda.sh 11.8
+ENV DESIRED_CUDA=11.8
+
+FROM cuda as cuda12.1
+RUN bash ./install_cuda.sh 12.1
+ENV DESIRED_CUDA=12.1
+
+FROM cuda as cuda12.4
+RUN bash ./install_cuda.sh 12.4
+ENV DESIRED_CUDA=12.4
+
+# Install MNIST test data
+FROM base as mnist
+ADD ./common/install_mnist.sh install_mnist.sh
+RUN bash ./install_mnist.sh
+
+FROM base as all_cuda
+COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
+COPY --from=cuda12.1  /usr/local/cuda-12.1 /usr/local/cuda-12.1
+COPY --from=cuda12.4  /usr/local/cuda-12.4 /usr/local/cuda-12.4
+
+# Final step
+FROM ${BASE_TARGET} as final
+COPY --from=openssl            /opt/openssl           /opt/openssl
+COPY --from=patchelf           /patchelf              /usr/local/bin/patchelf
+COPY --from=conda              /opt/conda             /opt/conda
+
+# Add jni.h for java host build.
+COPY ./common/install_jni.sh install_jni.sh
+COPY ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+ENV  PATH /opt/conda/bin:$PATH
+COPY --from=mnist  /usr/local/mnist /usr/local/mnist
+RUN rm -rf /usr/local/cuda
+RUN chmod o+rw /usr/local
+RUN touch /.condarc && \
+    chmod o+rw /.condarc && \
+    chmod -R o+rw /opt/conda
diff --git a/.ci/docker/conda/build.sh b/.ci/docker/conda/build.sh
new file mode 100755
index 00000000000..6e8a1c37ff9
--- /dev/null
+++ b/.ci/docker/conda/build.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+# Script used only in CD pipeline
+
+set -eou pipefail
+
+image="$1"
+shift
+
+if [ -z "${image}" ]; then
+  echo "Usage: $0 IMAGE"
+  exit 1
+fi
+
+DOCKER_IMAGE_NAME="pytorch/${image}"
+
+
+export DOCKER_BUILDKIT=1
+TOPDIR=$(git rev-parse --show-toplevel)
+
+CUDA_VERSION=${CUDA_VERSION:-12.1}
+
+case ${CUDA_VERSION} in
+  cpu)
+    BASE_TARGET=base
+    DOCKER_TAG=cpu
+    ;;
+  all)
+    BASE_TARGET=all_cuda
+    DOCKER_TAG=latest
+    ;;
+  *)
+    BASE_TARGET=cuda${CUDA_VERSION}
+    DOCKER_TAG=cuda${CUDA_VERSION}
+    ;;
+esac
+
+
+(
+  set -x
+  docker build \
+    --target final \
+    --progress plain \
+    --build-arg "BASE_TARGET=${BASE_TARGET}" \
+    --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
+    --build-arg "DEVTOOLSET_VERSION=9" \
+    -t ${DOCKER_IMAGE_NAME} \
+    $@ \
+    -f "${TOPDIR}/.ci/docker/conda/Dockerfile" \
+    ${TOPDIR}/.ci/docker/
+)
+
+if [[ "${DOCKER_TAG}" =~ ^cuda* ]]; then
+  # Test that we're using the right CUDA compiler
+  (
+    set -x
+    docker run --rm "${DOCKER_IMAGE_NAME}" nvcc --version | grep "cuda_${CUDA_VERSION}"
+  )
+fi
+
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE_NAME}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE_NAME}-${GIT_COMMIT_SHA}
+if [[ "${WITH_PUSH:-}" == true ]]; then
+  (
+    set -x
+    docker push "${DOCKER_IMAGE_NAME}"
+    if [[ -n ${GITHUB_REF} ]]; then
+        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_BRANCH_TAG}
+        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_SHA_TAG}
+        docker push "${DOCKER_IMAGE_BRANCH_TAG}"
+        docker push "${DOCKER_IMAGE_SHA_TAG}"
+    fi
+  )
+fi
diff --git a/.ci/docker/libtorch/Dockerfile b/.ci/docker/libtorch/Dockerfile
new file mode 100644
index 00000000000..c5249e30de3
--- /dev/null
+++ b/.ci/docker/libtorch/Dockerfile
@@ -0,0 +1,107 @@
+ARG BASE_TARGET=base
+ARG GPU_IMAGE=ubuntu:20.04
+FROM ${GPU_IMAGE} as base
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get clean && apt-get update
+RUN apt-get install -y curl locales g++ git-all autoconf automake make cmake wget unzip sudo
+# Just add everything as a safe.directory for git since these will be used in multiple places with git
+RUN git config --global --add safe.directory '*'
+
+RUN locale-gen en_US.UTF-8
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+# Install openssl
+FROM base as openssl
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+# Install python
+FROM base as python
+ADD common/install_cpython.sh install_cpython.sh
+RUN apt-get update -y && \
+    apt-get install build-essential gdb lcov libbz2-dev libffi-dev \
+        libgdbm-dev liblzma-dev libncurses5-dev libreadline6-dev \
+        libsqlite3-dev libssl-dev lzma lzma-dev tk-dev uuid-dev zlib1g-dev -y && \
+    bash ./install_cpython.sh && \
+    rm install_cpython.sh && \
+    apt-get clean
+
+FROM base as conda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+
+FROM base as cpu
+# Install Anaconda
+COPY --from=conda /opt/conda /opt/conda
+# Install python
+COPY --from=python /opt/python    /opt/python
+COPY --from=python /opt/_internal /opt/_internal
+ENV PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
+# Install MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM cpu as cuda
+ADD ./common/install_cuda.sh install_cuda.sh
+ADD ./common/install_magma.sh install_magma.sh
+ENV CUDA_HOME /usr/local/cuda
+
+FROM cuda as cuda11.8
+RUN bash ./install_cuda.sh 11.8
+RUN bash ./install_magma.sh 11.8
+RUN ln -sf /usr/local/cuda-11.8 /usr/local/cuda
+
+FROM cuda as cuda12.1
+RUN bash ./install_cuda.sh 12.1
+RUN bash ./install_magma.sh 12.1
+RUN ln -sf /usr/local/cuda-12.1 /usr/local/cuda
+
+FROM cuda as cuda12.4
+RUN bash ./install_cuda.sh 12.4
+RUN bash ./install_magma.sh 12.4
+RUN ln -sf /usr/local/cuda-12.4 /usr/local/cuda
+
+FROM cpu as rocm
+ARG PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
+ENV MKLROOT /opt/intel
+# Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0)
+# find HIP works for ROCm5.7. Not needed for ROCm6.0 and above.
+# Remove below when ROCm5.7 is not in support matrix anymore.
+ENV ROCM_PATH /opt/rocm
+# No need to install ROCm as base docker image should have full ROCm install
+#ADD ./common/install_rocm.sh install_rocm.sh
+ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
+ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
+# gfortran and python needed for building magma from source for ROCm
+RUN apt-get update -y && \
+    apt-get install gfortran -y && \
+    apt-get install python -y && \
+    apt-get clean
+
+RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
+
+# Install AOTriton
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/aotriton_version.txt aotriton_version.txt
+COPY ./common/install_aotriton.sh install_aotriton.sh
+RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
+ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
+
+FROM ${BASE_TARGET} as final
+COPY --from=openssl            /opt/openssl           /opt/openssl
+# Install patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+# Install Anaconda
+COPY --from=conda /opt/conda /opt/conda
+# Install python
+COPY --from=python /opt/python    /opt/python
+COPY --from=python /opt/_internal /opt/_internal
+ENV PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
diff --git a/.ci/docker/libtorch/build.sh b/.ci/docker/libtorch/build.sh
new file mode 100755
index 00000000000..4238bb5ef00
--- /dev/null
+++ b/.ci/docker/libtorch/build.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+# Script used only in CD pipeline
+
+set -eou pipefail
+
+image="$1"
+shift
+
+if [ -z "${image}" ]; then
+  echo "Usage: $0 IMAGE"
+  exit 1
+fi
+
+DOCKER_IMAGE="pytorch/${image}"
+
+TOPDIR=$(git rev-parse --show-toplevel)
+
+GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
+GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
+
+WITH_PUSH=${WITH_PUSH:-}
+
+DOCKER=${DOCKER:-docker}
+
+case ${GPU_ARCH_TYPE} in
+    cpu)
+        BASE_TARGET=cpu
+        DOCKER_TAG=cpu
+        GPU_IMAGE=ubuntu:20.04
+        DOCKER_GPU_BUILD_ARG=""
+        ;;
+    cuda)
+        BASE_TARGET=cuda${GPU_ARCH_VERSION}
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        GPU_IMAGE=ubuntu:20.04
+        DOCKER_GPU_BUILD_ARG=""
+        ;;
+    rocm)
+        BASE_TARGET=rocm
+        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
+        GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100"
+        ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)"
+        if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then
+            ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0}))
+        else
+            echo "ERROR: rocm regex failed"
+            exit 1
+        fi
+        if [[ $ROCM_VERSION_INT -ge 60000 ]]; then
+            PYTORCH_ROCM_ARCH+=";gfx942"
+        fi
+        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
+        ;;
+    *)
+        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
+        exit 1
+        ;;
+esac
+
+
+(
+    set -x
+    DOCKER_BUILDKIT=1 ${DOCKER} build \
+         --target final \
+        ${DOCKER_GPU_BUILD_ARG} \
+        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+        --build-arg "BASE_TARGET=${BASE_TARGET}" \
+        -t "${DOCKER_IMAGE}" \
+        $@ \
+        -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
+        "${TOPDIR}/.ci/docker/"
+
+)
+
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
+
+if [[ "${WITH_PUSH}" == true ]]; then
+  (
+    set -x
+    ${DOCKER} push "${DOCKER_IMAGE}"
+    if [[ -n ${GITHUB_REF} ]]; then
+        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
+        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
+        ${DOCKER} push "${DOCKER_IMAGE_BRANCH_TAG}"
+        ${DOCKER} push "${DOCKER_IMAGE_SHA_TAG}"
+    fi
+  )
+fi
diff --git a/.ci/docker/manywheel/Dockerfile b/.ci/docker/manywheel/Dockerfile
new file mode 100644
index 00000000000..4162926d9bc
--- /dev/null
+++ b/.ci/docker/manywheel/Dockerfile
@@ -0,0 +1,203 @@
+# syntax = docker/dockerfile:experimental
+ARG ROCM_VERSION=3.7
+ARG BASE_CUDA_VERSION=11.8
+
+ARG GPU_IMAGE=centos:7
+FROM centos:7 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+ARG DEVTOOLSET_VERSION=9
+# Note: This is required patch since CentOS have reached EOL
+# otherwise any yum install setp will fail
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
+# Just add everything as a safe.directory for git since these will be used in multiple places with git
+RUN git config --global --add safe.directory '*'
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+# Note: After running yum-config-manager --enable rhel-server-rhscl-7-rpms
+# patch is required once again. Somehow this steps adds mirror.centos.org
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \
+    rpm -ivh epel-release-latest-7.noarch.rpm && \
+    rm -f epel-release-latest-7.noarch.rpm
+
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake
+
+RUN yum install -y autoconf aclocal automake make sudo
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+# EPEL for cmake
+FROM base as patchelf
+# Install patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+RUN cp $(which patchelf) /patchelf
+
+FROM patchelf as python
+# build python
+COPY manywheel/build_scripts /build_scripts
+ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh
+RUN bash build_scripts/build.sh && rm -r build_scripts
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION=10.2
+# Install CUDA
+ADD ./common/install_cuda.sh install_cuda.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
+
+FROM base as intel
+# MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION=10.2
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as jni
+# Install java jni header
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+# Install libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM ${GPU_IMAGE} as common
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+RUN yum install -y \
+        aclocal \
+        autoconf \
+        automake \
+        bison \
+        bzip2 \
+        curl \
+        diffutils \
+        file \
+        git \
+        make \
+        patch \
+        perl \
+        unzip \
+        util-linux \
+        wget \
+        which \
+        xz \
+        yasm
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+RUN yum swap -y git git236-core
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+# Install LLVM version
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+
+FROM common as cpu_final
+ARG BASE_CUDA_VERSION=10.1
+ARG DEVTOOLSET_VERSION=9
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake is already installed inside the rocm base image, so remove if present
+RUN rpm -e cmake || true
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake
+
+# ninja
+RUN yum install -y ninja-build
+
+FROM cpu_final as cuda_final
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
+ENV PATH=/usr/local/cuda/bin:$PATH
+
+FROM cpu_final as rocm_final
+ARG ROCM_VERSION=3.7
+ARG PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
+# Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0)
+# find HIP works for ROCm5.7. Not needed for ROCm6.0 and above.
+# Remove below when ROCm5.7 is not in support matrix anymore.
+ENV ROCM_PATH /opt/rocm
+ENV MKLROOT /opt/intel
+# No need to install ROCm as base docker image should have full ROCm install
+#ADD ./common/install_rocm.sh install_rocm.sh
+#RUN ROCM_VERSION=${ROCM_VERSION} bash ./install_rocm.sh && rm install_rocm.sh
+ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
+RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
+# cmake3 is needed for the MIOpen build
+RUN ln -sf /usr/local/bin/cmake /usr/bin/cmake3
+ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
+
+# Install AOTriton
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/aotriton_version.txt aotriton_version.txt
+COPY ./common/install_aotriton.sh install_aotriton.sh
+RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
+ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
diff --git a/.ci/docker/manywheel/Dockerfile_2014 b/.ci/docker/manywheel/Dockerfile_2014
new file mode 100644
index 00000000000..87f2f78472a
--- /dev/null
+++ b/.ci/docker/manywheel/Dockerfile_2014
@@ -0,0 +1,152 @@
+# syntax = docker/dockerfile:experimental
+ARG ROCM_VERSION=3.7
+ARG BASE_CUDA_VERSION=10.2
+ARG GPU_IMAGE=nvidia/cuda:${BASE_CUDA_VERSION}-devel-centos7
+FROM quay.io/pypa/manylinux2014_x86_64 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
+RUN yum install -y yum-utils centos-release-scl sudo
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
+ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+
+
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION=10.2
+# Install CUDA
+ADD ./common/install_cuda.sh install_cuda.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
+
+FROM base as intel
+# MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION=10.2
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as jni
+# Install java jni header
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+# Install libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM ${GPU_IMAGE} as common
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+RUN yum install -y \
+        aclocal \
+        autoconf \
+        automake \
+        bison \
+        bzip2 \
+        curl \
+        diffutils \
+        file \
+        git \
+        make \
+        patch \
+        perl \
+        unzip \
+        util-linux \
+        wget \
+        which \
+        xz \
+        yasm
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+RUN yum swap -y git git236-core
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+# Install LLVM version
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=base               /opt/python                           /opt/python
+COPY --from=base               /opt/_internal                        /opt/_internal
+COPY --from=base               /usr/local/bin/auditwheel             /usr/local/bin/auditwheel
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=base               /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+
+FROM common as cpu_final
+ARG BASE_CUDA_VERSION=10.2
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
+ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+
+# ninja
+RUN yum install -y http://repo.okay.com.mx/centos/7/x86_64/release/okay-release-1-1.noarch.rpm
+RUN yum install -y ninja-build
+
+FROM cpu_final as cuda_final
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+
+FROM common as rocm_final
+ARG ROCM_VERSION=3.7
+# Install ROCm
+ADD ./common/install_rocm.sh install_rocm.sh
+RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh
+# cmake is already installed inside the rocm base image, but both 2 and 3 exist
+# cmake3 is needed for the later MIOpen custom build, so that step is last.
+RUN yum install -y cmake3 && \
+    rm -f /usr/bin/cmake && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
diff --git a/.ci/docker/manywheel/Dockerfile_2_28 b/.ci/docker/manywheel/Dockerfile_2_28
new file mode 100644
index 00000000000..a036a576431
--- /dev/null
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@@ -0,0 +1,153 @@
+# syntax = docker/dockerfile:experimental
+ARG ROCM_VERSION=3.7
+ARG BASE_CUDA_VERSION=11.8
+ARG GPU_IMAGE=amd64/almalinux:8
+FROM quay.io/pypa/manylinux_2_28_x86_64 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+ARG DEVTOOLSET_VERSION=11
+RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
+ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake3
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION=11.8
+# Install CUDA
+ADD ./common/install_cuda.sh install_cuda.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
+
+FROM base as intel
+# MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION=10.2
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as jni
+# Install java jni header
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+# Install libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM ${GPU_IMAGE} as common
+ARG DEVTOOLSET_VERSION=11
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+RUN yum -y install epel-release
+RUN yum -y update
+RUN yum install -y \
+        autoconf \
+        automake \
+        bison \
+        bzip2 \
+        curl \
+        diffutils \
+        file \
+        git \
+        make \
+        patch \
+        perl \
+        unzip \
+        util-linux \
+        wget \
+        which \
+        xz \
+        gcc-toolset-${DEVTOOLSET_VERSION}-toolchain \
+        glibc-langpack-en
+
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+RUN yum swap -y git git236-core
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+# Install LLVM version
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=base               /opt/python                           /opt/python
+COPY --from=base               /opt/_internal                        /opt/_internal
+COPY --from=base               /usr/local/bin/auditwheel             /usr/local/bin/auditwheel
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=base               /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+
+FROM common as cpu_final
+ARG BASE_CUDA_VERSION=11.8
+ARG DEVTOOLSET_VERSION=11
+# Ensure the expected devtoolset is used
+ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake3
+
+FROM cpu_final as cuda_final
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+
+FROM common as rocm_final
+ARG ROCM_VERSION=3.7
+# Install ROCm
+ADD ./common/install_rocm.sh install_rocm.sh
+RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh
+# cmake is already installed inside the rocm base image, but both 2 and 3 exist
+# cmake3 is needed for the later MIOpen custom build, so that step is last.
+RUN yum install -y cmake3 && \
+    rm -f /usr/bin/cmake && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
+
+FROM cpu_final as xpu_final
+# cmake-3.28.4 from pip
+RUN python3 -m pip install --upgrade pip && \
+    python3 -mpip install cmake==3.28.4
+ADD ./common/install_xpu.sh install_xpu.sh
+RUN bash ./install_xpu.sh && rm install_xpu.sh
+RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
diff --git a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
new file mode 100644
index 00000000000..cbf2a757010
--- /dev/null
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@@ -0,0 +1,57 @@
+FROM quay.io/pypa/manylinux_2_28_aarch64 as base
+
+# Graviton needs GCC 10 or above for the build. GCC12 is the default version in almalinux-8.
+ARG GCCTOOLSET_VERSION=11
+
+# Language variabes
+ENV LC_ALL=en_US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US.UTF-8
+
+# Installed needed OS packages. This is to support all
+# the binary builds (torch, vision, audio, text, data)
+RUN yum -y install epel-release
+RUN yum -y update
+RUN yum install -y \
+  autoconf \
+  automake \
+  bison \
+  bzip2 \
+  curl \
+  diffutils \
+  file \
+  git \
+  less \
+  libffi-devel \
+  libgomp \
+  make \
+  openssl-devel \
+  patch \
+  perl \
+  unzip \
+  util-linux \
+  wget \
+  which \
+  xz \
+  yasm \
+  zstd \
+  sudo \
+  gcc-toolset-${GCCTOOLSET_VERSION}-toolchain
+
+# Ensure the expected devtoolset is used
+ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+FROM base as final
+
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
diff --git a/.ci/docker/manywheel/Dockerfile_aarch64 b/.ci/docker/manywheel/Dockerfile_aarch64
new file mode 100644
index 00000000000..cb1ddb7c62b
--- /dev/null
+++ b/.ci/docker/manywheel/Dockerfile_aarch64
@@ -0,0 +1,94 @@
+FROM quay.io/pypa/manylinux2014_aarch64 as base
+
+
+# Graviton needs GCC 10 for the build
+ARG DEVTOOLSET_VERSION=10
+
+# Language variabes
+ENV LC_ALL=en_US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US.UTF-8
+
+# Installed needed OS packages. This is to support all
+# the binary builds (torch, vision, audio, text, data)
+RUN yum -y install epel-release
+RUN yum -y update
+RUN yum install -y \
+  autoconf \
+  automake \
+  bison \
+  bzip2 \
+  curl \
+  diffutils \
+  file \
+  git \
+  make \
+  patch \
+  perl \
+  unzip \
+  util-linux \
+  wget \
+  which \
+  xz \
+  yasm \
+  less \
+  zstd \
+  libgomp \
+  sudo \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
+  devtoolset-${DEVTOOLSET_VERSION}-binutils
+
+# Ensure the expected devtoolset is used
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+
+###############################################################################
+# libglfortran.a hack
+#
+# libgfortran.a from quay.io/pypa/manylinux2014_aarch64 is not compiled with -fPIC.
+# This causes __stack_chk_guard@@GLIBC_2.17 on pytorch build. To solve, get
+# ubuntu's libgfortran.a which is compiled with -fPIC
+# NOTE: Need a better way to get this library as Ubuntu's package can be removed by the vender, or changed
+###############################################################################
+RUN cd ~/ \
+  && curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-1ubuntu1_arm64.deb \
+  && ar x ~/libgfortran-10-dev.deb \
+  && tar --use-compress-program=unzstd -xvf data.tar.zst -C ~/ \
+  && cp -f ~/usr/lib/gcc/aarch64-linux-gnu/10/libgfortran.a /opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/
+
+# install cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+FROM base as openblas
+# Install openblas
+ADD ./common/install_openblas.sh install_openblas.sh
+RUN bash ./install_openblas.sh && rm install_openblas.sh
+
+FROM openssl as final
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
+ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
diff --git a/.ci/docker/manywheel/Dockerfile_cuda_aarch64 b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
new file mode 100644
index 00000000000..7ffd38f5edc
--- /dev/null
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@@ -0,0 +1,91 @@
+FROM quay.io/pypa/manylinux_2_28_aarch64 as base
+
+# Cuda ARM build needs gcc 11
+ARG DEVTOOLSET_VERSION=11
+
+# Language variables
+ENV LC_ALL=en_US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US.UTF-8
+
+# Installed needed OS packages. This is to support all
+# the binary builds (torch, vision, audio, text, data)
+RUN yum -y install epel-release
+RUN yum -y update
+RUN yum install -y \
+  autoconf \
+  automake \
+  bison \
+  bzip2 \
+  curl \
+  diffutils \
+  file \
+  git \
+  make \
+  patch \
+  perl \
+  unzip \
+  util-linux \
+  wget \
+  which \
+  xz \
+  yasm \
+  less \
+  zstd \
+  libgomp \
+  sudo \
+  gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
+
+# Ensure the expected devtoolset is used
+ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+FROM openssl as final
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION
+# Install CUDA
+ADD ./common/install_cuda_aarch64.sh install_cuda_aarch64.sh
+RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as openblas
+# Install openblas
+ADD ./common/install_openblas.sh install_openblas.sh
+RUN bash ./install_openblas.sh && rm install_openblas.sh
+
+FROM final as cuda_final
+ARG BASE_CUDA_VERSION
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
+RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
+ENV PATH=/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
diff --git a/.ci/docker/manywheel/Dockerfile_cxx11-abi b/.ci/docker/manywheel/Dockerfile_cxx11-abi
new file mode 100644
index 00000000000..ed33cc61df0
--- /dev/null
+++ b/.ci/docker/manywheel/Dockerfile_cxx11-abi
@@ -0,0 +1,71 @@
+FROM centos:8 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+ENV PATH /opt/rh/gcc-toolset-11/root/bin/:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+
+# change to a valid repo
+RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Linux-*.repo
+# enable to install ninja-build
+RUN sed -i 's|enabled=0|enabled=1|g' /etc/yum.repos.d/CentOS-Linux-PowerTools.repo
+
+RUN yum -y update
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which zlib-devel sudo
+RUN yum install -y autoconf automake make cmake gdb gcc-toolset-11-gcc-c++
+
+
+FROM base as openssl
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+# Install python
+FROM base as python
+RUN yum install -y openssl-devel zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel
+ADD common/install_cpython.sh install_cpython.sh
+RUN bash ./install_cpython.sh && rm install_cpython.sh
+
+FROM base as conda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+RUN /opt/conda/bin/conda install -y cmake
+
+FROM base as intel
+# Install MKL
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=conda              /opt/conda                            /opt/conda
+ENV PATH=/opt/conda/bin:$PATH
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+RUN cp $(which patchelf) /patchelf
+
+FROM base as jni
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM base as final
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=conda              /opt/conda                            /opt/conda
+COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+
+RUN yum install -y ninja-build
diff --git a/.ci/docker/manywheel/Dockerfile_s390x b/.ci/docker/manywheel/Dockerfile_s390x
new file mode 100644
index 00000000000..5125e3830e8
--- /dev/null
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@@ -0,0 +1,73 @@
+FROM --platform=linux/s390x docker.io/ubuntu:24.04 as base
+
+# Language variables
+ENV LC_ALL=C.UTF-8
+ENV LANG=C.UTF-8
+ENV LANGUAGE=C.UTF-8
+
+# Installed needed OS packages. This is to support all
+# the binary builds (torch, vision, audio, text, data)
+RUN apt update ; apt upgrade -y
+RUN apt install -y \
+  build-essential \
+  autoconf \
+  automake \
+  bzip2 \
+  curl \
+  diffutils \
+  file \
+  git \
+  make \
+  patch \
+  perl \
+  unzip \
+  util-linux \
+  wget \
+  which \
+  xz-utils \
+  less \
+  zstd \
+  cmake \
+  python3 \
+  python3-dev \
+  python3-setuptools \
+  python3-yaml \
+  python3-typing-extensions \
+  libblas-dev \
+  libopenblas-dev \
+  liblapack-dev \
+  libatlas-base-dev
+
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+# EPEL for cmake
+FROM base as patchelf
+# Install patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+RUN cp $(which patchelf) /patchelf
+
+FROM patchelf as python
+# build python
+COPY manywheel/build_scripts /build_scripts
+ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh
+RUN bash build_scripts/build.sh && rm -r build_scripts
+
+FROM openssl as final
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel
+COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh
new file mode 100755
index 00000000000..0cfb88ef72f
--- /dev/null
+++ b/.ci/docker/manywheel/build.sh
@@ -0,0 +1,154 @@
+#!/usr/bin/env bash
+# Script used only in CD pipeline
+
+set -eou pipefail
+
+TOPDIR=$(git rev-parse --show-toplevel)
+
+image="$1"
+shift
+
+if [ -z "${image}" ]; then
+  echo "Usage: $0 IMAGE"
+  exit 1
+fi
+
+DOCKER_IMAGE="pytorch/${image}"
+
+DOCKER_REGISTRY="${DOCKER_REGISTRY:-docker.io}"
+
+GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
+GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
+MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
+DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
+WITH_PUSH=${WITH_PUSH:-}
+
+case ${GPU_ARCH_TYPE} in
+    cpu)
+        TARGET=cpu_final
+        DOCKER_TAG=cpu
+        GPU_IMAGE=centos:7
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
+        ;;
+    cpu-manylinux_2_28)
+        TARGET=cpu_final
+        DOCKER_TAG=cpu
+        GPU_IMAGE=amd64/almalinux:8
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
+        MANY_LINUX_VERSION="2_28"
+        ;;
+    cpu-aarch64)
+        TARGET=final
+        DOCKER_TAG=cpu-aarch64
+        GPU_IMAGE=arm64v8/centos:7
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=10"
+        MANY_LINUX_VERSION="aarch64"
+        ;;
+    cpu-aarch64-2_28)
+        TARGET=final
+        DOCKER_TAG=cpu-aarch64
+        GPU_IMAGE=arm64v8/almalinux:8
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
+        MANY_LINUX_VERSION="2_28_aarch64"
+        ;;
+    cpu-cxx11-abi)
+        TARGET=final
+        DOCKER_TAG=cpu-cxx11-abi
+        GPU_IMAGE=""
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
+        MANY_LINUX_VERSION="cxx11-abi"
+        ;;
+    cpu-s390x)
+        TARGET=final
+        DOCKER_TAG=cpu-s390x
+        GPU_IMAGE=redhat/ubi9
+        DOCKER_GPU_BUILD_ARG=""
+        MANY_LINUX_VERSION="s390x"
+        ;;
+    cuda)
+        TARGET=cuda_final
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        # Keep this up to date with the minimum version of CUDA we currently support
+        GPU_IMAGE=centos:7
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=9"
+        ;;
+    cuda-manylinux_2_28)
+        TARGET=cuda_final
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        GPU_IMAGE=amd64/almalinux:8
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
+        MANY_LINUX_VERSION="2_28"
+        ;;
+    cuda-aarch64)
+        TARGET=cuda_final
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        GPU_IMAGE=arm64v8/centos:7
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
+        MANY_LINUX_VERSION="aarch64"
+        DOCKERFILE_SUFFIX="_cuda_aarch64"
+        ;;
+    rocm)
+        TARGET=rocm_final
+        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
+        GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100"
+        ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)"
+        if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then
+            ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0}))
+        else
+            echo "ERROR: rocm regex failed"
+            exit 1
+        fi
+        if [[ $ROCM_VERSION_INT -ge 60000 ]]; then
+            PYTORCH_ROCM_ARCH+=";gfx942"
+        fi
+        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=9"
+        ;;
+    xpu)
+        TARGET=xpu_final
+        DOCKER_TAG=xpu
+        GPU_IMAGE=amd64/almalinux:8
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
+        MANY_LINUX_VERSION="2_28"
+        ;;
+    *)
+        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
+        exit 1
+        ;;
+esac
+
+IMAGES=''
+
+if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
+    DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION}
+fi
+(
+    set -x
+    DOCKER_BUILDKIT=1 docker build \
+        ${DOCKER_GPU_BUILD_ARG} \
+        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+        --target "${TARGET}" \
+        -t "${DOCKER_IMAGE}" \
+        $@ \
+        -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
+        "${TOPDIR}/.ci/docker/"
+)
+
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
+
+if [[ "${WITH_PUSH}" == true ]]; then
+    (
+        set -x
+        docker push "${DOCKER_IMAGE}"
+        if [[ -n ${GITHUB_REF} ]]; then
+            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
+            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
+            docker push "${DOCKER_IMAGE_BRANCH_TAG}"
+            docker push "${DOCKER_IMAGE_SHA_TAG}"
+        fi
+    )
+fi
diff --git a/.ci/docker/manywheel/build_scripts/build.sh b/.ci/docker/manywheel/build_scripts/build.sh
new file mode 100644
index 00000000000..1708b71a19b
--- /dev/null
+++ b/.ci/docker/manywheel/build_scripts/build.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+# Top-level build script called from Dockerfile
+# Script used only in CD pipeline
+
+# Stop at any error, show all commands
+set -ex
+
+# openssl version to build, with expected sha256 hash of .tar.gz
+# archive
+OPENSSL_ROOT=openssl-1.1.1l
+OPENSSL_HASH=0b7a3e5e59c34827fe0c3a74b7ec8baef302b98fa80088d7f9153aa16fa76bd1
+DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
+PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
+CURL_ROOT=curl-7.73.0
+CURL_HASH=cf34fe0b07b800f1c01a499a6e8b2af548f6d0e044dca4a29d88a4bee146d131
+AUTOCONF_ROOT=autoconf-2.69
+AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
+
+# Get build utilities
+MY_DIR=$(dirname "${BASH_SOURCE[0]}")
+source $MY_DIR/build_utils.sh
+
+if [ "$(uname -m)" != "s390x" ] ; then
+    # Dependencies for compiling Python that we want to remove from
+    # the final image after compiling Python
+    PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel"
+
+    # Libraries that are allowed as part of the manylinux1 profile
+    MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
+
+    # Development tools and libraries
+    yum -y install bzip2 make git patch unzip bison yasm diffutils \
+        automake which file cmake28 \
+        kernel-devel-`uname -r` \
+        ${PYTHON_COMPILE_DEPS}
+else
+    # Dependencies for compiling Python that we want to remove from
+    # the final image after compiling Python
+    PYTHON_COMPILE_DEPS="zlib1g-dev libbz2-dev libncurses-dev libsqlite3-dev libdb-dev libpcap-dev liblzma-dev libffi-dev"
+
+    # Libraries that are allowed as part of the manylinux1 profile
+    MANYLINUX1_DEPS="libglib2.0-dev libX11-dev libncurses-dev"
+
+    # Development tools and libraries
+    apt install -y bzip2 make git patch unzip diffutils \
+        automake which file cmake \
+        linux-headers-virtual \
+        ${PYTHON_COMPILE_DEPS}
+fi
+
+# Install newest autoconf
+build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH
+autoconf --version
+
+# Compile the latest Python releases.
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+build_openssl $OPENSSL_ROOT $OPENSSL_HASH
+/build_scripts/install_cpython.sh
+
+PY39_BIN=/opt/python/cp39-cp39/bin
+
+# Our openssl doesn't know how to find the system CA trust store
+#   (https://github.com/pypa/manylinux/issues/53)
+# And it's not clear how up-to-date that is anyway
+# So let's just use the same one pip and everyone uses
+$PY39_BIN/pip install certifi
+ln -s $($PY39_BIN/python -c 'import certifi; print(certifi.where())') \
+      /opt/_internal/certs.pem
+# If you modify this line you also have to modify the versions in the
+# Dockerfiles:
+export SSL_CERT_FILE=/opt/_internal/certs.pem
+
+# Install newest curl
+build_curl $CURL_ROOT $CURL_HASH
+rm -rf /usr/local/include/curl /usr/local/lib/libcurl* /usr/local/lib/pkgconfig/libcurl.pc
+hash -r
+curl --version
+curl-config --features
+
+# Install patchelf (latest with unreleased bug fixes)
+curl -sLOk https://nixos.org/releases/patchelf/patchelf-0.10/patchelf-0.10.tar.gz
+# check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
+tar -xzf patchelf-0.10.tar.gz
+(cd patchelf-0.10 && ./configure && make && make install)
+rm -rf patchelf-0.10.tar.gz patchelf-0.10
+
+# Install latest pypi release of auditwheel
+$PY39_BIN/pip install auditwheel
+ln -s $PY39_BIN/auditwheel /usr/local/bin/auditwheel
+
+# Clean up development headers and other unnecessary stuff for
+# final image
+if [ "$(uname -m)" != "s390x" ] ; then
+    yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
+        avahi freetype bitstream-vera-fonts \
+        ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
+    yum -y install ${MANYLINUX1_DEPS}
+    yum -y clean all > /dev/null 2>&1
+    yum list installed
+else
+    apt purge -y ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
+fi
+# we don't need libpython*.a, and they're many megabytes
+find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f
+# Strip what we can -- and ignore errors, because this just attempts to strip
+# *everything*, including non-ELF files:
+find /opt/_internal -type f -print0 \
+    | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true
+# We do not need the Python test suites, or indeed the precompiled .pyc and
+# .pyo files. Partially cribbed from:
+#    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile
+find /opt/_internal \
+     \( -type d -a -name test -o -name tests \) \
+  -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \
+  -print0 | xargs -0 rm -f
+
+for PYTHON in /opt/python/*/bin/python; do
+    # Smoke test to make sure that our Pythons work, and do indeed detect as
+    # being manylinux compatible:
+    $PYTHON $MY_DIR/manylinux1-check.py
+    # Make sure that SSL cert checking works
+    $PYTHON $MY_DIR/ssl-check.py
+done
+
+# Fix libc headers to remain compatible with C99 compilers.
+find /usr/include/ -type f -exec sed -i 's/\bextern _*inline_*\b/extern __inline __attribute__ ((__gnu_inline__))/g' {} +
+
+# Now we can delete our built SSL
+rm -rf /usr/local/ssl
diff --git a/.ci/docker/manywheel/build_scripts/build_utils.sh b/.ci/docker/manywheel/build_scripts/build_utils.sh
new file mode 100755
index 00000000000..279a7b17a52
--- /dev/null
+++ b/.ci/docker/manywheel/build_scripts/build_utils.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+# Helper utilities for build
+# Script used only in CD pipeline
+
+OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/
+CURL_DOWNLOAD_URL=https://curl.askapache.com/download
+
+AUTOCONF_DOWNLOAD_URL=https://ftp.gnu.org/gnu/autoconf
+
+
+function check_var {
+    if [ -z "$1" ]; then
+        echo "required variable not defined"
+        exit 1
+    fi
+}
+
+
+function do_openssl_build {
+    ./config no-ssl2 no-shared -fPIC --prefix=/usr/local/ssl > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function check_sha256sum {
+    local fname=$1
+    check_var ${fname}
+    local sha256=$2
+    check_var ${sha256}
+
+    echo "${sha256}  ${fname}" > ${fname}.sha256
+    sha256sum -c ${fname}.sha256
+    rm -f ${fname}.sha256
+}
+
+
+function build_openssl {
+    local openssl_fname=$1
+    check_var ${openssl_fname}
+    local openssl_sha256=$2
+    check_var ${openssl_sha256}
+    check_var ${OPENSSL_DOWNLOAD_URL}
+    curl -sLO ${OPENSSL_DOWNLOAD_URL}/${openssl_fname}.tar.gz
+    check_sha256sum ${openssl_fname}.tar.gz ${openssl_sha256}
+    tar -xzf ${openssl_fname}.tar.gz
+    (cd ${openssl_fname} && do_openssl_build)
+    rm -rf ${openssl_fname} ${openssl_fname}.tar.gz
+}
+
+
+function do_curl_build {
+    LIBS=-ldl ./configure --with-ssl --disable-shared > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function build_curl {
+    local curl_fname=$1
+    check_var ${curl_fname}
+    local curl_sha256=$2
+    check_var ${curl_sha256}
+    check_var ${CURL_DOWNLOAD_URL}
+    curl -sLO ${CURL_DOWNLOAD_URL}/${curl_fname}.tar.bz2
+    check_sha256sum ${curl_fname}.tar.bz2 ${curl_sha256}
+    tar -jxf ${curl_fname}.tar.bz2
+    (cd ${curl_fname} && do_curl_build)
+    rm -rf ${curl_fname} ${curl_fname}.tar.bz2
+}
+
+
+function do_standard_install {
+    ./configure > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function build_autoconf {
+    local autoconf_fname=$1
+    check_var ${autoconf_fname}
+    local autoconf_sha256=$2
+    check_var ${autoconf_sha256}
+    check_var ${AUTOCONF_DOWNLOAD_URL}
+    curl -sLO ${AUTOCONF_DOWNLOAD_URL}/${autoconf_fname}.tar.gz
+    check_sha256sum ${autoconf_fname}.tar.gz ${autoconf_sha256}
+    tar -zxf ${autoconf_fname}.tar.gz
+    (cd ${autoconf_fname} && do_standard_install)
+    rm -rf ${autoconf_fname} ${autoconf_fname}.tar.gz
+}
diff --git a/.ci/docker/manywheel/build_scripts/manylinux1-check.py b/.ci/docker/manywheel/build_scripts/manylinux1-check.py
new file mode 100644
index 00000000000..bdc8f610298
--- /dev/null
+++ b/.ci/docker/manywheel/build_scripts/manylinux1-check.py
@@ -0,0 +1,60 @@
+# Logic copied from PEP 513
+
+
+def is_manylinux1_compatible():
+    # Only Linux, and only x86-64 / i686
+    from distutils.util import get_platform
+
+    if get_platform() not in ["linux-x86_64", "linux-i686", "linux-s390x"]:
+        return False
+
+    # Check for presence of _manylinux module
+    try:
+        import _manylinux
+
+        return bool(_manylinux.manylinux1_compatible)
+    except (ImportError, AttributeError):
+        # Fall through to heuristic check below
+        pass
+
+    # Check glibc version. CentOS 5 uses glibc 2.5.
+    return have_compatible_glibc(2, 5)
+
+
+def have_compatible_glibc(major, minimum_minor):
+    import ctypes
+
+    process_namespace = ctypes.CDLL(None)
+    try:
+        gnu_get_libc_version = process_namespace.gnu_get_libc_version
+    except AttributeError:
+        # Symbol doesn't exist -> therefore, we are not linked to
+        # glibc.
+        return False
+
+    # Call gnu_get_libc_version, which returns a string like "2.5".
+    gnu_get_libc_version.restype = ctypes.c_char_p
+    version_str = gnu_get_libc_version()
+    # py2 / py3 compatibility:
+    if not isinstance(version_str, str):
+        version_str = version_str.decode("ascii")
+
+    # Parse string and check against requested version.
+    version = [int(piece) for piece in version_str.split(".")]
+    assert len(version) == 2
+    if major != version[0]:
+        return False
+    if minimum_minor > version[1]:
+        return False
+    return True
+
+
+import sys
+
+
+if is_manylinux1_compatible():
+    print(f"{sys.executable} is manylinux1 compatible")
+    sys.exit(0)
+else:
+    print(f"{sys.executable} is NOT manylinux1 compatible")
+    sys.exit(1)
diff --git a/.ci/docker/manywheel/build_scripts/ssl-check.py b/.ci/docker/manywheel/build_scripts/ssl-check.py
new file mode 100644
index 00000000000..b1df3e1346f
--- /dev/null
+++ b/.ci/docker/manywheel/build_scripts/ssl-check.py
@@ -0,0 +1,35 @@
+# cf. https://github.com/pypa/manylinux/issues/53
+
+GOOD_SSL = "https://google.com"
+BAD_SSL = "https://self-signed.badssl.com"
+
+import sys
+
+
+print("Testing SSL certificate checking for Python:", sys.version)
+
+if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
+    print("This version never checks SSL certs; skipping tests")
+    sys.exit(0)
+
+if sys.version_info[0] >= 3:
+    from urllib.request import urlopen
+
+    EXC = OSError
+else:
+    from urllib import urlopen
+
+    EXC = IOError
+
+print(f"Connecting to {GOOD_SSL} should work")
+urlopen(GOOD_SSL)
+print("...it did, yay.")
+
+print(f"Connecting to {BAD_SSL} should fail")
+try:
+    urlopen(BAD_SSL)
+    # If we get here then we failed:
+    print("...it DIDN'T!!!!!11!!1one!")
+    sys.exit(1)
+except EXC:
+    print("...it did, yay.")
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
index da85d65cc3c..18225b8da39 100644
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -9,7 +9,9 @@ self-hosted-runner:
     - linux.large
     - linux.2xlarge
     - linux.4xlarge
+    - linux.9xlarge.ephemeral
     - linux.12xlarge
+    - linux.12xlarge.ephemeral
     - linux.24xlarge
     - linux.arm64.2xlarge
     - linux.4xlarge.nvidia.gpu
diff --git a/.github/workflows/build-conda-images.yml b/.github/workflows/build-conda-images.yml
new file mode 100644
index 00000000000..f77cf7cef38
--- /dev/null
+++ b/.github/workflows/build-conda-images.yml
@@ -0,0 +1,64 @@
+name: Build conda docker images
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate or nightly builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+    paths:
+      - conda/Dockerfile
+      - 'common/*'
+      - .github/workflows/build-conda-images.yml
+  pull_request:
+    paths:
+      - conda/Dockerfile
+      - 'common/*'
+      - .github/workflows/build-conda-images.yml
+
+env:
+  DOCKER_REGISTRY: "docker.io"
+  DOCKER_BUILDKIT: 1
+  DOCKER_ID: ${{ secrets.DOCKER_ID }}
+  DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  build-docker:
+    runs-on: linux.9xlarge.ephemeral
+    strategy:
+      matrix:
+        cuda_version: ["11.8", "12.1", "12.4", "cpu"]
+    env:
+      CUDA_VERSION: ${{ matrix.cuda_version }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: conda-builder${{ matrix.cuda_version == 'cpu' && '-' || '-cuda' }}${{matrix.cuda_version}}
+            docker-build-dir:  .ci/docker/conda
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/conda/build.sh conda-builder${{ matrix.cuda_version == 'cpu' && ':' || ':cuda' }}${{matrix.cuda_version}}
diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml
new file mode 100644
index 00000000000..baab800cfc4
--- /dev/null
+++ b/.github/workflows/build-libtorch-images.yml
@@ -0,0 +1,120 @@
+name: Build libtorch docker images
+
+on:
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate or nightly builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+    paths:
+      - '.ci/docker/libtorch/*'
+      - '.ci/docker/common/*'
+      - .github/workflows/build-libtorch-images.yml
+  pull_request:
+    paths:
+      - '.ci/docker/libtorch/*'
+      - '.ci/docker/common/*'
+      - .github/workflows/build-libtorch-images.yml
+
+env:
+  DOCKER_REGISTRY: "docker.io"
+  DOCKER_BUILDKIT: 1
+  DOCKER_ID: ${{ secrets.DOCKER_ID }}
+  DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  build-docker-cuda:
+    runs-on: linux.9xlarge.ephemeral
+    strategy:
+      matrix:
+        cuda_version: ["12.4", "12.1", "11.8"]
+    env:
+      GPU_ARCH_TYPE: cuda
+      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: libtorch-cxx11-builder-cuda${{matrix.cuda_version}}
+            docker-build-dir:  .ci/docker/libtorch
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cuda${{matrix.cuda_version}}
+  build-docker-rocm:
+    runs-on: linux.9xlarge.ephemeral
+    strategy:
+      matrix:
+        rocm_version: ["6.0", "6.1"]
+    env:
+      GPU_ARCH_TYPE: rocm
+      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: libtorch-cxx11-builder-rocm${{matrix.rocm_version}}
+            docker-build-dir:  .ci/docker/libtorch
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:rocm${{matrix.rocm_version}}
+  build-docker-cpu:
+    runs-on: linux.9xlarge.ephemeral
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: libtorch-cxx11-builder-cpu
+            docker-build-dir:  .ci/docker/libtorch
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cpu
diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml
new file mode 100644
index 00000000000..2ba8cbd9ae6
--- /dev/null
+++ b/.github/workflows/build-manywheel-images.yml
@@ -0,0 +1,322 @@
+name: Build manywheel docker images
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate or nightly builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+    paths:
+      - '.ci/docker/manywheel/*'
+      - '.ci/docker/common/*'
+      - .github/workflows/build-manywheel-images.yml
+  pull_request:
+    paths:
+      - '.ci/docker/manywheel/*'
+      - '.ci/docker/common/*'
+      - .github/workflows/build-manywheel-images.yml
+
+
+env:
+  DOCKER_REGISTRY: "docker.io"
+  DOCKER_BUILDKIT: 1
+  DOCKER_ID: ${{ secrets.DOCKER_ID }}
+  DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  build-docker-cuda:
+    runs-on: linux.9xlarge.ephemeral
+    strategy:
+      matrix:
+        cuda_version: ["12.4", "12.1", "11.8"]
+    env:
+      GPU_ARCH_TYPE: cuda
+      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
+    steps:
+      - name: Purge tools folder (free space for build)
+        run: rm -rf /opt/hostedtoolcache
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinux-builder-cuda${{matrix.cuda_version}}
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinux-builder:cuda${{matrix.cuda_version}}
+  # NOTE: manylinux_2_28 are still experimental, see https://github.com/pytorch/pytorch/issues/123649
+  build-docker-cuda-manylinux_2_28:
+    runs-on: linux.9xlarge.ephemeral
+    strategy:
+      matrix:
+        cuda_version: ["12.4", "12.1", "11.8"]
+    env:
+      GPU_ARCH_TYPE: cuda-manylinux_2_28
+      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
+    steps:
+      - name: Purge tools folder (free space for build)
+        run: rm -rf /opt/hostedtoolcache
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinux2_28-builder-cuda${{matrix.cuda_version}}
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinux2_28-builder:cuda${{matrix.cuda_version}}
+  build-docker-cuda-aarch64:
+    runs-on: linux.arm64.2xlarge
+    strategy:
+      matrix:
+        cuda_version: ["12.4"]
+    env:
+      GPU_ARCH_TYPE: cuda-aarch64
+      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v3
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinuxaarch64-builder-cuda${{matrix.cuda_version}}
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}}
+  build-docker-rocm:
+    runs-on: linux.9xlarge.ephemeral
+    strategy:
+      matrix:
+        rocm_version: ["6.0", "6.1"]
+    env:
+      GPU_ARCH_TYPE: rocm
+      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinux-builder-rocm${{matrix.rocm_version}}
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinux-builder:rocm${{matrix.rocm_version}}
+  build-docker-cpu:
+    runs-on: linux.9xlarge.ephemeral
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinux-builder-cpu
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinux-builder:cpu
+  build-docker-cpu-manylinux_2_28:
+    runs-on: linux.9xlarge.ephemeral
+    env:
+      GPU_ARCH_TYPE: cpu-manylinux_2_28
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinux2_28-builder-cpu
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinux2_28-builder:cpu
+  build-docker-cpu-aarch64:
+    runs-on: linux.arm64.2xlarge
+    env:
+      GPU_ARCH_TYPE: cpu-aarch64
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinuxaarch64-builder-cpu-aarch64
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cpu-aarch64
+  build-docker-cpu-aarch64-2_28:
+    runs-on: linux.arm64.2xlarge
+    env:
+      GPU_ARCH_TYPE: cpu-aarch64-2_28
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinux2_28_aarch64-builder-cpu-aarch64
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinux2_28_aarch64-builder:cpu-aarch64
+  build-docker-cpu-cxx11-abi:
+    runs-on: linux.9xlarge.ephemeral
+    env:
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinuxcxx11-abi-builder-cpu-cxx11-abi
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinuxcxx11-abi-builder:cpu-cxx11-abi
+  build-docker-xpu:
+    runs-on: linux.9xlarge.ephemeral
+    env:
+      GPU_ARCH_TYPE: xpu
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinux2_28-builder-xpu
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinux2_28-builder:xpu
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 2b6aaa3037b..ce41c89338e 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -417,6 +417,7 @@ exclude_patterns = [
     'aten/src/ATen/native/vulkan/api/vk_mem_alloc.h',
     'test/cpp/jit/upgrader_models/*.ptl',
     'test/cpp/jit/upgrader_models/*.ptl.ff',
+    '.ci/docker/common/install_rocm_drm.sh',
     '.lintrunner.toml',
 ]
 command = [