Migrate conda, manywheel and libtorch docker builds to pytorch/pytorch (#129022)

Migration of Docker conda builds to pytorch/pytorch from pytorch/builder: https://github.com/pytorch/builder/blob/main/.github/workflows/build-conda-images.yml Related to: https://github.com/pytorch/builder/issues/1849 Migrate scripts and worklfows, adds logic to execute on PR and upload to ecr with github hash tag in order to test Docker build and nightly on PR. Test when executing on PR, upload to ecr: https://github.com/pytorch/pytorch/actions/runs/9799439218/job/27059691327 ``` 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/conda-builder-cpu:789cf8fcd738088860056160f6e9ea7cd005972b ``` Test With-Push, upload to dockerhub: https://github.com/pytorch/pytorch/actions/runs/9799783407/job/27060633427 ``` docker.io/pytorch/conda-builder:cpu done ``` Will upload here: https://hub.docker.com/r/pytorch/conda-builder/ Test using ecr image in the nightly workflow: https://github.com/pytorch/pytorch/actions/runs/9798428933/job/27057835235#step:16:87 Note: This is first part that will build docker and upload it to either dockerhub or ecr. After merging followup PR will need to change conda nightly workflows to either use ecr image or dockerhub image, depending if we are running it on PR or from main/release branch. Cleanup of workflows and scripts from builder repo: https://github.com/pytorch/builder/pull/1923 Co-authored-by: atalman <atalman@fb.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/129022 Approved by: https://github.com/atalman, https://github.com/seemethere, https://github.com/malfet, https://github.com/chuanqi129
2025-12-06 12:20:52 +01:00 · 2024-07-25 14:36:13 +00:00 · 2024-07-25 14:36:13 +00:00 · d0e2ab617d
commit d0e2ab617d
parent 4a5a87168e
38 changed files with 3224 additions and 15 deletions
--- a/.ci/docker/README.md
+++ b/.ci/docker/README.md
@ -1,4 +1,4 @@
-# Docker images for GitHub CI
+# Docker images for GitHub CI and CD

 This directory contains everything needed to build the Docker images
 that are used in our CI.
@ -12,7 +12,7 @@ each image as the `BUILD_ENVIRONMENT` environment variable.

 See `build.sh` for valid build environments (it's the giant switch).

-## Contents
+## Docker CI builds

 * `build.sh` -- dispatch script to launch all builds
 * `common` -- scripts used to execute individual Docker build stages
@ -21,6 +21,12 @@ See `build.sh` for valid build environments (it's the giant switch).
 * `ubuntu-rocm` -- Dockerfile for Ubuntu image with ROCm support
 * `ubuntu-xpu` -- Dockerfile for Ubuntu image with XPU support

+### Docker CD builds
+
+* `conda` - Dockerfile and build.sh to build Docker images used in nightly conda builds
+* `manywheel` - Dockerfile and build.sh to build Docker images used in nightly manywheel builds
+* `libtorch` - Dockerfile and build.sh to build Docker images used in nightly libtorch builds
+
 ## Usage

 ```bash
--- a/.ci/docker/common/aotriton_version.txt
+++ b/.ci/docker/common/aotriton_version.txt
@ -0,0 +1,5 @@
+0.6b
+manylinux_2_17
+rocm6.1
+04b5df8c8123f90cba3ede7e971e6fbc6040d506
+77c29fa3f3b614e187d7213d745e989a92708cee2bc6020419ab49019af399d1
--- a/.ci/docker/common/install_conda_docker.sh
+++ b/.ci/docker/common/install_conda_docker.sh
@ -0,0 +1,20 @@
+#!/bin/bash
+# Script used only in CD pipeline
+set -ex
+
+# Anaconda
+# Latest anaconda is using openssl-3 which is incompatible with all currently published versions of git
+# Which are using openssl-1.1.1, see https://anaconda.org/anaconda/git/files?version=2.40.1 for example
+MINICONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-py311_23.5.2-0-Linux-x86_64.sh
+wget -q $MINICONDA_URL
+# NB: Manually invoke bash per https://github.com/conda/conda/issues/10431
+bash $(basename "$MINICONDA_URL") -b -p /opt/conda
+rm $(basename "$MINICONDA_URL")
+export PATH=/opt/conda/bin:$PATH
+# See https://github.com/pytorch/builder/issues/1473
+# Pin conda to 23.5.2 as it's the last one compatible with openssl-1.1.1
+conda install -y conda=23.5.2 conda-build anaconda-client git ninja
+# The cmake version here needs to match with the minimum version of cmake
+# supported by PyTorch (3.18). There is only 3.18.2 on anaconda
+/opt/conda/bin/pip3 install cmake==3.18.2
+conda remove -y --force patchelf
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -0,0 +1,95 @@
+#!/bin/bash
+# Script used only in CD pipeline
+set -uex -o pipefail
+
+PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
+PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads
+GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
+
+# Python versions to be installed in /opt/$VERSION_NO
+CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0"}
+
+function check_var {
+    if [ -z "$1" ]; then
+        echo "required variable not defined"
+        exit 1
+    fi
+}
+
+function do_cpython_build {
+    local py_ver=$1
+    local py_folder=$2
+    check_var $py_ver
+    check_var $py_folder
+    tar -xzf Python-$py_ver.tgz
+    pushd $py_folder
+
+    local prefix="/opt/_internal/cpython-${py_ver}"
+    mkdir -p ${prefix}/lib
+    if [[ -n $(which patchelf) ]]; then
+        local shared_flags="--enable-shared"
+    else
+        local shared_flags="--disable-shared"
+    fi
+    if [[ -z  "${WITH_OPENSSL+x}" ]]; then
+        local openssl_flags=""
+    else
+        local openssl_flags="--with-openssl=${WITH_OPENSSL} --with-openssl-rpath=auto"
+    fi
+
+    # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
+    CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} > /dev/null
+
+    make -j40 > /dev/null
+    make install > /dev/null
+
+    if [[ "${shared_flags}" == "--enable-shared" ]]; then
+        patchelf --set-rpath '$ORIGIN/../lib' ${prefix}/bin/python3
+    fi
+
+    popd
+    rm -rf $py_folder
+    # Some python's install as bin/python3. Make them available as
+    # bin/python.
+    if [ -e ${prefix}/bin/python3 ]; then
+        ln -s python3 ${prefix}/bin/python
+    fi
+    ${prefix}/bin/python get-pip.py
+    if [ -e ${prefix}/bin/pip3 ] && [ ! -e ${prefix}/bin/pip ]; then
+        ln -s pip3 ${prefix}/bin/pip
+    fi
+    ${prefix}/bin/pip install wheel==0.34.2
+    local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
+    ln -s ${prefix} /opt/python/${abi_tag}
+}
+
+function build_cpython {
+    local py_ver=$1
+    check_var $py_ver
+    check_var $PYTHON_DOWNLOAD_URL
+    local py_ver_folder=$py_ver
+    if [ "$py_ver" = "3.13.0" ]; then
+        PY_VER_SHORT="3.13"
+        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
+        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
+        do_cpython_build $py_ver cpython-$PY_VER_SHORT
+    else
+        wget -q $PYTHON_DOWNLOAD_URL/$py_ver_folder/Python-$py_ver.tgz
+        do_cpython_build $py_ver Python-$py_ver
+    fi
+
+    rm -f Python-$py_ver.tgz
+}
+
+function build_cpythons {
+    check_var $GET_PIP_URL
+    curl -sLO $GET_PIP_URL
+    for py_ver in $@; do
+        build_cpython $py_ver
+    done
+    rm -f get-pip.py
+}
+
+mkdir -p /opt/python
+mkdir -p /opt/_internal
+build_cpythons $CPYTHON_VERSIONS
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -0,0 +1,239 @@
+#!/bin/bash
+
+set -ex
+
+NCCL_VERSION=v2.21.5-1
+CUDNN_VERSION=9.1.0.70
+
+function install_cusparselt_040 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_cusparselt_052 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_118 {
+    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
+    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
+    # install CUDA 11.8.0 in the same container
+    wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
+    chmod +x cuda_11.8.0_520.61.05_linux.run
+    ./cuda_11.8.0_520.61.05_linux.run --toolkit --silent
+    rm -f cuda_11.8.0_520.61.05_linux.run
+    rm -f /usr/local/cuda && ln -s /usr/local/cuda-11.8 /usr/local/cuda
+
+    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+    mkdir tmp_cudnn && cd tmp_cudnn
+    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
+    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/include/* /usr/local/cuda/include/
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf tmp_cudnn
+
+    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+    git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+    cd nccl && make -j src.build
+    cp -a build/include/* /usr/local/cuda/include/
+    cp -a build/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf nccl
+
+    install_cusparselt_040
+
+    ldconfig
+}
+
+function install_121 {
+    echo "Installing CUDA 12.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+    rm -rf /usr/local/cuda-12.1 /usr/local/cuda
+    # install CUDA 12.1.0 in the same container
+    wget -q https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
+    chmod +x cuda_12.1.1_530.30.02_linux.run
+    ./cuda_12.1.1_530.30.02_linux.run --toolkit --silent
+    rm -f cuda_12.1.1_530.30.02_linux.run
+    rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.1 /usr/local/cuda
+
+    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+    mkdir tmp_cudnn && cd tmp_cudnn
+    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf tmp_cudnn
+
+    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+    git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+    cd nccl && make -j src.build
+    cp -a build/include/* /usr/local/cuda/include/
+    cp -a build/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf nccl
+
+    install_cusparselt_052
+
+    ldconfig
+}
+
+function install_124 {
+  echo "Installing CUDA 12.4 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
+  # install CUDA 12.4.0 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run
+  chmod +x cuda_12.4.0_550.54.14_linux.run
+  ./cuda_12.4.0_550.54.14_linux.run --toolkit --silent
+  rm -f cuda_12.4.0_550.54.14_linux.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_052
+
+  ldconfig
+}
+
+function prune_118 {
+    echo "Pruning CUDA 11.8 and cuDNN"
+    #####################################################################################
+    # CUDA 11.8 prune static libs
+    #####################################################################################
+    export NVPRUNE="/usr/local/cuda-11.8/bin/nvprune"
+    export CUDA_LIB_DIR="/usr/local/cuda-11.8/lib64"
+
+    export GENCODE="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+    export GENCODE_CUDNN="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+    if [[ -n "$OVERRIDE_GENCODE" ]]; then
+        export GENCODE=$OVERRIDE_GENCODE
+    fi
+
+    # all CUDA libs except CuDNN and CuBLAS (cudnn and cublas need arch 3.7 included)
+    ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+    # prune CuDNN and CuBLAS
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+    #####################################################################################
+    # CUDA 11.8 prune visual tools
+    #####################################################################################
+    export CUDA_BASE="/usr/local/cuda-11.8/"
+    rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2022.3.0 $CUDA_BASE/nsight-systems-2022.4.2/
+}
+
+function prune_121 {
+  echo "Pruning CUDA 12.1"
+  #####################################################################################
+  # CUDA 12.1 prune static libs
+  #####################################################################################
+    export NVPRUNE="/usr/local/cuda-12.1/bin/nvprune"
+    export CUDA_LIB_DIR="/usr/local/cuda-12.1/lib64"
+
+    export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+    export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+    if [[ -n "$OVERRIDE_GENCODE" ]]; then
+        export GENCODE=$OVERRIDE_GENCODE
+    fi
+
+    # all CUDA libs except CuDNN and CuBLAS
+    ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+    # prune CuDNN and CuBLAS
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+    #####################################################################################
+    # CUDA 12.1 prune visual tools
+    #####################################################################################
+    export CUDA_BASE="/usr/local/cuda-12.1/"
+    rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2023.1.0 $CUDA_BASE/nsight-systems-2023.1.2/
+}
+
+function prune_124 {
+  echo "Pruning CUDA 12.4"
+  #####################################################################################
+  # CUDA 12.4 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.1 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.4/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+    case "$1" in
+    11.8) install_118; prune_118
+        ;;
+    12.1) install_121; prune_121
+        ;;
+    12.4) install_124; prune_124
+        ;;
+    *) echo "bad argument $1"; exit 1
+        ;;
+    esac
+    shift
+done
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -0,0 +1,93 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+NCCL_VERSION=v2.21.5-1
+
+function install_cusparselt_052 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
+    tar xf libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
+    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_124 {
+  echo "Installing CUDA 12.4 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
+  # install CUDA 12.4.0 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux_sbsa.run
+  chmod +x cuda_12.4.0_550.54.14_linux_sbsa.run
+  ./cuda_12.4.0_550.54.14_linux_sbsa.run --toolkit --silent
+  rm -f cuda_12.4.0_550.54.14_linux_sbsa.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz -O cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz
+  tar xf cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz
+  cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_052
+
+  ldconfig
+}
+
+function prune_124 {
+  echo "Pruning CUDA 12.4"
+  #####################################################################################
+  # CUDA 12.4 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.1 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.4/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+    case "$1" in
+    12.4) install_124; prune_124
+        ;;
+    *) echo "bad argument $1"; exit 1
+        ;;
+    esac
+    shift
+done
--- a/.ci/docker/common/install_libpng.sh
+++ b/.ci/docker/common/install_libpng.sh
@ -0,0 +1,23 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+LIBPNG_VERSION=1.6.37
+
+mkdir -p libpng
+pushd libpng
+
+wget http://download.sourceforge.net/libpng/libpng-$LIBPNG_VERSION.tar.gz
+tar -xvzf libpng-$LIBPNG_VERSION.tar.gz
+
+pushd libpng-$LIBPNG_VERSION
+
+./configure
+make
+make install
+
+popd
+
+popd
+rm -rf libpng
--- a/.ci/docker/common/install_magma.sh
+++ b/.ci/docker/common/install_magma.sh
@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Script used only in CD pipeline
+
+set -eou pipefail
+
+MAGMA_VERSION="2.5.2"
+
+function do_install() {
+    cuda_version=$1
+    cuda_version_nodot=${1/./}
+
+    MAGMA_VERSION="2.6.1"
+    magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
+
+    cuda_dir="/usr/local/cuda-${cuda_version}"
+    (
+        set -x
+        tmp_dir=$(mktemp -d)
+        pushd ${tmp_dir}
+        curl -OLs https://anaconda.org/pytorch/magma-cuda${cuda_version_nodot}/${MAGMA_VERSION}/download/linux-64/${magma_archive}
+        tar -xvf "${magma_archive}"
+        mkdir -p "${cuda_dir}/magma"
+        mv include "${cuda_dir}/magma/include"
+        mv lib "${cuda_dir}/magma/lib"
+        popd
+    )
+}
+
+do_install $1
--- a/.ci/docker/common/install_miopen.sh
+++ b/.ci/docker/common/install_miopen.sh
@ -0,0 +1,134 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+ROCM_VERSION=$1
+
+if [[ -z $ROCM_VERSION ]]; then
+    echo "missing ROCM_VERSION"
+    exit 1;
+fi
+
+# To make version comparison easier, create an integer representation.
+save_IFS="$IFS"
+IFS=. ROCM_VERSION_ARRAY=(${ROCM_VERSION})
+IFS="$save_IFS"
+if [[ ${#ROCM_VERSION_ARRAY[@]} == 2 ]]; then
+    ROCM_VERSION_MAJOR=${ROCM_VERSION_ARRAY[0]}
+    ROCM_VERSION_MINOR=${ROCM_VERSION_ARRAY[1]}
+    ROCM_VERSION_PATCH=0
+elif [[ ${#ROCM_VERSION_ARRAY[@]} == 3 ]]; then
+    ROCM_VERSION_MAJOR=${ROCM_VERSION_ARRAY[0]}
+    ROCM_VERSION_MINOR=${ROCM_VERSION_ARRAY[1]}
+    ROCM_VERSION_PATCH=${ROCM_VERSION_ARRAY[2]}
+else
+    echo "Unhandled ROCM_VERSION ${ROCM_VERSION}"
+    exit 1
+fi
+ROCM_INT=$(($ROCM_VERSION_MAJOR * 10000 + $ROCM_VERSION_MINOR * 100 + $ROCM_VERSION_PATCH))
+
+# Install custom MIOpen + COMgr for ROCm >= 4.0.1
+if [[ $ROCM_INT -lt 40001 ]]; then
+    echo "ROCm version < 4.0.1; will not install custom MIOpen"
+    exit 0
+fi
+
+# Function to retry functions that sometimes timeout or have flaky failures
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+# Build custom MIOpen to use comgr for offline compilation.
+
+## Need a sanitized ROCM_VERSION without patchlevel; patchlevel version 0 must be added to paths.
+ROCM_DOTS=$(echo ${ROCM_VERSION} | tr -d -c '.' | wc -c)
+if [[ ${ROCM_DOTS} == 1 ]]; then
+    ROCM_VERSION_NOPATCH="${ROCM_VERSION}"
+    ROCM_INSTALL_PATH="/opt/rocm-${ROCM_VERSION}.0"
+else
+    ROCM_VERSION_NOPATCH="${ROCM_VERSION%.*}"
+    ROCM_INSTALL_PATH="/opt/rocm-${ROCM_VERSION}"
+fi
+
+# MIOPEN_USE_HIP_KERNELS is a Workaround for COMgr issues
+MIOPEN_CMAKE_COMMON_FLAGS="
+-DMIOPEN_USE_COMGR=ON
+-DMIOPEN_BUILD_DRIVER=OFF
+"
+# Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version
+if [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
+    echo "ROCm 6.1 MIOpen does not need any patches, do not build from source"
+    exit 0
+elif [[ $ROCM_INT -ge 60000 ]] && [[ $ROCM_INT -lt 60100 ]]; then
+    echo "ROCm 6.0 MIOpen does not need any patches, do not build from source"
+    exit 0
+elif [[ $ROCM_INT -ge 50700 ]] && [[ $ROCM_INT -lt 60000 ]]; then
+    echo "ROCm 5.7 MIOpen does not need any patches, do not build from source"
+    exit 0
+elif [[ $ROCM_INT -ge 50600 ]] && [[ $ROCM_INT -lt 50700 ]]; then
+    MIOPEN_BRANCH="release/rocm-rel-5.6-staging"
+elif [[ $ROCM_INT -ge 50500 ]] && [[ $ROCM_INT -lt 50600 ]]; then
+    MIOPEN_BRANCH="release/rocm-rel-5.5-gfx11"
+elif [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
+    MIOPEN_BRANCH="release/rocm-rel-5.4-staging"
+elif [[ $ROCM_INT -ge 50300 ]] && [[ $ROCM_INT -lt 50400 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
+    MIOPEN_BRANCH="release/rocm-rel-5.3-staging"
+elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50300 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
+    MIOPEN_BRANCH="release/rocm-rel-5.2-staging"
+elif [[ $ROCM_INT -ge 50100 ]] && [[ $ROCM_INT -lt 50200 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36"
+    MIOPEN_BRANCH="release/rocm-rel-5.1-staging"
+elif [[ $ROCM_INT -ge 50000 ]] && [[ $ROCM_INT -lt 50100 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36"
+    MIOPEN_BRANCH="release/rocm-rel-5.0-staging"
+else
+    echo "Unhandled ROCM_VERSION ${ROCM_VERSION}"
+    exit 1
+fi
+
+yum remove -y miopen-hip
+
+git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH}
+pushd MIOpen
+# remove .git to save disk space since CI runner was running out
+rm -rf .git
+# Don't build MLIR to save docker build time
+# since we are disabling MLIR backend for MIOpen anyway
+if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
+    sed -i '/rocMLIR/d' requirements.txt
+elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50400 ]]; then
+    sed -i '/llvm-project-mlir/d' requirements.txt
+fi
+## MIOpen minimum requirements
+cmake -P install_deps.cmake --minimum
+
+# clean up since CI runner was running out of disk space
+rm -rf /tmp/*
+yum clean all
+rm -rf /var/cache/yum
+rm -rf /var/lib/yum/yumdb
+rm -rf /var/lib/yum/history
+
+## Build MIOpen
+mkdir -p build
+cd build
+PKG_CONFIG_PATH=/usr/local/lib/pkgconfig CXX=${ROCM_INSTALL_PATH}/llvm/bin/clang++ cmake .. \
+    ${MIOPEN_CMAKE_COMMON_FLAGS} \
+    ${MIOPEN_CMAKE_DB_FLAGS} \
+    -DCMAKE_PREFIX_PATH="${ROCM_INSTALL_PATH}/hip;${ROCM_INSTALL_PATH}"
+make MIOpen -j $(nproc)
+
+# Build MIOpen package
+make -j $(nproc) package
+
+# clean up since CI runner was running out of disk space
+rm -rf /usr/local/cget
+
+yum install -y miopen-*.rpm
+
+popd
+rm -rf MIOpen
--- a/.ci/docker/common/install_mkl.sh
+++ b/.ci/docker/common/install_mkl.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+set -ex
+
+# MKL
+MKL_VERSION=2024.2.0
+
+MKLROOT=/opt/intel
+mkdir -p ${MKLROOT}
+pushd /tmp
+
+python3 -mpip install wheel
+python3 -mpip download -d . mkl-static==${MKL_VERSION}
+python3 -m wheel unpack mkl_static-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
+python3 -m wheel unpack mkl_include-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
+mv mkl_static-${MKL_VERSION}/mkl_static-${MKL_VERSION}.data/data/lib ${MKLROOT}
+mv mkl_include-${MKL_VERSION}/mkl_include-${MKL_VERSION}.data/data/include ${MKLROOT}
--- a/.ci/docker/common/install_mnist.sh
+++ b/.ci/docker/common/install_mnist.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+mkdir -p /usr/local/mnist/
+
+cd /usr/local/mnist
+
+for img in train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz t10k-images-idx3-ubyte.gz t10k-labels-idx1-ubyte.gz; do
+  wget -q https://ossci-datasets.s3.amazonaws.com/mnist/$img
+  gzip -d $img
+done
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+cd /
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.25 --depth 1 --shallow-submodules
+
+
+OPENBLAS_BUILD_FLAGS="
+NUM_THREADS=128
+USE_OPENMP=1
+NO_SHARED=0
+DYNAMIC_ARCH=1
+TARGET=ARMV8
+CFLAGS=-O3
+"
+
+OPENBLAS_CHECKOUT_DIR="OpenBLAS"
+
+make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
+make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
--- a/.ci/docker/common/install_patchelf.sh
+++ b/.ci/docker/common/install_patchelf.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+# Pin the version to latest release 0.17.2, building newer commit starts
+# to fail on the current image
+git clone -b 0.17.2 --single-branch https://github.com/NixOS/patchelf
+cd patchelf
+sed -i 's/serial/parallel/g' configure.ac
+./bootstrap.sh
+./configure
+make
+make install
+cd ..
+rm -rf patchelf
--- a/.ci/docker/common/install_rocm_drm.sh
+++ b/.ci/docker/common/install_rocm_drm.sh
@ -0,0 +1,150 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+###########################
+### prereqs
+###########################
+# Install Python packages depending on the base OS
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    apt-get update -y
+    apt-get install -y libpciaccess-dev pkg-config
+    apt-get clean
+    ;;
+  centos)
+    yum install -y libpciaccess-devel pkgconfig
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
+python3 -m pip install meson ninja
+
+###########################
+### clone repo
+###########################
+GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
+pushd drm
+
+###########################
+### patch
+###########################
+patch -p1 <<'EOF'
+diff --git a/amdgpu/amdgpu_asic_id.c b/amdgpu/amdgpu_asic_id.c
+index a5007ffc..13fa07fc 100644
+--- a/amdgpu/amdgpu_asic_id.c
+++ b/amdgpu/amdgpu_asic_id.c
+@@ -22,6 +22,13 @@
+  *
+  */
+
+#define _XOPEN_SOURCE 700
+#define _LARGEFILE64_SOURCE
+#define _FILE_OFFSET_BITS 64
+#include <ftw.h>
+#include <link.h>
+#include <limits.h>
+
+ #include <ctype.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+@@ -34,6 +41,19 @@
+ #include "amdgpu_drm.h"
+ #include "amdgpu_internal.h"
+
+static char *amdgpuids_path = NULL;
+static const char* amdgpuids_path_msg = NULL;
+
+static int check_for_location_of_amdgpuids(const char *filepath, const struct stat *info, const int typeflag, struct FTW *pathinfo)
+{
+	if (typeflag == FTW_F && strstr(filepath, "amdgpu.ids")) {
+		amdgpuids_path = strdup(filepath);
+		return 1;
+	}
+
+	return 0;
+}
+
+ static int parse_one_line(struct amdgpu_device *dev, const char *line)
+ {
+ 	char *buf, *saveptr;
+@@ -113,10 +133,46 @@ void amdgpu_parse_asic_ids(struct amdgpu_device *dev)
+ 	int line_num = 1;
+ 	int r = 0;
+
+	// attempt to find typical location for amdgpu.ids file
+ 	fp = fopen(AMDGPU_ASIC_ID_TABLE, "r");
+
+	// if it doesn't exist, search
+	if (!fp) {
+
+	char self_path[ PATH_MAX ];
+	ssize_t count;
+	ssize_t i;
+
+	count = readlink( "/proc/self/exe", self_path, PATH_MAX );
+	if (count > 0) {
+		self_path[count] = '\0';
+
+		// remove '/bin/python' from self_path
+		for (i=count; i>0; --i) {
+			if (self_path[i] == '/') break;
+			self_path[i] = '\0';
+		}
+		self_path[i] = '\0';
+		for (; i>0; --i) {
+			if (self_path[i] == '/') break;
+			self_path[i] = '\0';
+		}
+		self_path[i] = '\0';
+
+		if (1 == nftw(self_path, check_for_location_of_amdgpuids, 5, FTW_PHYS)) {
+			fp = fopen(amdgpuids_path, "r");
+			amdgpuids_path_msg = amdgpuids_path;
+		}
+	}
+
+	}
+	else {
+		amdgpuids_path_msg = AMDGPU_ASIC_ID_TABLE;
+	}
+
+	// both hard-coded location and search have failed
+ 	if (!fp) {
+-		fprintf(stderr, "%s: %s\n", AMDGPU_ASIC_ID_TABLE,
+-			strerror(errno));
+		fprintf(stderr, "amdgpu.ids: No such file or directory\n");
+ 		return;
+ 	}
+
+@@ -132,7 +188,7 @@ void amdgpu_parse_asic_ids(struct amdgpu_device *dev)
+ 			continue;
+ 		}
+
+-		drmMsg("%s version: %s\n", AMDGPU_ASIC_ID_TABLE, line);
+		drmMsg("%s version: %s\n", amdgpuids_path_msg, line);
+ 		break;
+ 	}
+
+@@ -150,7 +206,7 @@ void amdgpu_parse_asic_ids(struct amdgpu_device *dev)
+
+ 	if (r == -EINVAL) {
+ 		fprintf(stderr, "Invalid format: %s: line %d: %s\n",
+-			AMDGPU_ASIC_ID_TABLE, line_num, line);
+			amdgpuids_path_msg, line_num, line);
+ 	} else if (r && r != -EAGAIN) {
+ 		fprintf(stderr, "%s: Cannot parse ASIC IDs: %s\n",
+ 			__func__, strerror(-r));
+EOF
+
+###########################
+### build
+###########################
+meson builddir --prefix=/opt/amdgpu
+pushd builddir
+ninja install
+
+popd
+popd
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -1,7 +1,11 @@
 #!/bin/bash
+# Script used in CI and CD pipeline

 set -ex

+
+MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
+
 # "install" hipMAGMA into /opt/rocm/magma by copying after build
 git clone https://bitbucket.org/icl/magma.git
 pushd magma
@ -11,7 +15,10 @@ git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6

 cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
 echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
-echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib' >> make.inc
+if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
+    echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
+fi
+echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
 echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
 export PATH="${PATH}:/opt/rocm/bin"
 if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
@ -25,7 +32,7 @@ done
 # hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
 sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
 make -f make.gen.hipMAGMA -j $(nproc)
-LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT=/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION
-make testing/testing_dgemm -j $(nproc) MKLROOT=/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION
+LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
+make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
 popd
 mv magma /opt/rocm
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -1,6 +1,6 @@
 #!/bin/bash
 set -xe
-
+# Script used in CI and CD pipeline

 # Intel® software for general purpose GPU capabilities.
 # Refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
@ -8,19 +8,23 @@ set -xe
 # Users should update to the latest version as it becomes available

 function install_ubuntu() {
+    . /etc/os-release
+    if [[ ! " jammy " =~ " ${VERSION_CODENAME} " ]]; then
+        echo "Ubuntu version ${VERSION_CODENAME} not supported"
+        exit
+    fi
+
    apt-get update -y
    apt-get install -y gpg-agent wget
-
-    # Set up the repository. To do this, download the key to the system keyring
+    # To add the online network package repository for the GPU Driver LTS releases
    wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
-        | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
-    wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
-        | gpg --dearmor --output /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
-
-    # Add the signed entry to APT sources and configure the APT client to use the Intel repository
+        | gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \
-        https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
-        | tee /etc/apt/sources.list.d/intel-gpu-jammy.list
+        https://repositories.intel.com/gpu/ubuntu ${VERSION_CODENAME}/lts/2350 unified" \
+        | tee /etc/apt/sources.list.d/intel-gpu-${VERSION_CODENAME}.list
+    # To add the online network network package repository for the Intel Support Packages
+    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+        | gpg --dearmor > /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
    echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \
        https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \
        | tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list
@ -97,6 +101,86 @@ EOF
    rm -rf /var/lib/yum/history
 }

+function install_rhel() {
+    . /etc/os-release
+    if [[ "${ID}" == "rhel" ]]; then
+        if [[ ! " 8.6 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+            echo "RHEL version ${VERSION_ID} not supported"
+            exit
+        fi
+    elif [[ "${ID}" == "almalinux" ]]; then
+        # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64
+        VERSION_ID="8.6"
+    fi
+
+    dnf install -y 'dnf-command(config-manager)'
+    # To add the online network package repository for the GPU Driver LTS releases
+    dnf config-manager --add-repo \
+        https://repositories.intel.com/gpu/rhel/${VERSION_ID}/lts/2350/unified/intel-gpu-${VERSION_ID}.repo
+    # To add the online network network package repository for the Intel Support Packages
+    tee > /etc/yum.repos.d/intel-for-pytorch-gpu-dev.repo << EOF
+[intel-for-pytorch-gpu-dev]
+name=Intel for Pytorch GPU dev repository
+baseurl=https://yum.repos.intel.com/intel-for-pytorch-gpu-dev
+enabled=1
+gpgcheck=1
+repo_gpgcheck=1
+gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+EOF
+
+    # The xpu-smi packages
+    dnf install -y xpu-smi
+    # Compute and Media Runtimes
+    dnf install -y \
+        intel-opencl intel-media intel-mediasdk libmfxgen1 libvpl2\
+        level-zero intel-level-zero-gpu mesa-dri-drivers mesa-vulkan-drivers \
+        mesa-vdpau-drivers libdrm mesa-libEGL mesa-libgbm mesa-libGL \
+        mesa-libxatracker libvpl-tools intel-metrics-discovery \
+        intel-metrics-library intel-igc-core intel-igc-cm \
+        libva libva-utils intel-gmmlib libmetee intel-gsc intel-ocloc
+    # Development packages
+    dnf install -y --refresh \
+        intel-igc-opencl-devel level-zero-devel intel-gsc-devel libmetee-devel \
+        level-zero-devel
+    # Install Intel Support Packages
+    yum install -y intel-for-pytorch-gpu-dev intel-pti-dev
+
+    # Cleanup
+    dnf clean all
+    rm -rf /var/cache/yum
+    rm -rf /var/lib/yum/yumdb
+    rm -rf /var/lib/yum/history
+}
+
+function install_sles() {
+    . /etc/os-release
+    VERSION_SP=${VERSION_ID//./sp}
+    if [[ ! " 15sp4 15sp5 " =~ " ${VERSION_SP} " ]]; then
+        echo "SLES version ${VERSION_ID} not supported"
+        exit
+    fi
+
+    # To add the online network package repository for the GPU Driver LTS releases
+    zypper addrepo -f -r \
+        https://repositories.intel.com/gpu/sles/${VERSION_SP}/lts/2350/unified/intel-gpu-${VERSION_SP}.repo
+    rpm --import https://repositories.intel.com/gpu/intel-graphics.key
+    # To add the online network network package repository for the Intel Support Packages
+    zypper addrepo https://yum.repos.intel.com/intel-for-pytorch-gpu-dev intel-for-pytorch-gpu-dev
+    rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+
+    # The xpu-smi packages
+    zypper install -y lsb-release flex bison xpu-smi
+    # Compute and Media Runtimes
+    zypper install -y intel-level-zero-gpu level-zero intel-gsc intel-opencl intel-ocloc \
+        intel-media-driver libigfxcmrt7 libvpl2 libvpl-tools libmfxgen1 libmfx1
+    # Development packages
+    zypper install -y libigdfcl-devel intel-igc-cm libigfxcmrt-devel level-zero-devel
+
+    # Install Intel Support Packages
+    zypper install -y intel-for-pytorch-gpu-dev intel-pti-dev
+
+}
+

 # The installation depends on the base OS
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
@ -107,6 +191,12 @@ case "$ID" in
    centos)
        install_centos
    ;;
+    rhel|almalinux)
+        install_rhel
+    ;;
+    sles)
+        install_sles
+    ;;
    *)
        echo "Unable to determine OS..."
        exit 1
--- a/.ci/docker/conda/Dockerfile
+++ b/.ci/docker/conda/Dockerfile
@ -0,0 +1,101 @@
+ARG CUDA_VERSION=10.2
+ARG BASE_TARGET=cuda${CUDA_VERSION}
+FROM centos:7 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+ARG DEVTOOLSET_VERSION=9
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum update -y
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which unzip
+# Just add everything as a safe.directory for git since these will be used in multiple places with git
+RUN git config --global --add safe.directory '*'
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
+# EPEL for cmake
+RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \
+    rpm -ivh epel-release-latest-7.noarch.rpm && \
+    rm -f epel-release-latest-7.noarch.rpm
+# cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+RUN yum install -y autoconf aclocal automake make sudo
+RUN rm -rf /usr/local/cuda-*
+
+FROM base as patchelf
+# Install patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh && cp $(which patchelf) /patchelf
+
+FROM base as openssl
+# Install openssl
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+FROM base as conda
+# Install Anaconda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+
+# Install CUDA
+FROM base as cuda
+ARG CUDA_VERSION=10.2
+RUN rm -rf /usr/local/cuda-*
+ADD ./common/install_cuda.sh install_cuda.sh
+ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
+# Preserve CUDA_VERSION for the builds
+ENV CUDA_VERSION=${CUDA_VERSION}
+# Make things in our path by default
+ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH
+
+FROM cuda as cuda11.8
+RUN bash ./install_cuda.sh 11.8
+ENV DESIRED_CUDA=11.8
+
+FROM cuda as cuda12.1
+RUN bash ./install_cuda.sh 12.1
+ENV DESIRED_CUDA=12.1
+
+FROM cuda as cuda12.4
+RUN bash ./install_cuda.sh 12.4
+ENV DESIRED_CUDA=12.4
+
+# Install MNIST test data
+FROM base as mnist
+ADD ./common/install_mnist.sh install_mnist.sh
+RUN bash ./install_mnist.sh
+
+FROM base as all_cuda
+COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
+COPY --from=cuda12.1  /usr/local/cuda-12.1 /usr/local/cuda-12.1
+COPY --from=cuda12.4  /usr/local/cuda-12.4 /usr/local/cuda-12.4
+
+# Final step
+FROM ${BASE_TARGET} as final
+COPY --from=openssl            /opt/openssl           /opt/openssl
+COPY --from=patchelf           /patchelf              /usr/local/bin/patchelf
+COPY --from=conda              /opt/conda             /opt/conda
+
+# Add jni.h for java host build.
+COPY ./common/install_jni.sh install_jni.sh
+COPY ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+ENV  PATH /opt/conda/bin:$PATH
+COPY --from=mnist  /usr/local/mnist /usr/local/mnist
+RUN rm -rf /usr/local/cuda
+RUN chmod o+rw /usr/local
+RUN touch /.condarc && \
+    chmod o+rw /.condarc && \
+    chmod -R o+rw /opt/conda
--- a/.ci/docker/conda/build.sh
+++ b/.ci/docker/conda/build.sh
@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+# Script used only in CD pipeline
+
+set -eou pipefail
+
+image="$1"
+shift
+
+if [ -z "${image}" ]; then
+  echo "Usage: $0 IMAGE"
+  exit 1
+fi
+
+DOCKER_IMAGE_NAME="pytorch/${image}"
+
+
+export DOCKER_BUILDKIT=1
+TOPDIR=$(git rev-parse --show-toplevel)
+
+CUDA_VERSION=${CUDA_VERSION:-12.1}
+
+case ${CUDA_VERSION} in
+  cpu)
+    BASE_TARGET=base
+    DOCKER_TAG=cpu
+    ;;
+  all)
+    BASE_TARGET=all_cuda
+    DOCKER_TAG=latest
+    ;;
+  *)
+    BASE_TARGET=cuda${CUDA_VERSION}
+    DOCKER_TAG=cuda${CUDA_VERSION}
+    ;;
+esac
+
+
+(
+  set -x
+  docker build \
+    --target final \
+    --progress plain \
+    --build-arg "BASE_TARGET=${BASE_TARGET}" \
+    --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
+    --build-arg "DEVTOOLSET_VERSION=9" \
+    -t ${DOCKER_IMAGE_NAME} \
+    $@ \
+    -f "${TOPDIR}/.ci/docker/conda/Dockerfile" \
+    ${TOPDIR}/.ci/docker/
+)
+
+if [[ "${DOCKER_TAG}" =~ ^cuda* ]]; then
+  # Test that we're using the right CUDA compiler
+  (
+    set -x
+    docker run --rm "${DOCKER_IMAGE_NAME}" nvcc --version | grep "cuda_${CUDA_VERSION}"
+  )
+fi
+
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE_NAME}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE_NAME}-${GIT_COMMIT_SHA}
+if [[ "${WITH_PUSH:-}" == true ]]; then
+  (
+    set -x
+    docker push "${DOCKER_IMAGE_NAME}"
+    if [[ -n ${GITHUB_REF} ]]; then
+        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_BRANCH_TAG}
+        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_SHA_TAG}
+        docker push "${DOCKER_IMAGE_BRANCH_TAG}"
+        docker push "${DOCKER_IMAGE_SHA_TAG}"
+    fi
+  )
+fi
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -0,0 +1,107 @@
+ARG BASE_TARGET=base
+ARG GPU_IMAGE=ubuntu:20.04
+FROM ${GPU_IMAGE} as base
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get clean && apt-get update
+RUN apt-get install -y curl locales g++ git-all autoconf automake make cmake wget unzip sudo
+# Just add everything as a safe.directory for git since these will be used in multiple places with git
+RUN git config --global --add safe.directory '*'
+
+RUN locale-gen en_US.UTF-8
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+# Install openssl
+FROM base as openssl
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+# Install python
+FROM base as python
+ADD common/install_cpython.sh install_cpython.sh
+RUN apt-get update -y && \
+    apt-get install build-essential gdb lcov libbz2-dev libffi-dev \
+        libgdbm-dev liblzma-dev libncurses5-dev libreadline6-dev \
+        libsqlite3-dev libssl-dev lzma lzma-dev tk-dev uuid-dev zlib1g-dev -y && \
+    bash ./install_cpython.sh && \
+    rm install_cpython.sh && \
+    apt-get clean
+
+FROM base as conda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+
+FROM base as cpu
+# Install Anaconda
+COPY --from=conda /opt/conda /opt/conda
+# Install python
+COPY --from=python /opt/python    /opt/python
+COPY --from=python /opt/_internal /opt/_internal
+ENV PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
+# Install MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM cpu as cuda
+ADD ./common/install_cuda.sh install_cuda.sh
+ADD ./common/install_magma.sh install_magma.sh
+ENV CUDA_HOME /usr/local/cuda
+
+FROM cuda as cuda11.8
+RUN bash ./install_cuda.sh 11.8
+RUN bash ./install_magma.sh 11.8
+RUN ln -sf /usr/local/cuda-11.8 /usr/local/cuda
+
+FROM cuda as cuda12.1
+RUN bash ./install_cuda.sh 12.1
+RUN bash ./install_magma.sh 12.1
+RUN ln -sf /usr/local/cuda-12.1 /usr/local/cuda
+
+FROM cuda as cuda12.4
+RUN bash ./install_cuda.sh 12.4
+RUN bash ./install_magma.sh 12.4
+RUN ln -sf /usr/local/cuda-12.4 /usr/local/cuda
+
+FROM cpu as rocm
+ARG PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
+ENV MKLROOT /opt/intel
+# Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0)
+# find HIP works for ROCm5.7. Not needed for ROCm6.0 and above.
+# Remove below when ROCm5.7 is not in support matrix anymore.
+ENV ROCM_PATH /opt/rocm
+# No need to install ROCm as base docker image should have full ROCm install
+#ADD ./common/install_rocm.sh install_rocm.sh
+ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
+ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
+# gfortran and python needed for building magma from source for ROCm
+RUN apt-get update -y && \
+    apt-get install gfortran -y && \
+    apt-get install python -y && \
+    apt-get clean
+
+RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
+
+# Install AOTriton
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/aotriton_version.txt aotriton_version.txt
+COPY ./common/install_aotriton.sh install_aotriton.sh
+RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
+ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
+
+FROM ${BASE_TARGET} as final
+COPY --from=openssl            /opt/openssl           /opt/openssl
+# Install patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+# Install Anaconda
+COPY --from=conda /opt/conda /opt/conda
+# Install python
+COPY --from=python /opt/python    /opt/python
+COPY --from=python /opt/_internal /opt/_internal
+ENV PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+# Script used only in CD pipeline
+
+set -eou pipefail
+
+image="$1"
+shift
+
+if [ -z "${image}" ]; then
+  echo "Usage: $0 IMAGE"
+  exit 1
+fi
+
+DOCKER_IMAGE="pytorch/${image}"
+
+TOPDIR=$(git rev-parse --show-toplevel)
+
+GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
+GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
+
+WITH_PUSH=${WITH_PUSH:-}
+
+DOCKER=${DOCKER:-docker}
+
+case ${GPU_ARCH_TYPE} in
+    cpu)
+        BASE_TARGET=cpu
+        DOCKER_TAG=cpu
+        GPU_IMAGE=ubuntu:20.04
+        DOCKER_GPU_BUILD_ARG=""
+        ;;
+    cuda)
+        BASE_TARGET=cuda${GPU_ARCH_VERSION}
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        GPU_IMAGE=ubuntu:20.04
+        DOCKER_GPU_BUILD_ARG=""
+        ;;
+    rocm)
+        BASE_TARGET=rocm
+        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
+        GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100"
+        ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)"
+        if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then
+            ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0}))
+        else
+            echo "ERROR: rocm regex failed"
+            exit 1
+        fi
+        if [[ $ROCM_VERSION_INT -ge 60000 ]]; then
+            PYTORCH_ROCM_ARCH+=";gfx942"
+        fi
+        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
+        ;;
+    *)
+        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
+        exit 1
+        ;;
+esac
+
+
+(
+    set -x
+    DOCKER_BUILDKIT=1 ${DOCKER} build \
+         --target final \
+        ${DOCKER_GPU_BUILD_ARG} \
+        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+        --build-arg "BASE_TARGET=${BASE_TARGET}" \
+        -t "${DOCKER_IMAGE}" \
+        $@ \
+        -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
+        "${TOPDIR}/.ci/docker/"
+
+)
+
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
+
+if [[ "${WITH_PUSH}" == true ]]; then
+  (
+    set -x
+    ${DOCKER} push "${DOCKER_IMAGE}"
+    if [[ -n ${GITHUB_REF} ]]; then
+        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
+        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
+        ${DOCKER} push "${DOCKER_IMAGE_BRANCH_TAG}"
+        ${DOCKER} push "${DOCKER_IMAGE_SHA_TAG}"
+    fi
+  )
+fi
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -0,0 +1,203 @@
+# syntax = docker/dockerfile:experimental
+ARG ROCM_VERSION=3.7
+ARG BASE_CUDA_VERSION=11.8
+
+ARG GPU_IMAGE=centos:7
+FROM centos:7 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+ARG DEVTOOLSET_VERSION=9
+# Note: This is required patch since CentOS have reached EOL
+# otherwise any yum install setp will fail
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
+# Just add everything as a safe.directory for git since these will be used in multiple places with git
+RUN git config --global --add safe.directory '*'
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+# Note: After running yum-config-manager --enable rhel-server-rhscl-7-rpms
+# patch is required once again. Somehow this steps adds mirror.centos.org
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \
+    rpm -ivh epel-release-latest-7.noarch.rpm && \
+    rm -f epel-release-latest-7.noarch.rpm
+
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake
+
+RUN yum install -y autoconf aclocal automake make sudo
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+# EPEL for cmake
+FROM base as patchelf
+# Install patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+RUN cp $(which patchelf) /patchelf
+
+FROM patchelf as python
+# build python
+COPY manywheel/build_scripts /build_scripts
+ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh
+RUN bash build_scripts/build.sh && rm -r build_scripts
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION=10.2
+# Install CUDA
+ADD ./common/install_cuda.sh install_cuda.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
+
+FROM base as intel
+# MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION=10.2
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as jni
+# Install java jni header
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+# Install libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM ${GPU_IMAGE} as common
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+RUN yum install -y \
+        aclocal \
+        autoconf \
+        automake \
+        bison \
+        bzip2 \
+        curl \
+        diffutils \
+        file \
+        git \
+        make \
+        patch \
+        perl \
+        unzip \
+        util-linux \
+        wget \
+        which \
+        xz \
+        yasm
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+RUN yum swap -y git git236-core
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+# Install LLVM version
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+
+FROM common as cpu_final
+ARG BASE_CUDA_VERSION=10.1
+ARG DEVTOOLSET_VERSION=9
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake is already installed inside the rocm base image, so remove if present
+RUN rpm -e cmake || true
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake
+
+# ninja
+RUN yum install -y ninja-build
+
+FROM cpu_final as cuda_final
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
+ENV PATH=/usr/local/cuda/bin:$PATH
+
+FROM cpu_final as rocm_final
+ARG ROCM_VERSION=3.7
+ARG PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
+# Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0)
+# find HIP works for ROCm5.7. Not needed for ROCm6.0 and above.
+# Remove below when ROCm5.7 is not in support matrix anymore.
+ENV ROCM_PATH /opt/rocm
+ENV MKLROOT /opt/intel
+# No need to install ROCm as base docker image should have full ROCm install
+#ADD ./common/install_rocm.sh install_rocm.sh
+#RUN ROCM_VERSION=${ROCM_VERSION} bash ./install_rocm.sh && rm install_rocm.sh
+ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
+RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
+# cmake3 is needed for the MIOpen build
+RUN ln -sf /usr/local/bin/cmake /usr/bin/cmake3
+ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
+
+# Install AOTriton
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/aotriton_version.txt aotriton_version.txt
+COPY ./common/install_aotriton.sh install_aotriton.sh
+RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
+ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
--- a/.ci/docker/manywheel/Dockerfile_2014
+++ b/.ci/docker/manywheel/Dockerfile_2014
@ -0,0 +1,152 @@
+# syntax = docker/dockerfile:experimental
+ARG ROCM_VERSION=3.7
+ARG BASE_CUDA_VERSION=10.2
+ARG GPU_IMAGE=nvidia/cuda:${BASE_CUDA_VERSION}-devel-centos7
+FROM quay.io/pypa/manylinux2014_x86_64 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
+RUN yum install -y yum-utils centos-release-scl sudo
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
+ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+
+
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION=10.2
+# Install CUDA
+ADD ./common/install_cuda.sh install_cuda.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
+
+FROM base as intel
+# MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION=10.2
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as jni
+# Install java jni header
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+# Install libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM ${GPU_IMAGE} as common
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+RUN yum install -y \
+        aclocal \
+        autoconf \
+        automake \
+        bison \
+        bzip2 \
+        curl \
+        diffutils \
+        file \
+        git \
+        make \
+        patch \
+        perl \
+        unzip \
+        util-linux \
+        wget \
+        which \
+        xz \
+        yasm
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+RUN yum swap -y git git236-core
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+# Install LLVM version
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=base               /opt/python                           /opt/python
+COPY --from=base               /opt/_internal                        /opt/_internal
+COPY --from=base               /usr/local/bin/auditwheel             /usr/local/bin/auditwheel
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=base               /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+
+FROM common as cpu_final
+ARG BASE_CUDA_VERSION=10.2
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
+ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+
+# ninja
+RUN yum install -y http://repo.okay.com.mx/centos/7/x86_64/release/okay-release-1-1.noarch.rpm
+RUN yum install -y ninja-build
+
+FROM cpu_final as cuda_final
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+
+FROM common as rocm_final
+ARG ROCM_VERSION=3.7
+# Install ROCm
+ADD ./common/install_rocm.sh install_rocm.sh
+RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh
+# cmake is already installed inside the rocm base image, but both 2 and 3 exist
+# cmake3 is needed for the later MIOpen custom build, so that step is last.
+RUN yum install -y cmake3 && \
+    rm -f /usr/bin/cmake && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -0,0 +1,153 @@
+# syntax = docker/dockerfile:experimental
+ARG ROCM_VERSION=3.7
+ARG BASE_CUDA_VERSION=11.8
+ARG GPU_IMAGE=amd64/almalinux:8
+FROM quay.io/pypa/manylinux_2_28_x86_64 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+ARG DEVTOOLSET_VERSION=11
+RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
+ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake3
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION=11.8
+# Install CUDA
+ADD ./common/install_cuda.sh install_cuda.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
+
+FROM base as intel
+# MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION=10.2
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as jni
+# Install java jni header
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+# Install libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM ${GPU_IMAGE} as common
+ARG DEVTOOLSET_VERSION=11
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+RUN yum -y install epel-release
+RUN yum -y update
+RUN yum install -y \
+        autoconf \
+        automake \
+        bison \
+        bzip2 \
+        curl \
+        diffutils \
+        file \
+        git \
+        make \
+        patch \
+        perl \
+        unzip \
+        util-linux \
+        wget \
+        which \
+        xz \
+        gcc-toolset-${DEVTOOLSET_VERSION}-toolchain \
+        glibc-langpack-en
+
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+RUN yum swap -y git git236-core
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+# Install LLVM version
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=base               /opt/python                           /opt/python
+COPY --from=base               /opt/_internal                        /opt/_internal
+COPY --from=base               /usr/local/bin/auditwheel             /usr/local/bin/auditwheel
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=base               /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+
+FROM common as cpu_final
+ARG BASE_CUDA_VERSION=11.8
+ARG DEVTOOLSET_VERSION=11
+# Ensure the expected devtoolset is used
+ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake3
+
+FROM cpu_final as cuda_final
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+
+FROM common as rocm_final
+ARG ROCM_VERSION=3.7
+# Install ROCm
+ADD ./common/install_rocm.sh install_rocm.sh
+RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh
+# cmake is already installed inside the rocm base image, but both 2 and 3 exist
+# cmake3 is needed for the later MIOpen custom build, so that step is last.
+RUN yum install -y cmake3 && \
+    rm -f /usr/bin/cmake && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
+
+FROM cpu_final as xpu_final
+# cmake-3.28.4 from pip
+RUN python3 -m pip install --upgrade pip && \
+    python3 -mpip install cmake==3.28.4
+ADD ./common/install_xpu.sh install_xpu.sh
+RUN bash ./install_xpu.sh && rm install_xpu.sh
+RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -0,0 +1,57 @@
+FROM quay.io/pypa/manylinux_2_28_aarch64 as base
+
+# Graviton needs GCC 10 or above for the build. GCC12 is the default version in almalinux-8.
+ARG GCCTOOLSET_VERSION=11
+
+# Language variabes
+ENV LC_ALL=en_US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US.UTF-8
+
+# Installed needed OS packages. This is to support all
+# the binary builds (torch, vision, audio, text, data)
+RUN yum -y install epel-release
+RUN yum -y update
+RUN yum install -y \
+  autoconf \
+  automake \
+  bison \
+  bzip2 \
+  curl \
+  diffutils \
+  file \
+  git \
+  less \
+  libffi-devel \
+  libgomp \
+  make \
+  openssl-devel \
+  patch \
+  perl \
+  unzip \
+  util-linux \
+  wget \
+  which \
+  xz \
+  yasm \
+  zstd \
+  sudo \
+  gcc-toolset-${GCCTOOLSET_VERSION}-toolchain
+
+# Ensure the expected devtoolset is used
+ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+FROM base as final
+
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
--- a/.ci/docker/manywheel/Dockerfile_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_aarch64
@ -0,0 +1,94 @@
+FROM quay.io/pypa/manylinux2014_aarch64 as base
+
+
+# Graviton needs GCC 10 for the build
+ARG DEVTOOLSET_VERSION=10
+
+# Language variabes
+ENV LC_ALL=en_US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US.UTF-8
+
+# Installed needed OS packages. This is to support all
+# the binary builds (torch, vision, audio, text, data)
+RUN yum -y install epel-release
+RUN yum -y update
+RUN yum install -y \
+  autoconf \
+  automake \
+  bison \
+  bzip2 \
+  curl \
+  diffutils \
+  file \
+  git \
+  make \
+  patch \
+  perl \
+  unzip \
+  util-linux \
+  wget \
+  which \
+  xz \
+  yasm \
+  less \
+  zstd \
+  libgomp \
+  sudo \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
+  devtoolset-${DEVTOOLSET_VERSION}-binutils
+
+# Ensure the expected devtoolset is used
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+
+###############################################################################
+# libglfortran.a hack
+#
+# libgfortran.a from quay.io/pypa/manylinux2014_aarch64 is not compiled with -fPIC.
+# This causes __stack_chk_guard@@GLIBC_2.17 on pytorch build. To solve, get
+# ubuntu's libgfortran.a which is compiled with -fPIC
+# NOTE: Need a better way to get this library as Ubuntu's package can be removed by the vender, or changed
+###############################################################################
+RUN cd ~/ \
+  && curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-1ubuntu1_arm64.deb \
+  && ar x ~/libgfortran-10-dev.deb \
+  && tar --use-compress-program=unzstd -xvf data.tar.zst -C ~/ \
+  && cp -f ~/usr/lib/gcc/aarch64-linux-gnu/10/libgfortran.a /opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/
+
+# install cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+FROM base as openblas
+# Install openblas
+ADD ./common/install_openblas.sh install_openblas.sh
+RUN bash ./install_openblas.sh && rm install_openblas.sh
+
+FROM openssl as final
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
+ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@ -0,0 +1,91 @@
+FROM quay.io/pypa/manylinux_2_28_aarch64 as base
+
+# Cuda ARM build needs gcc 11
+ARG DEVTOOLSET_VERSION=11
+
+# Language variables
+ENV LC_ALL=en_US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US.UTF-8
+
+# Installed needed OS packages. This is to support all
+# the binary builds (torch, vision, audio, text, data)
+RUN yum -y install epel-release
+RUN yum -y update
+RUN yum install -y \
+  autoconf \
+  automake \
+  bison \
+  bzip2 \
+  curl \
+  diffutils \
+  file \
+  git \
+  make \
+  patch \
+  perl \
+  unzip \
+  util-linux \
+  wget \
+  which \
+  xz \
+  yasm \
+  less \
+  zstd \
+  libgomp \
+  sudo \
+  gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
+
+# Ensure the expected devtoolset is used
+ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+FROM openssl as final
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION
+# Install CUDA
+ADD ./common/install_cuda_aarch64.sh install_cuda_aarch64.sh
+RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as openblas
+# Install openblas
+ADD ./common/install_openblas.sh install_openblas.sh
+RUN bash ./install_openblas.sh && rm install_openblas.sh
+
+FROM final as cuda_final
+ARG BASE_CUDA_VERSION
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
+RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
+ENV PATH=/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/manywheel/Dockerfile_cxx11-abi
+++ b/.ci/docker/manywheel/Dockerfile_cxx11-abi
@ -0,0 +1,71 @@
+FROM centos:8 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+ENV PATH /opt/rh/gcc-toolset-11/root/bin/:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+
+# change to a valid repo
+RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Linux-*.repo
+# enable to install ninja-build
+RUN sed -i 's|enabled=0|enabled=1|g' /etc/yum.repos.d/CentOS-Linux-PowerTools.repo
+
+RUN yum -y update
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which zlib-devel sudo
+RUN yum install -y autoconf automake make cmake gdb gcc-toolset-11-gcc-c++
+
+
+FROM base as openssl
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+# Install python
+FROM base as python
+RUN yum install -y openssl-devel zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel
+ADD common/install_cpython.sh install_cpython.sh
+RUN bash ./install_cpython.sh && rm install_cpython.sh
+
+FROM base as conda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+RUN /opt/conda/bin/conda install -y cmake
+
+FROM base as intel
+# Install MKL
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=conda              /opt/conda                            /opt/conda
+ENV PATH=/opt/conda/bin:$PATH
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+RUN cp $(which patchelf) /patchelf
+
+FROM base as jni
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM base as final
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=conda              /opt/conda                            /opt/conda
+COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+
+RUN yum install -y ninja-build
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -0,0 +1,73 @@
+FROM --platform=linux/s390x docker.io/ubuntu:24.04 as base
+
+# Language variables
+ENV LC_ALL=C.UTF-8
+ENV LANG=C.UTF-8
+ENV LANGUAGE=C.UTF-8
+
+# Installed needed OS packages. This is to support all
+# the binary builds (torch, vision, audio, text, data)
+RUN apt update ; apt upgrade -y
+RUN apt install -y \
+  build-essential \
+  autoconf \
+  automake \
+  bzip2 \
+  curl \
+  diffutils \
+  file \
+  git \
+  make \
+  patch \
+  perl \
+  unzip \
+  util-linux \
+  wget \
+  which \
+  xz-utils \
+  less \
+  zstd \
+  cmake \
+  python3 \
+  python3-dev \
+  python3-setuptools \
+  python3-yaml \
+  python3-typing-extensions \
+  libblas-dev \
+  libopenblas-dev \
+  liblapack-dev \
+  libatlas-base-dev
+
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+# EPEL for cmake
+FROM base as patchelf
+# Install patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+RUN cp $(which patchelf) /patchelf
+
+FROM patchelf as python
+# build python
+COPY manywheel/build_scripts /build_scripts
+ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh
+RUN bash build_scripts/build.sh && rm -r build_scripts
+
+FROM openssl as final
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel
+COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -0,0 +1,154 @@
+#!/usr/bin/env bash
+# Script used only in CD pipeline
+
+set -eou pipefail
+
+TOPDIR=$(git rev-parse --show-toplevel)
+
+image="$1"
+shift
+
+if [ -z "${image}" ]; then
+  echo "Usage: $0 IMAGE"
+  exit 1
+fi
+
+DOCKER_IMAGE="pytorch/${image}"
+
+DOCKER_REGISTRY="${DOCKER_REGISTRY:-docker.io}"
+
+GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
+GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
+MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
+DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
+WITH_PUSH=${WITH_PUSH:-}
+
+case ${GPU_ARCH_TYPE} in
+    cpu)
+        TARGET=cpu_final
+        DOCKER_TAG=cpu
+        GPU_IMAGE=centos:7
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
+        ;;
+    cpu-manylinux_2_28)
+        TARGET=cpu_final
+        DOCKER_TAG=cpu
+        GPU_IMAGE=amd64/almalinux:8
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
+        MANY_LINUX_VERSION="2_28"
+        ;;
+    cpu-aarch64)
+        TARGET=final
+        DOCKER_TAG=cpu-aarch64
+        GPU_IMAGE=arm64v8/centos:7
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=10"
+        MANY_LINUX_VERSION="aarch64"
+        ;;
+    cpu-aarch64-2_28)
+        TARGET=final
+        DOCKER_TAG=cpu-aarch64
+        GPU_IMAGE=arm64v8/almalinux:8
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
+        MANY_LINUX_VERSION="2_28_aarch64"
+        ;;
+    cpu-cxx11-abi)
+        TARGET=final
+        DOCKER_TAG=cpu-cxx11-abi
+        GPU_IMAGE=""
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
+        MANY_LINUX_VERSION="cxx11-abi"
+        ;;
+    cpu-s390x)
+        TARGET=final
+        DOCKER_TAG=cpu-s390x
+        GPU_IMAGE=redhat/ubi9
+        DOCKER_GPU_BUILD_ARG=""
+        MANY_LINUX_VERSION="s390x"
+        ;;
+    cuda)
+        TARGET=cuda_final
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        # Keep this up to date with the minimum version of CUDA we currently support
+        GPU_IMAGE=centos:7
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=9"
+        ;;
+    cuda-manylinux_2_28)
+        TARGET=cuda_final
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        GPU_IMAGE=amd64/almalinux:8
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
+        MANY_LINUX_VERSION="2_28"
+        ;;
+    cuda-aarch64)
+        TARGET=cuda_final
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        GPU_IMAGE=arm64v8/centos:7
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
+        MANY_LINUX_VERSION="aarch64"
+        DOCKERFILE_SUFFIX="_cuda_aarch64"
+        ;;
+    rocm)
+        TARGET=rocm_final
+        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
+        GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100"
+        ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)"
+        if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then
+            ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0}))
+        else
+            echo "ERROR: rocm regex failed"
+            exit 1
+        fi
+        if [[ $ROCM_VERSION_INT -ge 60000 ]]; then
+            PYTORCH_ROCM_ARCH+=";gfx942"
+        fi
+        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=9"
+        ;;
+    xpu)
+        TARGET=xpu_final
+        DOCKER_TAG=xpu
+        GPU_IMAGE=amd64/almalinux:8
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
+        MANY_LINUX_VERSION="2_28"
+        ;;
+    *)
+        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
+        exit 1
+        ;;
+esac
+
+IMAGES=''
+
+if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
+    DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION}
+fi
+(
+    set -x
+    DOCKER_BUILDKIT=1 docker build \
+        ${DOCKER_GPU_BUILD_ARG} \
+        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+        --target "${TARGET}" \
+        -t "${DOCKER_IMAGE}" \
+        $@ \
+        -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
+        "${TOPDIR}/.ci/docker/"
+)
+
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
+
+if [[ "${WITH_PUSH}" == true ]]; then
+    (
+        set -x
+        docker push "${DOCKER_IMAGE}"
+        if [[ -n ${GITHUB_REF} ]]; then
+            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
+            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
+            docker push "${DOCKER_IMAGE_BRANCH_TAG}"
+            docker push "${DOCKER_IMAGE_SHA_TAG}"
+        fi
+    )
+fi
--- a/.ci/docker/manywheel/build_scripts/build.sh
+++ b/.ci/docker/manywheel/build_scripts/build.sh
@ -0,0 +1,131 @@
+#!/bin/bash
+# Top-level build script called from Dockerfile
+# Script used only in CD pipeline
+
+# Stop at any error, show all commands
+set -ex
+
+# openssl version to build, with expected sha256 hash of .tar.gz
+# archive
+OPENSSL_ROOT=openssl-1.1.1l
+OPENSSL_HASH=0b7a3e5e59c34827fe0c3a74b7ec8baef302b98fa80088d7f9153aa16fa76bd1
+DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
+PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
+CURL_ROOT=curl-7.73.0
+CURL_HASH=cf34fe0b07b800f1c01a499a6e8b2af548f6d0e044dca4a29d88a4bee146d131
+AUTOCONF_ROOT=autoconf-2.69
+AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
+
+# Get build utilities
+MY_DIR=$(dirname "${BASH_SOURCE[0]}")
+source $MY_DIR/build_utils.sh
+
+if [ "$(uname -m)" != "s390x" ] ; then
+    # Dependencies for compiling Python that we want to remove from
+    # the final image after compiling Python
+    PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel"
+
+    # Libraries that are allowed as part of the manylinux1 profile
+    MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
+
+    # Development tools and libraries
+    yum -y install bzip2 make git patch unzip bison yasm diffutils \
+        automake which file cmake28 \
+        kernel-devel-`uname -r` \
+        ${PYTHON_COMPILE_DEPS}
+else
+    # Dependencies for compiling Python that we want to remove from
+    # the final image after compiling Python
+    PYTHON_COMPILE_DEPS="zlib1g-dev libbz2-dev libncurses-dev libsqlite3-dev libdb-dev libpcap-dev liblzma-dev libffi-dev"
+
+    # Libraries that are allowed as part of the manylinux1 profile
+    MANYLINUX1_DEPS="libglib2.0-dev libX11-dev libncurses-dev"
+
+    # Development tools and libraries
+    apt install -y bzip2 make git patch unzip diffutils \
+        automake which file cmake \
+        linux-headers-virtual \
+        ${PYTHON_COMPILE_DEPS}
+fi
+
+# Install newest autoconf
+build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH
+autoconf --version
+
+# Compile the latest Python releases.
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+build_openssl $OPENSSL_ROOT $OPENSSL_HASH
+/build_scripts/install_cpython.sh
+
+PY39_BIN=/opt/python/cp39-cp39/bin
+
+# Our openssl doesn't know how to find the system CA trust store
+#   (https://github.com/pypa/manylinux/issues/53)
+# And it's not clear how up-to-date that is anyway
+# So let's just use the same one pip and everyone uses
+$PY39_BIN/pip install certifi
+ln -s $($PY39_BIN/python -c 'import certifi; print(certifi.where())') \
+      /opt/_internal/certs.pem
+# If you modify this line you also have to modify the versions in the
+# Dockerfiles:
+export SSL_CERT_FILE=/opt/_internal/certs.pem
+
+# Install newest curl
+build_curl $CURL_ROOT $CURL_HASH
+rm -rf /usr/local/include/curl /usr/local/lib/libcurl* /usr/local/lib/pkgconfig/libcurl.pc
+hash -r
+curl --version
+curl-config --features
+
+# Install patchelf (latest with unreleased bug fixes)
+curl -sLOk https://nixos.org/releases/patchelf/patchelf-0.10/patchelf-0.10.tar.gz
+# check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
+tar -xzf patchelf-0.10.tar.gz
+(cd patchelf-0.10 && ./configure && make && make install)
+rm -rf patchelf-0.10.tar.gz patchelf-0.10
+
+# Install latest pypi release of auditwheel
+$PY39_BIN/pip install auditwheel
+ln -s $PY39_BIN/auditwheel /usr/local/bin/auditwheel
+
+# Clean up development headers and other unnecessary stuff for
+# final image
+if [ "$(uname -m)" != "s390x" ] ; then
+    yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
+        avahi freetype bitstream-vera-fonts \
+        ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
+    yum -y install ${MANYLINUX1_DEPS}
+    yum -y clean all > /dev/null 2>&1
+    yum list installed
+else
+    apt purge -y ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
+fi
+# we don't need libpython*.a, and they're many megabytes
+find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f
+# Strip what we can -- and ignore errors, because this just attempts to strip
+# *everything*, including non-ELF files:
+find /opt/_internal -type f -print0 \
+    | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true
+# We do not need the Python test suites, or indeed the precompiled .pyc and
+# .pyo files. Partially cribbed from:
+#    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile
+find /opt/_internal \
+     \( -type d -a -name test -o -name tests \) \
+  -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \
+  -print0 | xargs -0 rm -f
+
+for PYTHON in /opt/python/*/bin/python; do
+    # Smoke test to make sure that our Pythons work, and do indeed detect as
+    # being manylinux compatible:
+    $PYTHON $MY_DIR/manylinux1-check.py
+    # Make sure that SSL cert checking works
+    $PYTHON $MY_DIR/ssl-check.py
+done
+
+# Fix libc headers to remain compatible with C99 compilers.
+find /usr/include/ -type f -exec sed -i 's/\bextern _*inline_*\b/extern __inline __attribute__ ((__gnu_inline__))/g' {} +
+
+# Now we can delete our built SSL
+rm -rf /usr/local/ssl
--- a/.ci/docker/manywheel/build_scripts/build_utils.sh
+++ b/.ci/docker/manywheel/build_scripts/build_utils.sh
@ -0,0 +1,91 @@
+#!/bin/bash
+# Helper utilities for build
+# Script used only in CD pipeline
+
+OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/
+CURL_DOWNLOAD_URL=https://curl.askapache.com/download
+
+AUTOCONF_DOWNLOAD_URL=https://ftp.gnu.org/gnu/autoconf
+
+
+function check_var {
+    if [ -z "$1" ]; then
+        echo "required variable not defined"
+        exit 1
+    fi
+}
+
+
+function do_openssl_build {
+    ./config no-ssl2 no-shared -fPIC --prefix=/usr/local/ssl > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function check_sha256sum {
+    local fname=$1
+    check_var ${fname}
+    local sha256=$2
+    check_var ${sha256}
+
+    echo "${sha256}  ${fname}" > ${fname}.sha256
+    sha256sum -c ${fname}.sha256
+    rm -f ${fname}.sha256
+}
+
+
+function build_openssl {
+    local openssl_fname=$1
+    check_var ${openssl_fname}
+    local openssl_sha256=$2
+    check_var ${openssl_sha256}
+    check_var ${OPENSSL_DOWNLOAD_URL}
+    curl -sLO ${OPENSSL_DOWNLOAD_URL}/${openssl_fname}.tar.gz
+    check_sha256sum ${openssl_fname}.tar.gz ${openssl_sha256}
+    tar -xzf ${openssl_fname}.tar.gz
+    (cd ${openssl_fname} && do_openssl_build)
+    rm -rf ${openssl_fname} ${openssl_fname}.tar.gz
+}
+
+
+function do_curl_build {
+    LIBS=-ldl ./configure --with-ssl --disable-shared > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function build_curl {
+    local curl_fname=$1
+    check_var ${curl_fname}
+    local curl_sha256=$2
+    check_var ${curl_sha256}
+    check_var ${CURL_DOWNLOAD_URL}
+    curl -sLO ${CURL_DOWNLOAD_URL}/${curl_fname}.tar.bz2
+    check_sha256sum ${curl_fname}.tar.bz2 ${curl_sha256}
+    tar -jxf ${curl_fname}.tar.bz2
+    (cd ${curl_fname} && do_curl_build)
+    rm -rf ${curl_fname} ${curl_fname}.tar.bz2
+}
+
+
+function do_standard_install {
+    ./configure > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function build_autoconf {
+    local autoconf_fname=$1
+    check_var ${autoconf_fname}
+    local autoconf_sha256=$2
+    check_var ${autoconf_sha256}
+    check_var ${AUTOCONF_DOWNLOAD_URL}
+    curl -sLO ${AUTOCONF_DOWNLOAD_URL}/${autoconf_fname}.tar.gz
+    check_sha256sum ${autoconf_fname}.tar.gz ${autoconf_sha256}
+    tar -zxf ${autoconf_fname}.tar.gz
+    (cd ${autoconf_fname} && do_standard_install)
+    rm -rf ${autoconf_fname} ${autoconf_fname}.tar.gz
+}
--- a/.ci/docker/manywheel/build_scripts/manylinux1-check.py
+++ b/.ci/docker/manywheel/build_scripts/manylinux1-check.py
@ -0,0 +1,60 @@
+# Logic copied from PEP 513
+
+
+def is_manylinux1_compatible():
+    # Only Linux, and only x86-64 / i686
+    from distutils.util import get_platform
+
+    if get_platform() not in ["linux-x86_64", "linux-i686", "linux-s390x"]:
+        return False
+
+    # Check for presence of _manylinux module
+    try:
+        import _manylinux
+
+        return bool(_manylinux.manylinux1_compatible)
+    except (ImportError, AttributeError):
+        # Fall through to heuristic check below
+        pass
+
+    # Check glibc version. CentOS 5 uses glibc 2.5.
+    return have_compatible_glibc(2, 5)
+
+
+def have_compatible_glibc(major, minimum_minor):
+    import ctypes
+
+    process_namespace = ctypes.CDLL(None)
+    try:
+        gnu_get_libc_version = process_namespace.gnu_get_libc_version
+    except AttributeError:
+        # Symbol doesn't exist -> therefore, we are not linked to
+        # glibc.
+        return False
+
+    # Call gnu_get_libc_version, which returns a string like "2.5".
+    gnu_get_libc_version.restype = ctypes.c_char_p
+    version_str = gnu_get_libc_version()
+    # py2 / py3 compatibility:
+    if not isinstance(version_str, str):
+        version_str = version_str.decode("ascii")
+
+    # Parse string and check against requested version.
+    version = [int(piece) for piece in version_str.split(".")]
+    assert len(version) == 2
+    if major != version[0]:
+        return False
+    if minimum_minor > version[1]:
+        return False
+    return True
+
+
+import sys
+
+
+if is_manylinux1_compatible():
+    print(f"{sys.executable} is manylinux1 compatible")
+    sys.exit(0)
+else:
+    print(f"{sys.executable} is NOT manylinux1 compatible")
+    sys.exit(1)
--- a/.ci/docker/manywheel/build_scripts/ssl-check.py
+++ b/.ci/docker/manywheel/build_scripts/ssl-check.py
@ -0,0 +1,35 @@
+# cf. https://github.com/pypa/manylinux/issues/53
+
+GOOD_SSL = "https://google.com"
+BAD_SSL = "https://self-signed.badssl.com"
+
+import sys
+
+
+print("Testing SSL certificate checking for Python:", sys.version)
+
+if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
+    print("This version never checks SSL certs; skipping tests")
+    sys.exit(0)
+
+if sys.version_info[0] >= 3:
+    from urllib.request import urlopen
+
+    EXC = OSError
+else:
+    from urllib import urlopen
+
+    EXC = IOError
+
+print(f"Connecting to {GOOD_SSL} should work")
+urlopen(GOOD_SSL)
+print("...it did, yay.")
+
+print(f"Connecting to {BAD_SSL} should fail")
+try:
+    urlopen(BAD_SSL)
+    # If we get here then we failed:
+    print("...it DIDN'T!!!!!11!!1one!")
+    sys.exit(1)
+except EXC:
+    print("...it did, yay.")
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -9,7 +9,9 @@ self-hosted-runner:
    - linux.large
    - linux.2xlarge
    - linux.4xlarge
+    - linux.9xlarge.ephemeral
    - linux.12xlarge
+    - linux.12xlarge.ephemeral
    - linux.24xlarge
    - linux.arm64.2xlarge
    - linux.4xlarge.nvidia.gpu
--- a/.github/workflows/build-conda-images.yml
+++ b/.github/workflows/build-conda-images.yml
@ -0,0 +1,64 @@
+name: Build conda docker images
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate or nightly builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+    paths:
+      - conda/Dockerfile
+      - 'common/*'
+      - .github/workflows/build-conda-images.yml
+  pull_request:
+    paths:
+      - conda/Dockerfile
+      - 'common/*'
+      - .github/workflows/build-conda-images.yml
+
+env:
+  DOCKER_REGISTRY: "docker.io"
+  DOCKER_BUILDKIT: 1
+  DOCKER_ID: ${{ secrets.DOCKER_ID }}
+  DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  build-docker:
+    runs-on: linux.9xlarge.ephemeral
+    strategy:
+      matrix:
+        cuda_version: ["11.8", "12.1", "12.4", "cpu"]
+    env:
+      CUDA_VERSION: ${{ matrix.cuda_version }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: conda-builder${{ matrix.cuda_version == 'cpu' && '-' || '-cuda' }}${{matrix.cuda_version}}
+            docker-build-dir:  .ci/docker/conda
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/conda/build.sh conda-builder${{ matrix.cuda_version == 'cpu' && ':' || ':cuda' }}${{matrix.cuda_version}}
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -0,0 +1,120 @@
+name: Build libtorch docker images
+
+on:
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate or nightly builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+    paths:
+      - '.ci/docker/libtorch/*'
+      - '.ci/docker/common/*'
+      - .github/workflows/build-libtorch-images.yml
+  pull_request:
+    paths:
+      - '.ci/docker/libtorch/*'
+      - '.ci/docker/common/*'
+      - .github/workflows/build-libtorch-images.yml
+
+env:
+  DOCKER_REGISTRY: "docker.io"
+  DOCKER_BUILDKIT: 1
+  DOCKER_ID: ${{ secrets.DOCKER_ID }}
+  DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  build-docker-cuda:
+    runs-on: linux.9xlarge.ephemeral
+    strategy:
+      matrix:
+        cuda_version: ["12.4", "12.1", "11.8"]
+    env:
+      GPU_ARCH_TYPE: cuda
+      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: libtorch-cxx11-builder-cuda${{matrix.cuda_version}}
+            docker-build-dir:  .ci/docker/libtorch
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cuda${{matrix.cuda_version}}
+  build-docker-rocm:
+    runs-on: linux.9xlarge.ephemeral
+    strategy:
+      matrix:
+        rocm_version: ["6.0", "6.1"]
+    env:
+      GPU_ARCH_TYPE: rocm
+      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: libtorch-cxx11-builder-rocm${{matrix.rocm_version}}
+            docker-build-dir:  .ci/docker/libtorch
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:rocm${{matrix.rocm_version}}
+  build-docker-cpu:
+    runs-on: linux.9xlarge.ephemeral
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: libtorch-cxx11-builder-cpu
+            docker-build-dir:  .ci/docker/libtorch
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cpu
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -0,0 +1,322 @@
+name: Build manywheel docker images
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate or nightly builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+    paths:
+      - '.ci/docker/manywheel/*'
+      - '.ci/docker/common/*'
+      - .github/workflows/build-manywheel-images.yml
+  pull_request:
+    paths:
+      - '.ci/docker/manywheel/*'
+      - '.ci/docker/common/*'
+      - .github/workflows/build-manywheel-images.yml
+
+
+env:
+  DOCKER_REGISTRY: "docker.io"
+  DOCKER_BUILDKIT: 1
+  DOCKER_ID: ${{ secrets.DOCKER_ID }}
+  DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  build-docker-cuda:
+    runs-on: linux.9xlarge.ephemeral
+    strategy:
+      matrix:
+        cuda_version: ["12.4", "12.1", "11.8"]
+    env:
+      GPU_ARCH_TYPE: cuda
+      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
+    steps:
+      - name: Purge tools folder (free space for build)
+        run: rm -rf /opt/hostedtoolcache
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinux-builder-cuda${{matrix.cuda_version}}
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinux-builder:cuda${{matrix.cuda_version}}
+  # NOTE: manylinux_2_28 are still experimental, see https://github.com/pytorch/pytorch/issues/123649
+  build-docker-cuda-manylinux_2_28:
+    runs-on: linux.9xlarge.ephemeral
+    strategy:
+      matrix:
+        cuda_version: ["12.4", "12.1", "11.8"]
+    env:
+      GPU_ARCH_TYPE: cuda-manylinux_2_28
+      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
+    steps:
+      - name: Purge tools folder (free space for build)
+        run: rm -rf /opt/hostedtoolcache
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinux2_28-builder-cuda${{matrix.cuda_version}}
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinux2_28-builder:cuda${{matrix.cuda_version}}
+  build-docker-cuda-aarch64:
+    runs-on: linux.arm64.2xlarge
+    strategy:
+      matrix:
+        cuda_version: ["12.4"]
+    env:
+      GPU_ARCH_TYPE: cuda-aarch64
+      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v3
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinuxaarch64-builder-cuda${{matrix.cuda_version}}
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}}
+  build-docker-rocm:
+    runs-on: linux.9xlarge.ephemeral
+    strategy:
+      matrix:
+        rocm_version: ["6.0", "6.1"]
+    env:
+      GPU_ARCH_TYPE: rocm
+      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinux-builder-rocm${{matrix.rocm_version}}
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinux-builder:rocm${{matrix.rocm_version}}
+  build-docker-cpu:
+    runs-on: linux.9xlarge.ephemeral
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinux-builder-cpu
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinux-builder:cpu
+  build-docker-cpu-manylinux_2_28:
+    runs-on: linux.9xlarge.ephemeral
+    env:
+      GPU_ARCH_TYPE: cpu-manylinux_2_28
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinux2_28-builder-cpu
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinux2_28-builder:cpu
+  build-docker-cpu-aarch64:
+    runs-on: linux.arm64.2xlarge
+    env:
+      GPU_ARCH_TYPE: cpu-aarch64
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinuxaarch64-builder-cpu-aarch64
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cpu-aarch64
+  build-docker-cpu-aarch64-2_28:
+    runs-on: linux.arm64.2xlarge
+    env:
+      GPU_ARCH_TYPE: cpu-aarch64-2_28
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinux2_28_aarch64-builder-cpu-aarch64
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinux2_28_aarch64-builder:cpu-aarch64
+  build-docker-cpu-cxx11-abi:
+    runs-on: linux.9xlarge.ephemeral
+    env:
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinuxcxx11-abi-builder-cpu-cxx11-abi
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinuxcxx11-abi-builder:cpu-cxx11-abi
+  build-docker-xpu:
+    runs-on: linux.9xlarge.ephemeral
+    env:
+      GPU_ARCH_TYPE: xpu
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: manylinux2_28-builder-xpu
+            docker-build-dir:  .ci/docker/manywheel
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/manywheel/build.sh manylinux2_28-builder:xpu
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -417,6 +417,7 @@ exclude_patterns = [
    'aten/src/ATen/native/vulkan/api/vk_mem_alloc.h',
    'test/cpp/jit/upgrader_models/*.ptl',
    'test/cpp/jit/upgrader_models/*.ptl.ff',
+    '.ci/docker/common/install_rocm_drm.sh',
    '.lintrunner.toml',
 ]
 command = [