Upgrade ROCm wheels to manylinux2_28 - 1 of 2 (docker images) (#140681)

Fixes #140631

Highlights:
* Use `cpu_final` base for ROCm in `.ci/docker/manywheel/Dockerfile_2_28`
* Cleans up install_miopen.sh to remove old ROCm references
* Install `gcc-gfortran` package to build magma for ROCm on almalinux

Needs builder PR https://github.com/pytorch/builder/pull/2043 (merged) so that GCC_ABI expected value is updated.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/140681
Approved by: https://github.com/jeffdaily
This commit is contained in:
Jithun Nair 2024-11-26 00:10:39 +00:00 committed by PyTorch MergeBot
parent 8f5ce865a4
commit 9ccbd84316
6 changed files with 53 additions and 82 deletions

View File

@ -16,7 +16,7 @@ case "$ID" in
ubuntu) ubuntu)
IS_UBUNTU=1 IS_UBUNTU=1
;; ;;
centos) centos|almalinux)
IS_UBUNTU=0 IS_UBUNTU=0
;; ;;
*) *)
@ -43,12 +43,6 @@ else
fi fi
ROCM_INT=$(($ROCM_VERSION_MAJOR * 10000 + $ROCM_VERSION_MINOR * 100 + $ROCM_VERSION_PATCH)) ROCM_INT=$(($ROCM_VERSION_MAJOR * 10000 + $ROCM_VERSION_MINOR * 100 + $ROCM_VERSION_PATCH))
# Install custom MIOpen + COMgr for ROCm >= 4.0.1
if [[ $ROCM_INT -lt 40001 ]]; then
echo "ROCm version < 4.0.1; will not install custom MIOpen"
exit 0
fi
# Function to retry functions that sometimes timeout or have flaky failures # Function to retry functions that sometimes timeout or have flaky failures
retry () { retry () {
$* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
@ -66,58 +60,27 @@ else
ROCM_INSTALL_PATH="/opt/rocm-${ROCM_VERSION}" ROCM_INSTALL_PATH="/opt/rocm-${ROCM_VERSION}"
fi fi
# MIOPEN_USE_HIP_KERNELS is a Workaround for COMgr issues
MIOPEN_CMAKE_COMMON_FLAGS=" MIOPEN_CMAKE_COMMON_FLAGS="
-DMIOPEN_USE_COMGR=ON -DMIOPEN_USE_COMGR=ON
-DMIOPEN_BUILD_DRIVER=OFF -DMIOPEN_BUILD_DRIVER=OFF
" "
# Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version if [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60204 ]]; then
if [[ $ROCM_INT -ge 60300 ]]; then
echo "ROCm 6.3+ MIOpen does not need any patches, do not build from source"
exit 0
elif [[ $ROCM_INT -ge 60204 ]] && [[ $ROCM_INT -lt 60300 ]]; then
echo "ROCm 6.2.4+ MIOpen does not need any patches, do not build from source"
exit 0
elif [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60204 ]]; then
MIOPEN_BRANCH="release/rocm-rel-6.2-staging" MIOPEN_BRANCH="release/rocm-rel-6.2-staging"
elif [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
echo "ROCm 6.1 MIOpen does not need any patches, do not build from source"
exit 0
elif [[ $ROCM_INT -ge 60000 ]] && [[ $ROCM_INT -lt 60100 ]]; then
echo "ROCm 6.0 MIOpen does not need any patches, do not build from source"
exit 0
elif [[ $ROCM_INT -ge 50700 ]] && [[ $ROCM_INT -lt 60000 ]]; then
echo "ROCm 5.7 MIOpen does not need any patches, do not build from source"
exit 0
elif [[ $ROCM_INT -ge 50600 ]] && [[ $ROCM_INT -lt 50700 ]]; then
MIOPEN_BRANCH="release/rocm-rel-5.6-staging"
elif [[ $ROCM_INT -ge 50500 ]] && [[ $ROCM_INT -lt 50600 ]]; then
MIOPEN_BRANCH="release/rocm-rel-5.5-gfx11"
elif [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
MIOPEN_BRANCH="release/rocm-rel-5.4-staging"
elif [[ $ROCM_INT -ge 50300 ]] && [[ $ROCM_INT -lt 50400 ]]; then
MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
MIOPEN_BRANCH="release/rocm-rel-5.3-staging"
elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50300 ]]; then
MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
MIOPEN_BRANCH="release/rocm-rel-5.2-staging"
elif [[ $ROCM_INT -ge 50100 ]] && [[ $ROCM_INT -lt 50200 ]]; then
MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36"
MIOPEN_BRANCH="release/rocm-rel-5.1-staging"
elif [[ $ROCM_INT -ge 50000 ]] && [[ $ROCM_INT -lt 50100 ]]; then
MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36"
MIOPEN_BRANCH="release/rocm-rel-5.0-staging"
else else
echo "Unhandled ROCM_VERSION ${ROCM_VERSION}" echo "ROCm ${ROCM_VERSION} does not need any patches, do not build from source"
exit 1 exit 0
fi fi
if [[ ${IS_UBUNTU} == 1 ]]; then if [[ ${IS_UBUNTU} == 1 ]]; then
apt-get remove -y miopen-hip apt-get remove -y miopen-hip
else else
yum remove -y miopen-hip # Workaround since almalinux manylinux image already has this and cget doesn't like that
rm -rf /usr/local/lib/pkgconfig/sqlite3.pc
# Versioned package name needs regex match
# Use --noautoremove to prevent other rocm packages from being uninstalled
yum remove -y miopen-hip* --noautoremove
fi fi
git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH} git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH}
@ -125,16 +88,7 @@ pushd MIOpen
# remove .git to save disk space since CI runner was running out # remove .git to save disk space since CI runner was running out
rm -rf .git rm -rf .git
# Don't build CK to save docker build time # Don't build CK to save docker build time
if [[ $ROCM_INT -ge 60200 ]]; then sed -i '/composable_kernel/d' requirements.txt
sed -i '/composable_kernel/d' requirements.txt
fi
# Don't build MLIR to save docker build time
# since we are disabling MLIR backend for MIOpen anyway
if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
sed -i '/rocMLIR/d' requirements.txt
elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50400 ]]; then
sed -i '/llvm-project-mlir/d' requirements.txt
fi
## MIOpen minimum requirements ## MIOpen minimum requirements
cmake -P install_deps.cmake --minimum cmake -P install_deps.cmake --minimum
@ -156,7 +110,7 @@ cd build
PKG_CONFIG_PATH=/usr/local/lib/pkgconfig CXX=${ROCM_INSTALL_PATH}/llvm/bin/clang++ cmake .. \ PKG_CONFIG_PATH=/usr/local/lib/pkgconfig CXX=${ROCM_INSTALL_PATH}/llvm/bin/clang++ cmake .. \
${MIOPEN_CMAKE_COMMON_FLAGS} \ ${MIOPEN_CMAKE_COMMON_FLAGS} \
${MIOPEN_CMAKE_DB_FLAGS} \ ${MIOPEN_CMAKE_DB_FLAGS} \
-DCMAKE_PREFIX_PATH="${ROCM_INSTALL_PATH}/hip;${ROCM_INSTALL_PATH}" -DCMAKE_PREFIX_PATH="${ROCM_INSTALL_PATH}"
make MIOpen -j $(nproc) make MIOpen -j $(nproc)
# Build MIOpen package # Build MIOpen package

View File

@ -12,7 +12,7 @@ case "$ID" in
apt-get install -y libpciaccess-dev pkg-config apt-get install -y libpciaccess-dev pkg-config
apt-get clean apt-get clean
;; ;;
centos) centos|almalinux)
yum install -y libpciaccess-devel pkgconfig yum install -y libpciaccess-devel pkgconfig
;; ;;
*) *)

View File

@ -3,6 +3,18 @@
set -ex set -ex
# Magma build scripts need `python`
ln -sf /usr/bin/python3 /usr/bin/python
ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
case "$ID" in
almalinux)
yum install -y gcc-gfortran
;;
*)
echo "No preinstalls to build magma..."
;;
esac
MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION} MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}

View File

@ -1,5 +1,4 @@
# syntax = docker/dockerfile:experimental # syntax = docker/dockerfile:experimental
ARG ROCM_VERSION=3.7
ARG BASE_CUDA_VERSION=11.8 ARG BASE_CUDA_VERSION=11.8
ARG GPU_IMAGE=amd64/almalinux:8 ARG GPU_IMAGE=amd64/almalinux:8
FROM quay.io/pypa/manylinux_2_28_x86_64 as base FROM quay.io/pypa/manylinux_2_28_x86_64 as base
@ -130,10 +129,10 @@ RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \
done; done;
# cmake-3.18.4 from pip # cmake-3.18.4 from pip; force in case cmake3 already exists
RUN yum install -y python3-pip && \ RUN yum install -y python3-pip && \
python3 -mpip install cmake==3.18.4 && \ python3 -mpip install cmake==3.18.4 && \
ln -s /usr/local/bin/cmake /usr/bin/cmake3 ln -sf /usr/local/bin/cmake /usr/bin/cmake3
FROM cpu_final as cuda_final FROM cpu_final as cuda_final
RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION} RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
@ -142,17 +141,22 @@ COPY --from=magma /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BAS
RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
ENV PATH=/usr/local/cuda/bin:$PATH ENV PATH=/usr/local/cuda/bin:$PATH
FROM cpu_final as rocm_final
FROM common as rocm_final ARG ROCM_VERSION=6.0
ARG ROCM_VERSION=3.7 ARG PYTORCH_ROCM_ARCH
# Install ROCm ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
ADD ./common/install_rocm.sh install_rocm.sh # Somewhere in ROCm stack, we still use non-existing /opt/rocm/hip path,
RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh # below workaround helps avoid error
# cmake is already installed inside the rocm base image, but both 2 and 3 exist ENV ROCM_PATH /opt/rocm
# cmake3 is needed for the later MIOpen custom build, so that step is last. # cmake-3.28.4 from pip to get enable_language(HIP)
RUN yum install -y cmake3 && \ # and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker
rm -f /usr/bin/cmake && \ RUN python3 -m pip install --upgrade pip && \
ln -s /usr/bin/cmake3 /usr/bin/cmake python3 -mpip install cmake==3.28.4
ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
ENV MKLROOT /opt/intel
ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
ADD ./common/install_miopen.sh install_miopen.sh ADD ./common/install_miopen.sh install_miopen.sh
RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh

View File

@ -87,11 +87,15 @@ case ${GPU_ARCH_TYPE} in
MANY_LINUX_VERSION="aarch64" MANY_LINUX_VERSION="aarch64"
DOCKERFILE_SUFFIX="_cuda_aarch64" DOCKERFILE_SUFFIX="_cuda_aarch64"
;; ;;
rocm) rocm|rocm-manylinux_2_28)
TARGET=rocm_final TARGET=rocm_final
DOCKER_TAG=rocm${GPU_ARCH_VERSION} DOCKER_TAG=rocm${GPU_ARCH_VERSION}
GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100" if [ ${GPU_ARCH_TYPE} == "rocm-manylinux_2_28" ]; then
MANY_LINUX_VERSION="2_28"
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
fi
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100"
ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)" ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)"
if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then
ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0})) ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0}))
@ -99,9 +103,6 @@ case ${GPU_ARCH_TYPE} in
echo "ERROR: rocm regex failed" echo "ERROR: rocm regex failed"
exit 1 exit 1
fi fi
if [[ $ROCM_VERSION_INT -ge 60000 ]]; then
PYTORCH_ROCM_ARCH+=";gfx942"
fi
DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=9" DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=9"
;; ;;
xpu) xpu)

View File

@ -172,7 +172,7 @@ jobs:
retry_wait_seconds: 90 retry_wait_seconds: 90
command: | command: |
.ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}} .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}}
build-docker-rocm: build-docker-rocm-manylinux_2_28:
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }} environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
needs: get-label-type needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral" runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
@ -180,7 +180,7 @@ jobs:
matrix: matrix:
rocm_version: ["6.1", "6.2.4"] rocm_version: ["6.1", "6.2.4"]
env: env:
GPU_ARCH_TYPE: rocm GPU_ARCH_TYPE: rocm-manylinux_2_28
GPU_ARCH_VERSION: ${{ matrix.rocm_version }} GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
steps: steps:
- name: Checkout PyTorch - name: Checkout PyTorch
@ -191,7 +191,7 @@ jobs:
if: env.WITH_PUSH == 'false' if: env.WITH_PUSH == 'false'
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with: with:
docker-image-name: manylinux-builder-rocm${{matrix.rocm_version}} docker-image-name: manylinux2_28-builder-rocm${{matrix.rocm_version}}
docker-build-dir: .ci/docker/manywheel docker-build-dir: .ci/docker/manywheel
always-rebuild: true always-rebuild: true
push: true push: true
@ -213,7 +213,7 @@ jobs:
max_attempts: 3 max_attempts: 3
retry_wait_seconds: 90 retry_wait_seconds: 90
command: | command: |
.ci/docker/manywheel/build.sh manylinux-builder:rocm${{matrix.rocm_version}} .ci/docker/manywheel/build.sh manylinux2_28-builder:rocm${{matrix.rocm_version}}
build-docker-cpu: build-docker-cpu:
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }} environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
needs: get-label-type needs: get-label-type