diff --git a/.ci/docker/common/install_miopen.sh b/.ci/docker/common/install_miopen.sh index f531b03f8ca..3dbc67b90ab 100644 --- a/.ci/docker/common/install_miopen.sh +++ b/.ci/docker/common/install_miopen.sh @@ -16,7 +16,7 @@ case "$ID" in ubuntu) IS_UBUNTU=1 ;; - centos) + centos|almalinux) IS_UBUNTU=0 ;; *) @@ -43,12 +43,6 @@ else fi ROCM_INT=$(($ROCM_VERSION_MAJOR * 10000 + $ROCM_VERSION_MINOR * 100 + $ROCM_VERSION_PATCH)) -# Install custom MIOpen + COMgr for ROCm >= 4.0.1 -if [[ $ROCM_INT -lt 40001 ]]; then - echo "ROCm version < 4.0.1; will not install custom MIOpen" - exit 0 -fi - # Function to retry functions that sometimes timeout or have flaky failures retry () { $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) @@ -66,58 +60,27 @@ else ROCM_INSTALL_PATH="/opt/rocm-${ROCM_VERSION}" fi -# MIOPEN_USE_HIP_KERNELS is a Workaround for COMgr issues MIOPEN_CMAKE_COMMON_FLAGS=" -DMIOPEN_USE_COMGR=ON -DMIOPEN_BUILD_DRIVER=OFF " -# Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version -if [[ $ROCM_INT -ge 60300 ]]; then - echo "ROCm 6.3+ MIOpen does not need any patches, do not build from source" - exit 0 -elif [[ $ROCM_INT -ge 60204 ]] && [[ $ROCM_INT -lt 60300 ]]; then - echo "ROCm 6.2.4+ MIOpen does not need any patches, do not build from source" - exit 0 -elif [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60204 ]]; then +if [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60204 ]]; then MIOPEN_BRANCH="release/rocm-rel-6.2-staging" -elif [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then - echo "ROCm 6.1 MIOpen does not need any patches, do not build from source" - exit 0 -elif [[ $ROCM_INT -ge 60000 ]] && [[ $ROCM_INT -lt 60100 ]]; then - echo "ROCm 6.0 MIOpen does not need any patches, do not build from source" - exit 0 -elif [[ $ROCM_INT -ge 50700 ]] && [[ $ROCM_INT -lt 60000 ]]; then - echo "ROCm 5.7 MIOpen does not need any patches, do not build from source" - exit 0 -elif [[ $ROCM_INT -ge 50600 ]] && [[ $ROCM_INT -lt 50700 ]]; then - MIOPEN_BRANCH="release/rocm-rel-5.6-staging" -elif [[ $ROCM_INT -ge 50500 ]] && [[ $ROCM_INT -lt 50600 ]]; then - MIOPEN_BRANCH="release/rocm-rel-5.5-gfx11" -elif [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then - MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off" - MIOPEN_BRANCH="release/rocm-rel-5.4-staging" -elif [[ $ROCM_INT -ge 50300 ]] && [[ $ROCM_INT -lt 50400 ]]; then - MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off" - MIOPEN_BRANCH="release/rocm-rel-5.3-staging" -elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50300 ]]; then - MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off" - MIOPEN_BRANCH="release/rocm-rel-5.2-staging" -elif [[ $ROCM_INT -ge 50100 ]] && [[ $ROCM_INT -lt 50200 ]]; then - MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36" - MIOPEN_BRANCH="release/rocm-rel-5.1-staging" -elif [[ $ROCM_INT -ge 50000 ]] && [[ $ROCM_INT -lt 50100 ]]; then - MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36" - MIOPEN_BRANCH="release/rocm-rel-5.0-staging" else - echo "Unhandled ROCM_VERSION ${ROCM_VERSION}" - exit 1 + echo "ROCm ${ROCM_VERSION} does not need any patches, do not build from source" + exit 0 fi if [[ ${IS_UBUNTU} == 1 ]]; then apt-get remove -y miopen-hip else - yum remove -y miopen-hip + # Workaround since almalinux manylinux image already has this and cget doesn't like that + rm -rf /usr/local/lib/pkgconfig/sqlite3.pc + + # Versioned package name needs regex match + # Use --noautoremove to prevent other rocm packages from being uninstalled + yum remove -y miopen-hip* --noautoremove fi git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH} @@ -125,16 +88,7 @@ pushd MIOpen # remove .git to save disk space since CI runner was running out rm -rf .git # Don't build CK to save docker build time -if [[ $ROCM_INT -ge 60200 ]]; then - sed -i '/composable_kernel/d' requirements.txt -fi -# Don't build MLIR to save docker build time -# since we are disabling MLIR backend for MIOpen anyway -if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then - sed -i '/rocMLIR/d' requirements.txt -elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50400 ]]; then - sed -i '/llvm-project-mlir/d' requirements.txt -fi +sed -i '/composable_kernel/d' requirements.txt ## MIOpen minimum requirements cmake -P install_deps.cmake --minimum @@ -156,7 +110,7 @@ cd build PKG_CONFIG_PATH=/usr/local/lib/pkgconfig CXX=${ROCM_INSTALL_PATH}/llvm/bin/clang++ cmake .. \ ${MIOPEN_CMAKE_COMMON_FLAGS} \ ${MIOPEN_CMAKE_DB_FLAGS} \ - -DCMAKE_PREFIX_PATH="${ROCM_INSTALL_PATH}/hip;${ROCM_INSTALL_PATH}" + -DCMAKE_PREFIX_PATH="${ROCM_INSTALL_PATH}" make MIOpen -j $(nproc) # Build MIOpen package diff --git a/.ci/docker/common/install_rocm_drm.sh b/.ci/docker/common/install_rocm_drm.sh index a6c73560c1a..94cb9860779 100644 --- a/.ci/docker/common/install_rocm_drm.sh +++ b/.ci/docker/common/install_rocm_drm.sh @@ -12,7 +12,7 @@ case "$ID" in apt-get install -y libpciaccess-dev pkg-config apt-get clean ;; - centos) + centos|almalinux) yum install -y libpciaccess-devel pkgconfig ;; *) diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh index fc3d49a309b..5ab15a56352 100644 --- a/.ci/docker/common/install_rocm_magma.sh +++ b/.ci/docker/common/install_rocm_magma.sh @@ -3,6 +3,18 @@ set -ex +# Magma build scripts need `python` +ln -sf /usr/bin/python3 /usr/bin/python + +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +case "$ID" in + almalinux) + yum install -y gcc-gfortran + ;; + *) + echo "No preinstalls to build magma..." + ;; +esac MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION} diff --git a/.ci/docker/manywheel/Dockerfile_2_28 b/.ci/docker/manywheel/Dockerfile_2_28 index 6903bdc0c74..70b8e07ed31 100644 --- a/.ci/docker/manywheel/Dockerfile_2_28 +++ b/.ci/docker/manywheel/Dockerfile_2_28 @@ -1,5 +1,4 @@ # syntax = docker/dockerfile:experimental -ARG ROCM_VERSION=3.7 ARG BASE_CUDA_VERSION=11.8 ARG GPU_IMAGE=amd64/almalinux:8 FROM quay.io/pypa/manylinux_2_28_x86_64 as base @@ -130,10 +129,10 @@ RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \ done; -# cmake-3.18.4 from pip +# cmake-3.18.4 from pip; force in case cmake3 already exists RUN yum install -y python3-pip && \ python3 -mpip install cmake==3.18.4 && \ - ln -s /usr/local/bin/cmake /usr/bin/cmake3 + ln -sf /usr/local/bin/cmake /usr/bin/cmake3 FROM cpu_final as cuda_final RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION} @@ -142,17 +141,22 @@ COPY --from=magma /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BAS RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda ENV PATH=/usr/local/cuda/bin:$PATH - -FROM common as rocm_final -ARG ROCM_VERSION=3.7 -# Install ROCm -ADD ./common/install_rocm.sh install_rocm.sh -RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh -# cmake is already installed inside the rocm base image, but both 2 and 3 exist -# cmake3 is needed for the later MIOpen custom build, so that step is last. -RUN yum install -y cmake3 && \ - rm -f /usr/bin/cmake && \ - ln -s /usr/bin/cmake3 /usr/bin/cmake +FROM cpu_final as rocm_final +ARG ROCM_VERSION=6.0 +ARG PYTORCH_ROCM_ARCH +ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} +# Somewhere in ROCm stack, we still use non-existing /opt/rocm/hip path, +# below workaround helps avoid error +ENV ROCM_PATH /opt/rocm +# cmake-3.28.4 from pip to get enable_language(HIP) +# and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker +RUN python3 -m pip install --upgrade pip && \ + python3 -mpip install cmake==3.28.4 +ADD ./common/install_rocm_drm.sh install_rocm_drm.sh +RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh +ENV MKLROOT /opt/intel +ADD ./common/install_rocm_magma.sh install_rocm_magma.sh +RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh ADD ./common/install_miopen.sh install_miopen.sh RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh index 8ee547344dd..b75a809787e 100755 --- a/.ci/docker/manywheel/build.sh +++ b/.ci/docker/manywheel/build.sh @@ -87,11 +87,15 @@ case ${GPU_ARCH_TYPE} in MANY_LINUX_VERSION="aarch64" DOCKERFILE_SUFFIX="_cuda_aarch64" ;; - rocm) + rocm|rocm-manylinux_2_28) TARGET=rocm_final DOCKER_TAG=rocm${GPU_ARCH_VERSION} GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete - PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100" + if [ ${GPU_ARCH_TYPE} == "rocm-manylinux_2_28" ]; then + MANY_LINUX_VERSION="2_28" + GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete + fi + PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100" ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)" if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0})) @@ -99,9 +103,6 @@ case ${GPU_ARCH_TYPE} in echo "ERROR: rocm regex failed" exit 1 fi - if [[ $ROCM_VERSION_INT -ge 60000 ]]; then - PYTORCH_ROCM_ARCH+=";gfx942" - fi DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=9" ;; xpu) diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index e5176342516..d1b72f2fb24 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -172,7 +172,7 @@ jobs: retry_wait_seconds: 90 command: | .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}} - build-docker-rocm: + build-docker-rocm-manylinux_2_28: environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral" @@ -180,7 +180,7 @@ jobs: matrix: rocm_version: ["6.1", "6.2.4"] env: - GPU_ARCH_TYPE: rocm + GPU_ARCH_TYPE: rocm-manylinux_2_28 GPU_ARCH_VERSION: ${{ matrix.rocm_version }} steps: - name: Checkout PyTorch @@ -191,7 +191,7 @@ jobs: if: env.WITH_PUSH == 'false' uses: pytorch/test-infra/.github/actions/calculate-docker-image@main with: - docker-image-name: manylinux-builder-rocm${{matrix.rocm_version}} + docker-image-name: manylinux2_28-builder-rocm${{matrix.rocm_version}} docker-build-dir: .ci/docker/manywheel always-rebuild: true push: true @@ -213,7 +213,7 @@ jobs: max_attempts: 3 retry_wait_seconds: 90 command: | - .ci/docker/manywheel/build.sh manylinux-builder:rocm${{matrix.rocm_version}} + .ci/docker/manywheel/build.sh manylinux2_28-builder:rocm${{matrix.rocm_version}} build-docker-cpu: environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }} needs: get-label-type