[ROCm] Upgrade ROCm CI to ROCm6.4 (#151368)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/151368 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-12-06 12:20:52 +01:00 · 2025-05-02 17:21:18 +00:00 · 2025-05-02 17:21:18 +00:00 · 844842dfbf
commit 844842dfbf
parent f65fb0a23d
23 changed files with 195 additions and 120 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -241,11 +241,11 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-rocm-n-1-py3)
+  pytorch-linux-jammy-rocm-n-1-py3)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    VISION=yes
-    ROCM_VERSION=6.2.4
+    ROCM_VERSION=6.3
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -254,11 +254,11 @@ case "$image" in
    UCC_COMMIT=${_UCC_COMMIT}
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-focal-rocm-n-py3)
+  pytorch-linux-jammy-rocm-n-py3)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    VISION=yes
-    ROCM_VERSION=6.3
+    ROCM_VERSION=6.4
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -66,17 +66,25 @@ EOF
    done

    # ROCm 6.3 had a regression where initializing static code objects had significant overhead
-    if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
+    # ROCm 6.4 did not yet fix the regression, also HIP branch names are different
+    if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]] || [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
+        if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
+            HIP_BRANCH=rocm-6.3.x
+            VER_STR=6.3
+        elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
+            HIP_BRANCH=release/rocm-rel-6.4
+            VER_STR=6.4
+        fi
        # clr build needs CppHeaderParser but can only find it using conda's python
        /opt/conda/bin/python -m pip install CppHeaderParser
-        git clone https://github.com/ROCm/HIP -b rocm-6.3.x
+        git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
        HIP_COMMON_DIR=$(readlink -f HIP)
-        git clone https://github.com/jeffdaily/clr -b release/rocm-rel-6.3-statco-hotfix
+        git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}-statco-hotfix
        mkdir -p clr/build
        pushd clr/build
        cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
        make -j
-        cp hipamd/lib/libamdhip64.so.6.3.* /opt/rocm/lib/libamdhip64.so.6.3.*
+        cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.*
        popd
        rm -rf HIP clr
    fi
--- a/.github/actions/setup-rocm/action.yml
+++ b/.github/actions/setup-rocm/action.yml
@ -5,6 +5,12 @@ description: Set up ROCm host for CI
 runs:
  using: composite
  steps:
+    - name: Runner ROCm version
+      if: always()
+      shell: bash
+      run: |
+        dpkg -l | grep -E "  rocm"
+
    - name: Stop all running docker containers
      if: always()
      shell: bash
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -61,8 +61,8 @@ jobs:
          pytorch-linux-focal-py3.11-clang10,
          pytorch-linux-focal-py3.12-clang10,
          pytorch-linux-focal-py3.13-clang10,
-          pytorch-linux-focal-rocm-n-1-py3,
-          pytorch-linux-focal-rocm-n-py3,
+          pytorch-linux-jammy-rocm-n-1-py3,
+          pytorch-linux-jammy-rocm-n-py3,
          pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-clang12,
          pytorch-linux-jammy-py3.9-gcc11,
          pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
--- a/.github/workflows/docker-cache-mi300.yml
+++ b/.github/workflows/docker-cache-mi300.yml
@ -41,7 +41,7 @@ jobs:
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
-          docker-image-name: pytorch-linux-focal-rocm-n-py3
+          docker-image-name: pytorch-linux-jammy-rocm-n-py3
          push: false

      - name: Pull docker image
--- a/.github/workflows/inductor-perf-test-nightly-rocm.yml
+++ b/.github/workflows/inductor-perf-test-nightly-rocm.yml
@ -78,13 +78,13 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  linux-focal-rocm-py3_10-inductor-benchmark-build:
+  linux-jammy-rocm-py3_10-inductor-benchmark-build:
    if: github.repository_owner == 'pytorch'
    name: rocm-py3_10-inductor-benchmark-build
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-rocm-py3_10
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image-name: pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
          { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" },
@ -102,18 +102,18 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm-py3_10-inductor-benchmark-test:
+  linux-jammy-rocm-py3_10-inductor-benchmark-test:
    permissions:
      id-token: write
      contents: read
    name: rocm-py3_10-inductor-benchmark-test
    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm-py3_10-inductor-benchmark-build
+    needs: linux-jammy-rocm-py3_10-inductor-benchmark-build
    with:
-      build-environment: linux-focal-rocm-py3_10
+      build-environment: linux-jammy-rocm-py3_10
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
-      docker-image: ${{ needs.linux-focal-rocm-py3_10-inductor-benchmark-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm-py3_10-inductor-benchmark-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.test-matrix }}
      timeout-minutes: 720
      # Disable monitor in perf tests for more investigation
      disable-monitor: true
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -67,13 +67,13 @@ jobs:
      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets: inherit

-  linux-focal-rocm-py3_10-periodic-dynamo-benchmarks-build:
+  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build:
    if: github.repository_owner == 'pytorch'
    name: rocm-py3_10-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-rocm-py3_10
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image-name: pytorch-linux-jammy-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
        { include: [
@ -95,17 +95,17 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm-py3_10-periodic-dynamo-benchmarks-test:
+  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-test:
    permissions:
      id-token: write
      contents: read
    name: rocm-py3_10-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm-py3_10-periodic-dynamo-benchmarks-build
+    needs: linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build
    with:
-      build-environment: linux-focal-rocm-py3_10
-      docker-image: ${{ needs.linux-focal-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets: inherit

  linux-focal-cuda12_6-py3_10-gcc9-inductor-build-gcp:
--- a/.github/workflows/inductor-rocm-mi300.yml
+++ b/.github/workflows/inductor-rocm-mi300.yml
@ -36,14 +36,14 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  linux-focal-rocm-py3_10-inductor-build:
+  linux-jammy-rocm-py3_10-inductor-build:
    name: rocm-py3.10-inductor
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm-py3.10
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
@ -51,15 +51,15 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm-py3_10-inductor-test:
+  linux-jammy-rocm-py3_10-inductor-test:
    permissions:
      id-token: write
      contents: read
    name: rocm-py3.10-inductor
    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm-py3_10-inductor-build
+    needs: linux-jammy-rocm-py3_10-inductor-build
    with:
-      build-environment: linux-focal-rocm-py3.10
-      docker-image: ${{ needs.linux-focal-rocm-py3_10-inductor-build.outputs.docker-image }}
-      test-matrix:  ${{ needs.linux-focal-rocm-py3_10-inductor-build.outputs.test-matrix }}
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.docker-image }}
+      test-matrix:  ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor-rocm.yml
+++ b/.github/workflows/inductor-rocm.yml
@ -29,14 +29,14 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  linux-focal-rocm-py3_10-inductor-build:
+  linux-jammy-rocm-py3_10-inductor-build:
    name: rocm-py3.10-inductor
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm-py3.10
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2" },
@ -44,15 +44,15 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm-py3_10-inductor-test:
+  linux-jammy-rocm-py3_10-inductor-test:
    permissions:
      id-token: write
      contents: read
    name: rocm-py3.10-inductor
    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm-py3_10-inductor-build
+    needs: linux-jammy-rocm-py3_10-inductor-build
    with:
-      build-environment: linux-focal-rocm-py3.10
-      docker-image: ${{ needs.linux-focal-rocm-py3_10-inductor-build.outputs.docker-image }}
-      test-matrix:  ${{ needs.linux-focal-rocm-py3_10-inductor-build.outputs.test-matrix }}
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.docker-image }}
+      test-matrix:  ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/periodic-rocm-mi300.yml
+++ b/.github/workflows/periodic-rocm-mi300.yml
@ -49,14 +49,14 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  linux-focal-rocm-py3_10-build:
-    name: linux-focal-rocm-py3.10
+  linux-jammy-rocm-py3_10-build:
+    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm-py3.10
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
@ -65,17 +65,17 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm-py3_10-test:
+  linux-jammy-rocm-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm-py3.10
+    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm-py3_10-build
+      - linux-jammy-rocm-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm-py3.10
-      docker-image: ${{ needs.linux-focal-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -140,14 +140,14 @@ jobs:
      test-matrix: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
    secrets: inherit

-  linux-focal-rocm-py3_10-build:
-    name: linux-focal-rocm-py3.10
+  linux-jammy-rocm-py3_10-build:
+    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm-py3.10
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
@ -156,19 +156,19 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm-py3_10-test:
+  linux-jammy-rocm-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm-py3.10
+    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm-py3_10-build
+      - linux-jammy-rocm-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm-py3.10
-      docker-image: ${{ needs.linux-focal-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit

  linux-focal-cuda12_6-py3-gcc11-slow-gradcheck-build:
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -413,16 +413,16 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm-py3_10-build:
+  linux-jammy-rocm-py3_10-build:
    # don't run build twice on main
    if: github.event_name == 'pull_request'
-    name: linux-focal-rocm-py3.10
+    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm-py3.10
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: pytorch-linux-jammy-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
        { include: [
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@ -36,15 +36,15 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  linux-focal-rocm-py3_10-build:
+  linux-jammy-rocm-py3_10-build:
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    name: linux-focal-rocm-py3.10
+    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm-py3.10
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: pytorch-linux-jammy-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
        { include: [
@ -57,17 +57,17 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm-py3_10-test:
+  linux-jammy-rocm-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm-py3.10
+    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm-py3_10-build
+      - linux-jammy-rocm-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm-py3.10
-      docker-image: ${{ needs.linux-focal-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -26,13 +26,13 @@ jobs:
      id-token: write
      contents: read

-  linux-focal-rocm-py3_10-build:
+  linux-jammy-rocm-py3_10-build:
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    name: linux-focal-rocm-py3.10
+    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-rocm-py3.10
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: pytorch-linux-jammy-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
        { include: [
@ -45,17 +45,17 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm-py3_10-test:
+  linux-jammy-rocm-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm-py3.10
+    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm-py3_10-build
+      - linux-jammy-rocm-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm-py3.10
-      docker-image: ${{ needs.linux-focal-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -103,14 +103,14 @@ jobs:
      test-matrix: ${{ needs.linux-focal-py3_9-clang10-build.outputs.test-matrix }}
    secrets: inherit

-  linux-focal-rocm-py3_10-build:
-    name: linux-focal-rocm-py3.10
+  linux-jammy-rocm-py3_10-build:
+    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm-py3.10
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
          { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
@ -118,19 +118,19 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm-py3_10-test:
+  linux-jammy-rocm-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm-py3.10
+    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm-py3_10-build
+      - linux-jammy-rocm-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm-py3.10
-      docker-image: ${{ needs.linux-focal-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-py3_10-clang15-asan-build:
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -165,15 +165,15 @@ jobs:
      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    secrets: inherit

-  linux-focal-rocm-py3_10-build:
+  linux-jammy-rocm-py3_10-build:
    if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
-    name: linux-focal-rocm-py3.10
+    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm-py3.10
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: pytorch-linux-jammy-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
        { include: [
@ -183,20 +183,20 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm-py3_10-test:
+  linux-jammy-rocm-py3_10-test:
    if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm-py3.10
+    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm-py3_10-build
+      - linux-jammy-rocm-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm-py3.10
-      docker-image: ${{ needs.linux-focal-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
    secrets: inherit

--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -378,6 +378,15 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<

 template <typename Dtype, typename C_Dtype = Dtype>
 static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+#if defined(USE_ROCM) && ROCM_VERSION == 60400
+  // regression in ROCm 6.4, planned fixed in 6.4.1, hipblaslt TT fp32 calculation errors
+  // best to disallow hipblaslt for this specific case
+  if constexpr (std::is_same_v<Dtype, float>) {
+    if (_cublasOpFromChar(transa) == CUBLAS_OP_T && _cublasOpFromChar(transb) == CUBLAS_OP_T) {
+        return false;
+    }
+  }
+#endif
  cudaDataType_t abType = CUDA_R_32F;
  cudaDataType_t cType = CUDA_R_32F;
  cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@ -613,6 +613,14 @@ auto GetHipBlasLtTypeStringAndOps() {
  auto b_datatype = HipDataTypeFor<BT>();
  auto in_out_datatype = HipDataTypeFor<CT>();
  std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;
+#if ROCM_VERSION == 60400
+  // hipblaslt TT fp32 regression on ROCm 6.4, cannot use
+  if ((a_datatype == HIP_R_32F || b_datatype == HIP_R_32F || in_out_datatype == HIP_R_32F)
+          && (transa_outer == HIPBLAS_OP_T && transb_outer == HIPBLAS_OP_T)) {
+    std::vector<std::pair<std::string, std::unique_ptr<Callable<ParamsT>>>> ignore;
+    return ignore;
+  }
+#endif

  hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
  if (at::globalContext().allowTF32CuBLAS()) {
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -346,7 +346,15 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  static bool disable_addmm_cuda_lt = getDisableAddmmCudaLt();
 #endif
  // if lt path fails, we recurse back into this function here and force the lt path to off
-  disable_addmm_cuda_lt |= disable_addmm_cuda_lt_override;
+  // we cannot update varible disable_addmm_cuda_lt from above since it is static and would be permanent
+  bool disable_addmm_cuda_lt_final = disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
+#if defined(USE_ROCM) && ROCM_VERSION == 60400
+  // hipblaslt TT fp32 regression on ROCm 6.4, cannot use
+  cublasCommonArgs _args(mat1, mat2, result);
+  if (_args.transa == 't' && _args.transb == 't') {
+    disable_addmm_cuda_lt_final = true;
+  }
+#endif
  at::ScalarType scalar_type = mat1.scalar_type();
  bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
  c10::MaybeOwned<Tensor> self_;
@ -360,7 +368,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
    // the last two conditions is to skip 16b transA and non-trans-B having
    // leading dim >> rows when they are sliced from a large tensor
    // see fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul
-    if (!disable_addmm_cuda_lt) {
+    if (!disable_addmm_cuda_lt_final) {
      useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 &&
          result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] &&
          self.is_contiguous() && result.is_contiguous() &&
--- a/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
@ -26,7 +26,7 @@ from torch.testing._internal.common_fsdp import (
    patch_reduce_scatter,
    reduce_scatter_with_assert,
 )
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import run_tests, skipIfRocm


 class TestFullyShardMixedPrecisionTraining(FSDPTest):
@ -81,6 +81,7 @@ class TestFullyShardMixedPrecisionTraining(FSDPTest):
            use_shard_placement_fn_vals.append(True)
        return use_shard_placement_fn_vals

+    @skipIfRocm  # regressed in ROCm 6.4, but ROCm 6.5 fixes it
    @skip_if_lt_x_gpu(2)
    @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
    def test_compute_dtype(self):
@ -160,6 +161,7 @@ class TestFullyShardMixedPrecisionTraining(FSDPTest):
            self.assertEqual(fsdp_loss, ref_loss)
            check_sharded_parity(self, ref_model, model)

+    @skipIfRocm  # regressed in ROCm 6.4, but ROCm 6.5 fixes it
    @skip_if_lt_x_gpu(2)
    @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
    def test_reduce_dtype(self):
--- a/test/inductor/test_torchinductor_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_dynamic_shapes.py
@ -367,7 +367,9 @@ class TestInductorDynamic(TestCase):
    @torch._dynamo.config.patch(capture_scalar_outputs=True)
    @torch._inductor.config.patch(implicit_fallbacks=True)
    def test_item_to_inputs_kernel_nobreak(self, device):
-        @torch.library.custom_op("test::foo", mutates_args=())
+        @torch.library.custom_op(
+            "test_item_to_inputs_kernel_nobreak::foo", mutates_args=()
+        )
        def foo(x: torch.Tensor, y: int) -> torch.Tensor:
            return x.clone()

@ -378,7 +380,7 @@ class TestInductorDynamic(TestCase):
        @torch.compile(fullgraph=True)
        def f(x, r):
            y = x.item()
-            return torch.ops.test.foo(r, y)
+            return torch.ops.test_item_to_inputs_kernel_nobreak.foo(r, y)

        f(torch.tensor([3], device=device), torch.randn(10, device=device))

@ -591,7 +593,9 @@ class TestInductorDynamic(TestCase):
    )
    @torch._inductor.config.patch(implicit_fallbacks=True)
    def test_multi_output_unbacked_custom_op(self, device):
-        @torch.library.custom_op("test::foo", mutates_args=())
+        @torch.library.custom_op(
+            "test_multi_output_unbacked_custom_op::foo", mutates_args=()
+        )
        def foo(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
            return torch.empty(2, device=x.device), torch.empty(3, device=x.device)

@ -603,7 +607,7 @@ class TestInductorDynamic(TestCase):

        @torch.compile(fullgraph=True)
        def f(x):
-            a, b = torch.ops.test.foo(x)
+            a, b = torch.ops.test_multi_output_unbacked_custom_op.foo(x)
            return a.sum() + b.sum()

        f(torch.tensor([3], device=device))
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@ -14123,6 +14123,9 @@ op_db: list[OpInfo] = [
           check_batched_forward_grad=False,
           supports_out=False,
           skips=(
+               # Compiler issue on ROCm. Regression started in ROCm 6.4.
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_non_standard_bool_values',
+                            dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
           )),
    OpInfo('masked_select',
           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
@ -15567,6 +15570,9 @@ op_db: list[OpInfo] = [
               # NOTE: this failure may not reproduce consistently on different systems
               # false INTERNAL ASSERT FAILED at "...torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185
               DecorateInfo(unittest.skip("Internal assert failed!"), 'TestJit', 'test_variant_consistency_jit'),
+               # Compiler issue on ROCm. Regression started in ROCm 6.4.
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_non_standard_bool_values',
+                            dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
           )),
    OpInfo('nn.functional.interpolate',
           aten_name="interpolate",
@ -18633,7 +18639,12 @@ op_db: list[OpInfo] = [
           supports_forward_ad=True,
           supports_fwgrad_bwgrad=True,
           sample_inputs_func=sample_inputs_scatter,
-           error_inputs_func=error_inputs_scatter_and_scatter_add),
+           error_inputs_func=error_inputs_scatter_and_scatter_add,
+           skips=(
+               # Compiler issue on ROCm. Regression started in ROCm 6.4.
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_non_standard_bool_values',
+                            dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+           )),
    UnaryUfuncInfo(
        'bfloat16',
        op=lambda x, *args, **kwargs: x.bfloat16(*args, **kwargs),
@ -19422,7 +19433,11 @@ op_db: list[OpInfo] = [
           error_inputs_func=error_inputs_scatter_and_scatter_add,
           supports_forward_ad=True,
           supports_fwgrad_bwgrad=True,
-           ),
+           skips=(
+               # Compiler issue on ROCm. Regression started in ROCm 6.4.
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_non_standard_bool_values',
+                            dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+           )),
    OpInfo('stack',
           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
           sample_inputs_func=sample_inputs_stack,
@ -19433,8 +19448,7 @@ op_db: list[OpInfo] = [
               # https://github.com/pytorch/pytorch/issues/77046
               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
-           ),
-           ),
+           )),
    OpInfo('_chunk_cat',
           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
           sample_inputs_func=sample_inputs_chunk_cat,
@ -20014,13 +20028,23 @@ op_db: list[OpInfo] = [
           supports_forward_ad=True,
           supports_fwgrad_bwgrad=True,
           error_inputs_func=error_inputs_tril_triu,
-           sample_inputs_func=sample_inputs_tril_triu),
+           sample_inputs_func=sample_inputs_tril_triu,
+           skips=(
+               # Compiler issue on ROCm. Regression started in ROCm 6.4.
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_non_standard_bool_values',
+                            dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+           )),
    OpInfo('triu',
           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
           supports_forward_ad=True,
           supports_fwgrad_bwgrad=True,
           error_inputs_func=error_inputs_tril_triu,
-           sample_inputs_func=sample_inputs_tril_triu),
+           sample_inputs_func=sample_inputs_tril_triu,
+           skips=(
+               # Compiler issue on ROCm. Regression started in ROCm 6.4.
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_non_standard_bool_values',
+                            dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+           )),
    OpInfo('triu_indices',
           dtypes=_dispatch_dtypes((torch.int32, torch.int64)),
           sample_inputs_func=sample_inputs_trilu_indices,
@ -21598,6 +21622,11 @@ op_db: list[OpInfo] = [
        supports_forward_ad=True,
        supports_fwgrad_bwgrad=True,
        sample_inputs_func=sample_inputs_scatter_reduce,
+        skips=(
+            # Compiler issue on ROCm. Regression started in ROCm 6.4.
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_non_standard_bool_values',
+                         dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+        ),
    ),
    OpInfo(
        'scatter_reduce',
--- a/torch/testing/_internal/opinfo/definitions/linalg.py
+++ b/torch/testing/_internal/opinfo/definitions/linalg.py
@ -1442,6 +1442,7 @@ op_db: list[OpInfo] = [
                device_type="cpu",
                dtypes=(torch.complex128,),
            ),
+            skipCUDAIfRocm,  # regression in ROCm 6.4
        ],
    ),
    OpInfo(