PR #32960: [ROCm] Refactor testing scripts

Imported from GitHub PR https://github.com/openxla/xla/pull/32960 📝 Summary of Changes (Partially) upstreaming changes from: https://github.com/ROCm/xla/pull/323, 9d358b9b26, and https://github.com/ROCm/xla/pull/385. It skips some asan/tsan changes for now. 🎯 Justification These changes are ROCm specific and helps with rocm internal CI validation pipelines. 🚀 Kind of Contribution 🐛 Bug Fix, ♻️ Cleanup, 🧪 Tests 📊 Benchmark (for Performance Improvements) / 🧪 Unit Tests: / 🧪 Execution Tests: / Copybara import of the project: -- 804ff1b6a6fbba86a3e0a09d739179a4eb4f197d by Milica Makevic <Milica.Makevic@amd.com>: Add missing cuda-only tag to cuda test -- 44ce7a2d56c9f0c80405447f431ae1e5a33f42e1 by Milica Makevic <Milica.Makevic@amd.com>: Refactor test scripts -- fb783c968e9d2ff5d92357908d99e4952235c2bc by Milica Makevic <Milica.Makevic@amd.com>: Cover more mgpu tests -- 1f53712274f76202241bd3631dbf065826c0b960 by Milica Makevic <Milica.Makevic@amd.com>: Switch from rocm_gcc to rocm_ci for sgpu tests -- 00e0c8ee2a763680f5a3665dab62202ab230731d by Milica Makevic <Milica.Makevic@amd.com>: Changing file permissions -- 003c062a8900c12b73c0972e8d406f2661a27aba by Milica Makevic <Milica.Makevic@amd.com>: Remove unnecessary import -- 214599355f40f1b65e0540daf0b9829d2c950115 by Harsha HS <Harsha.HavanurShamsundara@amd.com>: Add license header Merging this change closes #32960 PiperOrigin-RevId: 822245565
2025-12-06 12:20:11 +01:00 · 2025-10-21 13:06:57 -07:00 · 2025-10-21 13:06:57 -07:00 · 47cd01d4a5
commit 47cd01d4a5
parent 7a107e3571
6 changed files with 126 additions and 61 deletions
--- a/third_party/xla/build_tools/rocm/rocm_tag_filters.sh
+++ b/third_party/xla/build_tools/rocm/rocm_tag_filters.sh
@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+
+TAG_FILTERS=(
+    -no_gpu
+    -requires-gpu-intel
+    -requires-gpu-nvidia
+    -cuda-only
+    -oneapi-only
+    -requires-gpu-sm60
+    -requires-gpu-sm60-only
+    -requires-gpu-sm70
+    -requires-gpu-sm70-only
+    -requires-gpu-sm80
+    -requires-gpu-sm80-only
+    -requires-gpu-sm86
+    -requires-gpu-sm86-only
+    -requires-gpu-sm89
+    -requires-gpu-sm89-only
+    -requires-gpu-sm90
+    -requires-gpu-sm90-only
+)
+
+echo $(IFS=, ; echo "${TAG_FILTERS[*]}")
--- a/third_party/xla/build_tools/rocm/rocm_xla.bazelrc
+++ b/third_party/xla/build_tools/rocm/rocm_xla.bazelrc
@ -1,5 +1,4 @@
 # Test-related settings.
-try-import /usertools/rocm.bazelrc

 build:rocm_dev --remote_upload_local_results=false
 build:rocm_dev --remote_cache="https://wardite.cluster.engflow.com"
@ -30,3 +29,48 @@ build:tsan --//build_tools/rocm:sanitizer=tsan
 build:asan --test_env=ASAN_OPTIONS=suppressions=build_tools/rocm/asan_ignore_list.txt:use_sigaltstack=0
 build:asan --test_env=LSAN_OPTIONS=suppressions=build_tools/rocm/lsan_ignore_list.txt:use_sigaltstack=0
 build:asan --//build_tools/rocm:sanitizer=asan
+
+test:xla_sgpu -- \
+//xla/... \
+-//xla/backends/gpu/collectives:gpu_clique_key_test \
+-//xla/backends/gpu/collectives:nccl_communicator_test \
+-//xla/service:collective_ops_utils_test \
+-//xla/service:collective_pipeliner_test \
+-//xla/service:collective_permute_cycle_test \
+-//xla/service:batched_gather_scatter_normalizer_test \
+-//xla/service:all_reduce_simplifier_test \
+-//xla/service:all_gather_simplifier_test \
+-//xla/service:reduce_scatter_decomposer_test \
+-//xla/service:reduce_scatter_reassociate_test \
+-//xla/service:reduce_scatter_combiner_test \
+-//xla/service:scatter_simplifier_test \
+-//xla/service:sharding_propagation_test \
+-//xla/service:sharding_remover_test \
+-//xla/service:p2p_schedule_preparation_test \
+-//xla/pjrt/distributed:topology_util_test \
+-//xla/pjrt/distributed:client_server_test
+
+test:xla_mgpu -- \
+//xla/tests:collective_ops_e2e_test \
+//xla/tests:collective_ops_test \
+//xla/tests:collective_pipeline_parallelism_test \
+//xla/tests:replicated_io_feed_test \
+//xla/backends/gpu/collectives:gpu_clique_key_test \
+//xla/backends/gpu/collectives:nccl_communicator_test \
+//xla/backends/gpu/runtime:all_reduce_test \
+//xla/service:collective_ops_utils_test \
+//xla/service:collective_pipeliner_test \
+//xla/service:collective_permute_cycle_test \
+//xla/service:batched_gather_scatter_normalizer_test \
+//xla/service:all_reduce_simplifier_test \
+//xla/service:all_gather_simplifier_test \
+//xla/service:reduce_scatter_decomposer_test \
+//xla/service:reduce_scatter_reassociate_test \
+//xla/service:reduce_scatter_combiner_test \
+//xla/service:scatter_simplifier_test \
+//xla/service:sharding_propagation_test \
+//xla/service:sharding_remover_test \
+//xla/service:p2p_schedule_preparation_test \
+//xla/tools/multihost_hlo_runner:functional_hlo_runner_test \
+//xla/pjrt/distributed:topology_util_test \
+//xla/pjrt/distributed:client_server_test 
--- a/third_party/xla/build_tools/rocm/run_xla.sh
+++ b/third_party/xla/build_tools/rocm/run_xla.sh
@ -37,30 +37,23 @@ echo ""
 echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s) for gpu ${AMD_GPU_GFX_ID}."
 echo ""

-# First positional argument (if any) specifies the ROCM_INSTALL_DIR
-if [[ -n $1 ]]; then
-    ROCM_INSTALL_DIR=$1
-else
-    if [[ -z "${ROCM_PATH}" ]]; then
-        ROCM_INSTALL_DIR=/opt/rocm/
-    else
-        ROCM_INSTALL_DIR=$ROCM_PATH
-    fi
-fi
-
 export PYTHON_BIN_PATH=`which python3`
 export TF_NEED_ROCM=1
-export ROCM_PATH=$ROCM_INSTALL_DIR
-TAGS_FILTER="gpu,requires-gpu-amd,-multi_gpu,-requires-gpu-nvidia,-requires-gpu-intel,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-cuda-only,-oneapi-only"
-UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{60,70,80,86,89,90}{,-only})"
-TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"
+export ROCM_PATH=/opt/rocm

-bazel \
-    test \
-    --define xnn_enable_avxvnniint8=false --define xnn_enable_avx512fp16=false \
-    --config=rocm_gcc \
-    --build_tag_filters=${TAGS_FILTER} \
-    --test_tag_filters=${TAGS_FILTER} \
+SCRIPT_DIR=$(realpath $(dirname $0))
+TAG_FILTERS=$($SCRIPT_DIR/rocm_tag_filters.sh),gpu,-multi_gpu,-multi_gpu_h100,requires-gpu-amd,,-skip_rocprofiler_sdk,-no_oss,-oss_excluded,-oss_serial
+
+if [ ! -d /tf/pkg ]; then
+	mkdir -p /tf/pkg
+fi
+
+bazel --bazelrc=build_tools/rocm/rocm_xla.bazelrc test \
+    --config=rocm_ci \
+    --config=xla_sgpu \
+    --build_tag_filters=$TAG_FILTERS \
+    --test_tag_filters=$TAG_FILTERS \
+    --profile=/tf/pkg/profile.json.gz \
    --test_timeout=920,2400,7200,9600 \
    --test_sharding_strategy=disabled \
    --test_output=errors \
@ -70,8 +63,6 @@ bazel \
    --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
    --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
    --action_env=TF_ROCM_AMDGPU_TARGETS=${AMD_GPU_GFX_ID} \
-    --action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
-    --action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
+    --action_env=XLA_FLAGS="--xla_gpu_enable_llvm_module_compilation_parallelism=true --xla_gpu_force_compilation_parallelism=16" \
    --repo_env="ROCM_PATH=$ROCM_PATH" \
-    --run_under=//build_tools/ci:parallel_gpu_execute \
-    -- //xla/...
+    --run_under=//build_tools/ci:parallel_gpu_execute
--- a/third_party/xla/build_tools/rocm/run_xla_ci_build.sh
+++ b/third_party/xla/build_tools/rocm/run_xla_ci_build.sh
@ -18,16 +18,22 @@
 set -e
 set -x

+SCRIPT_DIR=$(realpath $(dirname $0))
+TAG_FILTERS=$($SCRIPT_DIR/rocm_tag_filters.sh),gpu,-multi_gpu,-multi_gpu_h100,requires-gpu-amd,,-skip_rocprofiler_sdk,-no_oss,-oss_excluded,-oss_serial
+
+if [ ! -d /tf/pkg ]; then
+	mkdir -p /tf/pkg
+fi
+
 SCRIPT_DIR=$(dirname $0)
 bazel --bazelrc="$SCRIPT_DIR/rocm_xla.bazelrc" test \
 	"$@" \
-	--test_tag_filters=gpu,requires-gpu-amd,-requires-gpu-nvidia,-requires-gpu-intel,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm,-requires-gpu-sm60,-requires-gpu-sm60-only,-requires-gpu-sm70,-requires-gpu-sm70-only,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm86,-requires-gpu-sm86-only,-requires-gpu-sm89,-requires-gpu-sm89-only,-requires-gpu-sm90,-requires-gpu-sm90-only \
-	--build_tag_filters=gpu,requires-gpu-amd,-requires-gpu-nvidia,-requires-gpu-intel,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm,-requires-gpu-sm60,-requires-gpu-sm60-only,-requires-gpu-sm70,-requires-gpu-sm70-only,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm86,-requires-gpu-sm86-only,-requires-gpu-sm89,-requires-gpu-sm89-only,-requires-gpu-sm90,-requires-gpu-sm90-only \
+	--build_tag_filters=$TAG_FILTERS \
+    --test_tag_filters=$TAG_FILTERS \
 	--profile=/tf/pkg/profile.json.gz \
 	--keep_going \
 	--test_env=TF_TESTS_PER_GPU=1 \
-	--action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
-	--action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
+	--action_env=XLA_FLAGS="--xla_gpu_enable_llvm_module_compilation_parallelism=true --xla_gpu_force_compilation_parallelism=16" \
 	--test_output=errors \
 	--local_test_jobs=2 \
 	--run_under=//build_tools/rocm:parallel_gpu_execute
--- a/third_party/xla/build_tools/rocm/run_xla_multi_gpu.sh
+++ b/third_party/xla/build_tools/rocm/run_xla_multi_gpu.sh
@ -53,31 +53,23 @@ echo ""
 echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s) for gpu ${AMD_GPU_GFX_ID}."
 echo ""

-# First positional argument (if any) specifies the ROCM_INSTALL_DIR
-if [[ -n $1 ]]; then
-    ROCM_INSTALL_DIR=$1
-else
-    if [[ -z "${ROCM_PATH}" ]]; then
-        ROCM_INSTALL_DIR=/opt/rocm/
-    else
-        ROCM_INSTALL_DIR=$ROCM_PATH
-    fi
-fi
-
 export PYTHON_BIN_PATH=`which python3`
 export TF_NEED_ROCM=1
-export ROCM_PATH=$ROCM_INSTALL_DIR
-TAGS_FILTER="-requires-gpu-nvidia,-oss_excluded,-oss_serial"
-UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{60,70,80,86,89,90}{,-only})"
-TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"
+export ROCM_PATH=/opt/rocm/

-bazel \
-    test \
-    --define xnn_enable_avxvnniint8=false \
-    --define xnn_enable_avx512fp16=false \
-    --config=rocm_gcc \
-    --build_tag_filters=${TAGS_FILTER} \
-    --test_tag_filters=${TAGS_FILTER} \
+SCRIPT_DIR=$(realpath $(dirname $0))
+TAG_FILTERS="$($SCRIPT_DIR/rocm_tag_filters.sh)"
+
+if [ ! -d /tf/pkg ]; then
+	mkdir -p /tf/pkg
+fi
+
+bazel --bazelrc=build_tools/rocm/rocm_xla.bazelrc test \
+    --config=rocm_ci \
+    --config=xla_mgpu \
+    --build_tag_filters=${TAG_FILTERS} \
+    --test_tag_filters=${TAG_FILTERS} \
+    --profile=/tf/pkg/profile.json.gz \
    --test_timeout=920,2400,7200,9600 \
    --test_sharding_strategy=disabled \
    --test_output=errors \
@ -90,12 +82,4 @@ bazel \
    --action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
    --action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
    --action_env=NCCL_MAX_NCHANNELS=1 \
-    --repo_env="ROCM_PATH=$ROCM_PATH" \
-    -- //xla/tests:collective_ops_e2e_test \
-       //xla/tests:collective_ops_test \
-       //xla/tests:collective_pipeline_parallelism_test \
-       //xla/tests:replicated_io_feed_test \
-       //xla/tools/multihost_hlo_runner:functional_hlo_runner_test \
-       //xla/pjrt/distributed:topology_util_test \
-       //xla/pjrt/distributed:client_server_test \
-       //xla/backends/gpu/runtime:all_reduce_test
+    --repo_env="ROCM_PATH=$ROCM_PATH"
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@ -166,6 +166,7 @@ cuda_library(
    testonly = 1,
    srcs = ["cuda_test.cu.cc"],
    hdrs = ["cuda_test.h"],
+    tags = ["cuda-only"],
    visibility = ["//visibility:public"],
    deps = [
        "@com_google_googletest//:gtest_for_library",