PR #32960: [ROCm] Refactor testing scripts

Imported from GitHub PR https://github.com/openxla/xla/pull/32960

📝 Summary of Changes
(Partially) upstreaming changes from: https://github.com/ROCm/xla/pull/323, 9d358b9b26, and https://github.com/ROCm/xla/pull/385. It skips some asan/tsan changes for now.

🎯 Justification
These changes are ROCm specific and helps with rocm internal CI validation pipelines.

🚀 Kind of Contribution
🐛 Bug Fix, ♻️ Cleanup, 🧪 Tests

📊 Benchmark (for Performance Improvements)
/

🧪 Unit Tests:
/

🧪 Execution Tests:
/

Copybara import of the project:

--
804ff1b6a6fbba86a3e0a09d739179a4eb4f197d by Milica Makevic <Milica.Makevic@amd.com>:

Add missing cuda-only tag to cuda test

--
44ce7a2d56c9f0c80405447f431ae1e5a33f42e1 by Milica Makevic <Milica.Makevic@amd.com>:

Refactor test scripts

--
fb783c968e9d2ff5d92357908d99e4952235c2bc by Milica Makevic <Milica.Makevic@amd.com>:

Cover more mgpu tests

--
1f53712274f76202241bd3631dbf065826c0b960 by Milica Makevic <Milica.Makevic@amd.com>:

Switch from rocm_gcc to rocm_ci for sgpu tests

--
00e0c8ee2a763680f5a3665dab62202ab230731d by Milica Makevic <Milica.Makevic@amd.com>:

Changing file permissions

--
003c062a8900c12b73c0972e8d406f2661a27aba by Milica Makevic <Milica.Makevic@amd.com>:

Remove unnecessary import

--
214599355f40f1b65e0540daf0b9829d2c950115 by Harsha HS <Harsha.HavanurShamsundara@amd.com>:

Add license header

Merging this change closes #32960

PiperOrigin-RevId: 822245565
This commit is contained in:
mmakevic-amd 2025-10-21 13:06:57 -07:00 committed by TensorFlower Gardener
parent 7a107e3571
commit 47cd01d4a5
6 changed files with 126 additions and 61 deletions

View File

@ -0,0 +1,39 @@
#!/usr/bin/env bash
# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ==============================================================================
TAG_FILTERS=(
-no_gpu
-requires-gpu-intel
-requires-gpu-nvidia
-cuda-only
-oneapi-only
-requires-gpu-sm60
-requires-gpu-sm60-only
-requires-gpu-sm70
-requires-gpu-sm70-only
-requires-gpu-sm80
-requires-gpu-sm80-only
-requires-gpu-sm86
-requires-gpu-sm86-only
-requires-gpu-sm89
-requires-gpu-sm89-only
-requires-gpu-sm90
-requires-gpu-sm90-only
)
echo $(IFS=, ; echo "${TAG_FILTERS[*]}")

View File

@ -1,5 +1,4 @@
# Test-related settings.
try-import /usertools/rocm.bazelrc
build:rocm_dev --remote_upload_local_results=false
build:rocm_dev --remote_cache="https://wardite.cluster.engflow.com"
@ -30,3 +29,48 @@ build:tsan --//build_tools/rocm:sanitizer=tsan
build:asan --test_env=ASAN_OPTIONS=suppressions=build_tools/rocm/asan_ignore_list.txt:use_sigaltstack=0
build:asan --test_env=LSAN_OPTIONS=suppressions=build_tools/rocm/lsan_ignore_list.txt:use_sigaltstack=0
build:asan --//build_tools/rocm:sanitizer=asan
test:xla_sgpu -- \
//xla/... \
-//xla/backends/gpu/collectives:gpu_clique_key_test \
-//xla/backends/gpu/collectives:nccl_communicator_test \
-//xla/service:collective_ops_utils_test \
-//xla/service:collective_pipeliner_test \
-//xla/service:collective_permute_cycle_test \
-//xla/service:batched_gather_scatter_normalizer_test \
-//xla/service:all_reduce_simplifier_test \
-//xla/service:all_gather_simplifier_test \
-//xla/service:reduce_scatter_decomposer_test \
-//xla/service:reduce_scatter_reassociate_test \
-//xla/service:reduce_scatter_combiner_test \
-//xla/service:scatter_simplifier_test \
-//xla/service:sharding_propagation_test \
-//xla/service:sharding_remover_test \
-//xla/service:p2p_schedule_preparation_test \
-//xla/pjrt/distributed:topology_util_test \
-//xla/pjrt/distributed:client_server_test
test:xla_mgpu -- \
//xla/tests:collective_ops_e2e_test \
//xla/tests:collective_ops_test \
//xla/tests:collective_pipeline_parallelism_test \
//xla/tests:replicated_io_feed_test \
//xla/backends/gpu/collectives:gpu_clique_key_test \
//xla/backends/gpu/collectives:nccl_communicator_test \
//xla/backends/gpu/runtime:all_reduce_test \
//xla/service:collective_ops_utils_test \
//xla/service:collective_pipeliner_test \
//xla/service:collective_permute_cycle_test \
//xla/service:batched_gather_scatter_normalizer_test \
//xla/service:all_reduce_simplifier_test \
//xla/service:all_gather_simplifier_test \
//xla/service:reduce_scatter_decomposer_test \
//xla/service:reduce_scatter_reassociate_test \
//xla/service:reduce_scatter_combiner_test \
//xla/service:scatter_simplifier_test \
//xla/service:sharding_propagation_test \
//xla/service:sharding_remover_test \
//xla/service:p2p_schedule_preparation_test \
//xla/tools/multihost_hlo_runner:functional_hlo_runner_test \
//xla/pjrt/distributed:topology_util_test \
//xla/pjrt/distributed:client_server_test

View File

@ -37,30 +37,23 @@ echo ""
echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s) for gpu ${AMD_GPU_GFX_ID}."
echo ""
# First positional argument (if any) specifies the ROCM_INSTALL_DIR
if [[ -n $1 ]]; then
ROCM_INSTALL_DIR=$1
else
if [[ -z "${ROCM_PATH}" ]]; then
ROCM_INSTALL_DIR=/opt/rocm/
else
ROCM_INSTALL_DIR=$ROCM_PATH
fi
fi
export PYTHON_BIN_PATH=`which python3`
export TF_NEED_ROCM=1
export ROCM_PATH=$ROCM_INSTALL_DIR
TAGS_FILTER="gpu,requires-gpu-amd,-multi_gpu,-requires-gpu-nvidia,-requires-gpu-intel,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-cuda-only,-oneapi-only"
UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{60,70,80,86,89,90}{,-only})"
TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"
export ROCM_PATH=/opt/rocm
bazel \
test \
--define xnn_enable_avxvnniint8=false --define xnn_enable_avx512fp16=false \
--config=rocm_gcc \
--build_tag_filters=${TAGS_FILTER} \
--test_tag_filters=${TAGS_FILTER} \
SCRIPT_DIR=$(realpath $(dirname $0))
TAG_FILTERS=$($SCRIPT_DIR/rocm_tag_filters.sh),gpu,-multi_gpu,-multi_gpu_h100,requires-gpu-amd,,-skip_rocprofiler_sdk,-no_oss,-oss_excluded,-oss_serial
if [ ! -d /tf/pkg ]; then
mkdir -p /tf/pkg
fi
bazel --bazelrc=build_tools/rocm/rocm_xla.bazelrc test \
--config=rocm_ci \
--config=xla_sgpu \
--build_tag_filters=$TAG_FILTERS \
--test_tag_filters=$TAG_FILTERS \
--profile=/tf/pkg/profile.json.gz \
--test_timeout=920,2400,7200,9600 \
--test_sharding_strategy=disabled \
--test_output=errors \
@ -70,8 +63,6 @@ bazel \
--test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
--test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
--action_env=TF_ROCM_AMDGPU_TARGETS=${AMD_GPU_GFX_ID} \
--action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
--action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
--action_env=XLA_FLAGS="--xla_gpu_enable_llvm_module_compilation_parallelism=true --xla_gpu_force_compilation_parallelism=16" \
--repo_env="ROCM_PATH=$ROCM_PATH" \
--run_under=//build_tools/ci:parallel_gpu_execute \
-- //xla/...
--run_under=//build_tools/ci:parallel_gpu_execute

View File

@ -18,16 +18,22 @@
set -e
set -x
SCRIPT_DIR=$(realpath $(dirname $0))
TAG_FILTERS=$($SCRIPT_DIR/rocm_tag_filters.sh),gpu,-multi_gpu,-multi_gpu_h100,requires-gpu-amd,,-skip_rocprofiler_sdk,-no_oss,-oss_excluded,-oss_serial
if [ ! -d /tf/pkg ]; then
mkdir -p /tf/pkg
fi
SCRIPT_DIR=$(dirname $0)
bazel --bazelrc="$SCRIPT_DIR/rocm_xla.bazelrc" test \
"$@" \
--test_tag_filters=gpu,requires-gpu-amd,-requires-gpu-nvidia,-requires-gpu-intel,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm,-requires-gpu-sm60,-requires-gpu-sm60-only,-requires-gpu-sm70,-requires-gpu-sm70-only,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm86,-requires-gpu-sm86-only,-requires-gpu-sm89,-requires-gpu-sm89-only,-requires-gpu-sm90,-requires-gpu-sm90-only \
--build_tag_filters=gpu,requires-gpu-amd,-requires-gpu-nvidia,-requires-gpu-intel,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm,-requires-gpu-sm60,-requires-gpu-sm60-only,-requires-gpu-sm70,-requires-gpu-sm70-only,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm86,-requires-gpu-sm86-only,-requires-gpu-sm89,-requires-gpu-sm89-only,-requires-gpu-sm90,-requires-gpu-sm90-only \
--build_tag_filters=$TAG_FILTERS \
--test_tag_filters=$TAG_FILTERS \
--profile=/tf/pkg/profile.json.gz \
--keep_going \
--test_env=TF_TESTS_PER_GPU=1 \
--action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
--action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
--action_env=XLA_FLAGS="--xla_gpu_enable_llvm_module_compilation_parallelism=true --xla_gpu_force_compilation_parallelism=16" \
--test_output=errors \
--local_test_jobs=2 \
--run_under=//build_tools/rocm:parallel_gpu_execute

View File

@ -53,31 +53,23 @@ echo ""
echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s) for gpu ${AMD_GPU_GFX_ID}."
echo ""
# First positional argument (if any) specifies the ROCM_INSTALL_DIR
if [[ -n $1 ]]; then
ROCM_INSTALL_DIR=$1
else
if [[ -z "${ROCM_PATH}" ]]; then
ROCM_INSTALL_DIR=/opt/rocm/
else
ROCM_INSTALL_DIR=$ROCM_PATH
fi
fi
export PYTHON_BIN_PATH=`which python3`
export TF_NEED_ROCM=1
export ROCM_PATH=$ROCM_INSTALL_DIR
TAGS_FILTER="-requires-gpu-nvidia,-oss_excluded,-oss_serial"
UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{60,70,80,86,89,90}{,-only})"
TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"
export ROCM_PATH=/opt/rocm/
bazel \
test \
--define xnn_enable_avxvnniint8=false \
--define xnn_enable_avx512fp16=false \
--config=rocm_gcc \
--build_tag_filters=${TAGS_FILTER} \
--test_tag_filters=${TAGS_FILTER} \
SCRIPT_DIR=$(realpath $(dirname $0))
TAG_FILTERS="$($SCRIPT_DIR/rocm_tag_filters.sh)"
if [ ! -d /tf/pkg ]; then
mkdir -p /tf/pkg
fi
bazel --bazelrc=build_tools/rocm/rocm_xla.bazelrc test \
--config=rocm_ci \
--config=xla_mgpu \
--build_tag_filters=${TAG_FILTERS} \
--test_tag_filters=${TAG_FILTERS} \
--profile=/tf/pkg/profile.json.gz \
--test_timeout=920,2400,7200,9600 \
--test_sharding_strategy=disabled \
--test_output=errors \
@ -90,12 +82,4 @@ bazel \
--action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
--action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
--action_env=NCCL_MAX_NCHANNELS=1 \
--repo_env="ROCM_PATH=$ROCM_PATH" \
-- //xla/tests:collective_ops_e2e_test \
//xla/tests:collective_ops_test \
//xla/tests:collective_pipeline_parallelism_test \
//xla/tests:replicated_io_feed_test \
//xla/tools/multihost_hlo_runner:functional_hlo_runner_test \
//xla/pjrt/distributed:topology_util_test \
//xla/pjrt/distributed:client_server_test \
//xla/backends/gpu/runtime:all_reduce_test
--repo_env="ROCM_PATH=$ROCM_PATH"

View File

@ -166,6 +166,7 @@ cuda_library(
testonly = 1,
srcs = ["cuda_test.cu.cc"],
hdrs = ["cuda_test.h"],
tags = ["cuda-only"],
visibility = ["//visibility:public"],
deps = [
"@com_google_googletest//:gtest_for_library",