mirror of
https://github.com/zebrajr/tensorflow.git
synced 2025-12-06 12:20:11 +01:00
PR #32960: [ROCm] Refactor testing scripts
Imported from GitHub PR https://github.com/openxla/xla/pull/32960
📝 Summary of Changes
(Partially) upstreaming changes from: https://github.com/ROCm/xla/pull/323, 9d358b9b26, and https://github.com/ROCm/xla/pull/385. It skips some asan/tsan changes for now.
🎯 Justification
These changes are ROCm specific and helps with rocm internal CI validation pipelines.
🚀 Kind of Contribution
🐛 Bug Fix, ♻️ Cleanup, 🧪 Tests
📊 Benchmark (for Performance Improvements)
/
🧪 Unit Tests:
/
🧪 Execution Tests:
/
Copybara import of the project:
--
804ff1b6a6fbba86a3e0a09d739179a4eb4f197d by Milica Makevic <Milica.Makevic@amd.com>:
Add missing cuda-only tag to cuda test
--
44ce7a2d56c9f0c80405447f431ae1e5a33f42e1 by Milica Makevic <Milica.Makevic@amd.com>:
Refactor test scripts
--
fb783c968e9d2ff5d92357908d99e4952235c2bc by Milica Makevic <Milica.Makevic@amd.com>:
Cover more mgpu tests
--
1f53712274f76202241bd3631dbf065826c0b960 by Milica Makevic <Milica.Makevic@amd.com>:
Switch from rocm_gcc to rocm_ci for sgpu tests
--
00e0c8ee2a763680f5a3665dab62202ab230731d by Milica Makevic <Milica.Makevic@amd.com>:
Changing file permissions
--
003c062a8900c12b73c0972e8d406f2661a27aba by Milica Makevic <Milica.Makevic@amd.com>:
Remove unnecessary import
--
214599355f40f1b65e0540daf0b9829d2c950115 by Harsha HS <Harsha.HavanurShamsundara@amd.com>:
Add license header
Merging this change closes #32960
PiperOrigin-RevId: 822245565
This commit is contained in:
parent
7a107e3571
commit
47cd01d4a5
39
third_party/xla/build_tools/rocm/rocm_tag_filters.sh
vendored
Executable file
39
third_party/xla/build_tools/rocm/rocm_tag_filters.sh
vendored
Executable file
|
|
@ -0,0 +1,39 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
TAG_FILTERS=(
|
||||
-no_gpu
|
||||
-requires-gpu-intel
|
||||
-requires-gpu-nvidia
|
||||
-cuda-only
|
||||
-oneapi-only
|
||||
-requires-gpu-sm60
|
||||
-requires-gpu-sm60-only
|
||||
-requires-gpu-sm70
|
||||
-requires-gpu-sm70-only
|
||||
-requires-gpu-sm80
|
||||
-requires-gpu-sm80-only
|
||||
-requires-gpu-sm86
|
||||
-requires-gpu-sm86-only
|
||||
-requires-gpu-sm89
|
||||
-requires-gpu-sm89-only
|
||||
-requires-gpu-sm90
|
||||
-requires-gpu-sm90-only
|
||||
)
|
||||
|
||||
echo $(IFS=, ; echo "${TAG_FILTERS[*]}")
|
||||
|
|
@ -1,5 +1,4 @@
|
|||
# Test-related settings.
|
||||
try-import /usertools/rocm.bazelrc
|
||||
|
||||
build:rocm_dev --remote_upload_local_results=false
|
||||
build:rocm_dev --remote_cache="https://wardite.cluster.engflow.com"
|
||||
|
|
@ -30,3 +29,48 @@ build:tsan --//build_tools/rocm:sanitizer=tsan
|
|||
build:asan --test_env=ASAN_OPTIONS=suppressions=build_tools/rocm/asan_ignore_list.txt:use_sigaltstack=0
|
||||
build:asan --test_env=LSAN_OPTIONS=suppressions=build_tools/rocm/lsan_ignore_list.txt:use_sigaltstack=0
|
||||
build:asan --//build_tools/rocm:sanitizer=asan
|
||||
|
||||
test:xla_sgpu -- \
|
||||
//xla/... \
|
||||
-//xla/backends/gpu/collectives:gpu_clique_key_test \
|
||||
-//xla/backends/gpu/collectives:nccl_communicator_test \
|
||||
-//xla/service:collective_ops_utils_test \
|
||||
-//xla/service:collective_pipeliner_test \
|
||||
-//xla/service:collective_permute_cycle_test \
|
||||
-//xla/service:batched_gather_scatter_normalizer_test \
|
||||
-//xla/service:all_reduce_simplifier_test \
|
||||
-//xla/service:all_gather_simplifier_test \
|
||||
-//xla/service:reduce_scatter_decomposer_test \
|
||||
-//xla/service:reduce_scatter_reassociate_test \
|
||||
-//xla/service:reduce_scatter_combiner_test \
|
||||
-//xla/service:scatter_simplifier_test \
|
||||
-//xla/service:sharding_propagation_test \
|
||||
-//xla/service:sharding_remover_test \
|
||||
-//xla/service:p2p_schedule_preparation_test \
|
||||
-//xla/pjrt/distributed:topology_util_test \
|
||||
-//xla/pjrt/distributed:client_server_test
|
||||
|
||||
test:xla_mgpu -- \
|
||||
//xla/tests:collective_ops_e2e_test \
|
||||
//xla/tests:collective_ops_test \
|
||||
//xla/tests:collective_pipeline_parallelism_test \
|
||||
//xla/tests:replicated_io_feed_test \
|
||||
//xla/backends/gpu/collectives:gpu_clique_key_test \
|
||||
//xla/backends/gpu/collectives:nccl_communicator_test \
|
||||
//xla/backends/gpu/runtime:all_reduce_test \
|
||||
//xla/service:collective_ops_utils_test \
|
||||
//xla/service:collective_pipeliner_test \
|
||||
//xla/service:collective_permute_cycle_test \
|
||||
//xla/service:batched_gather_scatter_normalizer_test \
|
||||
//xla/service:all_reduce_simplifier_test \
|
||||
//xla/service:all_gather_simplifier_test \
|
||||
//xla/service:reduce_scatter_decomposer_test \
|
||||
//xla/service:reduce_scatter_reassociate_test \
|
||||
//xla/service:reduce_scatter_combiner_test \
|
||||
//xla/service:scatter_simplifier_test \
|
||||
//xla/service:sharding_propagation_test \
|
||||
//xla/service:sharding_remover_test \
|
||||
//xla/service:p2p_schedule_preparation_test \
|
||||
//xla/tools/multihost_hlo_runner:functional_hlo_runner_test \
|
||||
//xla/pjrt/distributed:topology_util_test \
|
||||
//xla/pjrt/distributed:client_server_test
|
||||
41
third_party/xla/build_tools/rocm/run_xla.sh
vendored
41
third_party/xla/build_tools/rocm/run_xla.sh
vendored
|
|
@ -37,30 +37,23 @@ echo ""
|
|||
echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s) for gpu ${AMD_GPU_GFX_ID}."
|
||||
echo ""
|
||||
|
||||
# First positional argument (if any) specifies the ROCM_INSTALL_DIR
|
||||
if [[ -n $1 ]]; then
|
||||
ROCM_INSTALL_DIR=$1
|
||||
else
|
||||
if [[ -z "${ROCM_PATH}" ]]; then
|
||||
ROCM_INSTALL_DIR=/opt/rocm/
|
||||
else
|
||||
ROCM_INSTALL_DIR=$ROCM_PATH
|
||||
fi
|
||||
fi
|
||||
|
||||
export PYTHON_BIN_PATH=`which python3`
|
||||
export TF_NEED_ROCM=1
|
||||
export ROCM_PATH=$ROCM_INSTALL_DIR
|
||||
TAGS_FILTER="gpu,requires-gpu-amd,-multi_gpu,-requires-gpu-nvidia,-requires-gpu-intel,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-cuda-only,-oneapi-only"
|
||||
UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{60,70,80,86,89,90}{,-only})"
|
||||
TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"
|
||||
export ROCM_PATH=/opt/rocm
|
||||
|
||||
bazel \
|
||||
test \
|
||||
--define xnn_enable_avxvnniint8=false --define xnn_enable_avx512fp16=false \
|
||||
--config=rocm_gcc \
|
||||
--build_tag_filters=${TAGS_FILTER} \
|
||||
--test_tag_filters=${TAGS_FILTER} \
|
||||
SCRIPT_DIR=$(realpath $(dirname $0))
|
||||
TAG_FILTERS=$($SCRIPT_DIR/rocm_tag_filters.sh),gpu,-multi_gpu,-multi_gpu_h100,requires-gpu-amd,,-skip_rocprofiler_sdk,-no_oss,-oss_excluded,-oss_serial
|
||||
|
||||
if [ ! -d /tf/pkg ]; then
|
||||
mkdir -p /tf/pkg
|
||||
fi
|
||||
|
||||
bazel --bazelrc=build_tools/rocm/rocm_xla.bazelrc test \
|
||||
--config=rocm_ci \
|
||||
--config=xla_sgpu \
|
||||
--build_tag_filters=$TAG_FILTERS \
|
||||
--test_tag_filters=$TAG_FILTERS \
|
||||
--profile=/tf/pkg/profile.json.gz \
|
||||
--test_timeout=920,2400,7200,9600 \
|
||||
--test_sharding_strategy=disabled \
|
||||
--test_output=errors \
|
||||
|
|
@ -70,8 +63,6 @@ bazel \
|
|||
--test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
|
||||
--test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
|
||||
--action_env=TF_ROCM_AMDGPU_TARGETS=${AMD_GPU_GFX_ID} \
|
||||
--action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
|
||||
--action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
|
||||
--action_env=XLA_FLAGS="--xla_gpu_enable_llvm_module_compilation_parallelism=true --xla_gpu_force_compilation_parallelism=16" \
|
||||
--repo_env="ROCM_PATH=$ROCM_PATH" \
|
||||
--run_under=//build_tools/ci:parallel_gpu_execute \
|
||||
-- //xla/...
|
||||
--run_under=//build_tools/ci:parallel_gpu_execute
|
||||
|
|
|
|||
|
|
@ -18,16 +18,22 @@
|
|||
set -e
|
||||
set -x
|
||||
|
||||
SCRIPT_DIR=$(realpath $(dirname $0))
|
||||
TAG_FILTERS=$($SCRIPT_DIR/rocm_tag_filters.sh),gpu,-multi_gpu,-multi_gpu_h100,requires-gpu-amd,,-skip_rocprofiler_sdk,-no_oss,-oss_excluded,-oss_serial
|
||||
|
||||
if [ ! -d /tf/pkg ]; then
|
||||
mkdir -p /tf/pkg
|
||||
fi
|
||||
|
||||
SCRIPT_DIR=$(dirname $0)
|
||||
bazel --bazelrc="$SCRIPT_DIR/rocm_xla.bazelrc" test \
|
||||
"$@" \
|
||||
--test_tag_filters=gpu,requires-gpu-amd,-requires-gpu-nvidia,-requires-gpu-intel,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm,-requires-gpu-sm60,-requires-gpu-sm60-only,-requires-gpu-sm70,-requires-gpu-sm70-only,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm86,-requires-gpu-sm86-only,-requires-gpu-sm89,-requires-gpu-sm89-only,-requires-gpu-sm90,-requires-gpu-sm90-only \
|
||||
--build_tag_filters=gpu,requires-gpu-amd,-requires-gpu-nvidia,-requires-gpu-intel,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm,-requires-gpu-sm60,-requires-gpu-sm60-only,-requires-gpu-sm70,-requires-gpu-sm70-only,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm86,-requires-gpu-sm86-only,-requires-gpu-sm89,-requires-gpu-sm89-only,-requires-gpu-sm90,-requires-gpu-sm90-only \
|
||||
--build_tag_filters=$TAG_FILTERS \
|
||||
--test_tag_filters=$TAG_FILTERS \
|
||||
--profile=/tf/pkg/profile.json.gz \
|
||||
--keep_going \
|
||||
--test_env=TF_TESTS_PER_GPU=1 \
|
||||
--action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
|
||||
--action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
|
||||
--action_env=XLA_FLAGS="--xla_gpu_enable_llvm_module_compilation_parallelism=true --xla_gpu_force_compilation_parallelism=16" \
|
||||
--test_output=errors \
|
||||
--local_test_jobs=2 \
|
||||
--run_under=//build_tools/rocm:parallel_gpu_execute
|
||||
|
|
|
|||
|
|
@ -53,31 +53,23 @@ echo ""
|
|||
echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s) for gpu ${AMD_GPU_GFX_ID}."
|
||||
echo ""
|
||||
|
||||
# First positional argument (if any) specifies the ROCM_INSTALL_DIR
|
||||
if [[ -n $1 ]]; then
|
||||
ROCM_INSTALL_DIR=$1
|
||||
else
|
||||
if [[ -z "${ROCM_PATH}" ]]; then
|
||||
ROCM_INSTALL_DIR=/opt/rocm/
|
||||
else
|
||||
ROCM_INSTALL_DIR=$ROCM_PATH
|
||||
fi
|
||||
fi
|
||||
|
||||
export PYTHON_BIN_PATH=`which python3`
|
||||
export TF_NEED_ROCM=1
|
||||
export ROCM_PATH=$ROCM_INSTALL_DIR
|
||||
TAGS_FILTER="-requires-gpu-nvidia,-oss_excluded,-oss_serial"
|
||||
UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{60,70,80,86,89,90}{,-only})"
|
||||
TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"
|
||||
export ROCM_PATH=/opt/rocm/
|
||||
|
||||
bazel \
|
||||
test \
|
||||
--define xnn_enable_avxvnniint8=false \
|
||||
--define xnn_enable_avx512fp16=false \
|
||||
--config=rocm_gcc \
|
||||
--build_tag_filters=${TAGS_FILTER} \
|
||||
--test_tag_filters=${TAGS_FILTER} \
|
||||
SCRIPT_DIR=$(realpath $(dirname $0))
|
||||
TAG_FILTERS="$($SCRIPT_DIR/rocm_tag_filters.sh)"
|
||||
|
||||
if [ ! -d /tf/pkg ]; then
|
||||
mkdir -p /tf/pkg
|
||||
fi
|
||||
|
||||
bazel --bazelrc=build_tools/rocm/rocm_xla.bazelrc test \
|
||||
--config=rocm_ci \
|
||||
--config=xla_mgpu \
|
||||
--build_tag_filters=${TAG_FILTERS} \
|
||||
--test_tag_filters=${TAG_FILTERS} \
|
||||
--profile=/tf/pkg/profile.json.gz \
|
||||
--test_timeout=920,2400,7200,9600 \
|
||||
--test_sharding_strategy=disabled \
|
||||
--test_output=errors \
|
||||
|
|
@ -90,12 +82,4 @@ bazel \
|
|||
--action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
|
||||
--action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
|
||||
--action_env=NCCL_MAX_NCHANNELS=1 \
|
||||
--repo_env="ROCM_PATH=$ROCM_PATH" \
|
||||
-- //xla/tests:collective_ops_e2e_test \
|
||||
//xla/tests:collective_ops_test \
|
||||
//xla/tests:collective_pipeline_parallelism_test \
|
||||
//xla/tests:replicated_io_feed_test \
|
||||
//xla/tools/multihost_hlo_runner:functional_hlo_runner_test \
|
||||
//xla/pjrt/distributed:topology_util_test \
|
||||
//xla/pjrt/distributed:client_server_test \
|
||||
//xla/backends/gpu/runtime:all_reduce_test
|
||||
--repo_env="ROCM_PATH=$ROCM_PATH"
|
||||
|
|
|
|||
|
|
@ -166,6 +166,7 @@ cuda_library(
|
|||
testonly = 1,
|
||||
srcs = ["cuda_test.cu.cc"],
|
||||
hdrs = ["cuda_test.h"],
|
||||
tags = ["cuda-only"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
"@com_google_googletest//:gtest_for_library",
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user