Add Kokoro continuous job for testing XLA Linux GPU with NVCC.

PiperOrigin-RevId: 577849947
This commit is contained in:
A. Unique TensorFlower 2023-10-30 08:19:08 -07:00 committed by TensorFlower Gardener
parent 88c88b89b4
commit 88e5914db5
4 changed files with 102 additions and 1 deletions

View File

@ -55,6 +55,7 @@
#
# rbe_linux_cpu: RBE options to build with only CPU support.
# rbe_linux_cuda: RBE options to build with GPU support using clang.
# rbe_linux_cuda_nvcc: RBE options to build with GPU support using nvcc.
#
# rbe_win_py39: Windows Python 3.9 RBE config
#
@ -525,6 +526,35 @@ build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.16-clang_c
build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_config_nccl"
test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
build:rbe_linux_cuda_nvcc --config=cuda
build:rbe_linux_cuda_nvcc --repo_env TF_NCCL_USE_STUB=1
build:rbe_linux_cuda_nvcc --@local_xla//xla/python:enable_gpu=true
build:rbe_linux_cuda_nvcc --@local_xla//xla/python:jax_cuda_pip_rpaths=true
build:rbe_linux_cuda_nvcc --define=xla_python_enable_gpu=true
build:rbe_linux_cuda_nvcc --config=tensorrt
build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_75,compute_80"
build:rbe_linux_cuda_nvcc --action_env=TF_CUDA_VERSION="12"
build:rbe_linux_cuda_nvcc --action_env=TF_CUDNN_VERSION="8"
build:rbe_linux_cuda_nvcc --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.2"
build:rbe_linux_cuda_nvcc --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
build:rbe_linux_cuda_nvcc --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
build:rbe_linux_cuda_nvcc --crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
build:rbe_linux_cuda_nvcc --config=rbe_linux
build:rbe_linux_cuda_nvcc --host_crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
build:rbe_linux_cuda_nvcc --extra_toolchains="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain-linux-x86_64"
build:rbe_linux_cuda_nvcc --extra_execution_platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
build:rbe_linux_cuda_nvcc --host_platform="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
build:rbe_linux_cuda_nvcc --platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
build:rbe_linux_cuda_nvcc --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_python3.9"
build:rbe_linux_cuda_nvcc --python_path="/usr/bin/python3"
# These you may need to change for your own GCP project.
common:rbe_linux_cuda_nvcc --remote_instance_name=projects/tensorflow-testing/instances/default_instance
build:rbe_linux_cuda_nvcc --repo_env=REMOTE_GPU_TESTING=1
build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_cuda"
build:rbe_linux_cuda_nvcc --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_tensorrt"
build:rbe_linux_cuda_nvcc --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_nccl"
test:rbe_linux_cuda_nvcc --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
# TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
build:rbe_win --config=rbe_base
build:rbe_win --crosstool_top="//tensorflow/tools/toolchains/win/tf_win_05022023:toolchain"

View File

@ -55,6 +55,7 @@
#
# rbe_linux_cpu: RBE options to build with only CPU support.
# rbe_linux_cuda: RBE options to build with GPU support using clang.
# rbe_linux_cuda_nvcc: RBE options to build with GPU support using nvcc.
#
# rbe_win_py39: Windows Python 3.9 RBE config
#
@ -525,6 +526,35 @@ build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.16-clang_c
build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_config_nccl"
test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
build:rbe_linux_cuda_nvcc --config=cuda
build:rbe_linux_cuda_nvcc --repo_env TF_NCCL_USE_STUB=1
build:rbe_linux_cuda_nvcc --@local_xla//xla/python:enable_gpu=true
build:rbe_linux_cuda_nvcc --@local_xla//xla/python:jax_cuda_pip_rpaths=true
build:rbe_linux_cuda_nvcc --define=xla_python_enable_gpu=true
build:rbe_linux_cuda_nvcc --config=tensorrt
build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_75,compute_80"
build:rbe_linux_cuda_nvcc --action_env=TF_CUDA_VERSION="12"
build:rbe_linux_cuda_nvcc --action_env=TF_CUDNN_VERSION="8"
build:rbe_linux_cuda_nvcc --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.2"
build:rbe_linux_cuda_nvcc --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
build:rbe_linux_cuda_nvcc --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
build:rbe_linux_cuda_nvcc --crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
build:rbe_linux_cuda_nvcc --config=rbe_linux
build:rbe_linux_cuda_nvcc --host_crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
build:rbe_linux_cuda_nvcc --extra_toolchains="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain-linux-x86_64"
build:rbe_linux_cuda_nvcc --extra_execution_platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
build:rbe_linux_cuda_nvcc --host_platform="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
build:rbe_linux_cuda_nvcc --platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
build:rbe_linux_cuda_nvcc --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_python3.9"
build:rbe_linux_cuda_nvcc --python_path="/usr/bin/python3"
# These you may need to change for your own GCP project.
common:rbe_linux_cuda_nvcc --remote_instance_name=projects/tensorflow-testing/instances/default_instance
build:rbe_linux_cuda_nvcc --repo_env=REMOTE_GPU_TESTING=1
build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_cuda"
build:rbe_linux_cuda_nvcc --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_tensorrt"
build:rbe_linux_cuda_nvcc --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_nccl"
test:rbe_linux_cuda_nvcc --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
# TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
build:rbe_win --config=rbe_base
build:rbe_win --crosstool_top="//tensorflow/tools/toolchains/win/tf_win_05022023:toolchain"

View File

@ -26,6 +26,10 @@ function is_linux_gpu_job() {
[[ "$KOKORO_JOB_NAME" =~ tensorflow/xla/linux/.*gpu.* ]]
}
function is_use_nvcc() {
[[ -z "${USE_NVCC:-}" ]] || [[ "$USE_NVCC" == "true" ]]
}
# Pull the container (in case it was updated since the instance started) and
# store its SHA in the Sponge log.
docker pull "$DOCKER_IMAGE"
@ -44,16 +48,23 @@ RC_FILE="/usertools/cpu.bazelrc"
TARGET_FILTER=""
TAGS_FILTER="-no_oss,-oss_excluded,-oss_serial"
ADDITIONAL_FLAGS=""
RBE_CONFIG=""
if is_linux_gpu_job ; then
TAGS_FILTER="$TAGS_FILTER,gpu,requires-gpu-nvidia,-no_gpu"
ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute"
RC_FILE="/usertools/gpu.bazelrc"
if is_use_nvcc ; then
RBE_CONFIG="rbe_linux_cuda_nvcc"
else
RBE_CONFIG="rbe_linux_cuda"
fi
echo "***NOTE: nvidia-smi lists the highest CUDA version the driver supports, which may be different than the version of CUDA actually used!!***"
nvidia-smi
else
TAGS_FILTER="$TAGS_FILTER,-gpu,-requires-gpu-nvidia"
ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --config=nonccl"
RBE_CONFIG="rbe_linux_cpu"
fi
# Build & test XLA
@ -65,7 +76,7 @@ docker exec xla bazel --bazelrc=$RC_FILE \
--features=layering_check \
--profile=/tf/pkg/profile.json.gz \
--flaky_test_attempts=3 \
--config=rbe \
--config=$RBE_CONFIG \
--jobs=150 \
--nobuild_tests_only \
$ADDITIONAL_FLAGS \

View File

@ -55,6 +55,7 @@
#
# rbe_linux_cpu: RBE options to build with only CPU support.
# rbe_linux_cuda: RBE options to build with GPU support using clang.
# rbe_linux_cuda_nvcc: RBE options to build with GPU support using nvcc.
#
# rbe_win_py39: Windows Python 3.9 RBE config
#
@ -525,6 +526,35 @@ build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.16-clang_c
build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_config_nccl"
test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
build:rbe_linux_cuda_nvcc --config=cuda
build:rbe_linux_cuda_nvcc --repo_env TF_NCCL_USE_STUB=1
build:rbe_linux_cuda_nvcc --@local_xla//xla/python:enable_gpu=true
build:rbe_linux_cuda_nvcc --@local_xla//xla/python:jax_cuda_pip_rpaths=true
build:rbe_linux_cuda_nvcc --define=xla_python_enable_gpu=true
build:rbe_linux_cuda_nvcc --config=tensorrt
build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_75,compute_80"
build:rbe_linux_cuda_nvcc --action_env=TF_CUDA_VERSION="12"
build:rbe_linux_cuda_nvcc --action_env=TF_CUDNN_VERSION="8"
build:rbe_linux_cuda_nvcc --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.2"
build:rbe_linux_cuda_nvcc --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
build:rbe_linux_cuda_nvcc --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
build:rbe_linux_cuda_nvcc --crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
build:rbe_linux_cuda_nvcc --config=rbe_linux
build:rbe_linux_cuda_nvcc --host_crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
build:rbe_linux_cuda_nvcc --extra_toolchains="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain-linux-x86_64"
build:rbe_linux_cuda_nvcc --extra_execution_platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
build:rbe_linux_cuda_nvcc --host_platform="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
build:rbe_linux_cuda_nvcc --platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
build:rbe_linux_cuda_nvcc --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_python3.9"
build:rbe_linux_cuda_nvcc --python_path="/usr/bin/python3"
# These you may need to change for your own GCP project.
common:rbe_linux_cuda_nvcc --remote_instance_name=projects/tensorflow-testing/instances/default_instance
build:rbe_linux_cuda_nvcc --repo_env=REMOTE_GPU_TESTING=1
build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_cuda"
build:rbe_linux_cuda_nvcc --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_tensorrt"
build:rbe_linux_cuda_nvcc --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_nccl"
test:rbe_linux_cuda_nvcc --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
# TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
build:rbe_win --config=rbe_base
build:rbe_win --crosstool_top="//tensorflow/tools/toolchains/win/tf_win_05022023:toolchain"