Integrate hermetic ML toolchains for TensorFlow.

Hermetic C++ toolchains and CUDA are enabled for Linux x86_64 platform by default. List of covered OSs will be extended in a few closest months. Developers still could use non hermetic toolchains with help of --config=clang_local flag. std::reduce replace with a traditional for loop. This is necessary because GCC 8 offers only partial support for C++17, and using std::reduce in this environment leads to "Undefined method" error. PiperOrigin-RevId: 775771057
2025-12-06 00:19:58 +01:00 · 2025-06-25 11:30:11 -07:00 · 2025-06-25 11:30:11 -07:00 · ac56b0e840
commit ac56b0e840
parent ce60a770af
8 changed files with 66 additions and 30 deletions
--- a/.bazelrc
+++ b/.bazelrc
@ -159,9 +159,13 @@ common --incompatible_enforce_config_setting_visibility
 # TODO: Enable Bzlmod
 common --noenable_bzlmod

+build --incompatible_enable_cc_toolchain_resolution
+build --repo_env USE_HERMETIC_CC_TOOLCHAIN=1
+
 # TODO: Migrate for https://github.com/bazelbuild/bazel/issues/7260
-common --noincompatible_enable_cc_toolchain_resolution
-common --noincompatible_enable_android_toolchain_resolution
+build:clang_local --noincompatible_enable_cc_toolchain_resolution
+build:clang_local --noincompatible_enable_android_toolchain_resolution
+build:clang_local --repo_env USE_HERMETIC_CC_TOOLCHAIN=0

 # Print a stacktrace when a test is killed
 test --test_env="GTEST_INSTALL_FAILURE_SIGNAL_HANDLER=1"
@ -172,6 +176,7 @@ test --test_env="GTEST_INSTALL_FAILURE_SIGNAL_HANDLER=1"
 # Android configs. Bazel needs to have --cpu and --fat_apk_cpu both set to the
 # target CPU to build transient dependencies correctly. See
 # https://docs.bazel.build/versions/master/user-manual.html#flag--fat_apk_cpu
+build:android --config=clang_local
 build:android --crosstool_top=//external:android/crosstool
 build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
 build:android_arm --config=android
@ -197,6 +202,8 @@ build:android --dynamic_mode=off
 # TODO(belitskiy): Remove once on Clang 20.
 build:android --define=xnn_enable_avxvnniint8=false

+build:macos --config=clang_local
+
 # Sets the default Apple platform to macOS.
 build:macos --apple_platform_type=macos

@ -220,6 +227,7 @@ build:apple-toolchain --crosstool_top=@local_config_apple_cc//:toolchain
 build:apple-toolchain --host_crosstool_top=@local_config_apple_cc//:toolchain

 # Settings for MacOS on ARM CPUs.
+build:macos_arm64 --config=clang_local
 build:macos_arm64 --cpu=darwin_arm64
 build:macos_arm64 --macos_minimum_os=11.0
 build:macos_arm64 --platforms=@build_bazel_apple_support//configs/platforms:darwin_arm64
@ -229,6 +237,7 @@ build:ios --apple_platform_type=ios
 build:ios --copt=-fembed-bitcode
 build:ios --copt=-Wno-c++11-narrowing
 build:ios --config=apple-toolchain
+build:ios --config=clang_local
 build:ios_armv7 --config=ios
 build:ios_armv7 --cpu=ios_armv7
 build:ios_armv7 --platforms=@org_tensorflow//tensorflow/tools/toolchains/ios:ios_armv7
@ -354,6 +363,7 @@ build:tpu --define=framework_shared_object=true
 build:tpu --copt=-DLIBTPU_ON_GCE
 build:tpu --define=enable_mlir_bridge=true

+build:rocm --config=clang_local
 build:rocm --copt=-Wno-gnu-offsetof-extensions
 build:rocm --crosstool_top=@local_config_rocm//crosstool:toolchain
 build:rocm --define=using_rocm_hipcc=true
@ -374,6 +384,7 @@ build:rocm_ci_hermetic --repo_env="OS=ubuntu_22.04"
 build:rocm_ci_hermetic --repo_env="ROCM_VERSION=6.2.0"
 build:rocm_ci_hermetic --@local_config_rocm//rocm:use_rocm_hermetic_rpath=True

+build:sycl --config=clang_local
 build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
 build:sycl --define=using_sycl=true
 build:sycl --define=tensorflow_mkldnn_contraction_kernel=0
@ -418,6 +429,7 @@ build:linux --copt="-Werror=switch"
 # Linux ARM64 specific options
 build:linux_arm64 --copt="-mtune=generic" --copt="-march=armv8-a" --copt="-O3"

+build:windows --config=clang_local

 # On Windows, `__cplusplus` is wrongly defined without this switch
 # See https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/
@ -536,6 +548,7 @@ test:win_clang_base --host_linkopt=/FORCE:MULTIPLE
 test:win_clang_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true --test_summary=short

 build:win_clang --config=win_clang_base
+build:win_clang --config=clang_local
 build:win_clang --extra_toolchains=@local_config_cc//:cc-toolchain-x64_windows-clang-cl
 build:win_clang --extra_execution_platforms=//tensorflow/tools/toolchains/win:x64_windows-clang-cl
 build:win_clang --host_platform=//tensorflow/tools/toolchains/win:x64_windows-clang-cl
@ -599,11 +612,6 @@ build:rbe_linux --host_linkopt=-lm

 build:rbe_linux_cpu --config=rbe_linux
 # Linux cpu and cuda builds share the same toolchain now.
-build:rbe_linux_cpu --host_crosstool_top="@local_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --crosstool_top="@local_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --extra_toolchains="@local_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cpu --repo_env=CC="/usr/lib/llvm-18/bin/clang"
-build:rbe_linux_cpu --repo_env=TF_SYSROOT="/dt9"
 build:rbe_linux_cpu --extra_execution_platforms="@ml_build_config_platform//:platform"
 build:rbe_linux_cpu --host_platform="@ml_build_config_platform//:platform"
 build:rbe_linux_cpu --platforms="@ml_build_config_platform//:platform"
@ -625,6 +633,13 @@ common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instance
 # build:rbe_linux_cpu --repo_env USE_CUDA_REDISTRIBUTIONS=1
 # build:rbe_linux_cpu --config=cuda_version

+build:rbe_linux_cpu_clang_local --config=clang_local
+build:rbe_linux_cpu_clang_local --host_crosstool_top="@local_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu_clang_local --crosstool_top="@local_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu_clang_local --extra_toolchains="@local_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cpu_clang_local --repo_env=CC="/usr/lib/llvm-18/bin/clang"
+build:rbe_linux_cpu_clang_local --repo_env=TF_SYSROOT="/dt9"
+
 # TODO(kanglan): Remove it after toolchain update is complete.
 build:rbe_linux_cpu_old --config=rbe_linux
 build:rbe_linux_cpu_old --host_crosstool_top="@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain"
@ -648,6 +663,7 @@ build:rbe_linux_cuda_nvcc --config=cuda_nvcc
 build:rbe_linux_cuda_nvcc --repo_env TF_NCCL_USE_STUB=1

 build:rbe_win_base --config=rbe_base
+build:rbe_win_base --config=clang_local
 build:rbe_win_base --shell_executable=C:\\tools\\msys64\\usr\\bin\\bash.exe
 build:rbe_win_base --remote_instance_name=projects/tensorflow-testing/instances/windows
 # Don't build the python zip archive in the RBE build.
@ -663,6 +679,7 @@ build:rbe_windows_x86_cpu_2022 --config=rbe_win_base --config=windows_x86_cpu_20
 # END TF REMOTE BUILD EXECUTION OPTIONS

 # TFLite build configs for generic embedded Linux
+build:elinux --config=clang_local
 build:elinux --crosstool_top=@local_config_embedded_arm//:toolchain
 build:elinux --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
 build:elinux_aarch64 --config=elinux
@ -709,21 +726,20 @@ build:release_linux_base --linkopt=-Wl,--undefined-version

 # Container environment settings below this point.
 # Set Clang as compiler. Use the actual path to clang installed in container.
-build:release_linux_base --repo_env=CC="/usr/lib/llvm-18/bin/clang"
-build:release_linux_base --repo_env=BAZEL_COMPILER="/usr/lib/llvm-18/bin/clang"
 # Test-related settings below this point.
 test:release_linux_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true
 test:release_linux_base --local_test_jobs=HOST_CPUS
 # Give only the list of failed tests at the end of the log
 test:release_linux_base --test_summary=short

-# Use the Clang toolchain to compile
 build:release_cpu_linux --config=release_linux_base
-build:release_cpu_linux --crosstool_top="@local_config_cuda//crosstool:toolchain"
-build:release_cpu_linux --repo_env=TF_SYSROOT="/dt9"
 # Target the AVX instruction set
 build:release_cpu_linux --config=avx_linux

+# Deprecated release_cpu_linux config with non-hermetic toolchains.
+build:release_cpu_linux_clang_local --crosstool_top="@local_config_cuda//crosstool:toolchain"
+build:release_cpu_linux_clang_local --repo_env=TF_SYSROOT="/dt9"
+
 build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 # Note that linux cpu and cuda builds share the same toolchain now.
@ -733,6 +749,9 @@ test:release_gpu_linux --test_timeout=300,450,1200,3600 --local_test_jobs=4 --ru

 build:release_arm64_linux --config=release_linux_base
 build:release_arm64_linux --config=linux_arm64
+build:release_arm64_linux --config=clang_local
+build:release_arm64_linux --repo_env=CC="/usr/lib/llvm-18/bin/clang"
+build:release_arm64_linux --repo_env=BAZEL_COMPILER="/usr/lib/llvm-18/bin/clang"
 build:release_arm64_linux --crosstool_top="@ml2014_clang_aarch64_config_aarch64//crosstool:toolchain"
 build:release_arm64_linux --config=mkl_aarch64_threadpool
 build:release_arm64_linux --copt=-flax-vector-conversions
@ -741,6 +760,7 @@ test:release_arm64_linux --flaky_test_attempts=3
 build:release_cpu_macos --config=avx_linux

 # Base build configs for macOS
+build:release_macos_base --config=clang_local
 build:release_macos_base --action_env  DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
 build:release_macos_base --define=no_nccl_support=true --output_filter=^$

@ -906,6 +926,7 @@ test:windows_x86_cpu_2022_pycpp_test --config=windows_x86_cpu_2022_pycpp_test_op
 # flags seem to be actually used to specify the execution platform details. It
 # seems it is this way because these flags are old and predate the distinction
 # between host and execution platform.
+build:cross_compile_base --config=clang_local
 build:cross_compile_base --host_cpu=k8
 build:cross_compile_base --host_crosstool_top=//tensorflow/tools/toolchains/cross_compile/cc:cross_compile_toolchain_suite
 build:cross_compile_base --extra_execution_platforms=//tensorflow/tools/toolchains/cross_compile/config:linux_x86_64
--- a/27
+++ b/27
@ -86,7 +86,7 @@ load(
 python_wheel_version_suffix_repository(name = "tf_wheel_version_suffix")

 load(
-    "@local_xla//third_party/gpus/cuda/hermetic:cuda_json_init_repository.bzl",
+    "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_json_init_repository.bzl",
    "cuda_json_init_repository",
 )

@ -98,7 +98,7 @@ load(
    "CUDNN_REDISTRIBUTIONS",
 )
 load(
-    "@local_xla//third_party/gpus/cuda/hermetic:cuda_redist_init_repositories.bzl",
+    "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_redist_init_repositories.bzl",
    "cuda_redist_init_repositories",
    "cudnn_redist_init_repository",
 )
@ -112,28 +112,28 @@ cudnn_redist_init_repository(
 )

 load(
-    "@local_xla//third_party/gpus/cuda/hermetic:cuda_configure.bzl",
+    "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_configure.bzl",
    "cuda_configure",
 )

 cuda_configure(name = "local_config_cuda")

 load(
-    "@local_xla//third_party/nccl/hermetic:nccl_redist_init_repository.bzl",
+    "@rules_ml_toolchain//third_party/nccl/hermetic:nccl_redist_init_repository.bzl",
    "nccl_redist_init_repository",
 )

 nccl_redist_init_repository()

 load(
-    "@local_xla//third_party/nccl/hermetic:nccl_configure.bzl",
+    "@rules_ml_toolchain//third_party/nccl/hermetic:nccl_configure.bzl",
    "nccl_configure",
 )

 nccl_configure(name = "local_config_nccl")

 load(
-    "@local_xla//third_party/nvshmem/hermetic:nvshmem_json_init_repository.bzl",
+    "@rules_ml_toolchain//third_party/nvshmem/hermetic:nvshmem_json_init_repository.bzl",
    "nvshmem_json_init_repository",
 )

@ -144,7 +144,7 @@ load(
    "NVSHMEM_REDISTRIBUTIONS",
 )
 load(
-    "@local_xla//third_party/nvshmem/hermetic:nvshmem_redist_init_repository.bzl",
+    "@rules_ml_toolchain//third_party/nvshmem/hermetic:nvshmem_redist_init_repository.bzl",
    "nvshmem_redist_init_repository",
 )

@ -153,8 +153,19 @@ nvshmem_redist_init_repository(
 )

 load(
-    "@local_xla//third_party/nvshmem/hermetic:nvshmem_configure.bzl",
+    "@rules_ml_toolchain//third_party/nvshmem/hermetic:nvshmem_configure.bzl",
    "nvshmem_configure",
 )

 nvshmem_configure(name = "local_config_nvshmem")
+
+load(
+    "@rules_ml_toolchain//cc_toolchain/deps:cc_toolchain_deps.bzl",
+    "cc_toolchain_deps",
+)
+
+cc_toolchain_deps()
+
+register_toolchains("@rules_ml_toolchain//cc_toolchain:lx64_lx64")
+
+register_toolchains("@rules_ml_toolchain//cc_toolchain:lx64_lx64_cuda")
--- a/ci/official/envs/linux_x86
+++ b/ci/official/envs/linux_x86
@ -23,7 +23,7 @@ TFCI_INDEX_HTML_ENABLE=1
 TFCI_LIB_SUFFIX="-cpu-linux-x86_64"
 TFCI_OUTPUT_DIR=build_output
 TFCI_WHL_AUDIT_ENABLE=1
-TFCI_WHL_AUDIT_PLAT=manylinux2014_x86_64
+TFCI_WHL_AUDIT_PLAT=manylinux_2_27_x86_64
 TFCI_WHL_BAZEL_TEST_ENABLE=1
 TFCI_WHL_SIZE_LIMIT=260M
 TFCI_WHL_SIZE_LIMIT_ENABLE=1
--- a/tensorflow/compiler/mlir/lite/kernels/internal/runtime_shape_test.cc
+++ b/tensorflow/compiler/mlir/lite/kernels/internal/runtime_shape_test.cc
@ -239,8 +239,10 @@ TEST(RuntimeShapeTest, TestExtendedShapeSmallToBig) {
 TEST_P(RuntimeShapeTest, TestFlatSize) {
  const std::vector<int32_t> src = IotaVector(kSmallSize);
  const RuntimeShape shape(src.size(), src.data());
-  EXPECT_EQ(shape.FlatSize(),
-            std::reduce(src.begin(), src.end(), 1, std::multiplies<int>{}));
+  int32_t flat_size = 1;
+  for (std::vector<int>::const_iterator it = src.begin(); it != src.end(); ++it)
+    flat_size *= *it;
+  EXPECT_EQ(shape.FlatSize(), flat_size);
 }

 INSTANTIATE_TEST_SUITE_P(BigSmall, RuntimeShapeTest,
--- a/tensorflow/core/framework/tensor_testutil.cc
+++ b/tensorflow/core/framework/tensor_testutil.cc
@ -15,7 +15,7 @@ limitations under the License.

 #include "tensorflow/core/framework/tensor_testutil.h"

-#include <cmath>
+#include <iomanip>

 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/types.h"
--- a/tensorflow/lite/kernels/internal/runtime_shape_test.cc
+++ b/tensorflow/lite/kernels/internal/runtime_shape_test.cc
@ -239,8 +239,10 @@ TEST(RuntimeShapeTest, TestExtendedShapeSmallToBig) {
 TEST_P(RuntimeShapeTest, TestFlatSize) {
  const std::vector<int32_t> src = IotaVector(kSmallSize);
  const RuntimeShape shape(src.size(), src.data());
-  EXPECT_EQ(shape.FlatSize(),
-            std::reduce(src.begin(), src.end(), 1, std::multiplies<int>{}));
+  int32_t flat_size = 1;
+  for (std::vector<int>::const_iterator it = src.begin(); it != src.end(); ++it)
+    flat_size *= *it;
+  EXPECT_EQ(shape.FlatSize(), flat_size);
 }

 INSTANTIATE_TEST_SUITE_P(BigSmall, RuntimeShapeTest,
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@ -426,7 +426,7 @@ verify_manylinux_compliance_test(
        "manual",
    ],
    wheel = ":wheel",
-    x86_64_compliance_tag = "manylinux_2_17_x86_64",
+    x86_64_compliance_tag = "manylinux_2_27_x86_64",
 )

 py_import(
--- a/tensorflow/workspace0.bzl
+++ b/tensorflow/workspace0.bzl
@ -140,10 +140,10 @@ def workspace():
    # Details: https://github.com/google-ml-infra/rules_ml_toolchain
    http_archive(
        name = "rules_ml_toolchain",
-        sha256 = "c85a3ae3da6af08dcc5065387e8d9b033913407c8fa5b074881fce516b482f69",
-        strip_prefix = "rules_ml_toolchain-f1e2b169441df00c8b1e9b08371d9ec8e0517ce6",
+        sha256 = "2bb5d2f7a94ceffb2b7bac881e6c13b830871bf808c2ee1dba7ec9a0d60bf660",
+        strip_prefix = "rules_ml_toolchain-0586ff3ca7c60f7963e5aa46cd390cf052c4f8b1",
        urls = [
-            "https://github.com/google-ml-infra/rules_ml_toolchain/archive/f1e2b169441df00c8b1e9b08371d9ec8e0517ce6.tar.gz",
+            "https://github.com/google-ml-infra/rules_ml_toolchain/archive/0586ff3ca7c60f7963e5aa46cd390cf052c4f8b1.tar.gz",
        ],
    )