Internal change

PiperOrigin-RevId: 429500662 Change-Id: I0ebaa1c1321fa4a077a48443fbe6d7cdfa98533b
2025-12-06 12:20:11 +01:00 · 2022-02-18 00:43:05 -08:00 · 2022-02-18 00:43:05 -08:00 · 4248acac6d
commit 4248acac6d
parent 5d5d822d0e
14 changed files with 184 additions and 174 deletions
--- a/.bazelrc
+++ b/.bazelrc
@ -718,22 +718,21 @@ build:ubsan --linkopt -lubsan
 build      --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/common,tensorflow/core/tfrt/eager,tensorflow/core/tfrt/eager/backends/cpu,tensorflow/core/tfrt/eager/backends/gpu,tensorflow/core/tfrt/eager/core_runtime,tensorflow/core/tfrt/eager/cpp_tests/core_runtime,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils
 build:tfrt --deleted_packages=

-# Experimental configuration for testing XLA GPU lowering to TFRT.
-#
-# To run using BEF thunks, combine with
-# --//tensorflow/compiler/xla/service/gpu:enable_bef_thunk. For example,
-# bazel test --config=experimental_tfrt_gpu \
-#   --//tensorflow/compiler/xla/service/gpu:enable_bef_thunk \
-#   //tensorflow/compiler/xla/service/gpu/tests:mlir_gemm_test
-#
-# To run using BEF executable, combine with
-# --//tensorflow/compiler/xla/service/gpu:enable_bef_executable. For example,
-# bazel test --config=experimental_tfrt_gpu \
-#   --//tensorflow/compiler/xla/service/gpu:enable_bef_executable \
-#   //tensorflow/compiler/xla/service/gpu/tests:mnist
-build:experimental_tfrt_gpu --config=tfrt
-build:experimental_tfrt_gpu --@tf_runtime//:enable_gpu
-build:experimental_tfrt_gpu --@rules_cuda//cuda:cuda_runtime=//tensorflow/compiler/xla/service/gpu:cuda_runtime_for_xlir
-build:experimental_tfrt_gpu --nocheck_visibility
-build:experimental_tfrt_gpu --incompatible_strict_action_env
-build:experimental_tfrt_gpu --config=monolithic
+# Experimental configuration for building XLA GPU lowering to TFRT.
+build:experimental_enable_xlir --config=tfrt
+build:experimental_enable_xlir --@tf_runtime//:enable_gpu
+build:experimental_enable_xlir --@rules_cuda//cuda:cuda_runtime=//tensorflow/compiler/xla/service/gpu:cuda_runtime_for_xlir
+build:experimental_enable_xlir --nocheck_visibility
+build:experimental_enable_xlir --incompatible_strict_action_env
+build:experimental_enable_xlir --config=monolithic
+
+# bazel test --config=experimental_enable_bef_thunk \
+#   //tensorflow/compiler/xla/service/gpu:bef_thunk_tests
+build:experimental_enable_bef_thunk      --config=experimental_enable_xlir
+test:experimental_enable_bef_thunk       --test_env=XLA_FLAGS=--xla_gpu_bef_thunk
+
+# bazel test --config=experimental_enable_bef_executable \
+#   //tensorflow/compiler/xla/service/gpu:bef_executable_tests
+build:experimental_enable_bef_executable --config=experimental_enable_xlir
+test:experimental_enable_bef_executable  --test_env=XLA_FLAGS=--xla_gpu_bef_executable
+
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@ -79,6 +79,10 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
  opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(false);
  opts.set_xla_multiheap_size_constraint_per_heap(-1);
  opts.set_xla_detailed_logging_and_dumping(true);
+
+  opts.set_xla_gpu_bef_executable(false);
+  opts.set_xla_gpu_bef_thunk(false);
+
  return opts;
 }

@ -683,6 +687,16 @@ static void AllocateFlags() {
      flag_values->xla_dump_hlo_pipeline_re(),
      "If specified, dumps HLO before and after optimization passes in the "
      "pass pipelines that match this regular expression."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_bef_executable",
+      bool_setter_for(&DebugOptions::set_xla_gpu_bef_executable),
+      flag_values->xla_gpu_bef_executable(),
+      "Whether to enable XLIR to compile gpu programs to TFRT BEF."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_bef_thunk",
+      bool_setter_for(&DebugOptions::set_xla_gpu_bef_thunk),
+      flag_values->xla_gpu_bef_thunk(),
+      "Whether to enable XLIR to compile thunks to TFRT BEF."));

  ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }  // NOLINT(readability/fn_size)
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@ -60,8 +60,7 @@ package_group(
 check_deps(
    name = "tfrt_deps_check",
    disallowed_deps = select({
-        ":is_bef_thunk_enabled": [],
-        ":is_bef_executable_enabled": [],
+        ":is_xlir_enabled": [],
        "//conditions:default": [
            ":xlir_kernels",
            ":xlir_opdefs",
@ -308,10 +307,6 @@ cc_library(
        "kernel_mapping_scheme.h",
    ],
    copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]),
-    local_defines = select({
-        ":is_bef_executable_enabled": ["BEF_EXECUTABLE=1"],
-        "//conditions:default": [],
-    }),
    deps = [
        ":backend_configs_cc",
        ":buffer_allocations",
@ -564,23 +559,13 @@ tf_cuda_library(
 )

 bool_flag(
-    name = "enable_bef_thunk",
-    build_setting_default = False,
+    name = "enable_xlir",
+    build_setting_default = if_google(True, False),
 )

 config_setting(
-    name = "is_bef_thunk_enabled",
-    flag_values = {":enable_bef_thunk": "True"},
-)
-
-bool_flag(
-    name = "enable_bef_executable",
-    build_setting_default = False,
-)
-
-config_setting(
-    name = "is_bef_executable_enabled",
-    flag_values = {":enable_bef_executable": "True"},
+    name = "is_xlir_enabled",
+    flag_values = {":enable_xlir": "True"},
 )

 gentbl_cc_library(
@ -629,10 +614,6 @@ cc_library(
 cc_library(
    name = "xlir_kernels",
    srcs = ["xlir_kernels.cc"],
-    local_defines = select({
-        ":is_bef_thunk_enabled": ["BEF_THUNKS=1"],
-        "//conditions:default": [],
-    }),
    tags = ["manual"],
    visibility = ["//visibility:private"],
    deps = [
@ -697,8 +678,7 @@ cc_library(
        "triangular_solve_thunk.h",
    ]),
    local_defines = select({
-        ":is_bef_thunk_enabled": ["BEF_THUNKS=1"],
-        ":is_bef_executable_enabled": ["BEF_EXECUTABLE=1"],
+        ":is_xlir_enabled": ["XLA_ENABLE_XLIR=1"],
        "//conditions:default": [],
    }),
    deps = [
@ -784,47 +764,31 @@ cc_library(
        "//tensorflow/core/platform/default/build_config:stream_executor_rocm",
        "@local_config_rocm//rocm:rocm_headers",
    ]) + select({
-        ":is_bef_thunk_enabled": [
+        ":is_xlir_enabled": [
            ":xlir_kernels",
            ":xlir_opdefs",
            "@llvm-project//llvm:Support",
            "@llvm-project//mlir:GPUTransforms",
            "@llvm-project//mlir:Pass",
            "@llvm-project//mlir:TransformUtils",
+            "//tensorflow/compiler/mlir:name_utils",
            "//tensorflow/compiler/mlir/hlo:lhlo",
            "//tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu:lmhlo_to_tfrt_gpu",
            "//tensorflow/compiler/mlir/xla:attribute_exporter",
            "//tensorflow/stream_executor/gpu:gpu_executor_header",
            "@tf_runtime//:basic_kernels_alwayslink",
            "@tf_runtime//:basic_kernels_opdefs",
-            "@tf_runtime//:befexecutor",
            "@tf_runtime//:bef",
+            "@tf_runtime//:befexecutor",
+            "@tf_runtime//:beftomlir",
            "@tf_runtime//:core_runtime",
            "@tf_runtime//:hostcontext",
            "@tf_runtime//:mlirtobef_translate",
            "@tf_runtime//:support",
-            "@tf_runtime//backends/gpu:gpu_executor",
-            "@tf_runtime//backends/gpu:gpu_kernels_alwayslink",
-            "@tf_runtime//backends/gpu:gpu_opdefs",
-            "@tf_runtime//backends/gpu:gpu_passes",
-            "@tf_runtime//backends/gpu:gpu_types",
-        ],
-        ":is_bef_executable_enabled": [
-            ":xlir_kernels",
-            ":xlir_opdefs",
-            "@llvm-project//llvm:Support",
-            "//tensorflow/compiler/mlir:name_utils",
-            "//tensorflow/stream_executor/gpu:gpu_executor_header",
-            "@tf_runtime//:basic_kernels_alwayslink",
-            "@tf_runtime//:befexecutor",
-            "@tf_runtime//:beftomlir",
-            "@tf_runtime//:bef",
-            "@tf_runtime//:core_runtime",
-            "@tf_runtime//:hostcontext",
-            "@tf_runtime//:support",
            "@tf_runtime//:tensor_alwayslink",
            "@tf_runtime//backends/gpu:gpu_executor",
            "@tf_runtime//backends/gpu:gpu_kernels_alwayslink",
+            "@tf_runtime//backends/gpu:gpu_opdefs",
            "@tf_runtime//backends/gpu:gpu_passes",
            "@tf_runtime//backends/gpu:gpu_types",
        ],
@ -1549,7 +1513,7 @@ cc_library(
        "gpu_compiler.h",
    ],
    local_defines = select({
-        ":is_bef_executable_enabled": ["BEF_EXECUTABLE=1"],
+        ":is_xlir_enabled": ["XLA_ENABLE_XLIR=1"],
        "//conditions:default": [],
    }),
    deps = [
@ -1687,7 +1651,7 @@ cc_library(
        "//tensorflow/core/profiler/lib:traceme",
        "//tensorflow/stream_executor:stream_executor_headers",
    ] + select({
-        ":is_bef_executable_enabled": [
+        ":is_xlir_enabled": [
            "//tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu:pass_utils",
            "@tf_runtime//:mlirtobef_translate",
            "@tf_runtime//:support",
@ -1729,10 +1693,6 @@ cc_library(
    hdrs = if_cuda_is_configured([
        "nvptx_compiler.h",
    ]),
-    local_defines = select({
-        ":is_bef_executable_enabled": ["BEF_EXECUTABLE=1"],
-        "//conditions:default": [],
-    }),
    deps = if_cuda_is_configured([
        ":cublas_pad_for_gemms",
        ":cudnn_fused_conv_rewriter",
@ -1741,6 +1701,7 @@ cc_library(
        ":cusolver_rewriter",
        ":gemm_algorithm_picker",
        ":gpu_asm_opts_util",
+        ":gpu_executable",
        ":gpu_compiler",
        ":gpu_conv_padding_legalization",
        ":gpu_conv_rewriter",
--- a/tensorflow/compiler/xla/service/gpu/bef_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/bef_thunk.cc
@ -19,7 +19,7 @@ limitations under the License.

 #include "tensorflow/core/platform/errors.h"

-#if BEF_THUNKS
+#if XLA_ENABLE_XLIR
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/SourceMgr.h"
 #include "mlir/Dialect/GPU/Passes.h"  // from @llvm-project
@ -63,9 +63,6 @@ limitations under the License.

 namespace xla {
 namespace gpu {
-
-bool IsBefThunkEnabled() { return true; }
-
 namespace {

 struct MlirAndTfrtHostCtx {
@ -504,28 +501,32 @@ Status BefThunk::ExecuteOnStream(const ExecuteParams& params) {

 }  // namespace gpu
 }  // namespace xla
-#else   // BEF_THUNKS
+#else  // XLA_ENABLE_XLIR
+
 namespace xla {

-bool gpu::IsBefThunkEnabled() { return false; }
+static Status GetXlirDisabledError() {
+  return tensorflow::errors::FailedPrecondition(
+      "Built without XLA_ENABLE_XLIR");
+}

 StatusOr<std::unique_ptr<gpu::Thunk>> gpu::CreateBefThunk(
    Thunk::ThunkInfo, mlir::Operation*, std::vector<BufferAllocation::Slice>) {
-  return tensorflow::errors::FailedPrecondition("BefThunks are disabled.");
+  return GetXlirDisabledError();
 }

 StatusOr<std::unique_ptr<gpu::Thunk>> gpu::CreateBefCollectiveThunk(
    Thunk::ThunkInfo, mlir::Operation*, std::vector<BufferAllocation::Slice>,
    int64_t, int64_t) {
-  return tensorflow::errors::FailedPrecondition("BefThunks are disabled.");
+  return GetXlirDisabledError();
 }

 StatusOr<std::unique_ptr<gpu::Thunk>> gpu::CreateBefKernelThunk(
    Thunk::ThunkInfo, absl::Span<const BufferAllocation* const>,
    const std::string&, const LaunchDimensions&) {
-  return tensorflow::errors::FailedPrecondition(
-      "BefKernelThunks are disabled.");
+  return GetXlirDisabledError();
 }

 }  // namespace xla
-#endif  // BEF_THUNKS
+
+#endif  // XLA_ENABLE_XLIR
--- a/tensorflow/compiler/xla/service/gpu/bef_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/bef_thunk.h
@ -30,9 +30,6 @@ class ExecutionContext;
 namespace xla {
 namespace gpu {

-// Return whether --//...:enable_bef_thunk was specified on the command line.
-bool IsBefThunkEnabled();
-
 // Creates a Thunk that uses TFRT BEF execution to perform the work of various
 // Thunk types. A BefThunk is not restricted to a particular op function, unlike
 // GemmThunk, ConvolutionThunk, etc. Rather, a BefThunk is to stand in place of
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@ -19,6 +19,7 @@ limitations under the License.

 #include <atomic>
 #include <functional>
+#include <iterator>
 #include <string>
 #include <utility>

@ -172,11 +173,11 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/util/env_var.h"

-#if BEF_EXECUTABLE
+#if XLA_ENABLE_XLIR
 #include "tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/pass_utils.h"
 #include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 #include "tfrt/bef_converter/mlir_to_bef_translate.h"  // from @tf_runtime
-#endif  // BEF_EXECUTABLE
+#endif  // XLA_ENABLE_XLIR

 namespace xla {
 namespace gpu {
@ -802,7 +803,7 @@ StatusOr<std::unique_ptr<BufferAssignment>> GpuCompiler::AssignBuffers(
  return std::move(assignment);
 }

-#if BEF_EXECUTABLE
+#if XLA_ENABLE_XLIR
 static StatusOr<OwnedBefBuffer> LowerToBef(
    mlir::ModuleOp mlir_module, absl::string_view entry_function_name,
    llvm::ArrayRef<int64_t> buffer_sizes, HloModule* hlo_module) {
@ -842,7 +843,7 @@ static StatusOr<OwnedBefBuffer> LowerToBef(
  std::copy(bef.begin(), bef.end(), ptr);
  return OwnedBefBuffer(ptr, {bef.size()});
 }
-#endif  // BEF_EXECUTABLE
+#endif  // XLA_ENABLE_XLIR

 using OutputInfoMap =
    absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>;
@ -968,10 +969,11 @@ static Status CompileModuleToLlvmIrImpl(

    TF_RETURN_IF_ERROR(ir_emitter->EmitLmhloRegion(&entry_function.body()));

-    // TODO(b/218527186): Implement this feature for BEF as well.
-    // TODO(b/218907125): Implement this feature for ROCm as well.
    bool supports_runtime_managed_constants =
-        !(IsBefThunkEnabled() || platform_id == se::rocm::kROCmPlatformId);
+        // TODO(b/218527186): Implement this feature for BEF as well.
+        !IsBefEnabled(hlo_module->config()) &&
+        // TODO(b/218907125): Implement this feature for ROCm as well.
+        platform_id != se::rocm::kROCmPlatformId;
    if (supports_runtime_managed_constants) {
      // Remove these globals from the generated code to indicate that XLA is
      // responsible for allocating and initializing them.
@ -987,19 +989,21 @@ static Status CompileModuleToLlvmIrImpl(
    RecordHloToLlvmDuration(end_usecs - start_usecs);
  }

-#if BEF_EXECUTABLE
-  std::vector<int64_t> buffer_sizes;
-  llvm::transform(
-      results->allocations, std::back_inserter(buffer_sizes),
-      [](const BufferAllocation& allocation) { return allocation.size(); });
-  TF_ASSIGN_OR_RETURN(results->thunks_or_bef,
-                      LowerToBef(*mlir_module, entry_function.getName().str(),
-                                 buffer_sizes, hlo_module));
-#else   // BEF_EXECUTABLE
+#if XLA_ENABLE_XLIR
+  if (IsBefExecutableEnabled(hlo_module->config())) {
+    std::vector<int64_t> buffer_sizes;
+    llvm::transform(
+        results->allocations, std::back_inserter(buffer_sizes),
+        [](const BufferAllocation& allocation) { return allocation.size(); });
+    TF_ASSIGN_OR_RETURN(results->thunks_or_bef,
+                        LowerToBef(*mlir_module, entry_function.getName().str(),
+                                   buffer_sizes, hlo_module));
+    return Status::OK();
+  }
+#endif  // XLA_ENABLE_XLIR
+
  results->thunks_or_bef =
      absl::make_unique<ThunkSchedule>(ir_emitter->ConsumeThunkSequence());
-#endif  // BEF_EXECUTABLE
-
  return Status::OK();
 }

@ -1354,7 +1358,7 @@ GpuDeviceInfo GetGpuDeviceInfo(se::StreamExecutor* stream_exec) {
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                                const AotCompilationOptions& options) {
-#if BEF_EXECUTABLE
+#if XLA_ENABLE_XLIR
  CHECK(options.PlatformId() == se::cuda::kCudaPlatformId);
  CHECK(options.executor() != nullptr);
  auto stream_exec = options.executor();
@ -1429,10 +1433,9 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
  }

  return std::move(results);
-#else   // BEF_EXECUTABLE
-  return FailedPrecondition(
-      "GpuCompiler::CompileAheadOfTime only supported with BEF_EXECUTABLE");
-#endif  // BEF_EXECUTABLE
+#else   // XLA_ENABLE_XLIR
+  return FailedPrecondition("Not built with XLA_ENABLE_XLIR");
+#endif  // XLA_ENABLE_XLIR
 }

 HloCostAnalysis::ShapeSizeFunction GpuCompiler::ShapeSizeBytesFunction() const {
@ -1556,11 +1559,11 @@ StatusOr<std::unique_ptr<Executable>> CompileLmhloToExecutable(
                                           module_config, ir_emitter_context));
  TF_RETURN_IF_ERROR(ir_emitter->EmitLmhloRegion(&entry_function.body()));

-  // TODO(b/218527186): Implement this feature for BEF as well.
-  // TODO(b/218907125): Implement this feature for ROCm as well.
  bool supports_runtime_managed_constants =
-      !(IsBefThunkEnabled() ||
-        compiler->PlatformId() == se::rocm::kROCmPlatformId);
+      // TODO(b/218527186): Implement this feature for BEF as well.
+      !IsBefEnabled(module_config) &&
+      // TODO(b/218907125): Implement this feature for ROCm as well.
+      compiler->PlatformId() != se::rocm::kROCmPlatformId;
  if (supports_runtime_managed_constants) {
    // Remove these globals from the generated code to indicate that XLA is
    // responsible for allocating and initializing them.
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@ -50,7 +50,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/stream_executor/platform.h"

-#if BEF_EXECUTABLE
+#if XLA_ENABLE_XLIR
 #include "llvm/Support/SourceMgr.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
@ -71,10 +71,29 @@ limitations under the License.
 #include "tfrt/host_context/function.h"  // from @tf_runtime
 #include "tfrt/host_context/host_allocator.h"  // from @tf_runtime
 #include "tfrt/host_context/host_context.h"  // from @tf_runtime
-#endif  // BEF_EXECUTABLE
+#endif  // XLA_ENABLE_XLIR

 namespace xla {
 namespace gpu {
+
+bool IsBefExecutableEnabled(const HloModuleConfig& config) {
+#if XLA_ENABLE_XLIR
+  return config.debug_options().xla_gpu_bef_executable();
+#else   // XLA_ENABLE_XLIR
+  (void)config;
+  return false;
+#endif  // XLA_ENABLE_XLIR
+}
+
+bool IsBefThunkEnabled(const HloModuleConfig& config) {
+#if XLA_ENABLE_XLIR
+  return config.debug_options().xla_gpu_bef_thunk();
+#else   // XLA_ENABLE_XLIR
+  (void)config;
+  return false;
+#endif  // XLA_ENABLE_XLIR
+}
+
 namespace {

 using ::tensorflow::profiler::ScopedAnnotation;
@ -101,14 +120,14 @@ static std::string ModuleUniqueName(absl::string_view module_name,
 }  // namespace

 void GpuExecutable::BefBufferDeleter::operator()(uint8_t* ptr) const {
-#if BEF_EXECUTABLE
+#if XLA_ENABLE_XLIR
  tfrt::AlignedFree(ptr);
 #else
-  LOG(FATAL) << "OwnedBefBuffer only supported with BEF_EXECUTABLE";
+  LOG(FATAL) << "OwnedBefBuffer only supported with XLA_ENABLE_XLIR";
 #endif
 }

-#if BEF_EXECUTABLE
+#if XLA_ENABLE_XLIR
 struct GpuExecutable::BefExecutable {
 private:
  explicit BefExecutable(OwnedBefBuffer buffer)
@ -168,7 +187,7 @@ struct GpuExecutable::BefExecutable {
  absl::Mutex mutex;
  tfrt::gpu::GpuContextCache gpu_ctx_cache TF_GUARDED_BY(mutex);
 };
-#endif
+#endif  // XLA_ENABLE_XLIR

 StatusOr<std::unique_ptr<GpuExecutable>> GpuExecutable::Create(Params params) {
  auto thunks_or_bef = std::move(params.thunks_or_bef);
@ -179,14 +198,14 @@ StatusOr<std::unique_ptr<GpuExecutable>> GpuExecutable::Create(Params params) {
    return result;
  }

-#if BEF_EXECUTABLE
+#if XLA_ENABLE_XLIR
  if (absl::holds_alternative<OwnedBefBuffer>(thunks_or_bef)) {
    auto& bef_buffer = absl::get<OwnedBefBuffer>(thunks_or_bef);
    TF_ASSIGN_OR_RETURN(result->bef_executable_,
                        BefExecutable::Create(std::move(bef_buffer)));
    return result;
  }
-#endif
+#endif  // XLA_ENABLE_XLIR

  return InternalError("No thunk or bef provided");
 }
@ -250,7 +269,7 @@ GpuExecutable::~GpuExecutable() {
    }
  }

-#if BEF_EXECUTABLE
+#if XLA_ENABLE_XLIR
  delete bef_executable_;
 #endif
 }
@ -579,7 +598,7 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
  return out.ConsumeResult();
 }

-#if BEF_EXECUTABLE
+#if XLA_ENABLE_XLIR
 // TODO(hanbinyoon): Deduplicate with that in bef_thunk.cc.
 static tfrt::RCReference<tfrt::AsyncValue> CreateGpuBuffer(
    stream_executor::DeviceMemoryBase* data) {
@ -693,7 +712,7 @@ static Status ExecuteBef(const std::string& module_name,
      run_options, start_micros,
      block_host_until_done ? run_options->stream() : nullptr);
 }
-#endif  // BEF_EXECUTABLE
+#endif  // XLA_ENABLE_XLIR

 StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
    const ServiceExecutableRunOptions* run_options,
@ -871,13 +890,13 @@ Status GpuExecutable::ExecuteThunksOrBef(
                         buffer_allocations, block_host_until_done);
  }

-#if BEF_EXECUTABLE
+#if XLA_ENABLE_XLIR
  if (bef_executable_) {
    return ExecuteBef(module_name_, bef_executable_, run_options,
                      buffer_allocations, allocations_.size(),
                      block_host_until_done);
  }
-#endif  // BEF_EXECUTABLE
+#endif  // XLA_ENABLE_XLIR

  return FailedPrecondition("Expected thunk or bef is not supplied.");
 }
@ -977,7 +996,7 @@ Status GpuExecutable::SetUpMlirAllocation(
  return Status::OK();
 }

-#if BEF_EXECUTABLE
+#if XLA_ENABLE_XLIR
 static void ApplyEntryFunctionAttributes(
    mlir::MLIRContext& context, mlir::FuncOp& func,
    xla::EntryFunctionAttributes entry_func_attrs, int buffer_param_offset) {
@ -996,10 +1015,11 @@ static void ApplyEntryFunctionAttributes(
                        builder.getIndexAttr(buffer.lmhlo_params()));
    }
    if (buffer.has_lmhlo_param_shape_index()) {
-      arg_attr_list.set("lmhlo.param_shape_index",
-                        builder.getI64TensorAttr(llvm::makeArrayRef(
-                            buffer.lmhlo_param_shape_index().indices().begin(),
-                            buffer.lmhlo_param_shape_index().indices().end())));
+      arg_attr_list.set(
+          "lmhlo.param_shape_index",
+          builder.getI64TensorAttr(llvm::makeArrayRef(
+              buffer.lmhlo_param_shape_index().indices().data(),
+              buffer.lmhlo_param_shape_index().indices().size())));
    }
    if (!buffer.lmhlo_constant_name().empty()) {
      arg_attr_list.set("lmhlo.constant_name",
@ -1011,8 +1031,8 @@ static void ApplyEntryFunctionAttributes(
    if (buffer.has_lmhlo_output_index()) {
      arg_attr_list.set("lmhlo.output_index",
                        builder.getI64TensorAttr(llvm::makeArrayRef(
-                            buffer.lmhlo_output_index().indices().begin(),
-                            buffer.lmhlo_output_index().indices().end())));
+                            buffer.lmhlo_output_index().indices().data(),
+                            buffer.lmhlo_output_index().indices().size())));
    }
    args_attrs.push_back(arg_attr_list.getDictionary(&context));
  }
@ -1020,12 +1040,12 @@ static void ApplyEntryFunctionAttributes(
  func->setAttr("result_xla_shape",
                builder.getStringAttr(entry_func_attrs.result_xla_shape()));
 }
-#endif  // BEF_EXECUTABLE
+#endif  // XLA_ENABLE_XLIR

 StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromBef(
    std::shared_ptr<HloModule> hlo_module, absl::string_view bef,
    xla::EntryFunctionAttributes entry_func_attrs, GpuVersion gpu_version) {
-#if BEF_EXECUTABLE
+#if XLA_ENABLE_XLIR
  OwnedBefBuffer bef_buffer = [bef]() {
    auto ptr = static_cast<uint8_t*>(
        tfrt::AlignedAlloc(tfrt::GetRequiredBefAlignment(), bef.size()));
@ -1064,9 +1084,9 @@ StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromBef(
                        module_name, result_xla_shape, std::move(allocations),
                        std::move(output_info), bef_executable));
  return executable;
-#else   // BEF_EXECUTABLE
-  return FailedPrecondition("LoadFromBef only supported with BEF_EXECUTABLE");
-#endif  // BEF_EXECUTABLE
+#else   // XLA_ENABLE_XLIR
+  return FailedPrecondition("Not built with XLA_ENABLE_XLIR");
+#endif  // XLA_ENABLE_XLIR
 }

 StatusOr<absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>>
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@ -44,6 +44,16 @@ limitations under the License.
 namespace xla {
 namespace gpu {

+// Returns whether GpuExecutable runs on TFRT (instead of thunks).
+bool IsBefExecutableEnabled(const HloModuleConfig& config);
+
+// Returns whether to create BefThunks (if the specific thunk is supported).
+bool IsBefThunkEnabled(const HloModuleConfig& config);
+
+inline bool IsBefEnabled(const HloModuleConfig& config) {
+  return IsBefExecutableEnabled(config) || IsBefThunkEnabled(config);
+}
+
 // GPU-targeting implementation of the XLA Executable interface.
 //
 // Launches the given GPU kernel via the StreamExecutor.
@ -276,7 +286,7 @@ class GpuExecutable : public Executable {
  // potentially shared with other executables.
  std::vector<std::shared_ptr<se::DeviceMemoryBase>> shared_constants_;

-  // Data for BEF_EXECUTABLE mode only, owned.
+  // Data for bef executable mode only, owned.
  BefExecutable* bef_executable_ = nullptr;

  GpuExecutable(const GpuExecutable&) = delete;
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@ -78,6 +78,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
 #include "tensorflow/compiler/xla/service/gpu/infeed_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
@ -996,7 +997,7 @@ Status IrEmitterUnnested::EmitConvolutionThunk(mlir::Operation* op) {
  TF_ASSIGN_OR_RETURN(auto conv_result_slice, GetAllocationSlice(conv_result));
  TF_ASSIGN_OR_RETURN(auto scratch_slice, GetAllocationSlice(scratch_result));

-  if (IsBefThunkEnabled()) {
+  if (IsBefThunkEnabled(hlo_module_config_)) {
    operand_slices.push_back(conv_result_slice);
    operand_slices.push_back(scratch_slice);
    TF_ASSIGN_OR_RETURN(
@ -1178,7 +1179,7 @@ Status IrEmitterUnnested::EmitGemmThunk(mlir::Operation* op) {

  TF_ASSIGN_OR_RETURN(auto thunk, [&]() -> StatusOr<std::unique_ptr<Thunk>> {
    if (auto gemm = mlir::dyn_cast<mlir::lmhlo_gpu::GEMMOp>(op)) {
-      if (IsBefThunkEnabled()) return make_bef_thunk(gemm);
+      if (IsBefThunkEnabled(hlo_module_config_)) return make_bef_thunk(gemm);
      return make_gemm_thunk(gemm);
    }

@ -1187,7 +1188,8 @@ Status IrEmitterUnnested::EmitGemmThunk(mlir::Operation* op) {
      TF_ASSIGN_OR_RETURN(auto bias, GetAllocationSlice(gemm.bias()));
      TF_ASSIGN_OR_RETURN(auto output, GetAllocationSlice(gemm.output()));

-      if (IsBefThunkEnabled()) return make_bef_thunk(gemm, bias);
+      if (IsBefThunkEnabled(hlo_module_config_))
+        return make_bef_thunk(gemm, bias);

      // The bias is passed inside the output buffer. If those buffers are
      // shared we can just use it, otherwise copy the bias values into the
@ -1321,7 +1323,7 @@ Status IrEmitterUnnested::EmitCholeskyThunk(mlir::Operation* op) {
                      GetAllocationSlice(cholesky_op.scratch()));
  TF_ASSIGN_OR_RETURN(auto info_buffer, GetAllocationSlice(cholesky_op.info()));

-  if (IsBefThunkEnabled()) {
+  if (IsBefThunkEnabled(hlo_module_config_)) {
    std::vector<BufferAllocation::Slice> buffers = {
        operand_buffer, a_buffer, workspace_buffer, info_buffer};
    TF_ASSIGN_OR_RETURN(
@ -1419,7 +1421,7 @@ Status IrEmitterUnnested::EmitCustomCallThunk(mlir::Operation* op) {
  }

  std::unique_ptr<Thunk> thunk;
-  if (IsBefThunkEnabled()) {
+  if (IsBefThunkEnabled(hlo_module_config_)) {
    auto values_to_non_optional_slices = [&](mlir::ValueRange values)
        -> StatusOr<std::vector<BufferAllocation::Slice>> {
      std::vector<BufferAllocation::Slice> slices;
@ -2866,7 +2868,7 @@ Status IrEmitterUnnested::EmitReplicaOrPartitionId(mlir::Operation* op) {
  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result_slice,
                      GetAllocationSlice(casted.getOperand()));
  std::unique_ptr<Thunk> thunk;
-  if (IsBefThunkEnabled()) {
+  if (IsBefThunkEnabled(hlo_module_config_)) {
    TF_ASSIGN_OR_RETURN(thunk,
                        CreateBefThunk(GetThunkInfo(op), op, {result_slice}));
  } else {
@ -2898,7 +2900,7 @@ Status IrEmitterUnnested::EmitCollectivePermute(mlir::Operation* op) {
        /*mem_size=*/ShapeUtil::ByteSizeOf(shape)));
  } else {
    std::unique_ptr<Thunk> thunk;
-    if (IsBefThunkEnabled()) {
+    if (IsBefThunkEnabled(hlo_module_config_)) {
      std::vector<BufferAllocation::Slice> buffers = {source_slice,
                                                      result_slice};
      TF_ASSIGN_OR_RETURN(thunk, CreateBefCollectiveThunk(
@ -2967,10 +2969,11 @@ Status IrEmitterUnnested::EmitNcclThunk(mlir::Operation* untyped_op) {

  if (should_use_nccl_thunk) {
    std::unique_ptr<Thunk> thunk;
-    if (IsBefThunkEnabled() && (mlir::isa<mlir::lmhlo::AllGatherOp>(op) ||
-                                mlir::isa<mlir::lmhlo::AllReduceOp>(op) ||
-                                mlir::isa<mlir::lmhlo::ReduceScatterOp>(op) ||
-                                mlir::isa<mlir::lmhlo::AllToAllOp>(op))) {
+    if (IsBefThunkEnabled(hlo_module_config_) &&
+        (mlir::isa<mlir::lmhlo::AllGatherOp>(op) ||
+         mlir::isa<mlir::lmhlo::AllReduceOp>(op) ||
+         mlir::isa<mlir::lmhlo::ReduceScatterOp>(op) ||
+         mlir::isa<mlir::lmhlo::AllToAllOp>(op))) {
      std::vector<BufferAllocation::Slice> arg_buffers;
      arg_buffers.reserve(buffers.size() * 2);
      for (const auto& buffer : buffers) {
@ -3201,7 +3204,7 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildKernelThunkImpl(
                                std::string(kernel->getName()),
                                ir_emitter_context_->llvm_module());

-  if (IsBefThunkEnabled()) {
+  if (IsBefThunkEnabled(hlo_module_config_)) {
    return CreateBefKernelThunk(thunk_info, non_constant_buffers,
                                std::string(kernel->getName()),
                                launch_dimensions);
@ -5511,15 +5514,14 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
  }

  if (mlir::isa<mlir::lmhlo::TriangularSolveOp>(op)) {
-#if BEF_EXECUTABLE
-    // BEF-mode GpuExecutable allocates temp memory, and so the custom-call
-    // implementation for TriangularSolve is not needed.
-    return Status::OK();
-#else
+    if (IsBefEnabled(hlo_module_config_)) {
+      // XLIR allocates temp memory, and so the custom-call implementation for
+      // TriangularSolve is not needed.
+      return Status::OK();
+    }
    return InternalError(
        "TriangularSolve is implemented as a custom-call; we do not expect to "
        "lower a true HLO TriangularSolve op.");
-#endif
  }

  if (mlir::isa<mlir::lmhlo::FusionOp>(op)) {
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
@ -149,12 +150,12 @@ Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
  // Find the fastest algorithm for GEMMs.
  post_pipeline.AddPass<GemmAlgorithmPicker>(stream_exec, device_allocator);

-  // BEF-mode GpuExecutable allocates temp memory, and so the custom-call
-  // implementation for TriangularSolve is not needed.
-#if !BEF_EXECUTABLE
-  // Transform TriangularSolve ops into custom-calls, so we can add temp memory.
-  post_pipeline.AddPass<TriangularSolveRewriter>();
-#endif
+  if (!IsBefEnabled(hlo_module->config())) {
+    // Transform TriangularSolve ops into custom-calls, so we can add temp
+    // memory. XLIR allocates temp memory, and so the custom-call implementation
+    // for TriangularSolve is not needed.
+    post_pipeline.AddPass<TriangularSolveRewriter>();
+  }

  TF_RETURN_IF_ERROR(post_pipeline.Run(hlo_module).status());

--- a/tensorflow/compiler/xla/service/gpu/tests/bef_executable_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/bef_executable_test.cc
@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"

 // This test suite should be run with
-// --//third_party/tensorflow/compiler/xla/service/gpu:enable_bef_executable=true
+// '--test_env=XLA_FLAGS=--xla_gpu_bef_executable'
 // to test the BEF-mode GpuExecutable.

 namespace xla {
--- a/tensorflow/compiler/xla/service/gpu/tests/mnist.py
+++ b/tensorflow/compiler/xla/service/gpu/tests/mnist.py
@ -18,11 +18,11 @@ The source code here is from
 https://www.tensorflow.org/xla/tutorials/jit_compile, where there is also a
 walkthrough.

-To execute in TFRT BEF, run with `--config=cuda
--//third_party/tensorflow/compiler/xla/service/gpu:enable_bef_executable=true`.
+To execute in TFRT BEF, run with
+`--config=cuda --test_env=XLA_FLAGS=--xla_gpu_bef_executable`

 To dump debug output (e.g., LMHLO MLIR, TFRT MLIR, TFRT BEF), run with
-`XLA_FLAGS="--xla_dump_to=/tmp/mnist"`.
+`--test_env=XLA_FLAGS="--xla_dump_to=/tmp/mnist"`.
 """

 from absl import app
--- a/tensorflow/compiler/xla/service/gpu/xlir_kernels.cc
+++ b/tensorflow/compiler/xla/service/gpu/xlir_kernels.cc
@ -38,8 +38,6 @@
 namespace xla {
 namespace gpu {

-#if BEF_THUNKS
-
 static llvm::Expected<tfrt::gpu::GpuModule> ModuleLoad(
    tfrt::Argument<tfrt::gpu::GpuContext> context,
    const tfrt::ExecutionContext& exec_ctx) {
@ -83,8 +81,6 @@ static llvm::Expected<tfrt::gpu::GpuModule> ModuleLoad(
  return tfrt::gpu::GpuModule(context.ValueRef(), std::move(*module));
 }

-#endif  // BEF_THUNKS
-
 static llvm::Expected<DeviceAssignment::LogicalID> GetLogicalId(
    const tfrt::ExecutionContext& exec_ctx) {
  auto* xla_gpu_params =
@ -367,9 +363,8 @@ static llvm::Error CustomCall(
 static void RegisterXlirKernels(tfrt::KernelRegistry* kernel_reg) {
  kernel_reg->AddKernel("xlir.custom_call",
                        TFRT_KERNEL_WITH_CHAIN_RESULT(CustomCall));
-#if BEF_THUNKS
+  // This kernel is only used for bef thunks, not bef executables.
  kernel_reg->AddKernel("xlir.module.load", TFRT_KERNEL(ModuleLoad));
-#endif  // BEF_THUNKS
  kernel_reg->AddKernel("xlir.replica_id",
                        TFRT_KERNEL_WITH_CHAIN_RESULT(ReplicaId));
  kernel_reg->AddKernel("xlir.partition_id",
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@ -359,7 +359,14 @@ message DebugOptions {
  // logging a warning and proceeding with fallback.
  bool xla_gpu_strict_conv_algorithm_picker = 156;

-  // Next id: 161
+  // If true, enable XLIR to compile gpu programs to TFRT BEF.
+  bool xla_gpu_bef_executable = 161;
+
+  // If true, enable XLIR to compile thunks to TFRT BEF.
+  // This flag has no effect when xla_gpu_bef_executable is true.
+  bool xla_gpu_bef_thunk = 162;
+
+  // Next id: 163

  // Extra options to pass to the compilation backend (e.g. LLVM); specific
  // interpretation of these values is left to the backend.