mirror of
https://github.com/zebrajr/tensorflow.git
synced 2025-12-06 12:20:11 +01:00
Internal change
PiperOrigin-RevId: 429500662 Change-Id: I0ebaa1c1321fa4a077a48443fbe6d7cdfa98533b
This commit is contained in:
parent
5d5d822d0e
commit
4248acac6d
37
.bazelrc
37
.bazelrc
|
|
@ -718,22 +718,21 @@ build:ubsan --linkopt -lubsan
|
|||
build --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/common,tensorflow/core/tfrt/eager,tensorflow/core/tfrt/eager/backends/cpu,tensorflow/core/tfrt/eager/backends/gpu,tensorflow/core/tfrt/eager/core_runtime,tensorflow/core/tfrt/eager/cpp_tests/core_runtime,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils
|
||||
build:tfrt --deleted_packages=
|
||||
|
||||
# Experimental configuration for testing XLA GPU lowering to TFRT.
|
||||
#
|
||||
# To run using BEF thunks, combine with
|
||||
# --//tensorflow/compiler/xla/service/gpu:enable_bef_thunk. For example,
|
||||
# bazel test --config=experimental_tfrt_gpu \
|
||||
# --//tensorflow/compiler/xla/service/gpu:enable_bef_thunk \
|
||||
# //tensorflow/compiler/xla/service/gpu/tests:mlir_gemm_test
|
||||
#
|
||||
# To run using BEF executable, combine with
|
||||
# --//tensorflow/compiler/xla/service/gpu:enable_bef_executable. For example,
|
||||
# bazel test --config=experimental_tfrt_gpu \
|
||||
# --//tensorflow/compiler/xla/service/gpu:enable_bef_executable \
|
||||
# //tensorflow/compiler/xla/service/gpu/tests:mnist
|
||||
build:experimental_tfrt_gpu --config=tfrt
|
||||
build:experimental_tfrt_gpu --@tf_runtime//:enable_gpu
|
||||
build:experimental_tfrt_gpu --@rules_cuda//cuda:cuda_runtime=//tensorflow/compiler/xla/service/gpu:cuda_runtime_for_xlir
|
||||
build:experimental_tfrt_gpu --nocheck_visibility
|
||||
build:experimental_tfrt_gpu --incompatible_strict_action_env
|
||||
build:experimental_tfrt_gpu --config=monolithic
|
||||
# Experimental configuration for building XLA GPU lowering to TFRT.
|
||||
build:experimental_enable_xlir --config=tfrt
|
||||
build:experimental_enable_xlir --@tf_runtime//:enable_gpu
|
||||
build:experimental_enable_xlir --@rules_cuda//cuda:cuda_runtime=//tensorflow/compiler/xla/service/gpu:cuda_runtime_for_xlir
|
||||
build:experimental_enable_xlir --nocheck_visibility
|
||||
build:experimental_enable_xlir --incompatible_strict_action_env
|
||||
build:experimental_enable_xlir --config=monolithic
|
||||
|
||||
# bazel test --config=experimental_enable_bef_thunk \
|
||||
# //tensorflow/compiler/xla/service/gpu:bef_thunk_tests
|
||||
build:experimental_enable_bef_thunk --config=experimental_enable_xlir
|
||||
test:experimental_enable_bef_thunk --test_env=XLA_FLAGS=--xla_gpu_bef_thunk
|
||||
|
||||
# bazel test --config=experimental_enable_bef_executable \
|
||||
# //tensorflow/compiler/xla/service/gpu:bef_executable_tests
|
||||
build:experimental_enable_bef_executable --config=experimental_enable_xlir
|
||||
test:experimental_enable_bef_executable --test_env=XLA_FLAGS=--xla_gpu_bef_executable
|
||||
|
||||
|
|
|
|||
|
|
@ -79,6 +79,10 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
|
|||
opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(false);
|
||||
opts.set_xla_multiheap_size_constraint_per_heap(-1);
|
||||
opts.set_xla_detailed_logging_and_dumping(true);
|
||||
|
||||
opts.set_xla_gpu_bef_executable(false);
|
||||
opts.set_xla_gpu_bef_thunk(false);
|
||||
|
||||
return opts;
|
||||
}
|
||||
|
||||
|
|
@ -683,6 +687,16 @@ static void AllocateFlags() {
|
|||
flag_values->xla_dump_hlo_pipeline_re(),
|
||||
"If specified, dumps HLO before and after optimization passes in the "
|
||||
"pass pipelines that match this regular expression."));
|
||||
flag_objects->push_back(tensorflow::Flag(
|
||||
"xla_gpu_bef_executable",
|
||||
bool_setter_for(&DebugOptions::set_xla_gpu_bef_executable),
|
||||
flag_values->xla_gpu_bef_executable(),
|
||||
"Whether to enable XLIR to compile gpu programs to TFRT BEF."));
|
||||
flag_objects->push_back(tensorflow::Flag(
|
||||
"xla_gpu_bef_thunk",
|
||||
bool_setter_for(&DebugOptions::set_xla_gpu_bef_thunk),
|
||||
flag_values->xla_gpu_bef_thunk(),
|
||||
"Whether to enable XLIR to compile thunks to TFRT BEF."));
|
||||
|
||||
ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
|
||||
} // NOLINT(readability/fn_size)
|
||||
|
|
|
|||
|
|
@ -60,8 +60,7 @@ package_group(
|
|||
check_deps(
|
||||
name = "tfrt_deps_check",
|
||||
disallowed_deps = select({
|
||||
":is_bef_thunk_enabled": [],
|
||||
":is_bef_executable_enabled": [],
|
||||
":is_xlir_enabled": [],
|
||||
"//conditions:default": [
|
||||
":xlir_kernels",
|
||||
":xlir_opdefs",
|
||||
|
|
@ -308,10 +307,6 @@ cc_library(
|
|||
"kernel_mapping_scheme.h",
|
||||
],
|
||||
copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]),
|
||||
local_defines = select({
|
||||
":is_bef_executable_enabled": ["BEF_EXECUTABLE=1"],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
deps = [
|
||||
":backend_configs_cc",
|
||||
":buffer_allocations",
|
||||
|
|
@ -564,23 +559,13 @@ tf_cuda_library(
|
|||
)
|
||||
|
||||
bool_flag(
|
||||
name = "enable_bef_thunk",
|
||||
build_setting_default = False,
|
||||
name = "enable_xlir",
|
||||
build_setting_default = if_google(True, False),
|
||||
)
|
||||
|
||||
config_setting(
|
||||
name = "is_bef_thunk_enabled",
|
||||
flag_values = {":enable_bef_thunk": "True"},
|
||||
)
|
||||
|
||||
bool_flag(
|
||||
name = "enable_bef_executable",
|
||||
build_setting_default = False,
|
||||
)
|
||||
|
||||
config_setting(
|
||||
name = "is_bef_executable_enabled",
|
||||
flag_values = {":enable_bef_executable": "True"},
|
||||
name = "is_xlir_enabled",
|
||||
flag_values = {":enable_xlir": "True"},
|
||||
)
|
||||
|
||||
gentbl_cc_library(
|
||||
|
|
@ -629,10 +614,6 @@ cc_library(
|
|||
cc_library(
|
||||
name = "xlir_kernels",
|
||||
srcs = ["xlir_kernels.cc"],
|
||||
local_defines = select({
|
||||
":is_bef_thunk_enabled": ["BEF_THUNKS=1"],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
tags = ["manual"],
|
||||
visibility = ["//visibility:private"],
|
||||
deps = [
|
||||
|
|
@ -697,8 +678,7 @@ cc_library(
|
|||
"triangular_solve_thunk.h",
|
||||
]),
|
||||
local_defines = select({
|
||||
":is_bef_thunk_enabled": ["BEF_THUNKS=1"],
|
||||
":is_bef_executable_enabled": ["BEF_EXECUTABLE=1"],
|
||||
":is_xlir_enabled": ["XLA_ENABLE_XLIR=1"],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
deps = [
|
||||
|
|
@ -784,47 +764,31 @@ cc_library(
|
|||
"//tensorflow/core/platform/default/build_config:stream_executor_rocm",
|
||||
"@local_config_rocm//rocm:rocm_headers",
|
||||
]) + select({
|
||||
":is_bef_thunk_enabled": [
|
||||
":is_xlir_enabled": [
|
||||
":xlir_kernels",
|
||||
":xlir_opdefs",
|
||||
"@llvm-project//llvm:Support",
|
||||
"@llvm-project//mlir:GPUTransforms",
|
||||
"@llvm-project//mlir:Pass",
|
||||
"@llvm-project//mlir:TransformUtils",
|
||||
"//tensorflow/compiler/mlir:name_utils",
|
||||
"//tensorflow/compiler/mlir/hlo:lhlo",
|
||||
"//tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu:lmhlo_to_tfrt_gpu",
|
||||
"//tensorflow/compiler/mlir/xla:attribute_exporter",
|
||||
"//tensorflow/stream_executor/gpu:gpu_executor_header",
|
||||
"@tf_runtime//:basic_kernels_alwayslink",
|
||||
"@tf_runtime//:basic_kernels_opdefs",
|
||||
"@tf_runtime//:befexecutor",
|
||||
"@tf_runtime//:bef",
|
||||
"@tf_runtime//:befexecutor",
|
||||
"@tf_runtime//:beftomlir",
|
||||
"@tf_runtime//:core_runtime",
|
||||
"@tf_runtime//:hostcontext",
|
||||
"@tf_runtime//:mlirtobef_translate",
|
||||
"@tf_runtime//:support",
|
||||
"@tf_runtime//backends/gpu:gpu_executor",
|
||||
"@tf_runtime//backends/gpu:gpu_kernels_alwayslink",
|
||||
"@tf_runtime//backends/gpu:gpu_opdefs",
|
||||
"@tf_runtime//backends/gpu:gpu_passes",
|
||||
"@tf_runtime//backends/gpu:gpu_types",
|
||||
],
|
||||
":is_bef_executable_enabled": [
|
||||
":xlir_kernels",
|
||||
":xlir_opdefs",
|
||||
"@llvm-project//llvm:Support",
|
||||
"//tensorflow/compiler/mlir:name_utils",
|
||||
"//tensorflow/stream_executor/gpu:gpu_executor_header",
|
||||
"@tf_runtime//:basic_kernels_alwayslink",
|
||||
"@tf_runtime//:befexecutor",
|
||||
"@tf_runtime//:beftomlir",
|
||||
"@tf_runtime//:bef",
|
||||
"@tf_runtime//:core_runtime",
|
||||
"@tf_runtime//:hostcontext",
|
||||
"@tf_runtime//:support",
|
||||
"@tf_runtime//:tensor_alwayslink",
|
||||
"@tf_runtime//backends/gpu:gpu_executor",
|
||||
"@tf_runtime//backends/gpu:gpu_kernels_alwayslink",
|
||||
"@tf_runtime//backends/gpu:gpu_opdefs",
|
||||
"@tf_runtime//backends/gpu:gpu_passes",
|
||||
"@tf_runtime//backends/gpu:gpu_types",
|
||||
],
|
||||
|
|
@ -1549,7 +1513,7 @@ cc_library(
|
|||
"gpu_compiler.h",
|
||||
],
|
||||
local_defines = select({
|
||||
":is_bef_executable_enabled": ["BEF_EXECUTABLE=1"],
|
||||
":is_xlir_enabled": ["XLA_ENABLE_XLIR=1"],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
deps = [
|
||||
|
|
@ -1687,7 +1651,7 @@ cc_library(
|
|||
"//tensorflow/core/profiler/lib:traceme",
|
||||
"//tensorflow/stream_executor:stream_executor_headers",
|
||||
] + select({
|
||||
":is_bef_executable_enabled": [
|
||||
":is_xlir_enabled": [
|
||||
"//tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu:pass_utils",
|
||||
"@tf_runtime//:mlirtobef_translate",
|
||||
"@tf_runtime//:support",
|
||||
|
|
@ -1729,10 +1693,6 @@ cc_library(
|
|||
hdrs = if_cuda_is_configured([
|
||||
"nvptx_compiler.h",
|
||||
]),
|
||||
local_defines = select({
|
||||
":is_bef_executable_enabled": ["BEF_EXECUTABLE=1"],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
deps = if_cuda_is_configured([
|
||||
":cublas_pad_for_gemms",
|
||||
":cudnn_fused_conv_rewriter",
|
||||
|
|
@ -1741,6 +1701,7 @@ cc_library(
|
|||
":cusolver_rewriter",
|
||||
":gemm_algorithm_picker",
|
||||
":gpu_asm_opts_util",
|
||||
":gpu_executable",
|
||||
":gpu_compiler",
|
||||
":gpu_conv_padding_legalization",
|
||||
":gpu_conv_rewriter",
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ limitations under the License.
|
|||
|
||||
#include "tensorflow/core/platform/errors.h"
|
||||
|
||||
#if BEF_THUNKS
|
||||
#if XLA_ENABLE_XLIR
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/Support/SourceMgr.h"
|
||||
#include "mlir/Dialect/GPU/Passes.h" // from @llvm-project
|
||||
|
|
@ -63,9 +63,6 @@ limitations under the License.
|
|||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
bool IsBefThunkEnabled() { return true; }
|
||||
|
||||
namespace {
|
||||
|
||||
struct MlirAndTfrtHostCtx {
|
||||
|
|
@ -504,28 +501,32 @@ Status BefThunk::ExecuteOnStream(const ExecuteParams& params) {
|
|||
|
||||
} // namespace gpu
|
||||
} // namespace xla
|
||||
#else // BEF_THUNKS
|
||||
#else // XLA_ENABLE_XLIR
|
||||
|
||||
namespace xla {
|
||||
|
||||
bool gpu::IsBefThunkEnabled() { return false; }
|
||||
static Status GetXlirDisabledError() {
|
||||
return tensorflow::errors::FailedPrecondition(
|
||||
"Built without XLA_ENABLE_XLIR");
|
||||
}
|
||||
|
||||
StatusOr<std::unique_ptr<gpu::Thunk>> gpu::CreateBefThunk(
|
||||
Thunk::ThunkInfo, mlir::Operation*, std::vector<BufferAllocation::Slice>) {
|
||||
return tensorflow::errors::FailedPrecondition("BefThunks are disabled.");
|
||||
return GetXlirDisabledError();
|
||||
}
|
||||
|
||||
StatusOr<std::unique_ptr<gpu::Thunk>> gpu::CreateBefCollectiveThunk(
|
||||
Thunk::ThunkInfo, mlir::Operation*, std::vector<BufferAllocation::Slice>,
|
||||
int64_t, int64_t) {
|
||||
return tensorflow::errors::FailedPrecondition("BefThunks are disabled.");
|
||||
return GetXlirDisabledError();
|
||||
}
|
||||
|
||||
StatusOr<std::unique_ptr<gpu::Thunk>> gpu::CreateBefKernelThunk(
|
||||
Thunk::ThunkInfo, absl::Span<const BufferAllocation* const>,
|
||||
const std::string&, const LaunchDimensions&) {
|
||||
return tensorflow::errors::FailedPrecondition(
|
||||
"BefKernelThunks are disabled.");
|
||||
return GetXlirDisabledError();
|
||||
}
|
||||
|
||||
} // namespace xla
|
||||
#endif // BEF_THUNKS
|
||||
|
||||
#endif // XLA_ENABLE_XLIR
|
||||
|
|
|
|||
|
|
@ -30,9 +30,6 @@ class ExecutionContext;
|
|||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
// Return whether --//...:enable_bef_thunk was specified on the command line.
|
||||
bool IsBefThunkEnabled();
|
||||
|
||||
// Creates a Thunk that uses TFRT BEF execution to perform the work of various
|
||||
// Thunk types. A BefThunk is not restricted to a particular op function, unlike
|
||||
// GemmThunk, ConvolutionThunk, etc. Rather, a BefThunk is to stand in place of
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ limitations under the License.
|
|||
|
||||
#include <atomic>
|
||||
#include <functional>
|
||||
#include <iterator>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
|
|
@ -172,11 +173,11 @@ limitations under the License.
|
|||
#include "tensorflow/core/profiler/lib/traceme.h"
|
||||
#include "tensorflow/core/util/env_var.h"
|
||||
|
||||
#if BEF_EXECUTABLE
|
||||
#if XLA_ENABLE_XLIR
|
||||
#include "tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/pass_utils.h"
|
||||
#include "tfrt/bef/bef_buffer.h" // from @tf_runtime
|
||||
#include "tfrt/bef_converter/mlir_to_bef_translate.h" // from @tf_runtime
|
||||
#endif // BEF_EXECUTABLE
|
||||
#endif // XLA_ENABLE_XLIR
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
|
@ -802,7 +803,7 @@ StatusOr<std::unique_ptr<BufferAssignment>> GpuCompiler::AssignBuffers(
|
|||
return std::move(assignment);
|
||||
}
|
||||
|
||||
#if BEF_EXECUTABLE
|
||||
#if XLA_ENABLE_XLIR
|
||||
static StatusOr<OwnedBefBuffer> LowerToBef(
|
||||
mlir::ModuleOp mlir_module, absl::string_view entry_function_name,
|
||||
llvm::ArrayRef<int64_t> buffer_sizes, HloModule* hlo_module) {
|
||||
|
|
@ -842,7 +843,7 @@ static StatusOr<OwnedBefBuffer> LowerToBef(
|
|||
std::copy(bef.begin(), bef.end(), ptr);
|
||||
return OwnedBefBuffer(ptr, {bef.size()});
|
||||
}
|
||||
#endif // BEF_EXECUTABLE
|
||||
#endif // XLA_ENABLE_XLIR
|
||||
|
||||
using OutputInfoMap =
|
||||
absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>;
|
||||
|
|
@ -968,10 +969,11 @@ static Status CompileModuleToLlvmIrImpl(
|
|||
|
||||
TF_RETURN_IF_ERROR(ir_emitter->EmitLmhloRegion(&entry_function.body()));
|
||||
|
||||
// TODO(b/218527186): Implement this feature for BEF as well.
|
||||
// TODO(b/218907125): Implement this feature for ROCm as well.
|
||||
bool supports_runtime_managed_constants =
|
||||
!(IsBefThunkEnabled() || platform_id == se::rocm::kROCmPlatformId);
|
||||
// TODO(b/218527186): Implement this feature for BEF as well.
|
||||
!IsBefEnabled(hlo_module->config()) &&
|
||||
// TODO(b/218907125): Implement this feature for ROCm as well.
|
||||
platform_id != se::rocm::kROCmPlatformId;
|
||||
if (supports_runtime_managed_constants) {
|
||||
// Remove these globals from the generated code to indicate that XLA is
|
||||
// responsible for allocating and initializing them.
|
||||
|
|
@ -987,19 +989,21 @@ static Status CompileModuleToLlvmIrImpl(
|
|||
RecordHloToLlvmDuration(end_usecs - start_usecs);
|
||||
}
|
||||
|
||||
#if BEF_EXECUTABLE
|
||||
std::vector<int64_t> buffer_sizes;
|
||||
llvm::transform(
|
||||
results->allocations, std::back_inserter(buffer_sizes),
|
||||
[](const BufferAllocation& allocation) { return allocation.size(); });
|
||||
TF_ASSIGN_OR_RETURN(results->thunks_or_bef,
|
||||
LowerToBef(*mlir_module, entry_function.getName().str(),
|
||||
buffer_sizes, hlo_module));
|
||||
#else // BEF_EXECUTABLE
|
||||
#if XLA_ENABLE_XLIR
|
||||
if (IsBefExecutableEnabled(hlo_module->config())) {
|
||||
std::vector<int64_t> buffer_sizes;
|
||||
llvm::transform(
|
||||
results->allocations, std::back_inserter(buffer_sizes),
|
||||
[](const BufferAllocation& allocation) { return allocation.size(); });
|
||||
TF_ASSIGN_OR_RETURN(results->thunks_or_bef,
|
||||
LowerToBef(*mlir_module, entry_function.getName().str(),
|
||||
buffer_sizes, hlo_module));
|
||||
return Status::OK();
|
||||
}
|
||||
#endif // XLA_ENABLE_XLIR
|
||||
|
||||
results->thunks_or_bef =
|
||||
absl::make_unique<ThunkSchedule>(ir_emitter->ConsumeThunkSequence());
|
||||
#endif // BEF_EXECUTABLE
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
|
@ -1354,7 +1358,7 @@ GpuDeviceInfo GetGpuDeviceInfo(se::StreamExecutor* stream_exec) {
|
|||
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
||||
GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
|
||||
const AotCompilationOptions& options) {
|
||||
#if BEF_EXECUTABLE
|
||||
#if XLA_ENABLE_XLIR
|
||||
CHECK(options.PlatformId() == se::cuda::kCudaPlatformId);
|
||||
CHECK(options.executor() != nullptr);
|
||||
auto stream_exec = options.executor();
|
||||
|
|
@ -1429,10 +1433,9 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
|
|||
}
|
||||
|
||||
return std::move(results);
|
||||
#else // BEF_EXECUTABLE
|
||||
return FailedPrecondition(
|
||||
"GpuCompiler::CompileAheadOfTime only supported with BEF_EXECUTABLE");
|
||||
#endif // BEF_EXECUTABLE
|
||||
#else // XLA_ENABLE_XLIR
|
||||
return FailedPrecondition("Not built with XLA_ENABLE_XLIR");
|
||||
#endif // XLA_ENABLE_XLIR
|
||||
}
|
||||
|
||||
HloCostAnalysis::ShapeSizeFunction GpuCompiler::ShapeSizeBytesFunction() const {
|
||||
|
|
@ -1556,11 +1559,11 @@ StatusOr<std::unique_ptr<Executable>> CompileLmhloToExecutable(
|
|||
module_config, ir_emitter_context));
|
||||
TF_RETURN_IF_ERROR(ir_emitter->EmitLmhloRegion(&entry_function.body()));
|
||||
|
||||
// TODO(b/218527186): Implement this feature for BEF as well.
|
||||
// TODO(b/218907125): Implement this feature for ROCm as well.
|
||||
bool supports_runtime_managed_constants =
|
||||
!(IsBefThunkEnabled() ||
|
||||
compiler->PlatformId() == se::rocm::kROCmPlatformId);
|
||||
// TODO(b/218527186): Implement this feature for BEF as well.
|
||||
!IsBefEnabled(module_config) &&
|
||||
// TODO(b/218907125): Implement this feature for ROCm as well.
|
||||
compiler->PlatformId() != se::rocm::kROCmPlatformId;
|
||||
if (supports_runtime_managed_constants) {
|
||||
// Remove these globals from the generated code to indicate that XLA is
|
||||
// responsible for allocating and initializing them.
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ limitations under the License.
|
|||
#include "tensorflow/core/profiler/lib/traceme.h"
|
||||
#include "tensorflow/stream_executor/platform.h"
|
||||
|
||||
#if BEF_EXECUTABLE
|
||||
#if XLA_ENABLE_XLIR
|
||||
#include "llvm/Support/SourceMgr.h"
|
||||
#include "mlir/IR/Builders.h" // from @llvm-project
|
||||
#include "mlir/IR/Diagnostics.h" // from @llvm-project
|
||||
|
|
@ -71,10 +71,29 @@ limitations under the License.
|
|||
#include "tfrt/host_context/function.h" // from @tf_runtime
|
||||
#include "tfrt/host_context/host_allocator.h" // from @tf_runtime
|
||||
#include "tfrt/host_context/host_context.h" // from @tf_runtime
|
||||
#endif // BEF_EXECUTABLE
|
||||
#endif // XLA_ENABLE_XLIR
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
bool IsBefExecutableEnabled(const HloModuleConfig& config) {
|
||||
#if XLA_ENABLE_XLIR
|
||||
return config.debug_options().xla_gpu_bef_executable();
|
||||
#else // XLA_ENABLE_XLIR
|
||||
(void)config;
|
||||
return false;
|
||||
#endif // XLA_ENABLE_XLIR
|
||||
}
|
||||
|
||||
bool IsBefThunkEnabled(const HloModuleConfig& config) {
|
||||
#if XLA_ENABLE_XLIR
|
||||
return config.debug_options().xla_gpu_bef_thunk();
|
||||
#else // XLA_ENABLE_XLIR
|
||||
(void)config;
|
||||
return false;
|
||||
#endif // XLA_ENABLE_XLIR
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
using ::tensorflow::profiler::ScopedAnnotation;
|
||||
|
|
@ -101,14 +120,14 @@ static std::string ModuleUniqueName(absl::string_view module_name,
|
|||
} // namespace
|
||||
|
||||
void GpuExecutable::BefBufferDeleter::operator()(uint8_t* ptr) const {
|
||||
#if BEF_EXECUTABLE
|
||||
#if XLA_ENABLE_XLIR
|
||||
tfrt::AlignedFree(ptr);
|
||||
#else
|
||||
LOG(FATAL) << "OwnedBefBuffer only supported with BEF_EXECUTABLE";
|
||||
LOG(FATAL) << "OwnedBefBuffer only supported with XLA_ENABLE_XLIR";
|
||||
#endif
|
||||
}
|
||||
|
||||
#if BEF_EXECUTABLE
|
||||
#if XLA_ENABLE_XLIR
|
||||
struct GpuExecutable::BefExecutable {
|
||||
private:
|
||||
explicit BefExecutable(OwnedBefBuffer buffer)
|
||||
|
|
@ -168,7 +187,7 @@ struct GpuExecutable::BefExecutable {
|
|||
absl::Mutex mutex;
|
||||
tfrt::gpu::GpuContextCache gpu_ctx_cache TF_GUARDED_BY(mutex);
|
||||
};
|
||||
#endif
|
||||
#endif // XLA_ENABLE_XLIR
|
||||
|
||||
StatusOr<std::unique_ptr<GpuExecutable>> GpuExecutable::Create(Params params) {
|
||||
auto thunks_or_bef = std::move(params.thunks_or_bef);
|
||||
|
|
@ -179,14 +198,14 @@ StatusOr<std::unique_ptr<GpuExecutable>> GpuExecutable::Create(Params params) {
|
|||
return result;
|
||||
}
|
||||
|
||||
#if BEF_EXECUTABLE
|
||||
#if XLA_ENABLE_XLIR
|
||||
if (absl::holds_alternative<OwnedBefBuffer>(thunks_or_bef)) {
|
||||
auto& bef_buffer = absl::get<OwnedBefBuffer>(thunks_or_bef);
|
||||
TF_ASSIGN_OR_RETURN(result->bef_executable_,
|
||||
BefExecutable::Create(std::move(bef_buffer)));
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
#endif // XLA_ENABLE_XLIR
|
||||
|
||||
return InternalError("No thunk or bef provided");
|
||||
}
|
||||
|
|
@ -250,7 +269,7 @@ GpuExecutable::~GpuExecutable() {
|
|||
}
|
||||
}
|
||||
|
||||
#if BEF_EXECUTABLE
|
||||
#if XLA_ENABLE_XLIR
|
||||
delete bef_executable_;
|
||||
#endif
|
||||
}
|
||||
|
|
@ -579,7 +598,7 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
|
|||
return out.ConsumeResult();
|
||||
}
|
||||
|
||||
#if BEF_EXECUTABLE
|
||||
#if XLA_ENABLE_XLIR
|
||||
// TODO(hanbinyoon): Deduplicate with that in bef_thunk.cc.
|
||||
static tfrt::RCReference<tfrt::AsyncValue> CreateGpuBuffer(
|
||||
stream_executor::DeviceMemoryBase* data) {
|
||||
|
|
@ -693,7 +712,7 @@ static Status ExecuteBef(const std::string& module_name,
|
|||
run_options, start_micros,
|
||||
block_host_until_done ? run_options->stream() : nullptr);
|
||||
}
|
||||
#endif // BEF_EXECUTABLE
|
||||
#endif // XLA_ENABLE_XLIR
|
||||
|
||||
StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
|
||||
const ServiceExecutableRunOptions* run_options,
|
||||
|
|
@ -871,13 +890,13 @@ Status GpuExecutable::ExecuteThunksOrBef(
|
|||
buffer_allocations, block_host_until_done);
|
||||
}
|
||||
|
||||
#if BEF_EXECUTABLE
|
||||
#if XLA_ENABLE_XLIR
|
||||
if (bef_executable_) {
|
||||
return ExecuteBef(module_name_, bef_executable_, run_options,
|
||||
buffer_allocations, allocations_.size(),
|
||||
block_host_until_done);
|
||||
}
|
||||
#endif // BEF_EXECUTABLE
|
||||
#endif // XLA_ENABLE_XLIR
|
||||
|
||||
return FailedPrecondition("Expected thunk or bef is not supplied.");
|
||||
}
|
||||
|
|
@ -977,7 +996,7 @@ Status GpuExecutable::SetUpMlirAllocation(
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
#if BEF_EXECUTABLE
|
||||
#if XLA_ENABLE_XLIR
|
||||
static void ApplyEntryFunctionAttributes(
|
||||
mlir::MLIRContext& context, mlir::FuncOp& func,
|
||||
xla::EntryFunctionAttributes entry_func_attrs, int buffer_param_offset) {
|
||||
|
|
@ -996,10 +1015,11 @@ static void ApplyEntryFunctionAttributes(
|
|||
builder.getIndexAttr(buffer.lmhlo_params()));
|
||||
}
|
||||
if (buffer.has_lmhlo_param_shape_index()) {
|
||||
arg_attr_list.set("lmhlo.param_shape_index",
|
||||
builder.getI64TensorAttr(llvm::makeArrayRef(
|
||||
buffer.lmhlo_param_shape_index().indices().begin(),
|
||||
buffer.lmhlo_param_shape_index().indices().end())));
|
||||
arg_attr_list.set(
|
||||
"lmhlo.param_shape_index",
|
||||
builder.getI64TensorAttr(llvm::makeArrayRef(
|
||||
buffer.lmhlo_param_shape_index().indices().data(),
|
||||
buffer.lmhlo_param_shape_index().indices().size())));
|
||||
}
|
||||
if (!buffer.lmhlo_constant_name().empty()) {
|
||||
arg_attr_list.set("lmhlo.constant_name",
|
||||
|
|
@ -1011,8 +1031,8 @@ static void ApplyEntryFunctionAttributes(
|
|||
if (buffer.has_lmhlo_output_index()) {
|
||||
arg_attr_list.set("lmhlo.output_index",
|
||||
builder.getI64TensorAttr(llvm::makeArrayRef(
|
||||
buffer.lmhlo_output_index().indices().begin(),
|
||||
buffer.lmhlo_output_index().indices().end())));
|
||||
buffer.lmhlo_output_index().indices().data(),
|
||||
buffer.lmhlo_output_index().indices().size())));
|
||||
}
|
||||
args_attrs.push_back(arg_attr_list.getDictionary(&context));
|
||||
}
|
||||
|
|
@ -1020,12 +1040,12 @@ static void ApplyEntryFunctionAttributes(
|
|||
func->setAttr("result_xla_shape",
|
||||
builder.getStringAttr(entry_func_attrs.result_xla_shape()));
|
||||
}
|
||||
#endif // BEF_EXECUTABLE
|
||||
#endif // XLA_ENABLE_XLIR
|
||||
|
||||
StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromBef(
|
||||
std::shared_ptr<HloModule> hlo_module, absl::string_view bef,
|
||||
xla::EntryFunctionAttributes entry_func_attrs, GpuVersion gpu_version) {
|
||||
#if BEF_EXECUTABLE
|
||||
#if XLA_ENABLE_XLIR
|
||||
OwnedBefBuffer bef_buffer = [bef]() {
|
||||
auto ptr = static_cast<uint8_t*>(
|
||||
tfrt::AlignedAlloc(tfrt::GetRequiredBefAlignment(), bef.size()));
|
||||
|
|
@ -1064,9 +1084,9 @@ StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromBef(
|
|||
module_name, result_xla_shape, std::move(allocations),
|
||||
std::move(output_info), bef_executable));
|
||||
return executable;
|
||||
#else // BEF_EXECUTABLE
|
||||
return FailedPrecondition("LoadFromBef only supported with BEF_EXECUTABLE");
|
||||
#endif // BEF_EXECUTABLE
|
||||
#else // XLA_ENABLE_XLIR
|
||||
return FailedPrecondition("Not built with XLA_ENABLE_XLIR");
|
||||
#endif // XLA_ENABLE_XLIR
|
||||
}
|
||||
|
||||
StatusOr<absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>>
|
||||
|
|
|
|||
|
|
@ -44,6 +44,16 @@ limitations under the License.
|
|||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
// Returns whether GpuExecutable runs on TFRT (instead of thunks).
|
||||
bool IsBefExecutableEnabled(const HloModuleConfig& config);
|
||||
|
||||
// Returns whether to create BefThunks (if the specific thunk is supported).
|
||||
bool IsBefThunkEnabled(const HloModuleConfig& config);
|
||||
|
||||
inline bool IsBefEnabled(const HloModuleConfig& config) {
|
||||
return IsBefExecutableEnabled(config) || IsBefThunkEnabled(config);
|
||||
}
|
||||
|
||||
// GPU-targeting implementation of the XLA Executable interface.
|
||||
//
|
||||
// Launches the given GPU kernel via the StreamExecutor.
|
||||
|
|
@ -276,7 +286,7 @@ class GpuExecutable : public Executable {
|
|||
// potentially shared with other executables.
|
||||
std::vector<std::shared_ptr<se::DeviceMemoryBase>> shared_constants_;
|
||||
|
||||
// Data for BEF_EXECUTABLE mode only, owned.
|
||||
// Data for bef executable mode only, owned.
|
||||
BefExecutable* bef_executable_ = nullptr;
|
||||
|
||||
GpuExecutable(const GpuExecutable&) = delete;
|
||||
|
|
|
|||
|
|
@ -78,6 +78,7 @@ limitations under the License.
|
|||
#include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/infeed_thunk.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
|
||||
|
|
@ -996,7 +997,7 @@ Status IrEmitterUnnested::EmitConvolutionThunk(mlir::Operation* op) {
|
|||
TF_ASSIGN_OR_RETURN(auto conv_result_slice, GetAllocationSlice(conv_result));
|
||||
TF_ASSIGN_OR_RETURN(auto scratch_slice, GetAllocationSlice(scratch_result));
|
||||
|
||||
if (IsBefThunkEnabled()) {
|
||||
if (IsBefThunkEnabled(hlo_module_config_)) {
|
||||
operand_slices.push_back(conv_result_slice);
|
||||
operand_slices.push_back(scratch_slice);
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
|
|
@ -1178,7 +1179,7 @@ Status IrEmitterUnnested::EmitGemmThunk(mlir::Operation* op) {
|
|||
|
||||
TF_ASSIGN_OR_RETURN(auto thunk, [&]() -> StatusOr<std::unique_ptr<Thunk>> {
|
||||
if (auto gemm = mlir::dyn_cast<mlir::lmhlo_gpu::GEMMOp>(op)) {
|
||||
if (IsBefThunkEnabled()) return make_bef_thunk(gemm);
|
||||
if (IsBefThunkEnabled(hlo_module_config_)) return make_bef_thunk(gemm);
|
||||
return make_gemm_thunk(gemm);
|
||||
}
|
||||
|
||||
|
|
@ -1187,7 +1188,8 @@ Status IrEmitterUnnested::EmitGemmThunk(mlir::Operation* op) {
|
|||
TF_ASSIGN_OR_RETURN(auto bias, GetAllocationSlice(gemm.bias()));
|
||||
TF_ASSIGN_OR_RETURN(auto output, GetAllocationSlice(gemm.output()));
|
||||
|
||||
if (IsBefThunkEnabled()) return make_bef_thunk(gemm, bias);
|
||||
if (IsBefThunkEnabled(hlo_module_config_))
|
||||
return make_bef_thunk(gemm, bias);
|
||||
|
||||
// The bias is passed inside the output buffer. If those buffers are
|
||||
// shared we can just use it, otherwise copy the bias values into the
|
||||
|
|
@ -1321,7 +1323,7 @@ Status IrEmitterUnnested::EmitCholeskyThunk(mlir::Operation* op) {
|
|||
GetAllocationSlice(cholesky_op.scratch()));
|
||||
TF_ASSIGN_OR_RETURN(auto info_buffer, GetAllocationSlice(cholesky_op.info()));
|
||||
|
||||
if (IsBefThunkEnabled()) {
|
||||
if (IsBefThunkEnabled(hlo_module_config_)) {
|
||||
std::vector<BufferAllocation::Slice> buffers = {
|
||||
operand_buffer, a_buffer, workspace_buffer, info_buffer};
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
|
|
@ -1419,7 +1421,7 @@ Status IrEmitterUnnested::EmitCustomCallThunk(mlir::Operation* op) {
|
|||
}
|
||||
|
||||
std::unique_ptr<Thunk> thunk;
|
||||
if (IsBefThunkEnabled()) {
|
||||
if (IsBefThunkEnabled(hlo_module_config_)) {
|
||||
auto values_to_non_optional_slices = [&](mlir::ValueRange values)
|
||||
-> StatusOr<std::vector<BufferAllocation::Slice>> {
|
||||
std::vector<BufferAllocation::Slice> slices;
|
||||
|
|
@ -2866,7 +2868,7 @@ Status IrEmitterUnnested::EmitReplicaOrPartitionId(mlir::Operation* op) {
|
|||
TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result_slice,
|
||||
GetAllocationSlice(casted.getOperand()));
|
||||
std::unique_ptr<Thunk> thunk;
|
||||
if (IsBefThunkEnabled()) {
|
||||
if (IsBefThunkEnabled(hlo_module_config_)) {
|
||||
TF_ASSIGN_OR_RETURN(thunk,
|
||||
CreateBefThunk(GetThunkInfo(op), op, {result_slice}));
|
||||
} else {
|
||||
|
|
@ -2898,7 +2900,7 @@ Status IrEmitterUnnested::EmitCollectivePermute(mlir::Operation* op) {
|
|||
/*mem_size=*/ShapeUtil::ByteSizeOf(shape)));
|
||||
} else {
|
||||
std::unique_ptr<Thunk> thunk;
|
||||
if (IsBefThunkEnabled()) {
|
||||
if (IsBefThunkEnabled(hlo_module_config_)) {
|
||||
std::vector<BufferAllocation::Slice> buffers = {source_slice,
|
||||
result_slice};
|
||||
TF_ASSIGN_OR_RETURN(thunk, CreateBefCollectiveThunk(
|
||||
|
|
@ -2967,10 +2969,11 @@ Status IrEmitterUnnested::EmitNcclThunk(mlir::Operation* untyped_op) {
|
|||
|
||||
if (should_use_nccl_thunk) {
|
||||
std::unique_ptr<Thunk> thunk;
|
||||
if (IsBefThunkEnabled() && (mlir::isa<mlir::lmhlo::AllGatherOp>(op) ||
|
||||
mlir::isa<mlir::lmhlo::AllReduceOp>(op) ||
|
||||
mlir::isa<mlir::lmhlo::ReduceScatterOp>(op) ||
|
||||
mlir::isa<mlir::lmhlo::AllToAllOp>(op))) {
|
||||
if (IsBefThunkEnabled(hlo_module_config_) &&
|
||||
(mlir::isa<mlir::lmhlo::AllGatherOp>(op) ||
|
||||
mlir::isa<mlir::lmhlo::AllReduceOp>(op) ||
|
||||
mlir::isa<mlir::lmhlo::ReduceScatterOp>(op) ||
|
||||
mlir::isa<mlir::lmhlo::AllToAllOp>(op))) {
|
||||
std::vector<BufferAllocation::Slice> arg_buffers;
|
||||
arg_buffers.reserve(buffers.size() * 2);
|
||||
for (const auto& buffer : buffers) {
|
||||
|
|
@ -3201,7 +3204,7 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildKernelThunkImpl(
|
|||
std::string(kernel->getName()),
|
||||
ir_emitter_context_->llvm_module());
|
||||
|
||||
if (IsBefThunkEnabled()) {
|
||||
if (IsBefThunkEnabled(hlo_module_config_)) {
|
||||
return CreateBefKernelThunk(thunk_info, non_constant_buffers,
|
||||
std::string(kernel->getName()),
|
||||
launch_dimensions);
|
||||
|
|
@ -5511,15 +5514,14 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
|
|||
}
|
||||
|
||||
if (mlir::isa<mlir::lmhlo::TriangularSolveOp>(op)) {
|
||||
#if BEF_EXECUTABLE
|
||||
// BEF-mode GpuExecutable allocates temp memory, and so the custom-call
|
||||
// implementation for TriangularSolve is not needed.
|
||||
return Status::OK();
|
||||
#else
|
||||
if (IsBefEnabled(hlo_module_config_)) {
|
||||
// XLIR allocates temp memory, and so the custom-call implementation for
|
||||
// TriangularSolve is not needed.
|
||||
return Status::OK();
|
||||
}
|
||||
return InternalError(
|
||||
"TriangularSolve is implemented as a custom-call; we do not expect to "
|
||||
"lower a true HLO TriangularSolve op.");
|
||||
#endif
|
||||
}
|
||||
|
||||
if (mlir::isa<mlir::lmhlo::FusionOp>(op)) {
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ limitations under the License.
|
|||
#include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
|
||||
|
|
@ -149,12 +150,12 @@ Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
|
|||
// Find the fastest algorithm for GEMMs.
|
||||
post_pipeline.AddPass<GemmAlgorithmPicker>(stream_exec, device_allocator);
|
||||
|
||||
// BEF-mode GpuExecutable allocates temp memory, and so the custom-call
|
||||
// implementation for TriangularSolve is not needed.
|
||||
#if !BEF_EXECUTABLE
|
||||
// Transform TriangularSolve ops into custom-calls, so we can add temp memory.
|
||||
post_pipeline.AddPass<TriangularSolveRewriter>();
|
||||
#endif
|
||||
if (!IsBefEnabled(hlo_module->config())) {
|
||||
// Transform TriangularSolve ops into custom-calls, so we can add temp
|
||||
// memory. XLIR allocates temp memory, and so the custom-call implementation
|
||||
// for TriangularSolve is not needed.
|
||||
post_pipeline.AddPass<TriangularSolveRewriter>();
|
||||
}
|
||||
|
||||
TF_RETURN_IF_ERROR(post_pipeline.Run(hlo_module).status());
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ limitations under the License.
|
|||
#include "tensorflow/compiler/xla/tests/test_macros.h"
|
||||
|
||||
// This test suite should be run with
|
||||
// --//third_party/tensorflow/compiler/xla/service/gpu:enable_bef_executable=true
|
||||
// '--test_env=XLA_FLAGS=--xla_gpu_bef_executable'
|
||||
// to test the BEF-mode GpuExecutable.
|
||||
|
||||
namespace xla {
|
||||
|
|
|
|||
|
|
@ -18,11 +18,11 @@ The source code here is from
|
|||
https://www.tensorflow.org/xla/tutorials/jit_compile, where there is also a
|
||||
walkthrough.
|
||||
|
||||
To execute in TFRT BEF, run with `--config=cuda
|
||||
--//third_party/tensorflow/compiler/xla/service/gpu:enable_bef_executable=true`.
|
||||
To execute in TFRT BEF, run with
|
||||
`--config=cuda --test_env=XLA_FLAGS=--xla_gpu_bef_executable`
|
||||
|
||||
To dump debug output (e.g., LMHLO MLIR, TFRT MLIR, TFRT BEF), run with
|
||||
`XLA_FLAGS="--xla_dump_to=/tmp/mnist"`.
|
||||
`--test_env=XLA_FLAGS="--xla_dump_to=/tmp/mnist"`.
|
||||
"""
|
||||
|
||||
from absl import app
|
||||
|
|
|
|||
|
|
@ -38,8 +38,6 @@
|
|||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
#if BEF_THUNKS
|
||||
|
||||
static llvm::Expected<tfrt::gpu::GpuModule> ModuleLoad(
|
||||
tfrt::Argument<tfrt::gpu::GpuContext> context,
|
||||
const tfrt::ExecutionContext& exec_ctx) {
|
||||
|
|
@ -83,8 +81,6 @@ static llvm::Expected<tfrt::gpu::GpuModule> ModuleLoad(
|
|||
return tfrt::gpu::GpuModule(context.ValueRef(), std::move(*module));
|
||||
}
|
||||
|
||||
#endif // BEF_THUNKS
|
||||
|
||||
static llvm::Expected<DeviceAssignment::LogicalID> GetLogicalId(
|
||||
const tfrt::ExecutionContext& exec_ctx) {
|
||||
auto* xla_gpu_params =
|
||||
|
|
@ -367,9 +363,8 @@ static llvm::Error CustomCall(
|
|||
static void RegisterXlirKernels(tfrt::KernelRegistry* kernel_reg) {
|
||||
kernel_reg->AddKernel("xlir.custom_call",
|
||||
TFRT_KERNEL_WITH_CHAIN_RESULT(CustomCall));
|
||||
#if BEF_THUNKS
|
||||
// This kernel is only used for bef thunks, not bef executables.
|
||||
kernel_reg->AddKernel("xlir.module.load", TFRT_KERNEL(ModuleLoad));
|
||||
#endif // BEF_THUNKS
|
||||
kernel_reg->AddKernel("xlir.replica_id",
|
||||
TFRT_KERNEL_WITH_CHAIN_RESULT(ReplicaId));
|
||||
kernel_reg->AddKernel("xlir.partition_id",
|
||||
|
|
|
|||
|
|
@ -359,7 +359,14 @@ message DebugOptions {
|
|||
// logging a warning and proceeding with fallback.
|
||||
bool xla_gpu_strict_conv_algorithm_picker = 156;
|
||||
|
||||
// Next id: 161
|
||||
// If true, enable XLIR to compile gpu programs to TFRT BEF.
|
||||
bool xla_gpu_bef_executable = 161;
|
||||
|
||||
// If true, enable XLIR to compile thunks to TFRT BEF.
|
||||
// This flag has no effect when xla_gpu_bef_executable is true.
|
||||
bool xla_gpu_bef_thunk = 162;
|
||||
|
||||
// Next id: 163
|
||||
|
||||
// Extra options to pass to the compilation backend (e.g. LLVM); specific
|
||||
// interpretation of these values is left to the backend.
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user