Internal change

PiperOrigin-RevId: 429500662
Change-Id: I0ebaa1c1321fa4a077a48443fbe6d7cdfa98533b
This commit is contained in:
Christian Sigg 2022-02-18 00:43:05 -08:00 committed by TensorFlower Gardener
parent 5d5d822d0e
commit 4248acac6d
14 changed files with 184 additions and 174 deletions

View File

@ -718,22 +718,21 @@ build:ubsan --linkopt -lubsan
build --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/common,tensorflow/core/tfrt/eager,tensorflow/core/tfrt/eager/backends/cpu,tensorflow/core/tfrt/eager/backends/gpu,tensorflow/core/tfrt/eager/core_runtime,tensorflow/core/tfrt/eager/cpp_tests/core_runtime,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils
build:tfrt --deleted_packages=
# Experimental configuration for testing XLA GPU lowering to TFRT.
#
# To run using BEF thunks, combine with
# --//tensorflow/compiler/xla/service/gpu:enable_bef_thunk. For example,
# bazel test --config=experimental_tfrt_gpu \
# --//tensorflow/compiler/xla/service/gpu:enable_bef_thunk \
# //tensorflow/compiler/xla/service/gpu/tests:mlir_gemm_test
#
# To run using BEF executable, combine with
# --//tensorflow/compiler/xla/service/gpu:enable_bef_executable. For example,
# bazel test --config=experimental_tfrt_gpu \
# --//tensorflow/compiler/xla/service/gpu:enable_bef_executable \
# //tensorflow/compiler/xla/service/gpu/tests:mnist
build:experimental_tfrt_gpu --config=tfrt
build:experimental_tfrt_gpu --@tf_runtime//:enable_gpu
build:experimental_tfrt_gpu --@rules_cuda//cuda:cuda_runtime=//tensorflow/compiler/xla/service/gpu:cuda_runtime_for_xlir
build:experimental_tfrt_gpu --nocheck_visibility
build:experimental_tfrt_gpu --incompatible_strict_action_env
build:experimental_tfrt_gpu --config=monolithic
# Experimental configuration for building XLA GPU lowering to TFRT.
build:experimental_enable_xlir --config=tfrt
build:experimental_enable_xlir --@tf_runtime//:enable_gpu
build:experimental_enable_xlir --@rules_cuda//cuda:cuda_runtime=//tensorflow/compiler/xla/service/gpu:cuda_runtime_for_xlir
build:experimental_enable_xlir --nocheck_visibility
build:experimental_enable_xlir --incompatible_strict_action_env
build:experimental_enable_xlir --config=monolithic
# bazel test --config=experimental_enable_bef_thunk \
# //tensorflow/compiler/xla/service/gpu:bef_thunk_tests
build:experimental_enable_bef_thunk --config=experimental_enable_xlir
test:experimental_enable_bef_thunk --test_env=XLA_FLAGS=--xla_gpu_bef_thunk
# bazel test --config=experimental_enable_bef_executable \
# //tensorflow/compiler/xla/service/gpu:bef_executable_tests
build:experimental_enable_bef_executable --config=experimental_enable_xlir
test:experimental_enable_bef_executable --test_env=XLA_FLAGS=--xla_gpu_bef_executable

View File

@ -79,6 +79,10 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(false);
opts.set_xla_multiheap_size_constraint_per_heap(-1);
opts.set_xla_detailed_logging_and_dumping(true);
opts.set_xla_gpu_bef_executable(false);
opts.set_xla_gpu_bef_thunk(false);
return opts;
}
@ -683,6 +687,16 @@ static void AllocateFlags() {
flag_values->xla_dump_hlo_pipeline_re(),
"If specified, dumps HLO before and after optimization passes in the "
"pass pipelines that match this regular expression."));
flag_objects->push_back(tensorflow::Flag(
"xla_gpu_bef_executable",
bool_setter_for(&DebugOptions::set_xla_gpu_bef_executable),
flag_values->xla_gpu_bef_executable(),
"Whether to enable XLIR to compile gpu programs to TFRT BEF."));
flag_objects->push_back(tensorflow::Flag(
"xla_gpu_bef_thunk",
bool_setter_for(&DebugOptions::set_xla_gpu_bef_thunk),
flag_values->xla_gpu_bef_thunk(),
"Whether to enable XLIR to compile thunks to TFRT BEF."));
ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
} // NOLINT(readability/fn_size)

View File

@ -60,8 +60,7 @@ package_group(
check_deps(
name = "tfrt_deps_check",
disallowed_deps = select({
":is_bef_thunk_enabled": [],
":is_bef_executable_enabled": [],
":is_xlir_enabled": [],
"//conditions:default": [
":xlir_kernels",
":xlir_opdefs",
@ -308,10 +307,6 @@ cc_library(
"kernel_mapping_scheme.h",
],
copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]),
local_defines = select({
":is_bef_executable_enabled": ["BEF_EXECUTABLE=1"],
"//conditions:default": [],
}),
deps = [
":backend_configs_cc",
":buffer_allocations",
@ -564,23 +559,13 @@ tf_cuda_library(
)
bool_flag(
name = "enable_bef_thunk",
build_setting_default = False,
name = "enable_xlir",
build_setting_default = if_google(True, False),
)
config_setting(
name = "is_bef_thunk_enabled",
flag_values = {":enable_bef_thunk": "True"},
)
bool_flag(
name = "enable_bef_executable",
build_setting_default = False,
)
config_setting(
name = "is_bef_executable_enabled",
flag_values = {":enable_bef_executable": "True"},
name = "is_xlir_enabled",
flag_values = {":enable_xlir": "True"},
)
gentbl_cc_library(
@ -629,10 +614,6 @@ cc_library(
cc_library(
name = "xlir_kernels",
srcs = ["xlir_kernels.cc"],
local_defines = select({
":is_bef_thunk_enabled": ["BEF_THUNKS=1"],
"//conditions:default": [],
}),
tags = ["manual"],
visibility = ["//visibility:private"],
deps = [
@ -697,8 +678,7 @@ cc_library(
"triangular_solve_thunk.h",
]),
local_defines = select({
":is_bef_thunk_enabled": ["BEF_THUNKS=1"],
":is_bef_executable_enabled": ["BEF_EXECUTABLE=1"],
":is_xlir_enabled": ["XLA_ENABLE_XLIR=1"],
"//conditions:default": [],
}),
deps = [
@ -784,47 +764,31 @@ cc_library(
"//tensorflow/core/platform/default/build_config:stream_executor_rocm",
"@local_config_rocm//rocm:rocm_headers",
]) + select({
":is_bef_thunk_enabled": [
":is_xlir_enabled": [
":xlir_kernels",
":xlir_opdefs",
"@llvm-project//llvm:Support",
"@llvm-project//mlir:GPUTransforms",
"@llvm-project//mlir:Pass",
"@llvm-project//mlir:TransformUtils",
"//tensorflow/compiler/mlir:name_utils",
"//tensorflow/compiler/mlir/hlo:lhlo",
"//tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu:lmhlo_to_tfrt_gpu",
"//tensorflow/compiler/mlir/xla:attribute_exporter",
"//tensorflow/stream_executor/gpu:gpu_executor_header",
"@tf_runtime//:basic_kernels_alwayslink",
"@tf_runtime//:basic_kernels_opdefs",
"@tf_runtime//:befexecutor",
"@tf_runtime//:bef",
"@tf_runtime//:befexecutor",
"@tf_runtime//:beftomlir",
"@tf_runtime//:core_runtime",
"@tf_runtime//:hostcontext",
"@tf_runtime//:mlirtobef_translate",
"@tf_runtime//:support",
"@tf_runtime//backends/gpu:gpu_executor",
"@tf_runtime//backends/gpu:gpu_kernels_alwayslink",
"@tf_runtime//backends/gpu:gpu_opdefs",
"@tf_runtime//backends/gpu:gpu_passes",
"@tf_runtime//backends/gpu:gpu_types",
],
":is_bef_executable_enabled": [
":xlir_kernels",
":xlir_opdefs",
"@llvm-project//llvm:Support",
"//tensorflow/compiler/mlir:name_utils",
"//tensorflow/stream_executor/gpu:gpu_executor_header",
"@tf_runtime//:basic_kernels_alwayslink",
"@tf_runtime//:befexecutor",
"@tf_runtime//:beftomlir",
"@tf_runtime//:bef",
"@tf_runtime//:core_runtime",
"@tf_runtime//:hostcontext",
"@tf_runtime//:support",
"@tf_runtime//:tensor_alwayslink",
"@tf_runtime//backends/gpu:gpu_executor",
"@tf_runtime//backends/gpu:gpu_kernels_alwayslink",
"@tf_runtime//backends/gpu:gpu_opdefs",
"@tf_runtime//backends/gpu:gpu_passes",
"@tf_runtime//backends/gpu:gpu_types",
],
@ -1549,7 +1513,7 @@ cc_library(
"gpu_compiler.h",
],
local_defines = select({
":is_bef_executable_enabled": ["BEF_EXECUTABLE=1"],
":is_xlir_enabled": ["XLA_ENABLE_XLIR=1"],
"//conditions:default": [],
}),
deps = [
@ -1687,7 +1651,7 @@ cc_library(
"//tensorflow/core/profiler/lib:traceme",
"//tensorflow/stream_executor:stream_executor_headers",
] + select({
":is_bef_executable_enabled": [
":is_xlir_enabled": [
"//tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu:pass_utils",
"@tf_runtime//:mlirtobef_translate",
"@tf_runtime//:support",
@ -1729,10 +1693,6 @@ cc_library(
hdrs = if_cuda_is_configured([
"nvptx_compiler.h",
]),
local_defines = select({
":is_bef_executable_enabled": ["BEF_EXECUTABLE=1"],
"//conditions:default": [],
}),
deps = if_cuda_is_configured([
":cublas_pad_for_gemms",
":cudnn_fused_conv_rewriter",
@ -1741,6 +1701,7 @@ cc_library(
":cusolver_rewriter",
":gemm_algorithm_picker",
":gpu_asm_opts_util",
":gpu_executable",
":gpu_compiler",
":gpu_conv_padding_legalization",
":gpu_conv_rewriter",

View File

@ -19,7 +19,7 @@ limitations under the License.
#include "tensorflow/core/platform/errors.h"
#if BEF_THUNKS
#if XLA_ENABLE_XLIR
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Support/SourceMgr.h"
#include "mlir/Dialect/GPU/Passes.h" // from @llvm-project
@ -63,9 +63,6 @@ limitations under the License.
namespace xla {
namespace gpu {
bool IsBefThunkEnabled() { return true; }
namespace {
struct MlirAndTfrtHostCtx {
@ -504,28 +501,32 @@ Status BefThunk::ExecuteOnStream(const ExecuteParams& params) {
} // namespace gpu
} // namespace xla
#else // BEF_THUNKS
#else // XLA_ENABLE_XLIR
namespace xla {
bool gpu::IsBefThunkEnabled() { return false; }
static Status GetXlirDisabledError() {
return tensorflow::errors::FailedPrecondition(
"Built without XLA_ENABLE_XLIR");
}
StatusOr<std::unique_ptr<gpu::Thunk>> gpu::CreateBefThunk(
Thunk::ThunkInfo, mlir::Operation*, std::vector<BufferAllocation::Slice>) {
return tensorflow::errors::FailedPrecondition("BefThunks are disabled.");
return GetXlirDisabledError();
}
StatusOr<std::unique_ptr<gpu::Thunk>> gpu::CreateBefCollectiveThunk(
Thunk::ThunkInfo, mlir::Operation*, std::vector<BufferAllocation::Slice>,
int64_t, int64_t) {
return tensorflow::errors::FailedPrecondition("BefThunks are disabled.");
return GetXlirDisabledError();
}
StatusOr<std::unique_ptr<gpu::Thunk>> gpu::CreateBefKernelThunk(
Thunk::ThunkInfo, absl::Span<const BufferAllocation* const>,
const std::string&, const LaunchDimensions&) {
return tensorflow::errors::FailedPrecondition(
"BefKernelThunks are disabled.");
return GetXlirDisabledError();
}
} // namespace xla
#endif // BEF_THUNKS
#endif // XLA_ENABLE_XLIR

View File

@ -30,9 +30,6 @@ class ExecutionContext;
namespace xla {
namespace gpu {
// Return whether --//...:enable_bef_thunk was specified on the command line.
bool IsBefThunkEnabled();
// Creates a Thunk that uses TFRT BEF execution to perform the work of various
// Thunk types. A BefThunk is not restricted to a particular op function, unlike
// GemmThunk, ConvolutionThunk, etc. Rather, a BefThunk is to stand in place of

View File

@ -19,6 +19,7 @@ limitations under the License.
#include <atomic>
#include <functional>
#include <iterator>
#include <string>
#include <utility>
@ -172,11 +173,11 @@ limitations under the License.
#include "tensorflow/core/profiler/lib/traceme.h"
#include "tensorflow/core/util/env_var.h"
#if BEF_EXECUTABLE
#if XLA_ENABLE_XLIR
#include "tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/pass_utils.h"
#include "tfrt/bef/bef_buffer.h" // from @tf_runtime
#include "tfrt/bef_converter/mlir_to_bef_translate.h" // from @tf_runtime
#endif // BEF_EXECUTABLE
#endif // XLA_ENABLE_XLIR
namespace xla {
namespace gpu {
@ -802,7 +803,7 @@ StatusOr<std::unique_ptr<BufferAssignment>> GpuCompiler::AssignBuffers(
return std::move(assignment);
}
#if BEF_EXECUTABLE
#if XLA_ENABLE_XLIR
static StatusOr<OwnedBefBuffer> LowerToBef(
mlir::ModuleOp mlir_module, absl::string_view entry_function_name,
llvm::ArrayRef<int64_t> buffer_sizes, HloModule* hlo_module) {
@ -842,7 +843,7 @@ static StatusOr<OwnedBefBuffer> LowerToBef(
std::copy(bef.begin(), bef.end(), ptr);
return OwnedBefBuffer(ptr, {bef.size()});
}
#endif // BEF_EXECUTABLE
#endif // XLA_ENABLE_XLIR
using OutputInfoMap =
absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>;
@ -968,10 +969,11 @@ static Status CompileModuleToLlvmIrImpl(
TF_RETURN_IF_ERROR(ir_emitter->EmitLmhloRegion(&entry_function.body()));
// TODO(b/218527186): Implement this feature for BEF as well.
// TODO(b/218907125): Implement this feature for ROCm as well.
bool supports_runtime_managed_constants =
!(IsBefThunkEnabled() || platform_id == se::rocm::kROCmPlatformId);
// TODO(b/218527186): Implement this feature for BEF as well.
!IsBefEnabled(hlo_module->config()) &&
// TODO(b/218907125): Implement this feature for ROCm as well.
platform_id != se::rocm::kROCmPlatformId;
if (supports_runtime_managed_constants) {
// Remove these globals from the generated code to indicate that XLA is
// responsible for allocating and initializing them.
@ -987,19 +989,21 @@ static Status CompileModuleToLlvmIrImpl(
RecordHloToLlvmDuration(end_usecs - start_usecs);
}
#if BEF_EXECUTABLE
std::vector<int64_t> buffer_sizes;
llvm::transform(
results->allocations, std::back_inserter(buffer_sizes),
[](const BufferAllocation& allocation) { return allocation.size(); });
TF_ASSIGN_OR_RETURN(results->thunks_or_bef,
LowerToBef(*mlir_module, entry_function.getName().str(),
buffer_sizes, hlo_module));
#else // BEF_EXECUTABLE
#if XLA_ENABLE_XLIR
if (IsBefExecutableEnabled(hlo_module->config())) {
std::vector<int64_t> buffer_sizes;
llvm::transform(
results->allocations, std::back_inserter(buffer_sizes),
[](const BufferAllocation& allocation) { return allocation.size(); });
TF_ASSIGN_OR_RETURN(results->thunks_or_bef,
LowerToBef(*mlir_module, entry_function.getName().str(),
buffer_sizes, hlo_module));
return Status::OK();
}
#endif // XLA_ENABLE_XLIR
results->thunks_or_bef =
absl::make_unique<ThunkSchedule>(ir_emitter->ConsumeThunkSequence());
#endif // BEF_EXECUTABLE
return Status::OK();
}
@ -1354,7 +1358,7 @@ GpuDeviceInfo GetGpuDeviceInfo(se::StreamExecutor* stream_exec) {
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
const AotCompilationOptions& options) {
#if BEF_EXECUTABLE
#if XLA_ENABLE_XLIR
CHECK(options.PlatformId() == se::cuda::kCudaPlatformId);
CHECK(options.executor() != nullptr);
auto stream_exec = options.executor();
@ -1429,10 +1433,9 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
}
return std::move(results);
#else // BEF_EXECUTABLE
return FailedPrecondition(
"GpuCompiler::CompileAheadOfTime only supported with BEF_EXECUTABLE");
#endif // BEF_EXECUTABLE
#else // XLA_ENABLE_XLIR
return FailedPrecondition("Not built with XLA_ENABLE_XLIR");
#endif // XLA_ENABLE_XLIR
}
HloCostAnalysis::ShapeSizeFunction GpuCompiler::ShapeSizeBytesFunction() const {
@ -1556,11 +1559,11 @@ StatusOr<std::unique_ptr<Executable>> CompileLmhloToExecutable(
module_config, ir_emitter_context));
TF_RETURN_IF_ERROR(ir_emitter->EmitLmhloRegion(&entry_function.body()));
// TODO(b/218527186): Implement this feature for BEF as well.
// TODO(b/218907125): Implement this feature for ROCm as well.
bool supports_runtime_managed_constants =
!(IsBefThunkEnabled() ||
compiler->PlatformId() == se::rocm::kROCmPlatformId);
// TODO(b/218527186): Implement this feature for BEF as well.
!IsBefEnabled(module_config) &&
// TODO(b/218907125): Implement this feature for ROCm as well.
compiler->PlatformId() != se::rocm::kROCmPlatformId;
if (supports_runtime_managed_constants) {
// Remove these globals from the generated code to indicate that XLA is
// responsible for allocating and initializing them.

View File

@ -50,7 +50,7 @@ limitations under the License.
#include "tensorflow/core/profiler/lib/traceme.h"
#include "tensorflow/stream_executor/platform.h"
#if BEF_EXECUTABLE
#if XLA_ENABLE_XLIR
#include "llvm/Support/SourceMgr.h"
#include "mlir/IR/Builders.h" // from @llvm-project
#include "mlir/IR/Diagnostics.h" // from @llvm-project
@ -71,10 +71,29 @@ limitations under the License.
#include "tfrt/host_context/function.h" // from @tf_runtime
#include "tfrt/host_context/host_allocator.h" // from @tf_runtime
#include "tfrt/host_context/host_context.h" // from @tf_runtime
#endif // BEF_EXECUTABLE
#endif // XLA_ENABLE_XLIR
namespace xla {
namespace gpu {
bool IsBefExecutableEnabled(const HloModuleConfig& config) {
#if XLA_ENABLE_XLIR
return config.debug_options().xla_gpu_bef_executable();
#else // XLA_ENABLE_XLIR
(void)config;
return false;
#endif // XLA_ENABLE_XLIR
}
bool IsBefThunkEnabled(const HloModuleConfig& config) {
#if XLA_ENABLE_XLIR
return config.debug_options().xla_gpu_bef_thunk();
#else // XLA_ENABLE_XLIR
(void)config;
return false;
#endif // XLA_ENABLE_XLIR
}
namespace {
using ::tensorflow::profiler::ScopedAnnotation;
@ -101,14 +120,14 @@ static std::string ModuleUniqueName(absl::string_view module_name,
} // namespace
void GpuExecutable::BefBufferDeleter::operator()(uint8_t* ptr) const {
#if BEF_EXECUTABLE
#if XLA_ENABLE_XLIR
tfrt::AlignedFree(ptr);
#else
LOG(FATAL) << "OwnedBefBuffer only supported with BEF_EXECUTABLE";
LOG(FATAL) << "OwnedBefBuffer only supported with XLA_ENABLE_XLIR";
#endif
}
#if BEF_EXECUTABLE
#if XLA_ENABLE_XLIR
struct GpuExecutable::BefExecutable {
private:
explicit BefExecutable(OwnedBefBuffer buffer)
@ -168,7 +187,7 @@ struct GpuExecutable::BefExecutable {
absl::Mutex mutex;
tfrt::gpu::GpuContextCache gpu_ctx_cache TF_GUARDED_BY(mutex);
};
#endif
#endif // XLA_ENABLE_XLIR
StatusOr<std::unique_ptr<GpuExecutable>> GpuExecutable::Create(Params params) {
auto thunks_or_bef = std::move(params.thunks_or_bef);
@ -179,14 +198,14 @@ StatusOr<std::unique_ptr<GpuExecutable>> GpuExecutable::Create(Params params) {
return result;
}
#if BEF_EXECUTABLE
#if XLA_ENABLE_XLIR
if (absl::holds_alternative<OwnedBefBuffer>(thunks_or_bef)) {
auto& bef_buffer = absl::get<OwnedBefBuffer>(thunks_or_bef);
TF_ASSIGN_OR_RETURN(result->bef_executable_,
BefExecutable::Create(std::move(bef_buffer)));
return result;
}
#endif
#endif // XLA_ENABLE_XLIR
return InternalError("No thunk or bef provided");
}
@ -250,7 +269,7 @@ GpuExecutable::~GpuExecutable() {
}
}
#if BEF_EXECUTABLE
#if XLA_ENABLE_XLIR
delete bef_executable_;
#endif
}
@ -579,7 +598,7 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
return out.ConsumeResult();
}
#if BEF_EXECUTABLE
#if XLA_ENABLE_XLIR
// TODO(hanbinyoon): Deduplicate with that in bef_thunk.cc.
static tfrt::RCReference<tfrt::AsyncValue> CreateGpuBuffer(
stream_executor::DeviceMemoryBase* data) {
@ -693,7 +712,7 @@ static Status ExecuteBef(const std::string& module_name,
run_options, start_micros,
block_host_until_done ? run_options->stream() : nullptr);
}
#endif // BEF_EXECUTABLE
#endif // XLA_ENABLE_XLIR
StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
const ServiceExecutableRunOptions* run_options,
@ -871,13 +890,13 @@ Status GpuExecutable::ExecuteThunksOrBef(
buffer_allocations, block_host_until_done);
}
#if BEF_EXECUTABLE
#if XLA_ENABLE_XLIR
if (bef_executable_) {
return ExecuteBef(module_name_, bef_executable_, run_options,
buffer_allocations, allocations_.size(),
block_host_until_done);
}
#endif // BEF_EXECUTABLE
#endif // XLA_ENABLE_XLIR
return FailedPrecondition("Expected thunk or bef is not supplied.");
}
@ -977,7 +996,7 @@ Status GpuExecutable::SetUpMlirAllocation(
return Status::OK();
}
#if BEF_EXECUTABLE
#if XLA_ENABLE_XLIR
static void ApplyEntryFunctionAttributes(
mlir::MLIRContext& context, mlir::FuncOp& func,
xla::EntryFunctionAttributes entry_func_attrs, int buffer_param_offset) {
@ -996,10 +1015,11 @@ static void ApplyEntryFunctionAttributes(
builder.getIndexAttr(buffer.lmhlo_params()));
}
if (buffer.has_lmhlo_param_shape_index()) {
arg_attr_list.set("lmhlo.param_shape_index",
builder.getI64TensorAttr(llvm::makeArrayRef(
buffer.lmhlo_param_shape_index().indices().begin(),
buffer.lmhlo_param_shape_index().indices().end())));
arg_attr_list.set(
"lmhlo.param_shape_index",
builder.getI64TensorAttr(llvm::makeArrayRef(
buffer.lmhlo_param_shape_index().indices().data(),
buffer.lmhlo_param_shape_index().indices().size())));
}
if (!buffer.lmhlo_constant_name().empty()) {
arg_attr_list.set("lmhlo.constant_name",
@ -1011,8 +1031,8 @@ static void ApplyEntryFunctionAttributes(
if (buffer.has_lmhlo_output_index()) {
arg_attr_list.set("lmhlo.output_index",
builder.getI64TensorAttr(llvm::makeArrayRef(
buffer.lmhlo_output_index().indices().begin(),
buffer.lmhlo_output_index().indices().end())));
buffer.lmhlo_output_index().indices().data(),
buffer.lmhlo_output_index().indices().size())));
}
args_attrs.push_back(arg_attr_list.getDictionary(&context));
}
@ -1020,12 +1040,12 @@ static void ApplyEntryFunctionAttributes(
func->setAttr("result_xla_shape",
builder.getStringAttr(entry_func_attrs.result_xla_shape()));
}
#endif // BEF_EXECUTABLE
#endif // XLA_ENABLE_XLIR
StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromBef(
std::shared_ptr<HloModule> hlo_module, absl::string_view bef,
xla::EntryFunctionAttributes entry_func_attrs, GpuVersion gpu_version) {
#if BEF_EXECUTABLE
#if XLA_ENABLE_XLIR
OwnedBefBuffer bef_buffer = [bef]() {
auto ptr = static_cast<uint8_t*>(
tfrt::AlignedAlloc(tfrt::GetRequiredBefAlignment(), bef.size()));
@ -1064,9 +1084,9 @@ StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromBef(
module_name, result_xla_shape, std::move(allocations),
std::move(output_info), bef_executable));
return executable;
#else // BEF_EXECUTABLE
return FailedPrecondition("LoadFromBef only supported with BEF_EXECUTABLE");
#endif // BEF_EXECUTABLE
#else // XLA_ENABLE_XLIR
return FailedPrecondition("Not built with XLA_ENABLE_XLIR");
#endif // XLA_ENABLE_XLIR
}
StatusOr<absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>>

View File

@ -44,6 +44,16 @@ limitations under the License.
namespace xla {
namespace gpu {
// Returns whether GpuExecutable runs on TFRT (instead of thunks).
bool IsBefExecutableEnabled(const HloModuleConfig& config);
// Returns whether to create BefThunks (if the specific thunk is supported).
bool IsBefThunkEnabled(const HloModuleConfig& config);
inline bool IsBefEnabled(const HloModuleConfig& config) {
return IsBefExecutableEnabled(config) || IsBefThunkEnabled(config);
}
// GPU-targeting implementation of the XLA Executable interface.
//
// Launches the given GPU kernel via the StreamExecutor.
@ -276,7 +286,7 @@ class GpuExecutable : public Executable {
// potentially shared with other executables.
std::vector<std::shared_ptr<se::DeviceMemoryBase>> shared_constants_;
// Data for BEF_EXECUTABLE mode only, owned.
// Data for bef executable mode only, owned.
BefExecutable* bef_executable_ = nullptr;
GpuExecutable(const GpuExecutable&) = delete;

View File

@ -78,6 +78,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
#include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
#include "tensorflow/compiler/xla/service/gpu/infeed_thunk.h"
#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
@ -996,7 +997,7 @@ Status IrEmitterUnnested::EmitConvolutionThunk(mlir::Operation* op) {
TF_ASSIGN_OR_RETURN(auto conv_result_slice, GetAllocationSlice(conv_result));
TF_ASSIGN_OR_RETURN(auto scratch_slice, GetAllocationSlice(scratch_result));
if (IsBefThunkEnabled()) {
if (IsBefThunkEnabled(hlo_module_config_)) {
operand_slices.push_back(conv_result_slice);
operand_slices.push_back(scratch_slice);
TF_ASSIGN_OR_RETURN(
@ -1178,7 +1179,7 @@ Status IrEmitterUnnested::EmitGemmThunk(mlir::Operation* op) {
TF_ASSIGN_OR_RETURN(auto thunk, [&]() -> StatusOr<std::unique_ptr<Thunk>> {
if (auto gemm = mlir::dyn_cast<mlir::lmhlo_gpu::GEMMOp>(op)) {
if (IsBefThunkEnabled()) return make_bef_thunk(gemm);
if (IsBefThunkEnabled(hlo_module_config_)) return make_bef_thunk(gemm);
return make_gemm_thunk(gemm);
}
@ -1187,7 +1188,8 @@ Status IrEmitterUnnested::EmitGemmThunk(mlir::Operation* op) {
TF_ASSIGN_OR_RETURN(auto bias, GetAllocationSlice(gemm.bias()));
TF_ASSIGN_OR_RETURN(auto output, GetAllocationSlice(gemm.output()));
if (IsBefThunkEnabled()) return make_bef_thunk(gemm, bias);
if (IsBefThunkEnabled(hlo_module_config_))
return make_bef_thunk(gemm, bias);
// The bias is passed inside the output buffer. If those buffers are
// shared we can just use it, otherwise copy the bias values into the
@ -1321,7 +1323,7 @@ Status IrEmitterUnnested::EmitCholeskyThunk(mlir::Operation* op) {
GetAllocationSlice(cholesky_op.scratch()));
TF_ASSIGN_OR_RETURN(auto info_buffer, GetAllocationSlice(cholesky_op.info()));
if (IsBefThunkEnabled()) {
if (IsBefThunkEnabled(hlo_module_config_)) {
std::vector<BufferAllocation::Slice> buffers = {
operand_buffer, a_buffer, workspace_buffer, info_buffer};
TF_ASSIGN_OR_RETURN(
@ -1419,7 +1421,7 @@ Status IrEmitterUnnested::EmitCustomCallThunk(mlir::Operation* op) {
}
std::unique_ptr<Thunk> thunk;
if (IsBefThunkEnabled()) {
if (IsBefThunkEnabled(hlo_module_config_)) {
auto values_to_non_optional_slices = [&](mlir::ValueRange values)
-> StatusOr<std::vector<BufferAllocation::Slice>> {
std::vector<BufferAllocation::Slice> slices;
@ -2866,7 +2868,7 @@ Status IrEmitterUnnested::EmitReplicaOrPartitionId(mlir::Operation* op) {
TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result_slice,
GetAllocationSlice(casted.getOperand()));
std::unique_ptr<Thunk> thunk;
if (IsBefThunkEnabled()) {
if (IsBefThunkEnabled(hlo_module_config_)) {
TF_ASSIGN_OR_RETURN(thunk,
CreateBefThunk(GetThunkInfo(op), op, {result_slice}));
} else {
@ -2898,7 +2900,7 @@ Status IrEmitterUnnested::EmitCollectivePermute(mlir::Operation* op) {
/*mem_size=*/ShapeUtil::ByteSizeOf(shape)));
} else {
std::unique_ptr<Thunk> thunk;
if (IsBefThunkEnabled()) {
if (IsBefThunkEnabled(hlo_module_config_)) {
std::vector<BufferAllocation::Slice> buffers = {source_slice,
result_slice};
TF_ASSIGN_OR_RETURN(thunk, CreateBefCollectiveThunk(
@ -2967,10 +2969,11 @@ Status IrEmitterUnnested::EmitNcclThunk(mlir::Operation* untyped_op) {
if (should_use_nccl_thunk) {
std::unique_ptr<Thunk> thunk;
if (IsBefThunkEnabled() && (mlir::isa<mlir::lmhlo::AllGatherOp>(op) ||
mlir::isa<mlir::lmhlo::AllReduceOp>(op) ||
mlir::isa<mlir::lmhlo::ReduceScatterOp>(op) ||
mlir::isa<mlir::lmhlo::AllToAllOp>(op))) {
if (IsBefThunkEnabled(hlo_module_config_) &&
(mlir::isa<mlir::lmhlo::AllGatherOp>(op) ||
mlir::isa<mlir::lmhlo::AllReduceOp>(op) ||
mlir::isa<mlir::lmhlo::ReduceScatterOp>(op) ||
mlir::isa<mlir::lmhlo::AllToAllOp>(op))) {
std::vector<BufferAllocation::Slice> arg_buffers;
arg_buffers.reserve(buffers.size() * 2);
for (const auto& buffer : buffers) {
@ -3201,7 +3204,7 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildKernelThunkImpl(
std::string(kernel->getName()),
ir_emitter_context_->llvm_module());
if (IsBefThunkEnabled()) {
if (IsBefThunkEnabled(hlo_module_config_)) {
return CreateBefKernelThunk(thunk_info, non_constant_buffers,
std::string(kernel->getName()),
launch_dimensions);
@ -5511,15 +5514,14 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
}
if (mlir::isa<mlir::lmhlo::TriangularSolveOp>(op)) {
#if BEF_EXECUTABLE
// BEF-mode GpuExecutable allocates temp memory, and so the custom-call
// implementation for TriangularSolve is not needed.
return Status::OK();
#else
if (IsBefEnabled(hlo_module_config_)) {
// XLIR allocates temp memory, and so the custom-call implementation for
// TriangularSolve is not needed.
return Status::OK();
}
return InternalError(
"TriangularSolve is implemented as a custom-call; we do not expect to "
"lower a true HLO TriangularSolve op.");
#endif
}
if (mlir::isa<mlir::lmhlo::FusionOp>(op)) {

View File

@ -36,6 +36,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
@ -149,12 +150,12 @@ Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
// Find the fastest algorithm for GEMMs.
post_pipeline.AddPass<GemmAlgorithmPicker>(stream_exec, device_allocator);
// BEF-mode GpuExecutable allocates temp memory, and so the custom-call
// implementation for TriangularSolve is not needed.
#if !BEF_EXECUTABLE
// Transform TriangularSolve ops into custom-calls, so we can add temp memory.
post_pipeline.AddPass<TriangularSolveRewriter>();
#endif
if (!IsBefEnabled(hlo_module->config())) {
// Transform TriangularSolve ops into custom-calls, so we can add temp
// memory. XLIR allocates temp memory, and so the custom-call implementation
// for TriangularSolve is not needed.
post_pipeline.AddPass<TriangularSolveRewriter>();
}
TF_RETURN_IF_ERROR(post_pipeline.Run(hlo_module).status());

View File

@ -22,7 +22,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/tests/test_macros.h"
// This test suite should be run with
// --//third_party/tensorflow/compiler/xla/service/gpu:enable_bef_executable=true
// '--test_env=XLA_FLAGS=--xla_gpu_bef_executable'
// to test the BEF-mode GpuExecutable.
namespace xla {

View File

@ -18,11 +18,11 @@ The source code here is from
https://www.tensorflow.org/xla/tutorials/jit_compile, where there is also a
walkthrough.
To execute in TFRT BEF, run with `--config=cuda
--//third_party/tensorflow/compiler/xla/service/gpu:enable_bef_executable=true`.
To execute in TFRT BEF, run with
`--config=cuda --test_env=XLA_FLAGS=--xla_gpu_bef_executable`
To dump debug output (e.g., LMHLO MLIR, TFRT MLIR, TFRT BEF), run with
`XLA_FLAGS="--xla_dump_to=/tmp/mnist"`.
`--test_env=XLA_FLAGS="--xla_dump_to=/tmp/mnist"`.
"""
from absl import app

View File

@ -38,8 +38,6 @@
namespace xla {
namespace gpu {
#if BEF_THUNKS
static llvm::Expected<tfrt::gpu::GpuModule> ModuleLoad(
tfrt::Argument<tfrt::gpu::GpuContext> context,
const tfrt::ExecutionContext& exec_ctx) {
@ -83,8 +81,6 @@ static llvm::Expected<tfrt::gpu::GpuModule> ModuleLoad(
return tfrt::gpu::GpuModule(context.ValueRef(), std::move(*module));
}
#endif // BEF_THUNKS
static llvm::Expected<DeviceAssignment::LogicalID> GetLogicalId(
const tfrt::ExecutionContext& exec_ctx) {
auto* xla_gpu_params =
@ -367,9 +363,8 @@ static llvm::Error CustomCall(
static void RegisterXlirKernels(tfrt::KernelRegistry* kernel_reg) {
kernel_reg->AddKernel("xlir.custom_call",
TFRT_KERNEL_WITH_CHAIN_RESULT(CustomCall));
#if BEF_THUNKS
// This kernel is only used for bef thunks, not bef executables.
kernel_reg->AddKernel("xlir.module.load", TFRT_KERNEL(ModuleLoad));
#endif // BEF_THUNKS
kernel_reg->AddKernel("xlir.replica_id",
TFRT_KERNEL_WITH_CHAIN_RESULT(ReplicaId));
kernel_reg->AddKernel("xlir.partition_id",

View File

@ -359,7 +359,14 @@ message DebugOptions {
// logging a warning and proceeding with fallback.
bool xla_gpu_strict_conv_algorithm_picker = 156;
// Next id: 161
// If true, enable XLIR to compile gpu programs to TFRT BEF.
bool xla_gpu_bef_executable = 161;
// If true, enable XLIR to compile thunks to TFRT BEF.
// This flag has no effect when xla_gpu_bef_executable is true.
bool xla_gpu_bef_thunk = 162;
// Next id: 163
// Extra options to pass to the compilation backend (e.g. LLVM); specific
// interpretation of these values is left to the backend.