mirror of
https://github.com/zebrajr/tensorflow.git
synced 2025-12-06 12:20:11 +01:00
END_PUBLIC Note: this CL will break builds. cl/159887762 to follow to fix all the breakages. --- Commit2336cdf7fauthored by Maxwell Paul Brickner<mbrickn@users.noreply.github.com> Committed by gunan<gunan@google.com>: Updated link to use HTTPS (#10998) Howdy! I just updated a link to use https instead of http. Thanks! --- Commitad0892df1authored by Luke Iwanski<luke@codeplay.com> Committed by Luke Iwanski<luke@codeplay.com>: [OpenCL] Fixes run_metadata_test for SYCL This test is designed to test CUDA specific behavior --- Commit6b37a0725authored by Todd Wang<toddwang@gmail.com> Committed by GitHub<noreply@github.com>: Update comments --- Commit1699d904aauthored by John Lawson<john@codeplay.com> Committed by Luke Iwanski<luke@codeplay.com>: [OpenCL] Fixes CUDA specific test run on SYCL (#56) The testBadParentValuesOnGPU should only be run on CUDA devices, as the test checks for particular CUDA behaviour. We don't actually provide a SYCL kernel for GatherTree and so it's not a problem that the tests don't target SYCL. --- Commit3c1946230authored by myPrecious<Moriadry@users.noreply.github.com> Committed by Shanqing Cai<cais@google.com>: Java API to get the size of specified input list of operations. (#10865) * Java API to get the size of specified input list of operations * remove unnecessary explain to avoid bring a new term to users. --- Commite911c7480authored by Luke Iwanski<luke@codeplay.com> Committed by Luke Iwanski<luke@codeplay.com>: [OpenCL] REGISTER -> REGISTER6 --- Commitfbf6c4cecauthored by superryanguo<superryanguo@gmail.com> Committed by superryanguo<superryanguo@gmail.com>: Simplify the Quickstart section with the weblink is better --- Commit72e2918ccauthored by Taehoon Lee<taehoonlee@snu.ac.kr> Committed by Taehoon Lee<taehoonlee@snu.ac.kr>: Fix typos --- Commit90c4406b7authored by Rishabh Patel<patelrishabh@users.noreply.github.com> Committed by GitHub<noreply@github.com>: Correct the learning rate as per the code snippet --- Commit03da61134authored by Todd Wang<toddwang@gmail.com> Committed by GitHub<noreply@github.com>: Update ir_array.cc --- Commit2df6cd3acauthored by Todd Wang<toddwang@gmail.com> Committed by GitHub<noreply@github.com>: Another try --- Commitaf0cbace1authored by Luke Iwanski<luke@codeplay.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: [OpenCL] Transpose to go through Eigen (#10321) --- Commitfc7361081authored by Luke Iwanski<luke@codeplay.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: [OpenCL] Registers RGBToHSV and HSVToRGB (#91) (#10848) * [OpenCL] Added RGBToHSV and HSVToRGB * Aligning '\' --- Commit832894ef8authored by Luke Iwanski<luke@codeplay.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: [OpenCL] Registers AdjustContrastv2 (#10949) * [OpenCL] Registers AdjustContrastv2 (#93) * [OpenCL] Extended adjust_contrast_op_benchmark_test for OpenCL (#96) * [OpenCL] Extended adjust_contrast_op_benchmark_test for OpenCL * simplified to #ifndef * Changed to "#if GOOGLE_CUDA" * Update adjust_contrast_op_benchmark_test.cc * Added comments --- Commitcb4c2f8d1authored by Yifei Feng<yifeif@google.com> Committed by Yifei Feng<yifeif@google.com>: Make TransferBufferToInFeed not virual so it compiles. --- Commite89f04d80authored by Yifei Feng<yifeif@google.com> Committed by Yifei Feng<yifeif@google.com>: Fix calling Literal member functions. --- Commit15a8df724authored by Yifei Feng<yifeif@google.com> Committed by Yifei Feng<yifeif@google.com>: Fix mac build clone from meheff's change: [XLA] Change return type of DeviceAssignment::Deserialize to fix build breakage on mac. The mac build had the following error: error: incomplete type 'xla::DeviceAssignment' used in type trait expression This was due to a static method returning a StatusOr<DeviceAssignment> inside of the definition of DeviceAssignment. --- Commita54d43fa4authored by Yifei Feng<yifeif@google.com> Committed by Yifei Feng<yifeif@google.com>: Replace LiteralUtil to Literal in compiler/plugin/executor --- Commit88a6bb80cauthored by Guenther Schmuelling<guschmue@microsoft.com> Committed by Guenther Schmuelling<guschmue@microsoft.com>: expand inline for debug builds to limit number of symbols --- Commit62fb49d31authored by Yifei Feng<yifeif@google.com> Committed by Yifei Feng<yifeif@google.com>: Fix visibility error for contrib/remote_fused_graph/pylib/BUILD. --- Commit4c75252f2authored by Mark Neumann<markn@allenai.org> Committed by Mark Neumann<markn@allenai.org>: fix initial test values to avoid numerical instability --- Commitb58d98353authored by sj6077<epik03sj@gmail.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: Fixes of AutoParallel bug (#10368) * Fix the bug that auto_parallel could replicate variable snapshot name * Use NodeName in grappler:utils instead of substr, convert variables->variable_def of grappler item * remove variable_def from grappler item, exclude snapshot nodes from dont_replicate_nodes in auto_parallel --- Commita286b7db8authored by Yifei Feng<yifeif@google.com> Committed by Yifei Feng<yifeif@google.com>: Make debug_test slice integer. --- Commit97fcfdfa6authored by Toby Boyd<tobyboyd@google.com> Committed by GitHub<noreply@github.com>: Fixed path to seq2seq.py and minor formatting --- Commit63c1befb8authored by Anish Shah<shah.anish07@gmail.com> Committed by Anish Shah<shah.anish07@gmail.com>: Improve docs for tf.nn.depthwise_conv2d_native --- Commit8d42202b2authored by Yong Tang<yong.tang.github@outlook.com> Committed by Yong Tang<yong.tang.github@outlook.com>: Fix mismatched delete in mkl_tfconv_op.cc This fix fixes mismatched new[]-delete in mkl_tfconv_op.cc (the file went through clang-format so there are some additional changes) Signed-off-by: Yong Tang <yong.tang.github@outlook.com> --- Commit26301bd55authored by Danny Goodman<goodman.danny@gmail.com> Committed by Danny Goodman<goodman.danny@gmail.com>: fix error format --- Commitb3f33ad46authored by Yao Zhang<yaozhang@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: Make changes to prepare for the fused option of batch norm to be set to None (None means using fused batch norm if possible). PiperOrigin-RevId: 159649743 --- Commita4a469832authored by A. Unique TensorFlower<gardener@tensorflow.org> Committed by TensorFlower Gardener<gardener@tensorflow.org>: [XLA] Add tests for select ops and while loops that produce tuples that contain predicates. PiperOrigin-RevId: 159645900 --- Commit980d3f2beauthored by A. Unique TensorFlower<gardener@tensorflow.org> Committed by TensorFlower Gardener<gardener@tensorflow.org>: Use C API to implement Operation.name property This name property is used in many existing tests including those that already run with C API enabled (math_ops_test, framework_ops_test, session_test, session_partial_run_test, math_ops_test_gpu, etc). PiperOrigin-RevId: 159645767 --- Commit26239c706authored by A. Unique TensorFlower<gardener@tensorflow.org> Committed by TensorFlower Gardener<gardener@tensorflow.org>: Previously we didn't have an implementation of BatchNormInference and BatchNormTraining, which gives a linker error if anyone ever tries to call that. A dummy implementation is friendlier than a linker error. PiperOrigin-RevId: 159645612 --- Commitf671c5caaauthored by A. Unique TensorFlower<gardener@tensorflow.org> Committed by TensorFlower Gardener<gardener@tensorflow.org>: BEGIN_PUBLIC Automated g4 rollback of changelist 159570549 PiperOrigin-RevId: 160182040
148 lines
5.4 KiB
C++
148 lines
5.4 KiB
C++
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
#include "tensorflow/compiler/plugin/executor/executable.h"
|
|
#include "tensorflow/compiler/plugin/executor/executor.h"
|
|
|
|
#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
|
|
|
|
#include "tensorflow/compiler/xla/literal_util.h"
|
|
#include "tensorflow/compiler/xla/shape_util.h"
|
|
|
|
namespace se = ::perftools::gputools;
|
|
namespace sep = ::perftools::gputools::executorplugin;
|
|
|
|
namespace xla {
|
|
namespace executorplugin {
|
|
|
|
ExecutorExecutable::ExecutorExecutable(std::unique_ptr<HloModule> hlo_module)
|
|
: Executable(std::move(hlo_module), ShapeSizeBytes) {}
|
|
|
|
ExecutorExecutable::~ExecutorExecutable() {}
|
|
|
|
static se::DeviceMemoryBase AllocateSingleOutput(sep::ExecutorExecutor* executor,
|
|
const Literal& literal) {
|
|
int64 size(xla::ShapeUtil::ByteSizeOf(literal.shape()));
|
|
void* buf = executor->Allocate(size);
|
|
const void* src = literal.InternalData();
|
|
memcpy(buf, src, size);
|
|
return se::DeviceMemoryBase(buf, size);
|
|
}
|
|
|
|
static se::DeviceMemoryBase AllocateOutputBuffer(sep::ExecutorExecutor* executor,
|
|
const Literal& literal) {
|
|
const Shape& shape = literal.shape();
|
|
if (shape.element_type() != xla::TUPLE) {
|
|
return AllocateSingleOutput(executor, literal);
|
|
} else {
|
|
int64 size(xla::ShapeUtil::ByteSizeOf(shape, sizeof(void*)));
|
|
void** buf = reinterpret_cast<void**>(executor->Allocate(size));
|
|
for (int64 n = 0; n < xla::ShapeUtil::TupleElementCount(shape); n++) {
|
|
se::DeviceMemoryBase out =
|
|
AllocateSingleOutput(executor, literal.tuple_literals(n));
|
|
*buf++ = out.opaque();
|
|
}
|
|
|
|
return se::DeviceMemoryBase(buf, size);
|
|
}
|
|
}
|
|
|
|
StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteOnStream(
|
|
const ServiceExecutableRunOptions* run_options,
|
|
tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
|
|
HloExecutionProfile* hlo_execution_profile) {
|
|
se::Stream* stream = run_options->stream();
|
|
|
|
VLOG(1) << "Execute " << module().name();
|
|
if (VLOG_IS_ON(2)) {
|
|
for (const auto& a : arguments) {
|
|
VLOG(2) << "-- argument " << a.opaque();
|
|
}
|
|
}
|
|
|
|
uint64 start_micros = tensorflow::Env::Default()->NowMicros();
|
|
|
|
HloComputation* computation = module().entry_computation();
|
|
if (computation->num_parameters() != arguments.size()) {
|
|
return tensorflow::errors::Internal(
|
|
"Mismatch between argument count and graph parameter count.");
|
|
}
|
|
|
|
// Create the arguments as an vector of XLA literals
|
|
std::vector<std::unique_ptr<Literal>> arg_literals;
|
|
std::vector<Literal*> arg_literals_ptrs;
|
|
for (int64 p = 0; p < computation->num_parameters(); p++) {
|
|
// Create the input literal for the parameter
|
|
HloInstruction* param = computation->parameter_instruction(p);
|
|
arg_literals.emplace_back(Literal::CreateFromShape(param->shape()));
|
|
arg_literals_ptrs.push_back(arg_literals.back().get());
|
|
|
|
// Copy in the data from the stream_executor buffers
|
|
void* buffer = arg_literals.back().get()->MutableInternalData();
|
|
memcpy(buffer, arguments[p].opaque(),
|
|
ShapeUtil::ByteSizeOf(param->shape()));
|
|
}
|
|
|
|
// Execute the graph using the evaluator
|
|
HloEvaluator evaluator;
|
|
std::unique_ptr<Literal> output;
|
|
TF_ASSIGN_OR_RETURN(output,
|
|
evaluator.Evaluate(computation, arg_literals_ptrs));
|
|
|
|
// Copy the result into the return buffer
|
|
perftools::gputools::StreamExecutor* executor(stream->parent());
|
|
sep::ExecutorExecutor* executorExecutor(
|
|
static_cast<sep::ExecutorExecutor*>(executor->implementation()));
|
|
|
|
se::DeviceMemoryBase ret =
|
|
AllocateOutputBuffer(executorExecutor, *(output.get()));
|
|
|
|
uint64 end_micros = tensorflow::Env::Default()->NowMicros();
|
|
|
|
{
|
|
tensorflow::mutex_lock lock(mutex_);
|
|
const double nanoseconds = (end_micros - start_micros) * 1000.0;
|
|
execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
StatusOr<std::unique_ptr<ShapedBuffer>> ExecutorExecutable::ExecuteOnStream(
|
|
const ServiceExecutableRunOptions* run_options,
|
|
tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
|
|
HloExecutionProfile* hlo_execution_profile) {
|
|
return tensorflow::errors::Unimplemented(
|
|
"ExecuteOnStream is not yet supported on Executor.");
|
|
}
|
|
|
|
StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteAsyncOnStream(
|
|
const ServiceExecutableRunOptions* run_options,
|
|
tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
|
|
return tensorflow::errors::Unimplemented(
|
|
"ExecuteAsyncOnStream is not yet supported on Executor.");
|
|
}
|
|
|
|
/*static*/ int64 ExecutorExecutable::ShapeSizeBytes(const Shape& shape) {
|
|
if (ShapeUtil::IsOpaque(shape)) {
|
|
return sizeof(void*);
|
|
}
|
|
return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
|
|
}
|
|
|
|
|
|
} // namespace executorplugin
|
|
} // namespace xla
|