mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary:
Originally reverted this diff D37116110 (c9aa74a37f) because
```
> /usr/local/bin/buck build //caffe2/test/cpp/lite_interpreter_runtime/...
BUILD FAILED
The rule //caffe2:backend_interface_libAndroid could not be found.
Please check the spelling and whether it is one of the 1866 targets in /data/users/batanasov/fbsource/fbcode/caffe2/TARGETS. (52107 bytes)
1 similar targets in /data/users/batanasov/fbsource/fbcode/caffe2/TARGETS are:
//caffe2:backend_interface_lib
This error happened while trying to get dependency '//caffe2:backend_interface_libAndroid' of target '//caffe2/test/cpp/lite_interpreter_runtime:test_mobile_profilerAndroid'
At //caffe2:backend_interface_libAndroid (ovr_config//platform/linux:x86_64-fbcode)
At //caffe2/test/cpp/lite_interpreter_runtime:test_mobile_profilerAndroid (ovr_config//platform/linux:x86_64-fbcode)
```
The add test_mobile_profiler was not meant to be built with Android or other mobile platforms, so we are changing the test to a cpp_unittest
Test Plan:
```
buck test //caffe2/test/cpp/lite_interpreter_runtime:test_mobile_profiler
Parsing buck files: finished in 0.9 sec
Creating action graph: finished in 26.5 sec
Downloaded 2/2 artifacts, 1.30 Mbytes, 0.0% cache miss (for updated rules)
Building: finished in 16.5 sec (100%) 18451/18451 jobs, 3/18451 updated
Total time: 44.0 sec
More details at https://www.internalfb.com/intern/buck/build/8bee82c1-66a9-4fae-805f-e4ef5505d25d
BUILD SUCCEEDED
Tpx test run coordinator for Facebook. See https://fburl.com/tpx for details.
Running with tpx session id: 6904f989-5c17-4c5b-9a4f-ffb643dfcc43
Trace available for this run at /tmp/tpx-20220726-114727.001729-6904f989-5c17-4c5b-9a4f-ffb643dfcc43/trace.log
RemoteExecution session id: reSessionID-6904f989-5c17-4c5b-9a4f-ffb643dfcc43-tpx
Started reporting to test run: https://www.internalfb.com/intern/testinfra/testrun/844425183404951
✓ ListingSuccess: caffe2/test/cpp/lite_interpreter_runtime:test_mobile_profiler : 3 tests discovered (17.640)
✓ Pass: caffe2/test/cpp/lite_interpreter_runtime:test_mobile_profiler - MobileProfiler.Backend (0.206)
✓ Pass: caffe2/test/cpp/lite_interpreter_runtime:test_mobile_profiler - MobileProfiler.BackendMemoryEvents (0.271)
✓ Pass: caffe2/test/cpp/lite_interpreter_runtime:test_mobile_profiler - MobileProfiler.ModuleHierarchy (0.268)
Summary
Pass: 3
ListingSuccess: 1
Finished test run: https://www.internalfb.com/intern/testinfra/testrun/844425183404951
```
Differential Revision: D38166171
Pull Request resolved: https://github.com/pytorch/pytorch/pull/82243
Approved by: https://github.com/salilsdesai
202 lines
7.5 KiB
C++
202 lines
7.5 KiB
C++
#include <ATen/Utils.h>
|
|
#include <c10/core/TensorImpl.h>
|
|
#include <torch/csrc/jit/backends/backend.h>
|
|
#include <torch/csrc/jit/backends/backend_exception.h>
|
|
|
|
#ifndef NO_PROFILING
|
|
#include <torch/csrc/jit/mobile/profiler_edge.h>
|
|
#endif
|
|
|
|
namespace torch {
|
|
namespace jit {
|
|
|
|
// Implementation of a PyTorch Backend that can process, compile and execute
|
|
// TorchScript Modules composed of 'add' and 'sub' operators. It just supports
|
|
// for modules that implement a sum or subtraction of 2 inputs (i.e. in1 + in2
|
|
// or in1 - in2). Hence the methods of the models expect exactly 2 inputs of
|
|
// type Tensor. This backend is used to demonstrate the flow of compilation and
|
|
// execution with minimum amount of work. It's not intended to a practical
|
|
// backend that can be used for actual inference.
|
|
|
|
// Implementation details:
|
|
//
|
|
// Compilation
|
|
// 1. A backend with minimum compilation features, "backend_with_compiler_demo"
|
|
// is added.
|
|
// 2. The compilation happens AOT in the preprocess function registered to this
|
|
// backend.
|
|
// 3. Compiled results are stored in a string blob for each method. They are
|
|
// serialized to the lowered module with __getstate__ function.
|
|
// 4. Error message with model source code is thrown, for features not handled
|
|
// by the backend compiler.
|
|
//
|
|
// Runtime
|
|
// 1. The compiled blob is loaded in __setstate__ method.
|
|
// 2. The compile function of the backend: parse the preprocessed blob to the
|
|
// format (a list of tokens) that the backend can understand.
|
|
// 3. The execute function of the backend executes the specified method
|
|
// (handle).
|
|
|
|
namespace {
|
|
std::vector<std::tuple<std::string, int64_t>> parseMethodHandle(
|
|
const std::string& blob) {
|
|
std::vector<std::tuple<std::string, int64_t>> result;
|
|
std::stringstream s_stream(blob);
|
|
constexpr char debug_handle_token[] = "<debug_handle>";
|
|
while (s_stream.good()) {
|
|
std::string substr;
|
|
getline(s_stream, substr, ',');
|
|
auto debug_handle_pos = substr.find(debug_handle_token);
|
|
int64_t debug_handle{-1};
|
|
auto instruction = substr.substr(0);
|
|
if (debug_handle_pos != std::string::npos) {
|
|
instruction = substr.substr(0, debug_handle_pos);
|
|
debug_handle = stoi(substr.substr(debug_handle_pos + 14));
|
|
}
|
|
result.push_back(std::make_tuple(instruction, debug_handle));
|
|
}
|
|
return result;
|
|
}
|
|
|
|
float* float_data_ptr(const at::Tensor& t) {
|
|
return t.unsafeGetTensorImpl()->data_ptr_impl<float>();
|
|
}
|
|
} // namespace
|
|
|
|
class BackendWithCompiler : public PyTorchBackendInterface {
|
|
public:
|
|
// Constructor.
|
|
// NOLINTNEXTLINE(modernize-use-equals-default)
|
|
explicit BackendWithCompiler() {}
|
|
// NOLINTNEXTLINE(modernize-use-override)
|
|
virtual ~BackendWithCompiler() = default;
|
|
|
|
bool is_available() override {
|
|
return true;
|
|
}
|
|
|
|
// Since the actual compilation is done AOT for this backend, compile just
|
|
// forwards everything along. In a non toy setup this could grab information
|
|
// from that runtime that might be relevant to execute, such as build flags
|
|
// the resolution of the devices camera, or basically any runtime specific
|
|
// information that wouldnt be available server side where preprocess is
|
|
// called.
|
|
c10::impl::GenericDict compile(
|
|
c10::IValue processed,
|
|
c10::impl::GenericDict method_compile_spec) override {
|
|
auto dict = processed.toGenericDict();
|
|
auto handles =
|
|
c10::Dict<std::string, std::vector<std::tuple<std::string, int64_t>>>();
|
|
for (const auto& kv : dict) {
|
|
auto tokens = parseMethodHandle(kv.value().toStringRef());
|
|
handles.insert(kv.key().toStringRef(), tokens);
|
|
}
|
|
return c10::impl::toGenericDict(handles);
|
|
}
|
|
|
|
// Function that actually executes the model in the backend. Here there is
|
|
// nothing to dispatch to, so the backend is implemented locally within
|
|
// execute and it only supports add, subtract, and constant. In a non toy
|
|
// backend you can imagine how this function could be used to actually
|
|
// dispatch the inputs to the relevant backend/device.
|
|
c10::impl::GenericList execute(
|
|
c10::IValue
|
|
handle, // example: [('prim::Constant#1', 14), ('aten::add', 15)]
|
|
c10::impl::GenericList inputs) override {
|
|
TORCH_INTERNAL_ASSERT(inputs.size() == 2);
|
|
c10::IValue val0 = inputs[0];
|
|
at::Tensor x = val0.toTensor();
|
|
c10::IValue val1 = inputs[1];
|
|
at::Tensor h = val1.toTensor();
|
|
std::vector<std::tuple<int64_t, int64_t, std::string>> op_runtimes_us;
|
|
op_runtimes_us.reserve(handle.toList().size());
|
|
|
|
c10::List<at::Tensor> output_list;
|
|
#ifndef NO_PROFILING
|
|
auto start_us = torch::profiler::impl::getTime() / 1000;
|
|
#endif
|
|
for (const auto& token : handle.toList()) {
|
|
IValue val = token;
|
|
auto instruction = val.toTupleRef().elements()[0].toStringRef();
|
|
auto debug_handle = val.toTupleRef().elements()[1].toInt();
|
|
#ifndef NO_PROFILING
|
|
auto start_time_us = torch::profiler::impl::getTime() / 1000;
|
|
#endif
|
|
try {
|
|
if (instruction.rfind("prim::Constant", 0) == 0) {
|
|
// 15 is the length of 'prim::Constant#' the constant val comes after
|
|
TORCH_CHECK(
|
|
instruction.size() > 15,
|
|
"Constant value is expected in ",
|
|
instruction);
|
|
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
|
|
auto sub = instruction.substr(15);
|
|
} else if (instruction == "aten::add" || instruction == "aten::sub") {
|
|
TORCH_CHECK(x.sizes() == h.sizes());
|
|
if (x.dim() > 1 || (x.dim() == 1 && x.size(0) > 1)) {
|
|
TORCH_WARN(
|
|
"Only the first elements of the tensors are added or subbed.");
|
|
}
|
|
TORCH_CHECK(
|
|
(x.scalar_type() == c10::ScalarType::Float &&
|
|
h.scalar_type() == c10::ScalarType::Float),
|
|
"Only float tensors are compatible for add and sub.");
|
|
at::Tensor y = at::detail::empty_cpu(x.sizes(), at::kFloat);
|
|
auto x_ptr = float_data_ptr(x);
|
|
auto h_ptr = float_data_ptr(h);
|
|
auto y_ptr = float_data_ptr(y);
|
|
#ifndef NO_PROFILING
|
|
RECORD_BACKEND_MEMORY_EVENT_TO_EDGE_PROFILER(
|
|
x_ptr,
|
|
x.numel() * sizeof(float),
|
|
x.numel() * sizeof(float),
|
|
x.numel() * sizeof(float) + y.numel() * sizeof(float) +
|
|
h.numel() * sizeof(float),
|
|
c10::Device(c10::kCPU));
|
|
#endif
|
|
if (instruction == "aten::add") {
|
|
y_ptr[0] = x_ptr[0] + h_ptr[0];
|
|
} else {
|
|
y_ptr[0] = x_ptr[0] - h_ptr[0];
|
|
}
|
|
output_list.emplace_back(y);
|
|
} else {
|
|
TORCH_CHECK(
|
|
false,
|
|
"Instruction, ",
|
|
instruction,
|
|
" is not supported. ",
|
|
"Contact the backend POC for details. ");
|
|
}
|
|
} catch (c10::Error& e) {
|
|
TORCH_DELEGATED_BACKEND_THROW(false, e.what(), debug_handle);
|
|
}
|
|
#ifndef NO_PROFILING
|
|
auto end_time_us = torch::profiler::impl::getTime() / 1000;
|
|
auto duration = end_time_us - start_time_us;
|
|
op_runtimes_us.emplace_back(duration, debug_handle, instruction);
|
|
#endif
|
|
}
|
|
#ifndef NO_PROFILING
|
|
for (const auto& tup : op_runtimes_us) {
|
|
RECORD_BACKEND_EVENT_TO_EDGE_PROFILER(
|
|
start_us,
|
|
start_us + std::get<0>(tup),
|
|
std::get<1>(tup),
|
|
std::get<2>(tup),
|
|
"test_backend");
|
|
start_us = start_us + std::get<0>(tup);
|
|
}
|
|
#endif
|
|
return c10::impl::toList(output_list);
|
|
}
|
|
};
|
|
|
|
namespace {
|
|
constexpr auto backend_name = "backend_with_compiler_demo";
|
|
static auto cls = torch::jit::backend<BackendWithCompiler>(backend_name);
|
|
} // namespace
|
|
|
|
} // namespace jit
|
|
} // namespace torch
|