mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/54438 August 1x model has DictConstruct in the graph (P331168321) These can be easily removed with jit pass, but to easily measure the improvement and run replayer with the model in the meantime, enable DictConstruct in static runtime Test Plan: ``` ./sigrid/predictor/scripts/pytorch/pyper_inference_e2e_local_replayer_test.sh \ cpu 218841466_0 7449 /data/users/ansha/tmp/adfinder/august_1x/ /data/users/ansha/tmp/adfinder/august_1x/filtered_requests_inline_cvr_100 ``` ``` TEST trace Total num requests 100 Num exceptions 0 Latency us avg 180965 Latency us p25 89785 Latency us p50 131240 Latency us p75 146621 Latency us p90 158378 Latency us p95 166628 Latency us p99 1886680 Latency us p100 3803252 Server latency us avg 91554 Server latency us p25 51447 Server latency us p50 86371 Server latency us p75 95229 Server latency us p90 102706 Server latency us p95 116023 Server latency us p99 557017 Server latency us p100 716319 Num rankUnits avg 28 ``` Reviewed By: hlu1 Differential Revision: D27236682 fbshipit-source-id: 1da49a836dd7533480e77797338baa9edcb65fb5
1178 lines
37 KiB
C++
1178 lines
37 KiB
C++
#include <torch/csrc/jit/runtime/static/impl.h>
|
|
|
|
#include <ATen/core/LegacyTypeDispatch.h>
|
|
#include <ATen/core/interned_strings.h>
|
|
#include <c10/core/CPUAllocator.h>
|
|
#include <caffe2/core/scope_guard.h>
|
|
#include <caffe2/core/timer.h>
|
|
#include <torch/csrc/jit/ir/alias_analysis.h>
|
|
#include <torch/csrc/jit/passes/canonicalize.h>
|
|
#include <torch/csrc/jit/passes/dead_code_elimination.h>
|
|
#include <torch/csrc/jit/passes/freeze_module.h>
|
|
#include <torch/csrc/jit/passes/remove_mutation.h>
|
|
#include <torch/csrc/jit/passes/subgraph_rewrite.h>
|
|
#include <torch/csrc/jit/runtime/static/ops.h>
|
|
#include <torch/csrc/jit/runtime/static/passes.h>
|
|
#include <torch/csrc/jit/runtime/vararg_functions.h>
|
|
|
|
namespace torch {
|
|
namespace jit {
|
|
|
|
namespace {
|
|
|
|
void OptimizeGraph(
|
|
std::shared_ptr<torch::jit::Graph>& graph,
|
|
const StaticModuleOptions& opts) {
|
|
Inline(*graph);
|
|
ConstantPropagation(graph);
|
|
Canonicalize(graph);
|
|
ConstantPropagation(graph);
|
|
RemoveTensorMutation(graph);
|
|
ConstantPropagation(graph);
|
|
EliminateDeadCode(graph);
|
|
FuseInferenceOpsForSparseNN(graph);
|
|
|
|
// TODO: we can avoid this guard by moving operations
|
|
// to exposed folders.
|
|
#ifdef FBCODE_CAFFE2
|
|
if (opts.enable_out_variant) {
|
|
ReplaceWithCopy(graph);
|
|
FuseSigridTransformsListUnpack(graph);
|
|
}
|
|
#endif
|
|
ConstantPropagation(graph);
|
|
}
|
|
|
|
void CheckGraphEligibility(const std::shared_ptr<torch::jit::Graph>& graph) {
|
|
for (auto n : graph->nodes()) {
|
|
if (n->kind() == c10::Symbol::fromQualString("prim::GetAttr")) {
|
|
throw std::runtime_error("Cannot accelerate unfrozen graphs");
|
|
}
|
|
}
|
|
// check output types
|
|
// Static Runtime supports output types include None, Tensor and List/Tuple
|
|
// of Tensor
|
|
for (Value* output : graph->outputs()) {
|
|
VLOG(1) << "output: %" << output->debugName()
|
|
<< " has type: " << output->type()->repr_str();
|
|
auto kind = output->node()->kind();
|
|
if (kind == prim::TupleConstruct || kind == prim::ListConstruct) {
|
|
for (Value* input : output->node()->inputs()) {
|
|
const auto& type = input->type();
|
|
TORCH_CHECK(
|
|
type->cast<TensorType>() != nullptr,
|
|
"Static Runtime expects output type as List or Tuple of Tensor, but got List or Tuple of ",
|
|
type->repr_str());
|
|
}
|
|
} else {
|
|
const auto& type = output->type();
|
|
TORCH_CHECK(
|
|
type->cast<TensorType>() != nullptr ||
|
|
type->cast<NoneType>() != nullptr,
|
|
"Static Runtime expects output type as None or Tensor, but got ",
|
|
type->repr_str());
|
|
}
|
|
}
|
|
}
|
|
|
|
// remove unused input 0 from graph
|
|
void RemoveSelfFromGraphInput(std::shared_ptr<torch::jit::Graph>& graph) {
|
|
if (graph->inputs().at(0)->type()->is_module()) {
|
|
TORCH_CHECK(!graph->inputs().at(0)->hasUses());
|
|
graph->eraseInput(0);
|
|
}
|
|
}
|
|
|
|
// remove "self" from function schema
|
|
c10::FunctionSchema RemoveSelfFromSchema(const c10::FunctionSchema& s) {
|
|
TORCH_CHECK(s.arguments().size() >= 1 && s.arguments()[0].name() == "self");
|
|
std::vector<Argument> args({s.arguments().begin() + 1, s.arguments().end()});
|
|
return s.cloneWithArguments(args);
|
|
}
|
|
|
|
bool mayContainAlias(AliasDb& db, const Value* a, const Value* b) {
|
|
return db.mayContainAlias(const_cast<Value*>(a), const_cast<Value*>(b));
|
|
}
|
|
|
|
bool mayContainAlias(
|
|
AliasDb& db,
|
|
const std::unordered_set<const Value*>& a,
|
|
const std::unordered_set<const Value*>& b) {
|
|
std::vector<Value*> as;
|
|
std::vector<Value*> bs;
|
|
as.reserve(a.size());
|
|
for (auto* v : a) {
|
|
as.emplace_back(const_cast<Value*>(v));
|
|
}
|
|
bs.reserve(b.size());
|
|
for (auto* v : b) {
|
|
bs.emplace_back(const_cast<Value*>(v));
|
|
}
|
|
return db.mayContainAlias(as, bs);
|
|
}
|
|
|
|
// Returns two useful constructs:
|
|
// first: map each value to all values that are alive
|
|
// at the same time.
|
|
// second: set of all inputs/outputs/constants (always alive)
|
|
// and their aliases
|
|
// The algorithm does a traversal of the execution graph
|
|
// while keeping track of the live values.
|
|
using LivenessInformation = std::pair<
|
|
std::unordered_map<const Value*, std::set<const Value*>>,
|
|
std::unordered_set<const Value*>>;
|
|
|
|
LivenessInformation GetLivenessInformation(
|
|
const std::shared_ptr<torch::jit::Graph>& graph,
|
|
AliasDb& db) {
|
|
std::unordered_map<const Value*, std::set<const Value*>> liveness_map;
|
|
std::unordered_set<const Value*> always_alive;
|
|
|
|
std::vector<const Value*> values_in_creation_order;
|
|
std::unordered_map<const Value*, size_t> values_in_creation_order_idx;
|
|
for (const auto* node : graph->nodes()) {
|
|
for (const auto* v : node->outputs()) {
|
|
values_in_creation_order_idx[v] = values_in_creation_order.size();
|
|
values_in_creation_order.emplace_back(v);
|
|
}
|
|
}
|
|
|
|
// maps values to any nodes that consume or produce them
|
|
//
|
|
// updated as we traverse the graph. the presence of a key in `live_values`
|
|
// means that the value is currently alive.
|
|
//
|
|
// invariant: set.size() > 0
|
|
std::unordered_map<const Value*, std::set<const Node*>> live_values;
|
|
std::unordered_map<const Node*, std::set<const Value*>> live_nodes;
|
|
|
|
// inputs and outputs are marked permanently alive
|
|
for (const auto* input : graph->inputs()) {
|
|
always_alive.insert(input);
|
|
}
|
|
for (const auto* output : graph->outputs()) {
|
|
always_alive.insert(output);
|
|
}
|
|
|
|
for (const auto* node : graph->nodes()) {
|
|
if (node->kind() == prim::Constant) {
|
|
for (const auto* output : node->outputs()) {
|
|
always_alive.insert(output);
|
|
}
|
|
}
|
|
}
|
|
|
|
std::function<void(const Value* v)> add_live_value_fn = [&](const Value* v) {
|
|
if (liveness_map.count(v)) {
|
|
return;
|
|
}
|
|
liveness_map[v] = {};
|
|
|
|
for (const auto& live_v : live_values) {
|
|
liveness_map.at(v).insert(live_v.first);
|
|
liveness_map.at(live_v.first).insert(v);
|
|
}
|
|
|
|
// only add values to the live set if they
|
|
// have deps, otherwise they die immediately
|
|
if (v->uses().size()) {
|
|
live_values[v] = {};
|
|
}
|
|
|
|
for (const auto& u : v->uses()) {
|
|
const auto* node = u.user;
|
|
// track deps of this value
|
|
live_values.at(v).insert(node);
|
|
live_nodes[node].insert(v);
|
|
}
|
|
|
|
// values created after this one that alias it
|
|
std::vector<const Value*> aliased_vs;
|
|
auto idx = values_in_creation_order_idx[v];
|
|
for (; idx < values_in_creation_order.size(); ++idx) {
|
|
auto* alias_v = values_in_creation_order[idx];
|
|
if (mayContainAlias(db, v, alias_v)) {
|
|
aliased_vs.emplace_back(alias_v);
|
|
}
|
|
}
|
|
// for all the values in the alias set,
|
|
// we set them "alive"
|
|
for (auto* aliased_v : aliased_vs) {
|
|
add_live_value_fn(aliased_v);
|
|
for (const auto& u : aliased_v->uses()) {
|
|
const auto* node = u.user;
|
|
// track deps of the aliased values is if they
|
|
// are our own
|
|
live_values.at(v).insert(node);
|
|
live_nodes[node].insert(v);
|
|
}
|
|
}
|
|
};
|
|
|
|
auto traverse_node_fn = [&](const Node* node,
|
|
std::vector<const Value*>& dead) {
|
|
if (live_nodes.count(node)) {
|
|
for (const auto* v : live_nodes.at(node)) {
|
|
live_values.at(v).erase(node);
|
|
if (!live_values.at(v).size()) {
|
|
dead.emplace_back(v);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
for (const auto* node : graph->nodes()) {
|
|
for (const auto* v : node->outputs()) {
|
|
if (mayContainAlias(db, ValueSet{v}, always_alive)) {
|
|
always_alive.insert(v);
|
|
} else {
|
|
add_live_value_fn(v);
|
|
}
|
|
}
|
|
|
|
std::vector<const Value*> dead;
|
|
traverse_node_fn(node, dead);
|
|
for (const auto* dead_value : dead) {
|
|
live_values.erase(dead_value);
|
|
}
|
|
}
|
|
|
|
for (const auto& v : live_values) {
|
|
TORCH_CHECK(always_alive.count(v.first));
|
|
}
|
|
|
|
for (const auto* node : graph->nodes()) {
|
|
for (const auto* input : node->inputs()) {
|
|
for (const auto* output : node->outputs()) {
|
|
if (liveness_map.count(input) && liveness_map.count(output)) {
|
|
liveness_map.at(input).insert(output);
|
|
liveness_map.at(output).insert(input);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return std::make_pair(liveness_map, always_alive);
|
|
}
|
|
|
|
// Implementation specific pruning of values
|
|
// from "optimzable" set. GetLivenessInformation and FindSameStorageValues
|
|
// work with any graph, but we prune out values
|
|
// that aren't produced by "_out" variants here.
|
|
//
|
|
// Returns
|
|
// first: Values that can be optimized
|
|
// second: A deterministc order of all values
|
|
std::pair<std::vector<const Value*>, std::vector<const Value*>>
|
|
GetOptimizableValues(const std::shared_ptr<torch::jit::Graph>& graph) {
|
|
// for determinism
|
|
std::unordered_set<const Value*> seen_values;
|
|
std::vector<const Value*> all_values;
|
|
std::unordered_set<const Value*> can_reuse;
|
|
// values used by unsupported ops (as either inputs or outputs)
|
|
// these need to be removed from "can_reuse" after analyzing all nodes
|
|
std::unordered_set<const Value*> cannot_reuse;
|
|
for (auto* n : graph->nodes()) {
|
|
for (const auto* v : n->inputs()) {
|
|
if (!seen_values.count(v)) {
|
|
all_values.emplace_back(v);
|
|
seen_values.insert(v);
|
|
}
|
|
if (canReuseInputsOutputs(n)) {
|
|
can_reuse.insert(v);
|
|
} else {
|
|
cannot_reuse.insert(v);
|
|
}
|
|
}
|
|
for (const auto* v : n->outputs()) {
|
|
all_values.emplace_back(v);
|
|
seen_values.insert(v);
|
|
if (canReuseInputsOutputs(n)) {
|
|
can_reuse.insert(v);
|
|
} else {
|
|
cannot_reuse.insert(v);
|
|
}
|
|
}
|
|
}
|
|
for (const auto* v : cannot_reuse) {
|
|
can_reuse.erase(v);
|
|
}
|
|
// find a deterministic order
|
|
std::vector<const Value*> optimizable;
|
|
for (const auto* v : all_values) {
|
|
if (can_reuse.count(v)) {
|
|
optimizable.emplace_back(v);
|
|
can_reuse.erase(v);
|
|
}
|
|
}
|
|
return std::make_pair(optimizable, all_values);
|
|
}
|
|
|
|
// Equipped with a liveness map we can allocate memory to
|
|
// ivalues, reusing memory along the way. However, we are
|
|
// constrained by the set of optimizable_values
|
|
// (inputs/outputs of out variants). Inputs/outputs of view ops
|
|
// can't be reused.
|
|
//
|
|
// Algorithm:
|
|
// # clusters of values sharing the same memory
|
|
// # are called "value_to_same_storage_values" in the implementation
|
|
// # inserting into a cluster denotes sharing memory.
|
|
//
|
|
// clusters = {}
|
|
// for all v in optimzable_values:
|
|
// for all cluster in clusters: # can we insert into cluster?
|
|
// for all live_v in live_during(v):
|
|
// if cluster.contains(live_v):
|
|
// skip to next custer
|
|
// cluster.add(v)
|
|
// skip to next v
|
|
// if no cluster found:
|
|
// clusters.add(cluster{v})
|
|
//
|
|
//
|
|
// NB: This is a deterministic implementation, which makes it easier to tune
|
|
// and debug.
|
|
std::unordered_map<const Value*, std::vector<const Value*>>
|
|
FindSameStorageValues(
|
|
const LivenessInformation& lm,
|
|
const std::pair<std::vector<const Value*>, std::vector<const Value*>>&
|
|
optimizable,
|
|
AliasDb& db) {
|
|
const auto& alive_during = lm.first;
|
|
const auto& always_alive = lm.second;
|
|
const auto& optimizable_values = optimizable.first;
|
|
const auto& all_values = optimizable.second;
|
|
|
|
// map Value* to a set Value* that can share the same storage with it
|
|
std::unordered_map<const Value*, std::vector<const Value*>>
|
|
same_storage_values;
|
|
|
|
// make new_v and old_v map to the same storage (i.e., add to each other's
|
|
// same_storage_values set)
|
|
auto share_storage_fn = [&](const Value* new_v, const Value* old_v) {
|
|
if (new_v == old_v) {
|
|
return;
|
|
}
|
|
DCHECK(same_storage_values.count(old_v));
|
|
std::set<const Value*> seen;
|
|
std::vector<const Value*> values;
|
|
for (auto* v : same_storage_values.at(old_v)) {
|
|
if (seen.count(v)) {
|
|
continue;
|
|
}
|
|
seen.insert(v);
|
|
values.emplace_back(v);
|
|
}
|
|
for (auto* v : same_storage_values.at(new_v)) {
|
|
if (seen.count(v)) {
|
|
continue;
|
|
}
|
|
seen.insert(v);
|
|
values.emplace_back(v);
|
|
}
|
|
for (const auto* v : values) {
|
|
same_storage_values[v] = values;
|
|
}
|
|
};
|
|
|
|
// initialize with known same_storage_values (aliasing values)
|
|
for (const auto* v : all_values) {
|
|
if (!same_storage_values.count(v)) {
|
|
same_storage_values[v] = {v};
|
|
}
|
|
// skip always alive values (alias inputs/outputs/weights)
|
|
if (always_alive.count(v)) {
|
|
continue;
|
|
}
|
|
for (const auto& p : same_storage_values) {
|
|
// NB: this means we cannot optimize operations that "sometimes alias"
|
|
// TODO: add a more robust check of this behavior at runtime
|
|
// FIXME (penguin): this handling makes v and MayAlias(v) share the
|
|
// same storage, which is not correct.
|
|
if (db.mayAlias(p.first, v)) {
|
|
share_storage_fn(v, p.first);
|
|
}
|
|
}
|
|
}
|
|
|
|
// to preserve determinism
|
|
std::vector<const Value*> seen;
|
|
|
|
for (const auto* v : optimizable_values) {
|
|
if (always_alive.count(v)) {
|
|
continue;
|
|
}
|
|
// get values that are live during the lifetime of v
|
|
std::set<const Value*> live;
|
|
for (const auto* sv : same_storage_values.at(v)) {
|
|
const auto& l = alive_during.count(sv) ? alive_during.at(sv)
|
|
: std::set<const Value*>{};
|
|
live.insert(l.begin(), l.end());
|
|
}
|
|
live.insert(always_alive.begin(), always_alive.end());
|
|
|
|
for (const auto* s : seen) {
|
|
// check if any values in this set of same_storage_values
|
|
// are alive at the time of v
|
|
// effectively finding | set_intersection(live, set_of_shared(s)) | > 0
|
|
bool intersects = false;
|
|
for (const auto* candidate_v : same_storage_values.at(s)) {
|
|
if (live.count(candidate_v)) {
|
|
intersects = true;
|
|
break;
|
|
}
|
|
}
|
|
// we can share memory if there's no overlap
|
|
if (!intersects) {
|
|
share_storage_fn(v, s);
|
|
break;
|
|
}
|
|
}
|
|
seen.emplace_back(v);
|
|
}
|
|
|
|
return same_storage_values;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
void PrepareGraphForStaticModule(
|
|
std::shared_ptr<torch::jit::Graph> graph,
|
|
const StaticModuleOptions& opts) {
|
|
OptimizeGraph(graph, opts);
|
|
CheckGraphEligibility(graph);
|
|
RemoveSelfFromGraphInput(graph);
|
|
}
|
|
|
|
std::pair<std::shared_ptr<Graph>, c10::optional<c10::FunctionSchema>>
|
|
PrepareForStaticModule(
|
|
const torch::jit::Module& m,
|
|
const StaticModuleOptions& opts) {
|
|
auto module = m.copy();
|
|
module.eval();
|
|
|
|
module = freeze_module(module);
|
|
|
|
Method method = module.get_method("forward");
|
|
auto graph = module.get_method("forward").graph();
|
|
PrepareGraphForStaticModule(graph, opts);
|
|
|
|
c10::FunctionSchema s = RemoveSelfFromSchema(method.function().getSchema());
|
|
return std::make_pair(graph, s);
|
|
}
|
|
|
|
std::pair<std::shared_ptr<Graph>, c10::optional<c10::FunctionSchema>>
|
|
PrepareForStaticModule(
|
|
std::shared_ptr<torch::jit::Graph> graph,
|
|
const StaticModuleOptions& opts) {
|
|
PrepareGraphForStaticModule(graph, opts);
|
|
return std::make_pair(graph, c10::nullopt);
|
|
}
|
|
|
|
StaticModule::StaticModule(
|
|
std::shared_ptr<torch::jit::Graph> g,
|
|
const StaticModuleOptions& opts)
|
|
: StaticModule(PrepareForStaticModule(g, opts), opts) {}
|
|
|
|
StaticModule::StaticModule(
|
|
const torch::jit::Module& m,
|
|
const StaticModuleOptions& opts)
|
|
: StaticModule(PrepareForStaticModule(m, opts), opts) {}
|
|
|
|
StaticModule::StaticModule(
|
|
std::pair<
|
|
std::shared_ptr<torch::jit::Graph>,
|
|
c10::optional<c10::FunctionSchema>> graph_and_schema,
|
|
const StaticModuleOptions& opts)
|
|
: opts_(opts),
|
|
graph_(std::move(graph_and_schema.first)),
|
|
schema_(std::move(graph_and_schema.second)) {
|
|
// map Value* to IValue (from inputs or prim::Constant) or null
|
|
std::unordered_map<Value*, IValue*> value_to_ivalue;
|
|
// map Value* to its SSA definition IR
|
|
std::unordered_map<Value*, DefInfo> value_to_ssa_def;
|
|
|
|
// N inputs map to the first N entries in storage
|
|
for (auto i = 0; i < graph_->inputs().size(); ++i) {
|
|
Value* input = graph_->inputs()[i];
|
|
value_to_ivalue[input] = nullptr;
|
|
value_to_ssa_def[input] = std::make_pair(INPUT_VALUE, i);
|
|
}
|
|
|
|
// NB: before optimizing the order of execution, ensure that the
|
|
// memory optimization pass (LivenessMap) is
|
|
// aware of the new order!
|
|
|
|
// Fill constants first, so we have a std::vector<IValue> we can reference
|
|
// later
|
|
for (Node* node : graph_->nodes()) {
|
|
if (node->kind() != prim::Constant) {
|
|
continue;
|
|
}
|
|
auto* v = node->output();
|
|
TORCH_CHECK(v->type()->kind() != FunctionType::Kind);
|
|
constants_.emplace_back(toIValue(v).value());
|
|
}
|
|
{
|
|
// construct SSA definition for constant nodes
|
|
int i = 0;
|
|
for (Node* node : graph_->nodes()) {
|
|
if (node->kind() != prim::Constant) {
|
|
continue;
|
|
}
|
|
auto* v = node->output();
|
|
value_to_ssa_def[v] = std::make_pair(CONSTANT_VALUE, i);
|
|
value_to_ivalue[v] = &(constants_[i++]);
|
|
}
|
|
}
|
|
|
|
// construct SSA definition for non-constant nodes
|
|
int node_idx = 0;
|
|
for (Node* node : graph_->nodes()) {
|
|
if (node->kind() == prim::Constant) {
|
|
continue;
|
|
}
|
|
std::vector<const IValue*> ivalue_inputs;
|
|
std::vector<DefInfo> input_ssa_defs;
|
|
for (Value* input : node->inputs()) {
|
|
ivalue_inputs.emplace_back(value_to_ivalue.at(input));
|
|
input_ssa_defs.emplace_back(value_to_ssa_def.at(input));
|
|
}
|
|
node_inputs_ssa_def_map_[node_idx] = input_ssa_defs;
|
|
nodes_.emplace_back(
|
|
ProcessedNode(node, std::move(ivalue_inputs), opts.enable_out_variant));
|
|
for (size_t i = 0; i < node->outputs().size(); ++i) {
|
|
value_to_ivalue[node->outputs()[i]] = nullptr;
|
|
value_to_ssa_def[node->outputs()[i]] = std::make_pair(node_idx, i);
|
|
}
|
|
node_idx++;
|
|
}
|
|
for (auto output : graph_->outputs()) {
|
|
output_ssa_defs_.emplace_back(value_to_ssa_def[output]);
|
|
}
|
|
|
|
AliasDb alias_db(graph_);
|
|
auto lm = GetLivenessInformation(graph_, alias_db);
|
|
external_values_ = lm.second;
|
|
if (opts_.optimize_memory) {
|
|
auto values = GetOptimizableValues(graph_);
|
|
if (!opts_.enable_out_variant) {
|
|
values.first = {};
|
|
}
|
|
value_to_same_storage_values_ = FindSameStorageValues(lm, values, alias_db);
|
|
}
|
|
}
|
|
|
|
const StaticModuleOptions& StaticModule::opts() const {
|
|
return opts_;
|
|
}
|
|
|
|
size_t StaticModule::num_outputs() const {
|
|
return graph_->outputs().size();
|
|
}
|
|
|
|
size_t StaticModule::num_inputs() const {
|
|
return graph_->inputs().size();
|
|
}
|
|
|
|
StaticRuntime& StaticModule::runtime() {
|
|
if (!cached_runtime_) {
|
|
cached_runtime_ = std::make_unique<StaticRuntime>(*this);
|
|
}
|
|
return *cached_runtime_;
|
|
}
|
|
|
|
std::vector<at::Tensor> StaticModule::operator()(
|
|
const std::vector<at::Tensor>& inps) {
|
|
return runtime()(inps);
|
|
}
|
|
c10::IValue StaticModule::operator()(
|
|
const std::vector<c10::IValue>& args,
|
|
const std::unordered_map<std::string, c10::IValue>& kwargs) {
|
|
return runtime()(args, kwargs);
|
|
}
|
|
|
|
StaticRuntime::StaticRuntime(const StaticModule& sm) : static_module_(sm) {
|
|
// NB: create unchanging std::vector<IValue>s we can reference
|
|
inputs_.resize(sm.num_inputs());
|
|
nodes_.resize(sm.nodes().size());
|
|
for (auto idx = 0; idx < sm.nodes().size(); ++idx) {
|
|
const auto& n_ref = sm.nodes()[idx];
|
|
nodes_[idx] = n_ref; // copy the node
|
|
auto& n = nodes_[idx];
|
|
// hook up the inputs
|
|
for (auto i = 0; i < n.inputs().size(); ++i) {
|
|
if (n.inputs()[i] == nullptr) {
|
|
int node_idx;
|
|
int out_idx;
|
|
std::tie(node_idx, out_idx) = sm.index_map().at(idx)[i];
|
|
DCHECK(out_idx >= 0);
|
|
// input
|
|
if (node_idx == StaticModule::INPUT_VALUE) {
|
|
n.set_input(i, &inputs_[out_idx]);
|
|
} else if (node_idx == StaticModule::CONSTANT_VALUE) {
|
|
n.set_input(i, &sm.constants()[out_idx]);
|
|
} else {
|
|
n.set_input(i, &(nodes_[node_idx].Output(out_idx)));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (const auto& index_pair : sm.output_indices()) {
|
|
int node_idx;
|
|
int out_idx;
|
|
std::tie(node_idx, out_idx) = index_pair;
|
|
if (node_idx == StaticModule::INPUT_VALUE) {
|
|
outputs_.emplace_back(&inputs_[out_idx]);
|
|
} else if (node_idx == StaticModule::CONSTANT_VALUE) {
|
|
// This is a very rare case where const correctness
|
|
// breaks -- the user is returning a constant from
|
|
// the graph.
|
|
outputs_.emplace_back(const_cast<IValue*>(&sm.constants()[out_idx]));
|
|
} else {
|
|
auto& n = nodes_.at(node_idx);
|
|
auto* out = &n.Output(out_idx);
|
|
outputs_.emplace_back(out);
|
|
}
|
|
}
|
|
}
|
|
|
|
std::vector<at::Tensor> StaticRuntime::operator()(
|
|
const std::vector<at::Tensor>& inps) {
|
|
std::vector<c10::IValue> stack;
|
|
stack.resize(inps.size());
|
|
for (size_t i = 0; i < inps.size(); i++) {
|
|
stack[i] = inps[i];
|
|
}
|
|
|
|
c10::IValue v =
|
|
(*this)(stack, std::unordered_map<std::string, c10::IValue>());
|
|
|
|
std::vector<at::Tensor> out;
|
|
|
|
if (v.isTuple()) {
|
|
auto t = v.toTuple();
|
|
for (const auto& el : t->elements()) {
|
|
out.emplace_back(el.toTensor());
|
|
}
|
|
} else {
|
|
out.emplace_back(v.toTensor());
|
|
}
|
|
return out;
|
|
}
|
|
|
|
c10::IValue StaticRuntime::operator()(
|
|
const std::vector<c10::IValue>& args,
|
|
const std::unordered_map<std::string, c10::IValue>& kwargs) {
|
|
// We assume inference workloads, so we do not need
|
|
// autograd. Enabling this is a significant win on dispatcher
|
|
// overhead because it saves a round of dispatch for at least some
|
|
// functions, such as resize_ and resize_as_.
|
|
at::AutoNonVariableTypeMode non_var_type_mode(true);
|
|
|
|
if (planner_) {
|
|
planner_->allocate();
|
|
}
|
|
|
|
if (!kwargs.empty()) {
|
|
// This is not ideal
|
|
TORCH_CHECK(
|
|
static_module_.schema(),
|
|
"Schema is not available. Consider creating the Static Runtime "
|
|
"with StaticModule(const torch::jit::Module& m) instead.");
|
|
std::vector<c10::IValue> s = args;
|
|
static_module_.schema()->checkAndNormalizeInputs(s, kwargs);
|
|
for (size_t i = 0; i < s.size(); i++) {
|
|
Input(i) = std::move(s[i]);
|
|
}
|
|
} else {
|
|
for (size_t i = 0; i < args.size(); i++) {
|
|
Input(i) = args[i];
|
|
}
|
|
}
|
|
|
|
// NB: before optimizing the order of execution, ensure that the
|
|
// memory optimization pass (LivenessMap) is
|
|
// aware of the new order!
|
|
for (auto& n : nodes_) {
|
|
n.run();
|
|
}
|
|
|
|
if (static_module_.opts().cleanup_activations) {
|
|
if (!planner_) {
|
|
planner_ = std::make_unique<MemoryPlanner>(
|
|
this,
|
|
static_module_.values_share_same_storage(),
|
|
static_module_.external_values(),
|
|
static_module_.opts().enable_out_variant);
|
|
}
|
|
planner_->deallocate();
|
|
// clean up owning refs of input tensors
|
|
for (IValue& ival : inputs_) {
|
|
ival = IValue();
|
|
}
|
|
}
|
|
|
|
// no need to keep references of outputs in static runtime anymore
|
|
if (static_module_.num_outputs() > 1) {
|
|
std::vector<c10::IValue> outputs;
|
|
outputs.reserve(static_module_.num_outputs());
|
|
for (auto i = 0; i < static_module_.num_outputs(); ++i) {
|
|
// use move here. Otherwise, clean up outputs_[i] explicitly
|
|
outputs.emplace_back(std::move(*outputs_[i]));
|
|
}
|
|
return c10::ivalue::Tuple::create(std::move(outputs));
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
check_for_memory_leak(false);
|
|
#endif
|
|
|
|
// use move here. Otherwise, clean up outputs_[0] explicitly
|
|
return std::move(*outputs_[0]);
|
|
}
|
|
|
|
void StaticRuntime::benchmark(
|
|
const std::vector<c10::IValue>& args,
|
|
const std::unordered_map<std::string, c10::IValue>& kwargs,
|
|
const int warmup_runs,
|
|
const int main_runs) {
|
|
float time_per_iter = benchmark_model(args, kwargs, warmup_runs, main_runs);
|
|
std::cout << "Static runtime ms per iter: " << time_per_iter
|
|
<< ". Iters per second: " << 1000.0 / time_per_iter << std::endl;
|
|
|
|
IndividualMetrics results =
|
|
benchmark_individual_ops(args, kwargs, warmup_runs, main_runs);
|
|
|
|
for (size_t i = 0; i < nodes_.size(); i++) {
|
|
const Node* node = nodes_[i].node();
|
|
std::cout << "Node #" << i << ": " << results.time_per_node[i]
|
|
<< " ms/iter, ";
|
|
node->print(std::cout, 0, nullptr, false);
|
|
}
|
|
|
|
std::vector<std::pair<std::string, double>> time_per_node_type_vec{
|
|
results.time_per_node_type.begin(), results.time_per_node_type.end()};
|
|
std::sort(
|
|
time_per_node_type_vec.begin(),
|
|
time_per_node_type_vec.end(),
|
|
[](auto& left, auto& right) { return left.second > right.second; });
|
|
|
|
std::cout << "Time per node type:" << std::endl;
|
|
for (const auto& p : time_per_node_type_vec) {
|
|
const std::string& kind = p.first;
|
|
const double ms = p.second;
|
|
std::cout << std::setw(15) << ms << " ms. " << std::setw(10)
|
|
<< results.percent_per_node_type[kind] << "%. " << kind << " ("
|
|
<< results.instances_per_node_type[kind] << " nodes)"
|
|
<< std::endl;
|
|
}
|
|
std::cout << std::setw(15) << results.total_time << " ms. in Total"
|
|
<< std::endl;
|
|
std::cout << "StaticRuntime setup time: " << results.setup_time << " ms"
|
|
<< std::endl;
|
|
std::cout << "Memory allocation time: " << results.memory_alloc_time
|
|
<< " ms\n";
|
|
std::cout << "Memory deallocation time: " << results.memory_dealloc_time
|
|
<< " ms" << std::endl;
|
|
std::cout << "Outputs deallocation time: " << results.output_dealloc_time
|
|
<< " ms" << std::endl;
|
|
|
|
if (planner_) {
|
|
std::cout << "Total memory managed: " << planner_->total_managed()
|
|
<< " bytes" << std::endl;
|
|
if (static_module_.opts().optimize_memory) {
|
|
std::cout << "Total number of reused tensors: "
|
|
<< planner_->total_reused_tensors() << std::endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
float StaticRuntime::benchmark_model(
|
|
const std::vector<c10::IValue>& args,
|
|
const std::unordered_map<std::string, c10::IValue>& kwargs,
|
|
const int warmup_runs,
|
|
const int main_runs) {
|
|
TORCH_CHECK(warmup_runs >= 0 && main_runs >= 1);
|
|
|
|
for (int i = 0; i < warmup_runs; i++) {
|
|
operator()(args, kwargs);
|
|
}
|
|
caffe2::Timer timer;
|
|
for (int i = 0; i < main_runs; i++) {
|
|
operator()(args, kwargs);
|
|
}
|
|
float millis = timer.MilliSeconds();
|
|
return millis / static_cast<float>(main_runs);
|
|
}
|
|
|
|
StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
|
|
const std::vector<c10::IValue>& args,
|
|
const std::unordered_map<std::string, c10::IValue>& kwargs,
|
|
const int warmup_runs,
|
|
const int main_runs) {
|
|
TORCH_CHECK(warmup_runs >= 0 && main_runs >= 1);
|
|
|
|
// See comment on above use of AutoNonVariableTypeMode for
|
|
// explanation.
|
|
at::AutoNonVariableTypeMode non_var_type_mode(true);
|
|
|
|
IndividualMetrics results;
|
|
results.time_per_node.resize(nodes_.size(), 0);
|
|
|
|
// setup time
|
|
caffe2::Timer timer;
|
|
std::vector<IValue> stack(args);
|
|
if (!kwargs.empty()) {
|
|
// This is not ideal
|
|
TORCH_CHECK(
|
|
static_module_.schema(),
|
|
"Schema is not available. Consider creating the Static Runtime "
|
|
"with StaticModule(const torch::jit::Module& m) instead.");
|
|
static_module_.schema()->checkAndNormalizeInputs(stack, kwargs);
|
|
}
|
|
for (size_t i = 0; i < stack.size(); i++) {
|
|
Input(i) = stack[i];
|
|
}
|
|
results.setup_time = timer.MilliSeconds();
|
|
|
|
// warmup runs
|
|
for (int i = 0; i < warmup_runs; i++) {
|
|
operator()(args, kwargs);
|
|
}
|
|
|
|
// main runs
|
|
for (int k = 0; k < main_runs; k++) {
|
|
for (size_t i = 0; i < stack.size(); i++) {
|
|
Input(i) = stack[i];
|
|
}
|
|
timer.Start();
|
|
if (planner_) {
|
|
planner_->allocate();
|
|
}
|
|
float millis = timer.MilliSeconds();
|
|
results.memory_alloc_time += millis;
|
|
|
|
for (size_t i = 0; i < nodes_.size(); i++) {
|
|
timer.Start();
|
|
nodes_[i].run();
|
|
millis = timer.MilliSeconds();
|
|
results.time_per_node[i] += millis;
|
|
}
|
|
timer.Start();
|
|
if (static_module_.opts().cleanup_activations) {
|
|
if (!planner_) {
|
|
planner_ = std::make_unique<MemoryPlanner>(
|
|
this,
|
|
static_module_.values_share_same_storage(),
|
|
static_module_.external_values(),
|
|
static_module_.opts().enable_out_variant);
|
|
}
|
|
planner_->deallocate();
|
|
// clean up owning refs of input tensors
|
|
for (IValue& ival : inputs_) {
|
|
ival = IValue();
|
|
}
|
|
}
|
|
millis = timer.MilliSeconds();
|
|
results.memory_dealloc_time += millis;
|
|
|
|
timer.Start();
|
|
// no need to keep references of outputs in static runtime anymore
|
|
c10::IValue output;
|
|
if (static_module_.num_outputs() > 1) {
|
|
std::vector<c10::IValue> outputs;
|
|
outputs.reserve(static_module_.num_outputs());
|
|
for (auto i = 0; i < static_module_.num_outputs(); ++i) {
|
|
// use move here. Otherwise, clean up outputs_[i] explicitly
|
|
outputs.emplace_back(std::move(*outputs_[i]));
|
|
}
|
|
output = c10::ivalue::Tuple::create(std::move(outputs));
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
check_for_memory_leak(false);
|
|
#endif
|
|
|
|
// use move here. Otherwise, clean up outputs_[0] explicitly
|
|
output = std::move(*outputs_[0]);
|
|
// release outputs explicitly to measure the time it takes
|
|
output = IValue();
|
|
millis = timer.MilliSeconds();
|
|
results.output_dealloc_time += millis;
|
|
}
|
|
|
|
// post processing
|
|
for (size_t i = 0; i < nodes_.size(); i++) {
|
|
const Node* node = nodes_[i].node();
|
|
std::string kind = std::string(node->kind().toQualString());
|
|
results.time_per_node[i] /= static_cast<float>(main_runs);
|
|
results.time_per_node_type[kind] += results.time_per_node[i];
|
|
results.instances_per_node_type[kind]++;
|
|
results.total_time += results.time_per_node[i];
|
|
}
|
|
results.memory_alloc_time /= static_cast<float>(main_runs);
|
|
results.memory_dealloc_time /= static_cast<float>(main_runs);
|
|
results.output_dealloc_time /= static_cast<float>(main_runs);
|
|
for (const auto& p : results.time_per_node_type) {
|
|
const std::string& kind = p.first;
|
|
results.percent_per_node_type[kind] = p.second / results.total_time * 100;
|
|
}
|
|
return results;
|
|
}
|
|
|
|
void StaticRuntime::check_for_memory_leak(bool output_returned) {
|
|
if (!static_module_.opts().cleanup_activations) {
|
|
return;
|
|
}
|
|
|
|
// check for inputs
|
|
for (size_t i = 0; i < inputs_.size(); i++) {
|
|
TORCH_CHECK(inputs_[i].isNone(), "Input ", i, " was not cleaned up");
|
|
}
|
|
|
|
std::unordered_set<const IValue*> output_ivalues(
|
|
outputs_.begin(), outputs_.end());
|
|
for (size_t n = 0; n < nodes_.size(); n++) {
|
|
auto& pnode = nodes_[n];
|
|
for (size_t i = 0; i < pnode.outputs().size(); i++) {
|
|
const IValue* ival = &pnode.Output(i);
|
|
const std::string error_msg = "Output " + c10::to_string(i) +
|
|
" of node " + c10::to_string(n) + " was not cleaned up";
|
|
if (output_ivalues.count(ival) == 0) {
|
|
// check for intermediates
|
|
if (!ival->isNone()) {
|
|
TORCH_CHECK(
|
|
ival->isTensor() || canOptimizeConstruct(pnode.node()),
|
|
error_msg);
|
|
if (ival->isTensor()) {
|
|
const auto& t = ival->toTensor();
|
|
if (t.defined()) {
|
|
const auto* storage_impl = t.storage().unsafeGetStorageImpl();
|
|
TORCH_CHECK(storage_impl->data() == nullptr, error_msg);
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// check for outputs
|
|
if (output_returned) {
|
|
TORCH_CHECK(ival->isNone(), error_msg);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
MemoryPlanner::MemoryPlanner(
|
|
StaticRuntime* runtime,
|
|
const std::unordered_map<const Value*, std::vector<const Value*>>&
|
|
value_to_same_storage_values,
|
|
const std::unordered_set<const Value*>& external_values,
|
|
bool out_variants) {
|
|
// collect register indices of outputs of ops with out variant
|
|
std::unordered_set<const Value*> managed_values;
|
|
std::unordered_set<IValue*> unmanaged_ivalues;
|
|
for (ProcessedNode& pnode : runtime->nodes()) {
|
|
if (canReuseInputsOutputs(pnode.node())) {
|
|
for (auto i = 0; i < pnode.outputs().size(); ++i) {
|
|
// Types are stored in the underlying TorchScript IR
|
|
const Value* out_v = pnode.node()->outputs()[i];
|
|
IValue& out = pnode.Output(i);
|
|
const auto& type = out_v->type();
|
|
if (out_variants && !external_values.count(out_v)) {
|
|
if (type->cast<TensorType>()) {
|
|
managed_values.insert(out_v);
|
|
} else if (canOptimizeConstruct(pnode.node())) {
|
|
// We "leak" containers of this type
|
|
} else {
|
|
unmanaged_ivalues.insert(&out);
|
|
}
|
|
} else {
|
|
unmanaged_ivalues.insert(&out);
|
|
}
|
|
}
|
|
} else {
|
|
for (auto i = 0; i < pnode.outputs().size(); ++i) {
|
|
unmanaged_ivalues.insert(&pnode.Output(i));
|
|
}
|
|
}
|
|
}
|
|
|
|
// remove model outputs from managed_values and unmanaged_ivalues
|
|
for (const Value* output : runtime->graph().outputs()) {
|
|
managed_values.erase(output);
|
|
}
|
|
for (IValue* output : runtime->outputs()) {
|
|
unmanaged_ivalues.erase(output);
|
|
}
|
|
|
|
// unmanaged_ivalues => unmanaged_ivalues_
|
|
for (IValue* out : unmanaged_ivalues) {
|
|
unmanaged_ivalues_.emplace_back(out);
|
|
}
|
|
|
|
// map Value to index to managed_storage_, where multiple values can
|
|
// map to the same index (i.e., sharing the same storage)
|
|
std::unordered_map<const Value*, size_t> value_to_storage_idx;
|
|
// the StorageImpls of Tensor views should not be managed
|
|
std::unordered_set<c10::StorageImpl*> managed_storage_impls;
|
|
|
|
// Snapshot of the current memory state
|
|
for (const auto& pnode : runtime->nodes()) {
|
|
for (auto i = 0; i < pnode.outputs().size(); ++i) {
|
|
const auto& ival = pnode.outputs()[i];
|
|
const auto* val = pnode.node()->outputs()[i];
|
|
if (managed_values.count(val)) {
|
|
TORCH_CHECK(ival.isTensor());
|
|
auto* impl = ival.toTensor().storage().unsafeGetStorageImpl();
|
|
|
|
auto didInsert = managed_storage_impls.insert(impl).second;
|
|
if (!didInsert) {
|
|
continue;
|
|
}
|
|
|
|
if (value_to_storage_idx.count(val)) {
|
|
managed_storage_[value_to_storage_idx.at(val)].second.emplace_back(
|
|
impl);
|
|
} else {
|
|
auto p =
|
|
std::make_pair<size_t, std::vector<c10::StorageImpl*>>(0, {impl});
|
|
managed_storage_.emplace_back(std::move(p));
|
|
// first of a group, update the value_to_storage_idx map with the
|
|
// index
|
|
if (value_to_same_storage_values.count(val)) {
|
|
for (const auto* v : value_to_same_storage_values.at(val)) {
|
|
value_to_storage_idx[v] = managed_storage_.size() - 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Don't change the size if it is already aligned, otherwise increase the size
|
|
// to make it aligned.
|
|
size_t MemoryPlanner::compute_aligned_tensor_size(size_t nbytes) {
|
|
// Note: everything below is size_t
|
|
return (nbytes + c10::gAlignment - 1) & (~(c10::gAlignment - 1));
|
|
}
|
|
|
|
at::DataPtr MemoryPlanner::allocate_buffer(size_t size) {
|
|
at::Allocator* allocator = c10::GetCPUCachingAllocator();
|
|
return allocator->allocate(size);
|
|
}
|
|
|
|
void MemoryPlanner::allocate() {
|
|
if (managed_bytes_ == 0) {
|
|
return;
|
|
}
|
|
buffer_ = allocate_buffer(managed_bytes_);
|
|
|
|
size_t offset = 0;
|
|
uint8_t* start = static_cast<uint8_t*>(buffer_.get());
|
|
|
|
reused_tensors_ = 0;
|
|
for (const auto& ms : managed_storage_) {
|
|
auto tensor_size = ms.first;
|
|
if (tensor_size == 0) {
|
|
continue;
|
|
}
|
|
const auto& impls = ms.second;
|
|
DCHECK_LE(offset + tensor_size, managed_bytes_);
|
|
void* src = static_cast<void*>(start + offset);
|
|
|
|
for (auto& impl : impls) {
|
|
impl->set_data_ptr_noswap(at::DataPtr(src, src, nullptr, impl->device()));
|
|
impl->set_nbytes(tensor_size);
|
|
reused_tensors_++;
|
|
}
|
|
reused_tensors_--;
|
|
|
|
offset += tensor_size;
|
|
}
|
|
DCHECK_EQ(offset, managed_bytes_);
|
|
}
|
|
|
|
void MemoryPlanner::deallocate() {
|
|
managed_bytes_ = 0;
|
|
|
|
// free memory used by outputs of ops in out variants
|
|
// but keep the TensorImpl and StorageImpl around
|
|
for (auto& ms : managed_storage_) {
|
|
const auto& impls = ms.second;
|
|
size_t max = 0;
|
|
for (auto& impl : impls) {
|
|
size_t current_size = compute_aligned_tensor_size(impl->nbytes());
|
|
impl->reset();
|
|
max = std::max(max, current_size);
|
|
}
|
|
ms.first = max;
|
|
managed_bytes_ += max;
|
|
}
|
|
for (auto& iv : unmanaged_ivalues_) {
|
|
*iv = IValue();
|
|
}
|
|
buffer_ = {};
|
|
}
|
|
|
|
ProcessedNode::ProcessedNode(
|
|
Node* node,
|
|
std::vector<const IValue*>&& inputs,
|
|
bool enable_out_variants)
|
|
: node_(node), inputs_(std::move(inputs)) {
|
|
// TODO leverage type information
|
|
outputs_.resize(node->outputs().size());
|
|
|
|
if (enable_out_variants && canRunOutOfPlace(node)) {
|
|
fn_ = getOutOfPlaceOperation(node);
|
|
std::ostringstream ss;
|
|
node->print(ss, 0, nullptr, false);
|
|
VLOG(1) << "Switch to out variant for node: " << ss.str();
|
|
} else if (canRunNatively(node)) {
|
|
native_fn_ = getNativeOperation(node);
|
|
std::ostringstream ss;
|
|
node->print(ss, 0, nullptr, false);
|
|
VLOG(1) << "Switch to native impl for node: " << ss.str();
|
|
} else if (
|
|
node->kind() != prim::ListConstruct &&
|
|
node->kind() != prim::TupleConstruct &&
|
|
node->kind() != prim::DictConstruct && node->kind() != prim::ListUnpack) {
|
|
const Operator& op = node->getOperator();
|
|
TORCH_CHECK(op.hasOperation());
|
|
op_ = op.getOperation(node);
|
|
|
|
std::ostringstream ss;
|
|
node->print(ss, 0, nullptr, false);
|
|
VLOG(1) << "Fallback interpreter for node: " << ss.str();
|
|
}
|
|
}
|
|
|
|
void ProcessedNode::run() {
|
|
if (fn_) {
|
|
fn_(this);
|
|
} else if (native_fn_) {
|
|
native_fn_(this);
|
|
} else {
|
|
std::vector<IValue> stack;
|
|
const size_t size = node_->inputs().size();
|
|
stack.reserve(size);
|
|
for (size_t i = 0; i < size; i++) {
|
|
stack.emplace_back(Input(i));
|
|
}
|
|
|
|
DCHECK(op_);
|
|
op_->operator()(&stack);
|
|
|
|
DCHECK_EQ(stack.size(), node_->outputs().size());
|
|
for (auto i = 0; i < node_->outputs().size(); i++) {
|
|
Output(i) = std::move(stack[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace jit
|
|
} // namespace torch
|