#include "Python.h" #include "interpreter.h" #include "torch/csrc/autograd/edge.h" #include "torch/csrc/autograd/function.h" #include "torch/csrc/autograd/functions/special.h" #include "torch/csrc/autograd/profiler.h" #include "torch/csrc/autograd/python_engine.h" #include "torch/csrc/autograd/python_variable.h" #include "torch/csrc/autograd/variable.h" #include "torch/csrc/jit/fusion_compiler.h" #include "torch/csrc/jit/generated/aten_dispatch.h" #include "torch/csrc/jit/graph_executor.h" #include "torch/csrc/jit/ir.h" #include "torch/csrc/jit/pybind.h" #include "torch/csrc/jit/tensor_conversions.h" #include "torch/csrc/utils/auto_gil.h" #include namespace py = pybind11; namespace torch { namespace jit { // Before we translate to intepreter instructions, we do // some preprocessing of the graph to turn it into a form that is closer // to what the instructions will look like. // In particular we: // * (TODO) desugar Loop trip counts into c = 0, c += 1 instructions in the loop // * flatten stages so that each stage starts with a load from the stack // and ends with a store to the stack // *. computes move_flags (see Outputs), and inserts // * Drop nodes are inserted for any node that is unused to create a dummy use // that will cause the interpreter to free the node. // A drop node is just a node with no outputs that just pops its inputs off the stack, // to ensure the interpreter release references to nodes that are never used. // Drop nodes are also inserted when the last use of a node is in some conditionally // run control flow (e.g. one side of an If) and the interpreter must free // the node only after the control flow has reconverged // Outputs are: // * graph - the post processed copy of g // * move_flags[n] - a list of booleans, one for each input, // indicating whether this is the last use of the value. The interpreter // should generate a move rather than a copy in this case. // * stage_input_types: the type annotations on the inputs to each stage // these can be removed once the the backward tracer is no longer used // this currently just _removes_ the trip count inputs and checks they are // unused. In the future they will be desugared into normal arithmetic to // provide a loop counter void desugarTripCounts(Block * b) { for(auto n : b->nodes()) { if(n->kind() == kLoop) { // remove the trip count from Loop inputs, we don't support it yet n->removeInput(0); JIT_ASSERT(n->blocks()[0]->inputs()[0]->uses().size() == 0 && "NYI - use of trip count variable"); n->blocks()[0]->eraseInput(0); } for(auto sb : n->blocks()) { desugarTripCounts(sb); } } } // removes all inputs and outputs to a graph, replacing them with nodes before of after each insertStage static std::vector> flattenStages(Graph & graph) { // because JIT classic needs this to fix up gradients, remove when possible std::vector> stage_input_types; WithInsertPoint guard(graph, *graph.nodes().begin()); size_t input_pos = 0; size_t output_pos = 0; auto it = graph.nodes().begin(); for(size_t i = 0; i <= graph.stage(); i++) { stage_input_types.emplace_back(); auto store = graph.create(kStore, 0)->insertBefore(*it); while(input_pos < graph.inputs().size() && graph.inputs()[input_pos]->stage() == i) { auto nv = store->addOutput(); auto old_node = graph.inputs()[input_pos]; stage_input_types[i].push_back(old_node->type()); old_node->replaceAllUsesWith(nv); input_pos++; } while(it != graph.nodes().end() && it->stage() == i) ++it; auto load = graph.create(kLoad, 0)->insertBefore(*it); while(output_pos < graph.outputs().size() && graph.outputs()[output_pos]->stage() == i) { load->addInput(graph.outputs()[output_pos]); output_pos++; } } while (graph.inputs().size() > 0) graph.eraseInput(graph.inputs().size() - 1); while (graph.outputs().size() > 0) graph.eraseOutput(graph.outputs().size() - 1); return stage_input_types; } // insert Drop nodes to kill references for anything unused: // this can happen in a few places, e.g. when a node returns // many values but only one is used // a, b = foo() // return a void dropUnused(Block *b) { auto createDropIfUnused = [&](ArrayRef values) -> Node* { std::vector to_drop; for(auto v : values) { if(v->uses().size() == 0) to_drop.push_back(v); } if(to_drop.size() == 0) return nullptr; return b->owningGraph()->create(kDrop, to_drop, 0); }; if(auto d = createDropIfUnused(b->inputs())) { b->prependNode(d); } for(auto n : b->nodes()) { if(auto d = createDropIfUnused(n->outputs())) { d->insertAfter(n); } for(auto b : n->blocks()) dropUnused(b); } } // for each input, should we move rather than copy the inputs std::unordered_map> findLastUses(Graph & g) { // struct to share common data structures struct FindLastUses { Graph & graph; // have we seen this value, yet, if not, it is the last use of the value std::unordered_set seen; std::unordered_map> move_flags; // A map from an If or Loop node to the optional Drop block that // occurs directly after it to release any tensors that go out of scope // when the If/Loop exits. These are created and inserted on demand. std::unordered_map drop_for_node; FindLastUses(Graph & g) : graph(g) { scanBlock(graph.block()); } void scanBlock(Block * b) { scanNode(b->return_node()); for(auto n : b->nodes().reverse()) { scanNode(n); } } void scanNode(Node * n) { for(auto b : n->blocks()) { scanBlock(b); } move_flags[n].resize(n->inputs().size()); // scan backwards so if a value is used twice in the list then it is a move for(size_t i = n->inputs().size(); i > 0; --i) { scanUse(n, i-1); } } void scanUse(Node * n, size_t i) { auto & move_flags_n = move_flags[n]; auto v = n->inputs()[i]; auto inserted = seen.insert(v).second; if(!inserted) { move_flags_n[i] = false; return; } // the last use of v may be in a nested block of an If or Loop statement // find the node 'same_depth_node' at the same depth as the definition of v, // and consider that node to be the last use of v. // This ensures we do not delete nodes in nested scopes // that may be executed multiple times // and that nodes used on one side of an if // but not the other get deleted regardless of the branch // e.g. // a = 4 // while <...>: // y = a + a // drop(a) // In other words, we find the first program point for v that // _reverse_ dominates the definition of v, and add a drop point there. Node * same_depth_node = findOwnerInBlock(n, v->node()->owningBlock()); JIT_ASSERT(same_depth_node); // failure means v is not in scope for n, use lint! // In the case where v and n are in the same block, just mark // its move_flags to be true if(same_depth_node == n) { move_flags_n[i] = true; return; } // in the case where the use is nested in a block // add a Drop node after that block which will drop 'v'. move_flags_n[i] = false; addToDropIfNotExists(findOrCreateDropInstructionForNode(same_depth_node), v); } // finds the node in block 'block' that contains in 'n' // or nullptr if no such node exists, e.g.: // n0: a = 4 // n1: if : // n2: b = a + a // findOwnerInBlock(n2, n0.block()) == n1 Node * findOwnerInBlock(Node * n, Block * block) { while(n != nullptr && block != n->owningBlock()) { n = n->owningBlock()->owningNode(); } return n; } Node * findOrCreateDropInstructionForNode(Node * n) { auto it = drop_for_node.find(n); if(it == drop_for_node.end()) { auto drop_node = graph.create(kDrop, 0); drop_node->insertAfter(n); it = drop_for_node.emplace(n, drop_node).first; } return it->second; } void addToDropIfNotExists(Node * drop, Value * v) { for(auto i : drop->inputs()) { // we already accounted for this use if(i == v) return; } drop->addInput(v); move_flags[drop].push_back(true); } }; return FindLastUses(g).move_flags; } // pre-processing that happens once per graph struct PreprocessGraph { PreprocessGraph(Graph & g) : graph(g.copy()) { desugarTripCounts(graph->block()); stage_input_types = flattenStages(*graph); dropUnused(graph->block()); // fill in move_flags by scanning blocks; move_flags = findLastUses(*graph); //TODO: desugar Loop trip counts, for now we drop trip counts } // Outputs of the preprocessing: std::shared_ptr graph; // for each input, should we move rather than copy the inputs std::unordered_map> move_flags; std::vector> stage_input_types; }; // previously the interpreter worked with at::Retainable values, // which are annoying to handle since 99% of values are at::Tensor anyway // instead we create a fake subclass of TensorImpl that can be subclassed // to hold arbitrary things struct ContainerTensor : public at::TensorImpl { public: ContainerTensor() : TensorImpl(&(at::globalContext().getType(at::Backend::Undefined,at::ScalarType::Undefined))) {} virtual ~ContainerTensor() {} virtual const char * toString() const override { throw std::runtime_error("toString() on ContainerTensor"); } virtual at::IntList sizes() const override { throw std::runtime_error("sizes() on ContainerTensor"); } virtual at::IntList strides() const override { throw std::runtime_error("strides() on ContainerTensor"); } virtual int64_t dim() const override { throw std::runtime_error("dim() on ContainerTensor"); } virtual at::Scalar localScalar() override { throw std::runtime_error("localScalar() on ContainerTensor"); } virtual void * unsafeGetTH(bool retain) override { throw std::runtime_error("unsafeGetTH() on ContainerTensor"); } virtual std::unique_ptr storage() override { throw std::runtime_error("storage() on ContainerTensor"); } }; // Dummy function is the last function that the autograd engine calls // when evaluating Eval nodes. Its input tensors are the outputs that the // Eval node needs to produce. // We interscept these values using an Autograd callback. So the function itself // never runs. struct DummyFunction : autograd::Function { virtual autograd::variable_list apply(const autograd::variable_list& inputs) override { throw std::logic_error("DummyFunction::apply() called, but it should be blocked by a callback returning false"); } }; // An AutogradHandle holds the information needed to run an Autograd backward pass // after running a forward operator (such as PythonOp, CppOp, or for double-backwards another Eval Op) // The EvalOperation uses AutogradHandle to perform this operation. struct AutogradHandle : public ContainerTensor { // The inputs of DummyFunction are the gradients of the forward passes // inputs, and the _outputs_ of the run of the Autograd engine computing backward. // there is one entry in this list for each forward input that requires // gradients std::shared_ptr forward_inputs; // there is one entry in this list for each output of the forward pass // that represents the location in the backwaard pass where the gradient // of this output should be inserted at the beginning of the backward pass autograd::edge_list forward_outputs; }; // HandleBuilder is used to construct the correct Autograd Handle objects // for use in a future stage. // It is used even when the future stage does not require a handle since // it also performs the conversions between Tensor and Variable, which // behave differently depending on whether a future handle needs to be // created. struct HandleBuilder { HandleBuilder(bool requires_handle) { if(requires_handle) { handle = new AutogradHandle(); handle->forward_inputs = std::make_shared(); } } autograd::Variable addInput(at::Tensor && input, const VariableFlags & flags_) { if(handle && flags_.requires_grad) { auto variable = autograd::make_variable(std::move(input), /*requires_grad=*/false); autograd::create_gradient_edge(variable, handle->forward_inputs); return variable; } else { return autograd::make_variable(std::move(input), /*requires_grad=*/false); } } at::Tensor addOutput(const autograd::Variable & output) { if(handle) { handle->forward_outputs.push_back(output.gradient_edge()); } return output.data(); } void writeTo(Stack & outputs) { // outputs takes ownership of handle if(handle) { outputs.push_back(at::Tensor(handle, /*retain=*/false)); handle = nullptr; } } private: AutogradHandle* handle = nullptr; }; bool hasHandleOutput(Node * n) { if(n->outputs().size() == 0) return false; auto & last = n->outputs().back(); return last->isHandle() && last->uses().size() > 0; // don't bother creating a handle if it is never used } Operation createPythonOperation(PythonOp* op) { py::object func = py::handle(op->pyobj.get()).attr("apply"); bool has_handle = hasHandleOutput(op); size_t num_inputs = 0; for(auto arg_type : op->cconv) { if(arg_type == 't') num_inputs++; } return [=](Stack & stack) { AutoGIL gil; py::tuple py_inputs(op->cconv.size()); size_t i = 0; size_t next_scalar = 0; size_t next_tensor = 0; HandleBuilder builder(has_handle); for(auto arg_type : op->cconv) { if(arg_type == 's') { py_inputs[i] = py::reinterpret_borrow(op->scalar_args[next_scalar++].get()); } else if(arg_type == 't') { py_inputs[i] = py::reinterpret_steal(THPVariable_Wrap( builder.addInput(std::move(fromLast(stack, num_inputs - next_tensor)), op->var_flags.at(next_tensor)))); next_tensor++; } i++; } drop(stack, num_inputs); py::object py_outputs(func(*py_inputs)); auto addOutput = [&](py::handle entry) { if(!THPVariable_Check(entry.ptr())) { throw std::runtime_error("Function.apply returned a non-Variable output"); } THPVariable *var = (THPVariable*) entry.ptr(); stack.push_back(builder.addOutput(var->cdata)); }; if(!PyTuple_Check(py_outputs.ptr())) { addOutput(py_outputs); } else { for(py::handle entry : py::tuple(py_outputs)) { addOutput(entry); } } builder.writeTo(stack); return 0; }; } Operation createCppOperation(CppOp* op) { std::shared_ptr func = op->fn; bool has_handle = hasHandleOutput(op); auto num_inputs = op->inputs().size(); return [=](Stack & stack) { HandleBuilder builder(has_handle); autograd::variable_list v_inputs; for(size_t i = 0; i < num_inputs; i++) { v_inputs.push_back(builder.addInput(std::move(fromLast(stack, num_inputs - i)), op->var_flags[i])); } drop(stack, num_inputs); autograd::variable_list v_outputs = (*func)(v_inputs); for(auto & output : v_outputs) { stack.push_back(builder.addOutput(output)); } builder.writeTo(stack); return 0; }; } Operation createEvalOperation(CppOp * op) { bool has_handle_output = hasHandleOutput(op); auto num_inputs = op->inputs().size(); return [=](Stack & stack) { at::Tensor handle_t = std::move(stack.back()); AutogradHandle * handle_in = dynamic_cast(handle_t.get()); JIT_ASSERT(handle_in); HandleBuilder builder(has_handle_output); auto& engine = autograd::python::PythonEngine::getDefaultEngine(); autograd::variable_list v_inputs; for(size_t i = 0; i < num_inputs - 1; i++) { v_inputs.push_back(builder.addInput(std::move(fromLast(stack, num_inputs - i)), op->var_flags[i])); } drop(stack, num_inputs); // TODO: handle create_graph appropriately bool create_graph = true; // note: node handle_in->use_count() == 1 means that we are guarenteed that we have the only // only copy of the handle. This might make it seem it is ok to pass keep_graph=False. // However, it is possible for 'copied_next_fns' to grab functions used by _other_ handles, // and these functions will be executed in this run. Since these other handles // may still be alive, it is not safe to release the graph // TODO: we could cache this list in AutogradHandle (it's read only) autograd::edge_list output_edges; const auto num_inputs = handle_in->forward_inputs->num_inputs(); output_edges.reserve(num_inputs); for (uint32_t i = 0; i < num_inputs; ++i) output_edges.emplace_back(handle_in->forward_inputs, i); auto values = engine.execute(handle_in->forward_outputs, v_inputs, true, create_graph, output_edges); for(auto & v : values) stack.push_back(builder.addOutput(v)); builder.writeTo(stack); return 0; }; } // Returns a function implementing functionality of a given node, // or nullptr if it's a no-op for autograd. Operation getOperation(jit::Node *node, bool constants_are_variables) { IR_IFM(node, PythonOp) return createPythonOperation(value); IR_ELSEIFM(CppOp) if(dynamic_cast(value->fn.get())) { return createEvalOperation(value); } else { return createCppOperation(value); } IR_ELSEIF(FusionGroup) auto fusion_fn = sharedFusionCompiler().getOrCompile(value); auto num_inputs = value->inputs().size(); return [fusion_fn, num_inputs](Stack & stack) { autograd::profiler::RecordFunction record("FusionGroup"); std::vector toutputs; // TODO: have fusion_fn work off of a stack as well fusion_fn->launch(last(stack, num_inputs), toutputs); drop(stack, num_inputs); stack.insert(stack.end(), toutputs.begin(), toutputs.end()); return 0; }; IR_ELSEIF(Constant) if(constants_are_variables) { auto t = torch::autograd::make_variable(value->t(kvalue), false); return [t](Stack & stack) { stack.push_back(t); return 0; }; } else { auto t = value->t(kvalue); return [t](Stack & stack) { stack.push_back(t); return 0; }; } IR_ELSEIF(Undefined) return [](Stack & stack) { stack.push_back(at::Tensor()); return 0; }; IR_ELSEIF(ReplaceIfUndef) return [](Stack & stack) { auto alternate = pop(stack); auto result = pop(stack); if(result.defined()) { stack.push_back(std::move(result)); } else { stack.push_back(std::move(alternate)); } return 0; }; IR_ELSEIF(Print) size_t num_inputs = value->inputs().size(); return [num_inputs](Stack & stack) { bool first = true; for (at::Tensor i : last(stack, num_inputs)) { if (!first) std::cout << " "; first = false; if (auto tensor_impl = dynamic_cast(i.get())) { std::cout << at::Tensor(tensor_impl, true); } else if (!i.defined()) { std::cout << ""; } else { std::cout << "<" << typeid(*i.get()).name() << " at " << i << ">"; } } drop(stack, num_inputs); std::cout << std::endl; return 0; }; IR_ELSEIF(GraphExecutor) GraphExecutor executor(value->g(kSubgraph)); auto num_inputs = value->inputs().size(); return [=](Stack& stack) mutable { autograd::profiler::RecordFunction record("GraphExecutor"); auto inputs = last(stack, num_inputs); variable_tensor_list tinputs(inputs.begin(), inputs.end()); drop(stack, num_inputs); //TODO: has graph executor work from a stack as well variable_tensor_list toutputs = executor.run(variable_tensor_list(std::move(tinputs))); stack.insert(stack.end(), toutputs.begin(), toutputs.end()); return 0; }; // Load x, y // loads values from registers onto the stack, the actual callback does // nothing since the stack manipulation is already encoded in inst.inputs // and inst.outputs IR_ELSEIF(Load) return [=](Stack& stack) { return 0; }; // x, y = Store // stores values from stack into registers, the actual callback does // nothing since the stack manipulation is already encoded in inst.inputs // and inst.outputs IR_ELSEIF(Store) return [=](Stack& stack) { return 0; }; IR_ELSEIF(Drop) auto N = value->inputs().size(); return [=](Stack& stack) { drop(stack, N); return 0; }; IR_ELSE() return getTensorOp(node).op; IR_END() } // We need some lists for inputs and outputs. To keep all the memory // contiguous we allocate a single vector and use offsets into the vector // which are stored in the ListHandle struct // start is an offset into int_data of Code for ListHandle // and bool_data of Code for ListHandle template struct ListHandle { int start; int size; }; struct UseList { // values to be used ListHandle values; // boolean flags indicating whether to free the Tensor after this use ListHandle free_flags; }; // one instruction plus meta-data struct Instruction { Operation callback; UseList inputs; ListHandle outputs; Symbol debug_name; // used in dump to understand the generated code }; int relativeJump(int from_inst, int to_inst) { return to_inst - (from_inst + 1); } struct CodeImpl { CodeImpl(std::shared_ptr & graph_, bool constants_are_variables) : constants_are_variables(constants_are_variables) , preprocess(*graph_) { graph = preprocess.graph; //std::cout << "into code graph:\n" << *graph << "\n"; insertNodesFromBlock(graph->block()); } // jump when input is 0 void createJumpZ(int from_inst, int to_inst) { auto & inst = instructions[from_inst]; JIT_ASSERT(inst.debug_name == kPlaceholder); auto offset = relativeJump(from_inst, to_inst); inst.callback = [offset](Stack & stack) { auto t = tensor_as(pop(stack)); return (t == 0) ? offset : 0; }; inst.debug_name = kJumpZ; } // jump when input is not 0 void createJumpNZ(int from_inst, int to_inst) { auto & inst = instructions[from_inst]; JIT_ASSERT(inst.debug_name == kPlaceholder); auto offset = relativeJump(from_inst, to_inst); inst.callback = [offset](Stack & stack) { auto t = tensor_as(pop(stack)); return (t != 0) ? offset : 0; }; inst.debug_name = kJumpNZ; } void createJump(int from_inst, int to_inst) { auto & inst = instructions[from_inst]; JIT_ASSERT(inst.debug_name == kPlaceholder); auto offset = relativeJump(from_inst, to_inst); inst.callback = [=](Stack & stack) { return offset; }; inst.debug_name = kJump; } void insertNodesFromBlock(Block* block) { for(auto node : block->nodes()) { switch(node->kind()) { case kIf: { // x = if c: // // -> (vt) // else: // // -> (vf) // turns into: // JumpNZ c, then // // x = vf // Jump end // then: // // x = vt // end: // kPlaceholder instructions are replaced with branch instructions // when the branch target locations are known auto cond_branch = insertInstruction(kPlaceholder, node->inputs(), moveFlags(node), {}); auto then_block = node->blocks()[0]; auto else_block = node->blocks()[1]; insertNodesFromBlock(else_block); insertAssign(else_block->outputs(), moveFlags(else_block), node->outputs()); auto jump = insertInstruction(kPlaceholder, {}, {}, {}); auto then_block_start = instructions.size(); insertNodesFromBlock(then_block); insertAssign(then_block->outputs(), moveFlags(then_block), node->outputs()); createJump(jump, instructions.size()); createJumpNZ(cond_branch, then_block_start); } break; case kLoop: { // o0 = while c i0 // block 0: l0 // // -> (v0, v1) // turns into: // l0 = i0 // JumpZ c, end // begin: // // c, l0 = v0, v1 // JumpNZ c, begin // end: auto body_block = node->blocks()[0]; // before assign op: stack: ...

insertAssign(node->inputs(), moveFlags(node), body_block->inputs()); // after assign op: stack: ... // cond_branch consumes from top of the stack auto cond_branch = insertInstruction(kPlaceholder, {}, {}, {}); // after branch: stack: ... auto entry = instructions.size(); insertNodesFromBlock(body_block); // before assign op: stack: ...

insertAssign(body_block->outputs(), moveFlags(body_block), body_block->inputs()); // after assign op: stack: ... auto cond_branch_end = insertInstruction(kPlaceholder, {}, {}, {}); // after branch: stack: ... aliasRegistersTo(node->outputs(), body_block->inputs()); createJumpZ(cond_branch, instructions.size()); createJumpNZ(cond_branch_end, entry); } break; default: { insertInstruction(node); } break; } // each stage ends with a load instruction // we record where these instructions occur, and use them to // exit the interpreter if(node->kind() == kLoad) { stage_end.push_back(instructions.size()); } } } size_t insertInstruction(Node * n) { auto inst = insertInstruction(n->kind(), n->inputs(), moveFlags(n) , n->outputs()); instructions[inst].callback = getOperation(n, constants_are_variables); return inst; } size_t insertInstruction(Symbol sym, ArrayRef inputs, ArrayRef move_flags, ArrayRef outputs) { instructions.emplace_back(); auto & inst = instructions.back(); inst.debug_name = sym; listBegin(inst.inputs.values); for(auto input : inputs) { listInsert(inst.inputs.values, getOrAllocateRegister(input, true)); } listBegin(inst.inputs.free_flags); for(auto flag : move_flags) { listInsert(inst.inputs.free_flags, flag); } listBegin(inst.outputs); for(auto output : outputs) { listInsert(inst.outputs, getOrAllocateRegister(output)); } return instructions.size() - 1; } ArrayRef moveFlags(Node * n) { return preprocess.move_flags.at(n); } ArrayRef moveFlags(Block *b) { return moveFlags(b->return_node()); } size_t insertAssign(ArrayRef inputs, ArrayRef move_flags, ArrayRef outputs) { auto inst = insertInstruction(kAssign, inputs, move_flags, outputs); // This node effectively forwards its inputs into different places in a register list. // We don't need to manipulate the stack in any way, because all inputs are also outputs, // and the interpreter will take care of putting them in correct places. instructions[inst].callback = [](Stack& stack) { return 0; }; return inst; } // helpers to build/access RegList objects int get(const ListHandle & list, int i) const { return int_data[list.start + i]; } bool get(const ListHandle & list, int i) const { return bool_data[list.start + i]; } void listBegin(ListHandle & list) { list.start = int_data.size(); list.size = 0; } void listInsert(ListHandle & list, int value) { JIT_ASSERTM(list.start + list.size == (int)int_data.size(), "another list already started"); int_data.push_back(value); list.size++; } void listBegin(ListHandle & list) { list.start = bool_data.size(); list.size = 0; } void listInsert(ListHandle & list, int value) { JIT_ASSERTM(list.start + list.size == (int)bool_data.size(), "another list already started"); bool_data.push_back(value); list.size++; } // must be called before any new_allocations are used, otherwise they will // already have registers assigned void aliasRegistersTo(ArrayRef new_allocations, ArrayRef existing_allocations) { JIT_ASSERT(new_allocations.size() == existing_allocations.size()); for(size_t i = 0; i < new_allocations.size(); ++i) { auto n = new_allocations[i]->unique(); auto e = existing_allocations[i]->unique(); JIT_ASSERT(unique_to_reg.count(e) > 0 && unique_to_reg.count(n) == 0); unique_to_reg[n] = unique_to_reg[e]; } } int getOrAllocateRegister(Value * n, bool required = false) { size_t u = n->unique(); if(unique_to_reg.count(u) > 0) return unique_to_reg[u]; JIT_ASSERT(!required); int r = register_size++; unique_to_reg[u] = r; return r; } void dumpInstruction(std::ostream & out, size_t pc) const { auto writeList = [&](const ListHandle & list) { for(int i = 0; i < list.size; i++) { if(i > 0) out << ", "; out << get(list, i); } }; auto writeUseList = [&](const UseList & list) { for(int i = 0; i < list.values.size; i++) { if(i > 0) out << ", "; if(get(list.free_flags, i)) out << "move(" << get(list.values, i) << ")"; else out << get(list.values, i); } }; auto & inst = instructions.at(pc); writeList(inst.outputs); out << " = " << inst.debug_name.toString() << " "; writeUseList(inst.inputs); } void dump(std::ostream & out) const { for(size_t i = 0; i < instructions.size(); ++i) { dumpInstruction(out, i); out << "\n"; } } // We MUST hold onto graph here because some Operators stored in the // instruction lists have dependencies on meta-data stored in the graph // that would be dead otherwise. // It is also very useful for debugging interpreter problems to // keep this around. std::shared_ptr graph; bool constants_are_variables; PreprocessGraph preprocess; std::unordered_map unique_to_reg; // map from unique of nodes to register in register table friend struct InterpreterState; std::vector instructions; std::vector stage_end; // each stage runs while(pc < stage_end[stage]) int register_size = 0; // all memory ArrayRef are slices of this, to make sure // the interpreter is mostly linearly scanning through memory std::vector int_data; std::vector bool_data; }; // InterpreterState state that is held across stages and used to compute a Code struct InterpreterStateImpl { InterpreterStateImpl(const Code & function_) : function(function_.pImpl), int_data(function->int_data.data()), bool_data(function->bool_data), registers(function->register_size) { } void runOneStage(Stack & stack) { // std::cout << "running stage: " << current_stage << " of " << function->stage_end.size() << "\n"; // std::cout << *function->graph << "\n"; // function->dump(std::cout); size_t pc = current_pc; size_t last = function->stage_end[current_stage]; auto & instructions = function->instructions; while(pc < last) { // std::cout << "executing " << pc << ": "; // function->dumpInstruction(std::cout, pc); // std::cout << "\n"; auto & inst = instructions[pc]; loadTensorsFromRegisters(inst.inputs, stack); pc += 1 + inst.callback(stack); for(int i = inst.outputs.size - 1; i >= 0; i--) { int reg = get(inst.outputs,i); registers[reg] = pop(stack); // std::cout << "pop reg[" << reg << "];\n" << registers[reg].pImpl << "\n"; } } current_pc = pc; current_stage++; } const TensorType & tensorTypeForInput(size_t i) const { return *function->preprocess.stage_input_types.at(current_stage).at(i)->expect(); } int get(const ListHandle & list, int i) { return int_data[list.start + i]; }; bool get(const ListHandle & list, int i) { return bool_data[list.start + i]; } void loadTensorsFromRegisters(const UseList & uses, Stack & stack) { for(int i = 0; i < uses.values.size; i++) { int reg = get(uses.values,i); // std::cout << "push reg[" << reg << "];\n" << registers[reg] << "\n\n"; if(get(uses.free_flags,i)) { stack.push_back(std::move(registers[reg])); } else { stack.push_back(registers[reg]); } } } size_t current_stage = 0; size_t current_pc = 0; std::shared_ptr function; // keep function alive // these are just copies of function to prevent indirections in interpreter int * int_data; const std::vector & bool_data; // this holds all the tensors for this interpreter run // we don't bother minimizing the size of this vector, since the extra // memory used by the pointers in this will be small // instead we are very aggresive about releasing tensors when they become dead // to make sure memory management happens efficiently. // We optimize for the case where derivatives are run with retain_graph=False // in the case where it is true, then the interpreter and this array get copied // if this every becomes a bottleneck then we _should_ consider minimizing the // total number or register std::vector registers; // single buffer for input/output calls to ATen functions, so that we do not reallocate Stack stack; }; std::ostream & operator<<(std::ostream & out, const Code & code) { out << *code.pImpl->graph << "\n"; code.pImpl->dump(out); return out; } Code::Code(std::shared_ptr & graph, bool constants_are_variables) : pImpl(new CodeImpl(graph, constants_are_variables)) {} Code::~Code() {} InterpreterState::InterpreterState(const Code & function) : pImpl(new InterpreterStateImpl(function)) {} InterpreterState::~InterpreterState() {} void InterpreterState::runOneStage(Stack & stack) { return pImpl->runOneStage(stack); } const TensorType & InterpreterState::tensorTypeForInput(size_t i) const { return pImpl->tensorTypeForInput(i); } InterpreterState InterpreterState::clone() const { return InterpreterState(new InterpreterStateImpl(*pImpl)); } InterpreterState::InterpreterState(InterpreterStateImpl * pImpl) : pImpl(pImpl) {} }}