[Static Runtime] Support prim::GetAttr/SetAttr (#61505)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61505 The handling of `self` in static runtime was previously incorrect. This diff fixed that issue, since self is essential to prim::GetAttr/SetAttr. After all, most of the time we're getting and setting attributes from self, the torch script module. Reviewed By: ajyu Differential Revision: D29350173 fbshipit-source-id: 6e62add4cda517ef8cd6c315d4cb0595e7d531fb
2025-12-07 00:21:07 +01:00 · 2021-07-10 14:04:48 -07:00 · 2021-07-10 14:04:48 -07:00 · ccd0977060
commit ccd0977060
parent f291b1899f
7 changed files with 262 additions and 128 deletions
--- a/test/test_static_runtime.py
+++ b/test/test_static_runtime.py
@ -1,11 +1,11 @@
 import unittest
 from typing import Dict, Optional
 import numpy as np
 import torch
 import unittest
 from torch import nn
 from torch.testing._internal.common_utils import TestCase, run_tests
 from typing import Dict, Optional
 class StaticModule:
    def __init__(self, scripted):
@ -30,7 +30,9 @@ class StaticModule:
        )
-def linear_shim(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+def linear_shim(
    input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None
 ) -> torch.Tensor:
    output = input.matmul(weight.t())
    if bias is not None:
        output += bias
@ -107,7 +109,8 @@ def trivial_graph(a, b, c):
    s = torch.tensor([[3, 3], [3, 3]])
    return a + b * c + s
-def loop_graph(a, b, iters : int):
+
 def loop_graph(a, b, iters: int):
    c = a + b * 2
    for i in range(iters):
        c = c + b
@ -115,14 +118,50 @@ def loop_graph(a, b, iters : int):
        c -= a
    return c
-def output_graph(a, b, c, iters : int):
+
 def output_graph(a, b, c, iters: int):
    s = torch.tensor([[3, 3], [3, 3]])
    k = a + b * c + s
-    d : Dict[int, torch.Tensor] = {}
+    d: Dict[int, torch.Tensor] = {}
    for i in range(iters):
        d[i] = k + i
    return d
 class SubModule(nn.Module):
    def __init__(self):
        super(SubModule, self).__init__()
        self.a = 11
        self.b = 2
    def forward(self, x):
        return self.a + self.b + x
 class SubModule2(nn.Module):
    def __init__(self):
        super(SubModule2, self).__init__()
        self.a = 12
        self.b = 2
    def forward(self, x):
        self.b = 30
        return self.a + self.b + x
 class TestModule(nn.Module):
    def __init__(self):
        super(TestModule, self).__init__()
        self.sub1 = SubModule()
        self.sub2 = SubModule2()
        self.a = 3
        self.b = 4
    def forward(self, x):
        self.b = 20
        return self.sub1(x) + self.a + self.b + self.sub2(x)
 class TestStaticModule(TestCase):
    def test_multihead_attention_layer(self):
        HID_DIM = 256
@ -220,6 +259,46 @@ class TestStaticModule(TestCase):
        o_test = tg_a(s)[0]
        torch.testing.assert_allclose(o_ref, o_test)
    def test_attr(self):
        """
        TorchScript IR of TestModule() after freezing:
        graph(%self : __torch__.test_static_runtime.___torch_mangle_0.TestModule,
              %x.1 : Tensor):
            %18 : int = prim::Constant[value=30]()
            %30 : int = prim::Constant[value=13]()
            %3 : int = prim::Constant[value=20]()
            %2 : int = prim::Constant[value=1]()
            %self.sub2.a : int = prim::Constant[value=12]()
            %self.a : int = prim::Constant[value=3]()
            = prim::SetAttr[name="b"](%self, %3)
            %17 : Tensor = aten::add(%x.1, %30, %2)
            %7 : Tensor = aten::add(%17, %self.a, %2)
            %b.1 : int = prim::GetAttr[name="b"](%self)
            %9 : Tensor = aten::add(%7, %b.1, %2)
            %sub2 : __torch__.test_static_runtime.___torch_mangle_2.SubModule2 = prim::GetAttr[name="sub2"](%self)
            = prim::SetAttr[name="b"](%sub2, %18)
            %b : int = prim::GetAttr[name="b"](%sub2)
            %22 : int = aten::add(%self.sub2.a, %b)
            %23 : Tensor = aten::add(%x.1, %22, %2)
            %12 : Tensor = aten::add(%9, %23, %2)
            return (%12)
        """
        # test prim::SetAttr and prim::GetAttr impl in Static Runtime
        m = TestModule()
        m.eval()
        input = torch.randn(2, 2)
        output_s = m.forward(input)
        ms = torch.jit.script(m)
        sm = StaticModule(ms)
        output_sm = sm(input)[0]
        torch.testing.assert_allclose(output_s, output_sm)
        sm.benchmark([input], {}, 2, 2)
        sm.benchmark_individual_ops([input], {}, 2, 2)
        sm.benchmark([], {"x": input}, 2, 2)
        sm.benchmark_individual_ops([], {"x": input}, 2, 2)
    @unittest.skip("Temporarily disabled")
    def test_fusion_trivial_graph(self):
        s = torch.full((2, 2), 2)
@ -281,6 +360,5 @@ class TestStaticModule(TestCase):
            torch.testing.assert_allclose(o_ref[i], o_test[i])
 if __name__ == "__main__":
    run_tests()
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@ -45,20 +45,29 @@ void OptimizeGraph(
  ConstantPropagation(graph);
 }
-void CheckGraphEligibility(const std::shared_ptr<torch::jit::Graph>& graph) {
+bool CheckGraphEligibility(const std::shared_ptr<torch::jit::Graph>& graph) {
-  for (auto n : graph->nodes()) {
+  // check for sub-blocks
-    if (n->kind() == c10::Symbol::fromQualString("prim::GetAttr")) {
+  bool can_support = true;
-      throw std::runtime_error("Cannot accelerate unfrozen graphs");
+  for (auto* node : graph->block()->nodes()) {
    for (Block* sub_block : node->blocks()) {
      VLOG(1) << "Found nested sub-blocks in graph at node: "
              << PrintNode(node);
      can_support = false;
    }
  }
  return can_support;
 }
 // remove unused input 0 from graph
-void RemoveSelfFromGraphInput(std::shared_ptr<torch::jit::Graph>& graph) {
+bool RemoveSelfFromGraphInput(std::shared_ptr<torch::jit::Graph>& graph) {
  if (graph->inputs().at(0)->type()->is_module()) {
-    TORCH_CHECK(!graph->inputs().at(0)->hasUses());
+    if (graph->inputs().at(0)->hasUses()) {
      return false;
    }
    graph->eraseInput(0);
  }
  return true;
 }
 // remove "self" from function schema
@ -443,12 +452,12 @@ GenerateSameStorageValues(
 void PrepareGraphForStaticModule(
    std::shared_ptr<torch::jit::Graph> graph,
    const StaticModuleOptions& opts) {
-  CheckGraphEligibility(graph);
+  // TODO: call CheckGraphEligibility before trying to enable static runtime
  TORCH_CHECK(CheckGraphEligibility(graph));
  OptimizeGraph(graph, opts);
  RemoveSelfFromGraphInput(graph);
 }
-std::pair<std::shared_ptr<Graph>, c10::optional<c10::FunctionSchema>>
+std::pair<std::shared_ptr<Graph>, std::shared_ptr<Module>>
 PrepareForStaticModule(
    const torch::jit::Module& m,
    const StaticModuleOptions& opts) {
@ -461,22 +470,23 @@ PrepareForStaticModule(
  auto module = m.copy();
  module.eval();
-  module = freeze_module(module);
+  auto module_ptr = std::make_shared<Module>(freeze_module(module));
-  Method method = module.get_method("forward");
+  Method method = module_ptr->get_method("forward");
-  auto graph = module.get_method("forward").graph();
+  auto graph = module_ptr->get_method("forward").graph();
  // graph->dump();
  PrepareGraphForStaticModule(graph, opts);
-  c10::FunctionSchema s = RemoveSelfFromSchema(method.function().getSchema());
+  return std::make_pair(graph, module_ptr);
  return std::make_pair(graph, s);
 }
-std::pair<std::shared_ptr<Graph>, c10::optional<c10::FunctionSchema>>
+std::pair<std::shared_ptr<Graph>, std::shared_ptr<Module>>
 PrepareForStaticModule(
    std::shared_ptr<torch::jit::Graph> graph,
    const StaticModuleOptions& opts) {
  PrepareGraphForStaticModule(graph, opts);
-  return std::make_pair(graph, c10::nullopt);
+  return std::make_pair(graph, nullptr);
 }
 } // namespace
@ -492,13 +502,12 @@ StaticModule::StaticModule(
    : StaticModule(PrepareForStaticModule(m, opts), opts) {}
 StaticModule::StaticModule(
-    std::pair<
+    std::pair<std::shared_ptr<torch::jit::Graph>, std::shared_ptr<Module>>
-        std::shared_ptr<torch::jit::Graph>,
+        graph_and_module,
        c10::optional<c10::FunctionSchema>> graph_and_schema,
    const StaticModuleOptions& opts)
    : opts_(opts),
-      graph_(std::move(graph_and_schema.first)),
+      graph_(std::move(graph_and_module.first)),
-      schema_(std::move(graph_and_schema.second)) {
+      module_(std::move(graph_and_module.second)) {
  // check opt flags
  if (opts.optimize_graph_output_memory) {
    TORCH_CHECK(
@ -511,6 +520,18 @@ StaticModule::StaticModule(
        "When optimize_memory is true, enable_out_variant must be set to true");
  }
  // handle schema
  if (module_) {
    Method method = module_->get_method("forward");
    schema_ = method.function().getSchema();
    if (RemoveSelfFromGraphInput(graph_)) {
      schema_ = RemoveSelfFromSchema(method.function().getSchema());
    } else {
      first_input_is_self_ = true;
      schema_ = method.function().getSchema();
    }
  }
  // map Value* to IValue (from inputs or prim::Constant) or null
  std::unordered_map<Value*, IValue*> value_to_ivalue;
  // map Value* to its SSA definition IR
@ -620,6 +641,7 @@ StaticRuntime::StaticRuntime(const StaticModule& sm) : static_module_(sm) {
  // NB: create unchanging std::vector<IValue>s we can reference
  inputs_.resize(sm.num_inputs());
  nodes_.resize(sm.nodes().size());
  for (const auto idx : c10::irange(sm.nodes().size())) {
    const auto& n_ref = sm.nodes()[idx];
    nodes_[idx] = n_ref; // copy the node
@ -688,6 +710,43 @@ std::vector<at::Tensor> StaticRuntime::operator()(
  return out;
 }
 void StaticRuntime::set_inputs(
    const std::vector<c10::IValue>& args,
    const std::unordered_map<std::string, c10::IValue>& kwargs) {
  if (!kwargs.empty()) {
    // This is not ideal
    TORCH_CHECK(
        static_module_.schema(),
        "Schema is not available. Consider creating the Static Runtime "
        "with StaticModule(const torch::jit::Module& m) instead.");
    std::vector<c10::IValue> stack;
    stack.reserve(inputs_.size());
    if (static_module_.first_input_is_self()) {
      stack.emplace_back(static_module_.module()._ivalue());
    }
    stack.insert(stack.end(), args.begin(), args.end());
    static_module_.schema()->checkAndNormalizeInputs(stack, kwargs);
    DCHECK_EQ(inputs_.size(), stack.size());
    for (const auto i : c10::irange(stack.size())) {
      Input(i) = std::move(stack[i]);
    }
  } else {
    if (static_module_.first_input_is_self()) {
      Input(0) = static_module_.module()._ivalue();
      DCHECK_EQ(inputs_.size(), args.size() + 1);
      for (const auto i : c10::irange(args.size())) {
        Input(i + 1) = args[i];
      }
    } else {
      DCHECK_EQ(inputs_.size(), args.size());
      for (const auto i : c10::irange(args.size())) {
        Input(i) = args[i];
      }
    }
  }
 }
 c10::IValue StaticRuntime::operator()(
    const std::vector<c10::IValue>& args,
    const std::unordered_map<std::string, c10::IValue>& kwargs) {
@ -701,27 +760,13 @@ c10::IValue StaticRuntime::operator()(
    planner_->allocate();
  }
-  if (!kwargs.empty()) {
+  set_inputs(args, kwargs);
    // This is not ideal
    TORCH_CHECK(
        static_module_.schema(),
        "Schema is not available. Consider creating the Static Runtime "
        "with StaticModule(const torch::jit::Module& m) instead.");
    std::vector<c10::IValue> s = args;
    static_module_.schema()->checkAndNormalizeInputs(s, kwargs);
    for (const auto i : c10::irange(s.size())) {
      Input(i) = std::move(s[i]);
    }
  } else {
    for (const auto i : c10::irange(args.size())) {
      Input(i) = args[i];
    }
  }
  // NB: before optimizing the order of execution, ensure that the
  // memory optimization pass (LivenessMap) is
  // aware of the new order!
  for (auto& n : nodes_) {
    // LOG(INFO) << "Running node: " << PrintNode(n.node());
    n.run();
  }
@ -739,9 +784,7 @@ c10::IValue StaticRuntime::operator()(
    }
    planner_->deallocate();
    // clean up owning refs of input tensors
-    for (IValue& ival : inputs_) {
+    clean_up_input_ivalues();
      ival = IValue();
    }
  }
  // no need to keep references of outputs in static runtime anymore
@ -829,6 +872,10 @@ void StaticRuntime::benchmark(
              << "%)" << std::endl;
  }
  check_for_memory_leak();
 #ifndef NDEBUG
  display_nodes(args, kwargs);
 #endif
 }
 float StaticRuntime::benchmark_model(
@ -906,16 +953,36 @@ void display_pnode_info(const ProcessedNode& pnode) {
  }
 }
-void StaticRuntime::display_nodes(const std::vector<c10::IValue>& args) {
+void StaticRuntime::display_nodes(
    const std::vector<c10::IValue>& args,
    const std::unordered_map<std::string, c10::IValue>& kwargs) {
  c10::InferenceMode mode;
-  std::vector<IValue> stack(args);
+  if (planner_) {
-  for (size_t i = 0; i < stack.size(); i++) {
+    planner_->allocate();
    Input(i) = stack[i];
  }
  set_inputs(args, kwargs);
  for (auto& node : nodes_) {
    node.run();
    display_pnode_info(node);
  }
  if (static_module_.opts().cleanup_activations) {
    // MemoryPlanner is created after the first invocation of `run()`. This is
    // done intentionally because MemoryPlanner uses `Tensor` sizes of the
    // previous `run()` for memory planning of subsequent runs
    if (!planner_) {
      planner_ = std::make_unique<MemoryPlanner>(
          this,
          static_module_.values_share_same_storage(),
          static_module_.external_values(),
          static_module_.opts().enable_out_variant,
          static_module_.opts().optimize_graph_output_memory);
    }
    planner_->deallocate();
    // clean up owning refs of input tensors
    clean_up_input_ivalues();
  }
 }
 StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
@ -934,18 +1001,9 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
  // setup time
  caffe2::Timer timer;
-  std::vector<IValue> stack(args);
+
-  if (!kwargs.empty()) {
+  set_inputs(args, kwargs);
-    // This is not ideal
+
    TORCH_CHECK(
        static_module_.schema(),
        "Schema is not available. Consider creating the Static Runtime "
        "with StaticModule(const torch::jit::Module& m) instead.");
    static_module_.schema()->checkAndNormalizeInputs(stack, kwargs);
  }
  for (const auto i : c10::irange(stack.size())) {
    Input(i) = stack[i];
  }
  results.setup_time = timer.MilliSeconds();
  // warmup runs
@ -957,9 +1015,9 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
  // main runs
  for (const auto k : c10::irange(main_runs)) {
    (void)k; // Suppress unused variable warning
-    for (const auto i : c10::irange(stack.size())) {
+
-      Input(i) = stack[i];
+    set_inputs(args, kwargs);
-    }
+
    timer.Start();
    if (planner_) {
      planner_->allocate();
@ -985,9 +1043,7 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
      }
      planner_->deallocate();
      // clean up owning refs of input tensors
-      for (IValue& ival : inputs_) {
+      clean_up_input_ivalues();
        ival = IValue();
      }
    }
    millis = timer.MilliSeconds();
    results.memory_dealloc_time += millis;
@ -1283,16 +1339,11 @@ ProcessedNode::ProcessedNode(
    VLOG(1) << "Switch to out variant for node: " << PrintNode(node);
    return;
  }
-  if (!fn_ && mayRunNatively(node)) {
+  if (!fn_ && (native_fn_ = getNativeOperation(node))) {
    native_fn_ = getNativeOperation(node);
    if (native_fn_) {
    VLOG(1) << "Switch to native impl for node: " << PrintNode(node);
    return;
  }
-  }
+  {
  if (node->kind() != prim::ListConstruct &&
      node->kind() != prim::TupleConstruct &&
      node->kind() != prim::DictConstruct && node->kind() != prim::ListUnpack) {
    const Operator& op = node->getOperator();
    TORCH_CHECK(op.hasOperation());
    op_ = op.getOperation(node);
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@ -92,9 +92,8 @@ class TORCH_API StaticModule {
 private:
  explicit StaticModule(
-      std::pair<
+      std::pair<std::shared_ptr<torch::jit::Graph>, std::shared_ptr<Module>>
-          std::shared_ptr<torch::jit::Graph>,
+          graph_and_module,
          c10::optional<c10::FunctionSchema>> graph_and_schema,
      const StaticModuleOptions& opts);
  // for <kind, idx>
@ -116,6 +115,10 @@ class TORCH_API StaticModule {
    return *graph_;
  }
  const Module& module() const {
    return *module_;
  }
  const StaticModuleOptions& opts() const;
  size_t num_inputs() const;
  size_t num_outputs() const;
@ -149,11 +152,17 @@ class TORCH_API StaticModule {
    return external_values_;
  }
  bool first_input_is_self() const {
    return first_input_is_self_;
  }
  StaticRuntime& runtime();
 private:
  StaticModuleOptions opts_;
  bool first_input_is_self_{false};
  std::shared_ptr<torch::jit::Graph> graph_;
  std::shared_ptr<torch::jit::Module> module_;
  c10::optional<c10::FunctionSchema> schema_;
  std::unique_ptr<StaticRuntime> cached_runtime_;
@ -188,7 +197,9 @@ class TORCH_API StaticRuntime {
      const std::vector<c10::IValue>& args,
      const std::unordered_map<std::string, c10::IValue>& kwargs);
-  void display_nodes(const std::vector<c10::IValue>& args);
+  void display_nodes(
      const std::vector<c10::IValue>& args,
      const std::unordered_map<std::string, c10::IValue>& kwargs);
  void benchmark(
      const std::vector<c10::IValue>& args,
@ -254,6 +265,18 @@ class TORCH_API StaticRuntime {
  void check_for_memory_leak(bool output_returned = true);
 private:
  // helper method for copying input args/kwargs into inputs_
  void set_inputs(
      const std::vector<c10::IValue>& args,
      const std::unordered_map<std::string, c10::IValue>& kwargs);
  // clean up owning refs of input IValues
  void clean_up_input_ivalues() {
    for (IValue& ival : inputs_) {
      ival = IValue();
    }
  }
  // Memory planning is only enabled if sm->opts().cleanup_activations is true.
  // Otherwise, the memory used by activations is cached inside the static
  // runtime.
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@ -189,28 +189,6 @@ std::function<void(ProcessedNode*)> getOutOfPlaceOperation(Node* n) {
  return nullptr;
 }
 // TODO: expand to include all view producing ops, mostly in
 // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorShape.cpp
 bool mayRunNatively(Node* n) {
  // In alphabetical order
  const static std::unordered_set<std::string> native_nodes{
      "aten::flatten",
      "aten::reshape",
      "aten::slice",
      "aten::transpose",
      "aten::to",
      "prim::ListConstruct",
      "prim::ListUnpack",
      "prim::TupleConstruct",
      "prim::DictConstruct",
      "aten::__getitem__"};
  auto str = std::string(n->kind().toQualString());
  if (!native_nodes.count(str)) {
    return false;
  }
  return true;
 }
 // Expensive check, use sparingly.
 // This is needed to make sure that we only switch to out variants for the
 // supported overloads, which is checked in the `Generate` step in
@ -1302,6 +1280,24 @@ std::function<void(ProcessedNode*)> getNativeOperation(Node* n) {
        p_node->Output(0) = in0_t.clone();
      }
    };
  } else if (n->kind() == prim::GetAttr) {
    return [](ProcessedNode* p_node) {
      auto module = p_node->Input(0).toObject();
      Node* node = p_node->node();
      const auto type = node->input()->type()->expect<ClassType>();
      const auto& field = node->s(attr::name);
      const auto slot = type->getAttributeSlot(field);
      p_node->Output(0) = module->getSlot(slot);
    };
  } else if (n->kind() == prim::SetAttr) {
    return [](ProcessedNode* p_node) {
      auto module = p_node->Input(0).toObject();
      Node* node = p_node->node();
      const auto type = node->inputs()[0]->type()->expect<ClassType>();
      const auto& field = node->s(attr::name);
      const auto slot = type->getAttributeSlot(field);
      module->setSlot(slot, p_node->Input(1));
    };
  }
  return nullptr;
 }
--- a/torch/csrc/jit/runtime/static/ops.h
+++ b/torch/csrc/jit/runtime/static/ops.h
@ -124,7 +124,6 @@ bool isOptimizableContainerType(Node* n);
 std::function<void(ProcessedNode*)> getOutOfPlaceOperation(Node* n);
 bool mayRunNatively(Node* n);
 std::function<void(ProcessedNode*)> getNativeOperation(Node* n);
 inline std::string PrintNode(const Node* node) {
--- a/torch/csrc/jit/runtime/static/passes.cpp
+++ b/torch/csrc/jit/runtime/static/passes.cpp
@ -366,9 +366,6 @@ void FuseInferenceOpsForSparseNN(std::shared_ptr<torch::jit::Graph>& graph) {
 }
 TORCH_LIBRARY_FRAGMENT(static_runtime, m) {
  m.def("static_runtime::pure_inputs() -> Tensor", []() -> at::Tensor {
    return at::randn({1});
  });
  m.def("static_runtime::permute_copy(Tensor self, int[] dims) -> Tensor");
  m.def(
      "static_runtime::reshape_copy(Tensor(a) self, int[] shape) -> Tensor(a)");
@ -386,24 +383,10 @@ bool HasInplaceOp(std::shared_ptr<Graph>& graph, const AliasDb& alias_db) {
  return HasInplaceOp(graph->block(), alias_db);
 }
-void ReplaceWithCopy(std::shared_ptr<torch::jit::Graph>& graph) {
+void ReplaceWithCopy(
-  auto* fake_input =
+    std::shared_ptr<torch::jit::Graph>& graph,
-      graph->insert(Symbol::fromQualString("static_runtime::pure_inputs"), {});
+    bool outputs_are_immutable) {
  fake_input->node()->moveBefore(*graph->nodes().begin());
  std::vector<std::pair<Value*, Use>> old_inputs;
  for (auto* input : graph->inputs()) {
    for (const auto& use : input->uses()) {
      old_inputs.emplace_back(std::make_pair(input, use));
    }
    input->replaceAllUsesWith(fake_input);
  }
  AliasDb db(graph);
  for (const auto& p : old_inputs) {
    p.second.user->replaceInput(p.second.offset, p.first);
  }
  fake_input->node()->destroy();
  const std::map<c10::Symbol, c10::Symbol> supported = {
 #ifdef FBCODE_CAFFE2
@ -474,7 +457,7 @@ void ReplaceWithCopy(std::shared_ptr<torch::jit::Graph>& graph) {
    }
    auto* out = n->output();
-    if (db.mayContainAlias({out}, graph->outputs())) {
+    if (!outputs_are_immutable && db.mayContainAlias({out}, graph->outputs())) {
      continue;
    }
    auto* new_node = graph->create(new_symbol, n->outputs().size());
--- a/torch/csrc/jit/runtime/static/passes.h
+++ b/torch/csrc/jit/runtime/static/passes.h
@ -7,7 +7,11 @@ TORCH_API void FuseInferenceOpsForSparseNN(
    std::shared_ptr<torch::jit::Graph>& graph);
 TORCH_API void FuseListUnpack(std::shared_ptr<torch::jit::Graph>& graph);
-TORCH_API void ReplaceWithCopy(std::shared_ptr<torch::jit::Graph>& graph);
+// If outputs_are_immutable is set to false, don't replace the view ops that
 // produce aliases of graph outputs with the copy version.
 TORCH_API void ReplaceWithCopy(
    std::shared_ptr<torch::jit::Graph>& graph,
    bool outputs_are_immutable = true);
 TORCH_API bool HasInplaceOp(
    std::shared_ptr<Graph>& graph,