[JIT] Add optimize_for_inference API (#58193)

Summary: Freezing exists as a pass which partially evaluates your model and applies generic optimizations which should speed it up. Optimize for inference is a counterpart to these optimizations which runs build & server specific optimizations. The interaction with existing `optimize_frozen_module` is not great, I guess we could just deprecate the API entirely? it was never officially released but just existed to document the `optimize_numerics` keyword. Eventually, I would like to add a way of adding example inputs but I didnt add that here because they are not being used at all yet. I also have not yet included a way to blacklist individual optimizations, and would like to wait until we move this to Beta and have a little more clarity on how everything will fit together. I also think blacklisting will be an uncommon use case for the current optimizations. Pull Request resolved: https://github.com/pytorch/pytorch/pull/58193 Reviewed By: bertmaher, navahgar Differential Revision: D28443714 Pulled By: eellison fbshipit-source-id: b032355bb2585720a6d2f00c89d0d9a7ef60e649
2025-12-07 12:21:27 +01:00 · 2021-05-15 15:48:42 -07:00 · 2021-05-15 15:48:42 -07:00 · 211bac53ef
commit 211bac53ef
parent fad2ce439e
8 changed files with 84 additions and 14 deletions
--- a/test/cpp/jit/test_module_api.cpp
+++ b/test/cpp/jit/test_module_api.cpp
@ -365,7 +365,10 @@ TEST(ModuleAPITest, Freezing) {
  auto frozen_mod = torch::jit::freeze(m);
  auto forward_g = frozen_mod.get_method("forward").graph();
  testing::FileCheck().check_not("GetAttr")->run(*forward_g);
-  ;
+
  auto frozen_mod2 = torch::jit::optimize_for_inference(m);
  forward_g = frozen_mod.get_method("forward").graph();
  testing::FileCheck().check_not("GetAttr")->run(*forward_g);
 }
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@ -1590,14 +1590,14 @@ class TestFrozenOptimizations(JitTestCase):
        conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=True)
        bn = torch.nn.BatchNorm2d(out_channels, eps=.001)
        mod = torch.nn.Sequential(conv, bn)
-        # set optimize to False here, by default freezing runs optimize_frozen_module
+        # set optimize to False here, by default freezing runs run_frozen_optimizations
        frozen_mod = torch.jit.freeze(torch.jit.script(mod.eval()), optimize_numerics=False)
        # inspect frozen mod
        FileCheck().check("batch_norm").run(frozen_mod.graph)
-        torch.jit.optimize_frozen_module(frozen_mod)
+        torch.jit.run_frozen_optimizations(frozen_mod)
        FileCheck().check_not("batch_norm").run(frozen_mod.graph)
-        # optimize_frozen_module should be run
+        # run_frozen_optimizations should be run
        frozen_mod = torch.jit.freeze(torch.jit.script(mod.eval()))
        FileCheck().check_not("batch_norm").run(frozen_mod.graph)
@ -1849,7 +1849,7 @@ class TestFrozenOptimizations(JitTestCase):
            else:
                scripted_mod = torch.jit.script(mod_eager)
-            frozen_mod = torch.jit.freeze(scripted_mod)
+            frozen_mod = torch.jit.optimize_for_inference(scripted_mod)
            if add_z:
                FileCheck().check("aten::cudnn_convolution_add_relu").run(frozen_mod.graph)
            else:
@ -1993,6 +1993,19 @@ class TestFrozenOptimizations(JitTestCase):
                        # and we aren't testing aten impls anyways
                        self.assertTrue(torch.allclose(aten_op(x, inplace=False), m(x).to_dense()))
    @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled")
    def test_optimize_for_inference(self):
        with set_default_dtype(torch.float):
            mod = nn.Linear(20, 30).eval()
            scripted_mod = torch.jit.script(mod)
            optimized = torch.jit.optimize_for_inference(scripted_mod)
            FileCheck().check("to_mkldnn").run(optimized.graph)
            frozen_mod = torch.jit.freeze(torch.jit.script(mod.eval()))
            optimized = torch.jit.optimize_for_inference(scripted_mod)
            FileCheck().check("to_mkldnn").run(optimized.graph)
@unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled")
 class TestMKLDNNReinplacing(JitTestCase):
    def setUp(self):
--- a/torch/_C/init.pyi.in
+++ b/torch/_C/init.pyi.in
@ -182,6 +182,7 @@ def _jit_pass_fold_frozen_conv_bn(graph: Graph): ...
 def _jit_pass_fold_frozen_conv_add_or_sub(graph: Graph): ...
 def _jit_pass_fold_frozen_conv_mul_or_div(graph: Graph): ...
 def _jit_pass_fuse_frozen_conv_add_relu(graph: Graph): ...
 def _jit_pass_convert_frozen_ops_to_mkldnn(graph: Graph): ...
 def _jit_pass_remove_dropout(module: 'torch.jit.ScriptModule'): ...
 def _is_tracing() -> _bool: ...
--- a/torch/csrc/jit/api/module.cpp
+++ b/torch/csrc/jit/api/module.cpp
@ -10,7 +10,9 @@
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/freeze_module.h>
 #include <torch/csrc/jit/passes/frozen_conv_add_relu_fusion.h>
 #include <torch/csrc/jit/passes/frozen_graph_optimizations.h>
 #include <torch/csrc/jit/passes/frozen_ops_to_mkldnn.h>
 #include <torch/csrc/jit/passes/inliner.h>
 #include <torch/csrc/jit/runtime/operator.h>
@ -484,6 +486,18 @@ Module freeze(
  return out_mod;
 }
 Module optimize_for_inference(Module& module) {
  // not frozen yet
  if (module._ivalue()->type()->hasAttribute("training")) {
    auto mod = freeze(module, {}, true);
  }
  auto graph = module.get_method("forward").graph();
  FuseFrozenConvAddRelu(graph);
  ConvertFrozenOpsToMKLDNN(graph);
  return module;
 }
 buffer_list Module::buffers(bool recurse) const {
  return buffer_list(*this, recurse, /*return_module=*/false);
 }
--- a/torch/csrc/jit/api/module.h
+++ b/torch/csrc/jit/api/module.h
@ -295,6 +295,10 @@ TORCH_API Module freeze(
    c10::optional<std::vector<std::string>> preserved_attrs = c10::nullopt,
    bool optimize_numerics = true);
 // C++ equivalent api of `torch.jit.optimize_for_inference`. See documentation
 // there for details.
 TORCH_API Module optimize_for_inference(Module& module);
 namespace detail {
 struct TORCH_API SlotCursor {
--- a/torch/csrc/jit/passes/frozen_graph_optimizations.cpp
+++ b/torch/csrc/jit/passes/frozen_graph_optimizations.cpp
@ -1,6 +1,5 @@
 #include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/ir/ir_views.h>
 #include <torch/csrc/jit/passes/frozen_conv_add_relu_fusion.h>
 #include <torch/csrc/jit/passes/frozen_conv_folding.h>
 #include <torch/csrc/jit/passes/frozen_graph_optimizations.h>
 #include <torch/csrc/jit/passes/remove_dropout.h>
@ -22,7 +21,6 @@ void OptimizeFrozenGraph(
      FoldFrozenConvMulOrDiv(graph);
    }
  }
  FuseFrozenConvAddRelu(graph);
 }
 } // namespace jit
--- a/torch/jit/init.py
+++ b/torch/jit/init.py
@ -49,7 +49,7 @@ from torch.jit._async import fork, wait
 from torch.jit._serialization import save, load
 from torch.jit._fuser import optimized_execution, fuser, last_executed_optimized_graph
-from torch.jit._freeze import freeze, optimize_frozen_module
+from torch.jit._freeze import freeze, optimize_for_inference, run_frozen_optimizations
 # For backwards compatibility
 _fork = fork
--- a/torch/jit/_freeze.py
+++ b/torch/jit/_freeze.py
@ -20,6 +20,10 @@ def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize_numerics:
    Freezing currently only accepts ScriptModules that are in eval mode.
    Freezing applies generic optimization that will speed up your model regardless of machine.
    To further optimize using server-specific settings, run `optimize_for_inference` after
    freezing.
    Args:
        mod (:class:`ScriptModule`): a module to be frozen
@ -27,7 +31,7 @@ def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize_numerics:
        Attributes modified in preserved methods will also be preserved.
        optimize_numerics (bool): If ``True``, a set of optimization passes will be run that does not strictly
-        preserve numerics. Full details of optimization can be found at `torch.jit.optimize_frozen_module`.
+        preserve numerics. Full details of optimization can be found at `torch.jit.run_frozen_optimizations`.
    Returns:
        Frozen :class:`ScriptModule`.
@ -83,6 +87,12 @@ def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize_numerics:
        If you're not sure why an attribute is not being inlined as a constant, you can run
        `dump_alias_db` on frozen_module.forward.graph to see if freezing has detected the
        attribute is being modified.
    Note:
        Because freezing makes weights constants and removes module hierarchy, `to` and other
        nn.Module methods to manipulate device or dtype no longer work. As a workaround,
        You can remap devices by specifying `map_location` in `torch.jit.load`, however
        device-specific logic may have been baked into the model.
    """
    if not isinstance(mod, ScriptModule):
        raise RuntimeError(
@ -100,12 +110,11 @@ def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize_numerics:
    out = RecursiveScriptModule(torch._C._freeze_module(mod._c, preserved_attrs))
    RecursiveScriptModule._finalize_scriptmodule(out)
-    optimize_frozen_module(out, optimize_numerics)
+    run_frozen_optimizations(out, optimize_numerics)
    return out
-
+def run_frozen_optimizations(mod, optimize_numerics: bool = True):
 def optimize_frozen_module(mod, optimize_numerics: bool = True):
    r"""
    Runs a series of optimizations looking for patterns that occur in frozen graphs.
    The current set of optimizations is:
@ -136,11 +145,11 @@ def optimize_frozen_module(mod, optimize_numerics: bool = True):
        conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=True)
        bn = torch.nn.BatchNorm2d(out_channels, eps=.001)
        mod = torch.nn.Sequential(conv, bn)
-        # set optimize to False here, by default freezing runs optimize_frozen_module
+        # set optimize to False here, by default freezing runs run_frozen_optimizations
        frozen_mod = torch.jit.freeze(torch.jit.script(mod.eval()), optimize=False)
        # inspect frozen mod
        assert "batch_norm" in str(frozen_mod.graph)
-        torch.jit.optimize_frozen_module(frozen_mod)
+        torch.jit.run_frozen_optimizations(frozen_mod)
        assert "batch_norm" not in str(frozen_mod.graph)
    """
@ -153,4 +162,32 @@ def optimize_frozen_module(mod, optimize_numerics: bool = True):
            torch._C._jit_pass_fold_frozen_conv_bn(mod.graph)
            torch._C._jit_pass_fold_frozen_conv_add_or_sub(mod.graph)
            torch._C._jit_pass_fold_frozen_conv_mul_or_div(mod.graph)
 def optimize_for_inference(mod: ScriptModule) -> ScriptModule:
    """
    Performs a set of optimization passes to optimize a model for the
    purposes of inference. If the model is not already frozen, optimize_for_inference
    will invoke `torch.jit.freeze` automatically.
    In addition to generic optimizations that should speed up your model regardless
    of environment, prepare for inference will also bake in build specific settings
    such as the presence of CUDNN or MKLDNN, and may in the future make transformations
    which speed things up on one machine but slow things down on another. Accordingly,
    serialization is not implemented following invoking `optimize_for_inference` and
    is not guaranteed.
    This is still in prototype, and may have the potential to slow down your model.
    Primary use cases that have been targeted so far have been vision models on cpu
    and gpu to a lesser extent.
    """
    if not isinstance(mod, ScriptModule):
        raise RuntimeError(
            "optimize_for_inference expects a ScriptModule as input. "
            "Please use torch.jit.script or torch.jit.trace to script your 'nn.Module'.")
    if hasattr(mod, "training"):
        mod = freeze(mod.eval())
    torch._C._jit_pass_convert_frozen_ops_to_mkldnn(mod.graph)
    torch._C._jit_pass_fuse_frozen_conv_add_relu(mod.graph)
    return mod