Test reland "AOTAutograd: gate view-replay behind config, not the def… (#124948)

A parallel attempt at landing https://github.com/pytorch/pytorch/pull/124945, but attempting to land through fbcode first Pull Request resolved: https://github.com/pytorch/pytorch/pull/124948 Approved by: https://github.com/albanD
2025-12-06 12:20:52 +01:00 · 2024-04-26 13:16:24 +00:00 · 2024-04-26 13:16:24 +00:00 · fc2aa23c1e
commit fc2aa23c1e
parent fc13c1c850
4 changed files with 40 additions and 4 deletions
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@ -3261,6 +3261,7 @@ def forward(self, tangents_1):

        return lambda f: aot_function(f, fw_compiler=lambda g, _: partial(wrapper, g))

+    @patch("functorch.compile.config.view_replay_for_aliased_outputs", True)
    def test_output_aliases_input_view_meta_replay(self):
        @self._compile_and_erase_bases(0)
        def f(a):
@ -3274,6 +3275,7 @@ def forward(self, tangents_1):
            str(out.grad_fn.__class__), """<class 'ViewBackward0'>"""
        )

+    @patch("functorch.compile.config.view_replay_for_aliased_outputs", True)
    def test_output_aliases_intermediate_view_meta_replay(self):
        @self._compile_and_erase_bases(0, 1)
        def f(a):
@ -3293,6 +3295,7 @@ def forward(self, tangents_1):
            str(out2.grad_fn.__class__), """<class 'ViewBackward0'>"""
        )

+    @patch("functorch.compile.config.view_replay_for_aliased_outputs", True)
    def test_output_aliases_output_view_meta_replay(self):
        @self._compile_and_erase_bases(1)
        def f(a):
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@ -257,6 +257,19 @@ intentionally_not_handled = {
    "resize_": {b8, f16, f32, f64, i32, i64},
    "resize_as_": {b8, f16, f32, f64, i32, i64},
 }
+# This is only fixed when this config is set
+# We should eventually always turn it on
+import torch._functorch.config as functorch_config
+
+if not functorch_config.view_replay_for_aliased_outputs:
+    intentionally_not_handled['("as_strided", "partial_views")'] = {
+        b8,
+        f16,
+        f32,
+        f64,
+        i32,
+        i64,
+    }

 inductor_expected_failures_single_sample["cuda"].update(intentionally_not_handled)

--- a/torch/_functorch/_aot_autograd/functional_utils.py
+++ b/torch/_functorch/_aot_autograd/functional_utils.py
@ -18,6 +18,7 @@ from torch.utils._python_dispatch import (
    is_traceable_wrapper_subclass,
    transform_subclass,
 )
+from .. import config

 aot_joint_log = getArtifactLogger(__name__, "aot_joint_graph")

@ -219,7 +220,7 @@ def gen_alias_from_base(
    # In summary, we use the fact that FunctionalTensorWrapper saves the view
    # functions applied to itself (collected during functionalization) so as
    # to replay them (view functions) on the aliased_base_tensor.
-    if target_functional_tensor is not None:
+    if config.view_replay_for_aliased_outputs and target_functional_tensor is not None:
        from .schemas import FunctionalTensorMetadataEq

        assert isinstance(target_functional_tensor, FunctionalTensorMetadataEq)
@ -237,11 +238,10 @@ def gen_alias_from_base(
            #
            # In order for this to work, we should have a way to replace those
            # symbolic shapes with concrete numbers.
-            aot_joint_log.warning(
+            aot_joint_log.info(
                "could not reconstruct view by re-applying a ViewMeta sequence. "
-                "This error is possibly caused by dynamic shapes. "
                "Fallbacking to reconstruction using as_strided. "
-                "Error message: %s",
+                "Reason: %s",
                str(e),
            )
        else:
--- a/torch/_functorch/config.py
+++ b/torch/_functorch/config.py
@ -41,6 +41,26 @@ static_weight_shapes = True
 # Applies CSE to the graph before partitioning
 cse = True

+# When AOTAutograd regenerates aliased graph outputs,
+# attempte to use functionalization's view-replay logic
+# before falling back to the autograd engine's view replay or as_strided.
+# This can have some perf implications
+# (although for many models this will not matter).
+# (1) If you have many view ops chained together, replaying all of them
+#     at runtime can have more overhead compared to a single as_strided call
+# (2) If you are doing training, AsStridedBackward is quite slow,
+#     and the individual view op backward formulas will likely be faster.
+# (3) Some backends like XLA do not support as_strided
+
+# Temporary hack: disable this flag for internal
+# (needed to fix an internal issue while avoiding bumping XLA pin)
+# eventually: either default this config to false completely
+# once XLA pin update works,
+# or default config to true and fix relevant bugs
+from torch._inductor.config import is_fbcode
+
+view_replay_for_aliased_outputs = not is_fbcode()
+
 # Restricts the amount of computation AOTAutograd can do.
 # NB: We have essentially disabled this heuristic now. However, this is kept
 # here for now in case it's useful. Setting it low can artificially reduce the