Revert "[inductor] add a threshold for membw saving during fusion (#136782)"

This reverts commit 6647320de2. Reverted https://github.com/pytorch/pytorch/pull/136782 on behalf of https://github.com/huydhn due to Sorry for reverting your change but test_memory starts to fail after this lands in trunk ([comment](https://github.com/pytorch/pytorch/pull/136782#issuecomment-2423549196))
2025-12-07 12:21:27 +01:00 · 2024-10-19 03:43:42 +00:00 · 2024-10-19 03:43:42 +00:00 · ac7f52b301
commit ac7f52b301
parent fecd370ea1
4 changed files with 2 additions and 61 deletions
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@ -358,7 +358,6 @@ def make_test(
    device="cuda",
    **kwargs,
 ):
-    @config.patch("score_fusion_memory_threshold", 1)
    def test_fn(self):
        stack = ExitStack()
        try:
@ -443,7 +442,6 @@ def make_test(


 def make_recompile_test(optim_cls, closure=None, kernel_count=2, **kwargs):
-    @config.patch("score_fusion_memory_threshold", 1)
    @requires_gpu
    def test_fn(self):
        torch._dynamo.reset()
--- a/test/inductor/test_loop_ordering.py
+++ b/test/inductor/test_loop_ordering.py
@ -412,46 +412,6 @@ class LoopOrderingTest(TestCase):
        self.do_acc_test(f, x, scale)
        self.assertEqual(1, metrics.generated_kernel_count)

-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 requires H100+ and MI300+")
-    def test_fp8_pattern_2(self):
-        """
-        This test repros the fp8 fusion relation issue here:
-            https://github.com/pytorch/pytorch/issues/133242
-        """
-        ref_dtype = torch.bfloat16
-        M, K = 4096, 4096
-
-        input_tensor = torch.randn(
-            M, K, device="cuda", dtype=ref_dtype, requires_grad=False
-        )
-        scale = torch.Tensor([10.0]).to("cuda")
-
-        E4M3_MAX_POS = torch.finfo(torch.float8_e4m3fn).max
-        E5M2_MAX_POS = torch.finfo(torch.float8_e5m2).max
-
-        def test_pattern2(tensor_x_inp, scale_x):
-            tensor_x = tensor_x_inp * scale_x
-            tensor_x = tensor_x.clamp(min=-1 * E4M3_MAX_POS, max=E4M3_MAX_POS)
-            tensor_fp8 = tensor_x.to(torch.float8_e4m3fn)
-
-            tensor_x_t = (tensor_x_inp * scale_x).t()
-            tensor_x_t = tensor_x_t.clamp(min=-1 * E4M3_MAX_POS, max=E4M3_MAX_POS)
-            tensor_fp8_t = tensor_x_t.to(torch.float8_e4m3fn)
-
-            tensor_fp8_t = tensor_fp8_t.contiguous().t()
-
-            return (tensor_fp8, tensor_fp8_t)
-
-        test_pattern = torch.compile(test_pattern2)
-        tensor_fp8, tensor_fp8_t = test_pattern(input_tensor, scale)
-
-        self.assertEqual(1, metrics.generated_kernel_count)
-
-        expected_numbytes = scale.nbytes  # scalar
-        expected_numbytes += input_tensor.nbytes  # input
-        expected_numbytes += tensor_fp8.nbytes + tensor_fp8_t.nbytes  # output
-        self.assertEqual(expected_numbytes, metrics.num_bytes_accessed)
-

 if __name__ == "__main__":
    if HAS_GPU:
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@ -438,17 +438,6 @@ loop_ordering_after_fusion = (
    os.environ.get("TORCHINDUCTOR_LOOP_ORDERING_AFTER_FUSION", "0") == "1"
 )

-# If fusing two nodes only save less then score_fusion_memory_threshold memory,
-# we should not bother fusing the nodes.
-#
-# This is especially helpful to resolve https://github.com/pytorch/pytorch/issues/133242
-# Previously we fuse two nodes because of common read of a scalar tensor.
-# If we skip it, the loop ordering after fusion mechanism kicks in and can
-# brings more savings.
-#
-# For the cases loop ordering after fusion does not help, we don't lose much.
-score_fusion_memory_threshold = 10
-
 # For Triton Templates, select fastest of best template + epilogue vs best template + separate epilogue kernel
 benchmark_epilogue_fusion = (
    os.environ.get("TORCHINDUCTOR_BENCHMARK_EPILOGUE_FUSION", "1") == "1"
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@ -2922,10 +2922,7 @@ class Scheduler:
                node2.get_name(),
            )

-        return (
-            self.score_fusion_memory(node1, node2)
-            >= config.score_fusion_memory_threshold
-        )
+        return self.score_fusion_memory(node1, node2) > 0

    def unfusable_node(self, node: BaseSchedulerNode) -> bool:
        """
@ -2993,10 +2990,7 @@ class Scheduler:
            return False
        del device2

-        no_shared_data = (
-            self.score_fusion_memory(node1, node2)
-            < config.score_fusion_memory_threshold
-        )
+        no_shared_data = self.score_fusion_memory(node1, node2) == 0
        if no_shared_data:
            no_shared_data = not self.has_shared_data_after_reordering_loop(
                node1, node2