Revert "Still run TritonBundler with BundledAOTAutogradCache, save autotune results (#158048)"

This reverts commit 8e57cdb746. Reverted https://github.com/pytorch/pytorch/pull/158048 on behalf of https://github.com/jeffdaily due to rocm failures due to unit test introduced in this PR, but no pre-merge signal available ([comment](https://github.com/pytorch/pytorch/pull/158048#issuecomment-3098746624))
2025-12-06 12:20:52 +01:00 · 2025-07-21 20:45:21 +00:00 · 2025-07-21 20:45:21 +00:00 · bc379aebe2
commit bc379aebe2
parent b1a0c34dd3
4 changed files with 3 additions and 79 deletions
--- a/test/dynamo/test_package.py
+++ b/test/dynamo/test_package.py
@ -15,7 +15,6 @@ import torch.utils.cpp_extension
 from torch._dynamo.package import CompilePackage, DiskDynamoStore, DynamoCache
 from torch._dynamo.precompile_context import PrecompileContext
 from torch._functorch import config as functorch_config
 from torch._inductor.mock_cache import global_stats, PatchCaches, Stats
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
@ -429,39 +428,6 @@ def add(x, y):
            self.assertEqual(expected, [result1, result2])
        self.assertEqual(torch._dynamo.convert_frame.FRAME_COUNTER, total_frames)
    @parametrize("device", ("cuda", "xpu"))
    @torch._dynamo.config.patch(caching_precompile=True)
    def test_automatic_dynamo_autotune_cache(self, device):
        if device == "cuda" and not HAS_CUDA:
            raise unittest.SkipTest("Requires CUDA/Triton")
        if device == "xpu" and not HAS_XPU:
            raise unittest.SkipTest("Requires XPU/Triton")
        def fn(x, y):
            return x.sin() + y
        arg1 = torch.randn(3, 3, device=device)
        arg2 = torch.randn(3, 3, device=device)
        expected = fn(arg1, arg2).clone()
        with PatchCaches():
            compiled_fn1 = torch.compile(fn, mode="max-autotune")
            result = compiled_fn1(arg1, arg2).clone()
            self.assertEqual(expected, result)
            self.assertEqual(global_stats.autotune_local, Stats(1, 0, 1))
            DynamoCache.clear()
            total_frames = torch._dynamo.convert_frame.FRAME_COUNTER
            self._save_and_reload(
                expected_backends=1, expected_dynamo=1, expected_autotune=1
            )
            compiled_fn1 = torch.compile(fn, mode="max-autotune")
            with torch.compiler.set_stance("fail_on_recompile"):
                result1 = compiled_fn1(arg1, arg2).clone()
                self.assertEqual(expected, result1)
            self.assertEqual(torch._dynamo.convert_frame.FRAME_COUNTER, total_frames)
            self.assertEqual(global_stats.autotune_local, Stats(2, 1, 1))
    @parametrize("device", ("cpu", "cuda", "xpu"))
    @torch._dynamo.config.patch(caching_precompile=True)
    def test_automatic_dynamo_recompiles(self, device):
--- a/torch/_dynamo/precompile_context.py
+++ b/torch/_dynamo/precompile_context.py
@ -70,8 +70,7 @@ class PrecompileContext(CacheArtifactManager):
    The following artifact types are supported by PrecompileContext:
     - BundledAOTAutogradCacheArtifact
-     - DynamoCodeStateArtifact
+     - CodeStateArtifact (from torch._dynamo.package once available)
     - AutotuneCacheArtifact (regular autotune results, same as Megacache)
    """
    # Protected by the compile_lock
@ -150,12 +149,8 @@ class PrecompileContext(CacheArtifactManager):
        artifacts_by_key = {}
        cache_info = CacheInfo()
        for artifact in chain(*artifacts.values()):
            if artifact.type() == "autotune":
                # Populate autotune cache artifacts
                artifact.populate_cache()
            else:
                artifacts_by_key[artifact.key] = artifact
            cache_info.add(artifact)
            artifacts_by_key[artifact.key] = artifact
        from torch._dynamo.package import _BackendId, DynamoCache
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@ -909,37 +909,10 @@ def _compile_fx_inner(
            else:
                log.debug("Failed to generate FX cache key")
        if torch._functorch.config.bundled_autograd_cache:
            assert mb_compiled_graph is None
            assert cache_info is None
            # When using bundled autograd cache, we still want
            # to use the TritonBundler, but we don't want to save
            # the results here. The results will get saved directly
            # to AOTAutogradCache.
            TritonBundler.begin_compile()
            try:
                mb_compiled_graph = fx_codegen_and_compile(
                    gm, example_inputs, inputs_to_check, **graph_kwargs
                )
                assert mb_compiled_graph is not None
                (
                    triton_bundle,
                    triton_bundler_meta,
                ) = TritonBundler.collect()
                mb_compiled_graph.set_triton_bundle(triton_bundle)
            except (ShortenTraceback, SkipFrame):
                raise
            except Exception as e:
                raise InductorError(e, currentframe()).with_traceback(
                    e.__traceback__
                ) from None
            finally:
                TritonBundler.end_compile()
        # CACHE BYPASS: Compile the graph, don't save it to the cache
        # (this can happen either because cache was disabled, or we
        # determined the input is uncacheable)
-        elif cache_info is None or cache_info["cache_state"] == "bypass":
+        if cache_info is None or cache_info["cache_state"] == "bypass":
            assert mb_compiled_graph is None
            log.debug(
                "FX cache bypass reason: %s",
--- a/torch/_inductor/runtime/autotune_cache.py
+++ b/torch/_inductor/runtime/autotune_cache.py
@ -35,7 +35,6 @@ from typing import Any, Optional, TYPE_CHECKING
 from typing_extensions import override
 import torch
 from torch._dynamo.precompile_context import PrecompileContext
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch.compiler._cache import (
    CacheArtifact,
@ -126,7 +125,6 @@ class AutotuneCache:
    ) -> Optional[AutotuneCache]:
        cache = AutotuneCache(configs_hash)
        key = AutotuneCache._prepare_key(filename)
        cache._setup_local_cache(inductor_meta, os.path.dirname(filename), key)
        cache._setup_remote_autotune_cache(inductor_meta, key)
        if cache.local_cache or cache.remote_cache:
@ -302,10 +300,6 @@ class AutotuneCache:
            CacheArtifactManager.record_artifact(
                AutotuneCacheArtifact.type(), autotune_artifact_key, data
            )
            if torch._dynamo.config.caching_precompile:
                PrecompileContext.record_artifact(
                    AutotuneCacheArtifact.type(), autotune_artifact_key, data
                )
            if log.isEnabledFor(logging.DEBUG):
                type_str = "coordesc" if found_by_coordesc else "heuristic"
@ -631,10 +625,6 @@ class LocalAutotuneCache(RemoteCache[JsonDataTy]):
            CacheArtifactManager.record_artifact(
                AutotuneCacheArtifact.type(), autotune_artifact_key, result
            )
            if torch._dynamo.config.caching_precompile:
                PrecompileContext.record_artifact(
                    AutotuneCacheArtifact.type(), autotune_artifact_key, result
                )
        return result
    @override