[cutlass backend] turn autotuning logs off by default + rename log to autotuning log (#147922)

things we did: * turn off autotuning logs by default * rename autotuning logs from log to autotuning_log, so people are aware that it is a special artifact log. Pull Request resolved: https://github.com/pytorch/pytorch/pull/147922 Approved by: https://github.com/eellison
2025-12-06 00:20:18 +01:00 · 2025-02-25 19:04:38 -08:00 · 2025-02-25 19:04:38 -08:00 · 201666d77d
commit 201666d77d
parent 976ff5cf01
3 changed files with 21 additions and 20 deletions
--- a/torch/_inductor/autotune_process.py
+++ b/torch/_inductor/autotune_process.py
@ -51,7 +51,8 @@ from .virtualized import V
 CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
 EXIT_HANDLER_REGISTERED = False

-log = getArtifactLogger(__name__, "autotuning")
+autotuning_log = getArtifactLogger(__name__, "autotuning")
+log = logging.getLogger(__name__)


 # Used to synchronize between parent and child processes
@ -109,14 +110,14 @@ class TuningProcess:
        """
        Entry point for the child process.
        """
-        log.debug(
+        autotuning_log.debug(
            "Entering TuningProcess child. Visible devices = %s",
            os.environ.get(CUDA_VISIBLE_DEVICES),
        )
        try:
            TuningProcess.workloop(request_queue, response_queue)
        except Exception:
-            log.exception("Exception in TuningProcess")
+            autotuning_log.exception("Exception in TuningProcess")

    @staticmethod
    def workloop(request_queue: Queue[Any], response_queue: Queue[Any]) -> None:
@ -262,14 +263,14 @@ class TuningProcess:
            self.terminate()
            self.process.join(timeout=graceful_timeout)
            if self.process.is_alive():
-                log.warning(
+                autotuning_log.warning(
                    "Sending SIGTERM to process with PID %d",
                    self.process.pid,
                )
                self.process.terminate()
                self.process.join(timeout=terminate_timeout)
                if self.process.is_alive():
-                    log.error(
+                    autotuning_log.error(
                        "Sending SIGKILL to process with PID %d",
                        self.process.pid,
                    )
@ -526,7 +527,7 @@ class BenchmarkRequest:
        *input_tensors: torch.Tensor,
        output_tensor: Optional[torch.Tensor] = None,
    ) -> float:
-        debug = log.isEnabledFor(logging.DEBUG)
+        debug = autotuning_log.isEnabledFor(logging.DEBUG)
        if debug:
            start_ts = time.time()

@ -543,7 +544,7 @@ class BenchmarkRequest:
            fn = self.make_run_fn(*input_tensors, output_tensor=output_tensor)
        except NonzeroWorkspaceNotSupportedError:
            # Skipping all ops with nonzero workspace requirements
-            log.info("Skipping op due to nonzero workspace requirement")
+            autotuning_log.info("Skipping op due to nonzero workspace requirement")
            return float("inf")

        if debug:
@ -554,7 +555,7 @@ class BenchmarkRequest:

        if debug:
            bench_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
-            log.debug(
+            autotuning_log.debug(
                "InChildProcess %s: load %f, create tensor %f, bench %f",
                str(self),
                load_elapse,  # type: ignore[possibly-undefined]
@ -657,7 +658,7 @@ class TritonBenchmarkRequest(BenchmarkRequest):
        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
    ) -> Callable[[], None]:
        mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path)
-        log.debug(
+        autotuning_log.debug(
            "benchmark module key: %s, path: %s",
            self.module_cache_key,
            self.module_path,
@ -781,9 +782,9 @@ class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
    def precompile(self):
        # Prepopulate CUDACodeCache
        # may happen in separate Threadpool
-        log.debug("Precompiling %s", self)
+        autotuning_log.debug("Precompiling %s", self)
        CUDACodeCache.compile(self.source_code, "so")
-        log.debug("Done precompiling %s", self)
+        autotuning_log.debug("Done precompiling %s", self)

    def make_run_fn(
        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
@ -794,7 +795,7 @@ class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
            c_void_p(tensor.data_ptr())
            for tensor in list(input_tensors) + [output_tensor]
        ]
-        log.debug(
+        autotuning_log.debug(
            "make_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s",
            self.kernel_name,
            self.source_file,
@ -848,7 +849,7 @@ class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
        )
        torch.cuda.synchronize()  # shake out any CUDA errors
        self.workspace_size = c_workspace_size.value
-        log.debug(
+        autotuning_log.debug(
            "update_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s",  # noqa: B950
            self.workspace_size,
            self.kernel_name,
@ -895,9 +896,9 @@ class CppBenchmarkRequest(CPUDeviceBenchmarkMixin, BenchmarkRequest):
    def precompile(self):
        # Prepopulate CppCodeCache
        # may happen in separate Threadpool
-        log.debug("Precompiling %s", self)
+        autotuning_log.debug("Precompiling %s", self)
        CppCodeCache.load(self.source_code, device_type="cpu")
-        log.debug("Done precompiling %s", self)
+        autotuning_log.debug("Done precompiling %s", self)

    def make_run_fn(
        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
@ -905,7 +906,7 @@ class CppBenchmarkRequest(CPUDeviceBenchmarkMixin, BenchmarkRequest):
        # TODO(jgong5): use CppPythonBindingsCodeCache for better binding perf
        self.DLL = CppCodeCache.load(self.source_code, device_type="cpu")
        args = [tensor.data_ptr() for tensor in list(input_tensors) + [output_tensor]]
-        log.debug(
+        autotuning_log.debug(
            "make_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%s",
            self.kernel_name,
            self.DLL,
--- a/torch/_inductor/codegen/cuda/cuda_template.py
+++ b/torch/_inductor/codegen/cuda/cuda_template.py
@ -19,7 +19,7 @@ from ..common import KernelTemplate
 from .cuda_kernel import CUDATemplateCaller, CUDATemplateKernel


-log = getArtifactLogger(__name__, "autotuning")
+autotuning_log = getArtifactLogger(__name__, "autotuning")


@dataclass(frozen=True)
@ -80,8 +80,8 @@ class CUDATemplate(KernelTemplate):
        ) as kernel:
            code = self.render(kernel=kernel, **kwargs)
            _, call_args, _, _ = kernel.args.python_argdefs()
-            log.debug("Generated Code:\n%s", code)
-            log.debug(
+            autotuning_log.debug("Generated Code:\n%s", code)
+            autotuning_log.debug(
                "Args: cpp_argdefs: %s, python_argdefs: %s",
                kernel.args.cpp_argdefs(),
                kernel.args.python_argdefs(),
--- a/torch/_logging/_registrations.py
+++ b/torch/_logging/_registrations.py
@ -209,8 +209,8 @@ register_artifact(
 register_artifact(
    "autotuning",
    "Autotuning choice logs, such as kernel source, perf, and tuning parameters.",
+    off_by_default=True,
 )
-
 register_artifact(
    "graph_region_expansion",
    "Logs detailed steps of the duplicate graph region tracker expansion algorithm",