From 201666d77dd980d71e392f705d7fac6256ee28f9 Mon Sep 17 00:00:00 2001
From: henrylhtsang <henrylhtsang@meta.com>
Date: Tue, 25 Feb 2025 19:04:38 -0800
Subject: [PATCH] [cutlass backend] turn autotuning logs off by default +
 rename log to autotuning log (#147922)

things we did:
* turn off autotuning logs by default
* rename autotuning logs from log to autotuning_log, so people are aware that it is a special artifact log.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/147922
Approved by: https://github.com/eellison
---
 torch/_inductor/autotune_process.py           | 33 ++++++++++---------
 torch/_inductor/codegen/cuda/cuda_template.py |  6 ++--
 torch/_logging/_registrations.py              |  2 +-
 3 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
index 61a5b959e12..dee86941666 100644
--- a/torch/_inductor/autotune_process.py
+++ b/torch/_inductor/autotune_process.py
@@ -51,7 +51,8 @@ from .virtualized import V
 CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
 EXIT_HANDLER_REGISTERED = False
 
-log = getArtifactLogger(__name__, "autotuning")
+autotuning_log = getArtifactLogger(__name__, "autotuning")
+log = logging.getLogger(__name__)
 
 
 # Used to synchronize between parent and child processes
@@ -109,14 +110,14 @@ class TuningProcess:
         """
         Entry point for the child process.
         """
-        log.debug(
+        autotuning_log.debug(
             "Entering TuningProcess child. Visible devices = %s",
             os.environ.get(CUDA_VISIBLE_DEVICES),
         )
         try:
             TuningProcess.workloop(request_queue, response_queue)
         except Exception:
-            log.exception("Exception in TuningProcess")
+            autotuning_log.exception("Exception in TuningProcess")
 
     @staticmethod
     def workloop(request_queue: Queue[Any], response_queue: Queue[Any]) -> None:
@@ -262,14 +263,14 @@ class TuningProcess:
             self.terminate()
             self.process.join(timeout=graceful_timeout)
             if self.process.is_alive():
-                log.warning(
+                autotuning_log.warning(
                     "Sending SIGTERM to process with PID %d",
                     self.process.pid,
                 )
                 self.process.terminate()
                 self.process.join(timeout=terminate_timeout)
                 if self.process.is_alive():
-                    log.error(
+                    autotuning_log.error(
                         "Sending SIGKILL to process with PID %d",
                         self.process.pid,
                     )
@@ -526,7 +527,7 @@ class BenchmarkRequest:
         *input_tensors: torch.Tensor,
         output_tensor: Optional[torch.Tensor] = None,
     ) -> float:
-        debug = log.isEnabledFor(logging.DEBUG)
+        debug = autotuning_log.isEnabledFor(logging.DEBUG)
         if debug:
             start_ts = time.time()
 
@@ -543,7 +544,7 @@ class BenchmarkRequest:
             fn = self.make_run_fn(*input_tensors, output_tensor=output_tensor)
         except NonzeroWorkspaceNotSupportedError:
             # Skipping all ops with nonzero workspace requirements
-            log.info("Skipping op due to nonzero workspace requirement")
+            autotuning_log.info("Skipping op due to nonzero workspace requirement")
             return float("inf")
 
         if debug:
@@ -554,7 +555,7 @@ class BenchmarkRequest:
 
         if debug:
             bench_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
-            log.debug(
+            autotuning_log.debug(
                 "InChildProcess %s: load %f, create tensor %f, bench %f",
                 str(self),
                 load_elapse,  # type: ignore[possibly-undefined]
@@ -657,7 +658,7 @@ class TritonBenchmarkRequest(BenchmarkRequest):
         self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
     ) -> Callable[[], None]:
         mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path)
-        log.debug(
+        autotuning_log.debug(
             "benchmark module key: %s, path: %s",
             self.module_cache_key,
             self.module_path,
@@ -781,9 +782,9 @@ class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
     def precompile(self):
         # Prepopulate CUDACodeCache
         # may happen in separate Threadpool
-        log.debug("Precompiling %s", self)
+        autotuning_log.debug("Precompiling %s", self)
         CUDACodeCache.compile(self.source_code, "so")
-        log.debug("Done precompiling %s", self)
+        autotuning_log.debug("Done precompiling %s", self)
 
     def make_run_fn(
         self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
@@ -794,7 +795,7 @@ class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
             c_void_p(tensor.data_ptr())
             for tensor in list(input_tensors) + [output_tensor]
         ]
-        log.debug(
+        autotuning_log.debug(
             "make_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s",
             self.kernel_name,
             self.source_file,
@@ -848,7 +849,7 @@ class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
         )
         torch.cuda.synchronize()  # shake out any CUDA errors
         self.workspace_size = c_workspace_size.value
-        log.debug(
+        autotuning_log.debug(
             "update_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s",  # noqa: B950
             self.workspace_size,
             self.kernel_name,
@@ -895,9 +896,9 @@ class CppBenchmarkRequest(CPUDeviceBenchmarkMixin, BenchmarkRequest):
     def precompile(self):
         # Prepopulate CppCodeCache
         # may happen in separate Threadpool
-        log.debug("Precompiling %s", self)
+        autotuning_log.debug("Precompiling %s", self)
         CppCodeCache.load(self.source_code, device_type="cpu")
-        log.debug("Done precompiling %s", self)
+        autotuning_log.debug("Done precompiling %s", self)
 
     def make_run_fn(
         self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
@@ -905,7 +906,7 @@ class CppBenchmarkRequest(CPUDeviceBenchmarkMixin, BenchmarkRequest):
         # TODO(jgong5): use CppPythonBindingsCodeCache for better binding perf
         self.DLL = CppCodeCache.load(self.source_code, device_type="cpu")
         args = [tensor.data_ptr() for tensor in list(input_tensors) + [output_tensor]]
-        log.debug(
+        autotuning_log.debug(
             "make_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%s",
             self.kernel_name,
             self.DLL,
diff --git a/torch/_inductor/codegen/cuda/cuda_template.py b/torch/_inductor/codegen/cuda/cuda_template.py
index 2f369e86bc8..2e4281065aa 100644
--- a/torch/_inductor/codegen/cuda/cuda_template.py
+++ b/torch/_inductor/codegen/cuda/cuda_template.py
@@ -19,7 +19,7 @@ from ..common import KernelTemplate
 from .cuda_kernel import CUDATemplateCaller, CUDATemplateKernel
 
 
-log = getArtifactLogger(__name__, "autotuning")
+autotuning_log = getArtifactLogger(__name__, "autotuning")
 
 
 @dataclass(frozen=True)
@@ -80,8 +80,8 @@ class CUDATemplate(KernelTemplate):
         ) as kernel:
             code = self.render(kernel=kernel, **kwargs)
             _, call_args, _, _ = kernel.args.python_argdefs()
-            log.debug("Generated Code:\n%s", code)
-            log.debug(
+            autotuning_log.debug("Generated Code:\n%s", code)
+            autotuning_log.debug(
                 "Args: cpp_argdefs: %s, python_argdefs: %s",
                 kernel.args.cpp_argdefs(),
                 kernel.args.python_argdefs(),
diff --git a/torch/_logging/_registrations.py b/torch/_logging/_registrations.py
index 3d805b639ab..a015a5be269 100644
--- a/torch/_logging/_registrations.py
+++ b/torch/_logging/_registrations.py
@@ -209,8 +209,8 @@ register_artifact(
 register_artifact(
     "autotuning",
     "Autotuning choice logs, such as kernel source, perf, and tuning parameters.",
+    off_by_default=True,
 )
-
 register_artifact(
     "graph_region_expansion",
     "Logs detailed steps of the duplicate graph region tracker expansion algorithm",