From 201666d77dd980d71e392f705d7fac6256ee28f9 Mon Sep 17 00:00:00 2001 From: henrylhtsang Date: Tue, 25 Feb 2025 19:04:38 -0800 Subject: [PATCH] [cutlass backend] turn autotuning logs off by default + rename log to autotuning log (#147922) things we did: * turn off autotuning logs by default * rename autotuning logs from log to autotuning_log, so people are aware that it is a special artifact log. Pull Request resolved: https://github.com/pytorch/pytorch/pull/147922 Approved by: https://github.com/eellison --- torch/_inductor/autotune_process.py | 33 ++++++++++--------- torch/_inductor/codegen/cuda/cuda_template.py | 6 ++-- torch/_logging/_registrations.py | 2 +- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py index 61a5b959e12..dee86941666 100644 --- a/torch/_inductor/autotune_process.py +++ b/torch/_inductor/autotune_process.py @@ -51,7 +51,8 @@ from .virtualized import V CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES" EXIT_HANDLER_REGISTERED = False -log = getArtifactLogger(__name__, "autotuning") +autotuning_log = getArtifactLogger(__name__, "autotuning") +log = logging.getLogger(__name__) # Used to synchronize between parent and child processes @@ -109,14 +110,14 @@ class TuningProcess: """ Entry point for the child process. """ - log.debug( + autotuning_log.debug( "Entering TuningProcess child. Visible devices = %s", os.environ.get(CUDA_VISIBLE_DEVICES), ) try: TuningProcess.workloop(request_queue, response_queue) except Exception: - log.exception("Exception in TuningProcess") + autotuning_log.exception("Exception in TuningProcess") @staticmethod def workloop(request_queue: Queue[Any], response_queue: Queue[Any]) -> None: @@ -262,14 +263,14 @@ class TuningProcess: self.terminate() self.process.join(timeout=graceful_timeout) if self.process.is_alive(): - log.warning( + autotuning_log.warning( "Sending SIGTERM to process with PID %d", self.process.pid, ) self.process.terminate() self.process.join(timeout=terminate_timeout) if self.process.is_alive(): - log.error( + autotuning_log.error( "Sending SIGKILL to process with PID %d", self.process.pid, ) @@ -526,7 +527,7 @@ class BenchmarkRequest: *input_tensors: torch.Tensor, output_tensor: Optional[torch.Tensor] = None, ) -> float: - debug = log.isEnabledFor(logging.DEBUG) + debug = autotuning_log.isEnabledFor(logging.DEBUG) if debug: start_ts = time.time() @@ -543,7 +544,7 @@ class BenchmarkRequest: fn = self.make_run_fn(*input_tensors, output_tensor=output_tensor) except NonzeroWorkspaceNotSupportedError: # Skipping all ops with nonzero workspace requirements - log.info("Skipping op due to nonzero workspace requirement") + autotuning_log.info("Skipping op due to nonzero workspace requirement") return float("inf") if debug: @@ -554,7 +555,7 @@ class BenchmarkRequest: if debug: bench_elapse = time.time() - start_ts # type: ignore[possibly-undefined] - log.debug( + autotuning_log.debug( "InChildProcess %s: load %f, create tensor %f, bench %f", str(self), load_elapse, # type: ignore[possibly-undefined] @@ -657,7 +658,7 @@ class TritonBenchmarkRequest(BenchmarkRequest): self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor ) -> Callable[[], None]: mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path) - log.debug( + autotuning_log.debug( "benchmark module key: %s, path: %s", self.module_cache_key, self.module_path, @@ -781,9 +782,9 @@ class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest): def precompile(self): # Prepopulate CUDACodeCache # may happen in separate Threadpool - log.debug("Precompiling %s", self) + autotuning_log.debug("Precompiling %s", self) CUDACodeCache.compile(self.source_code, "so") - log.debug("Done precompiling %s", self) + autotuning_log.debug("Done precompiling %s", self) def make_run_fn( self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor @@ -794,7 +795,7 @@ class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest): c_void_p(tensor.data_ptr()) for tensor in list(input_tensors) + [output_tensor] ] - log.debug( + autotuning_log.debug( "make_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s", self.kernel_name, self.source_file, @@ -848,7 +849,7 @@ class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest): ) torch.cuda.synchronize() # shake out any CUDA errors self.workspace_size = c_workspace_size.value - log.debug( + autotuning_log.debug( "update_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s", # noqa: B950 self.workspace_size, self.kernel_name, @@ -895,9 +896,9 @@ class CppBenchmarkRequest(CPUDeviceBenchmarkMixin, BenchmarkRequest): def precompile(self): # Prepopulate CppCodeCache # may happen in separate Threadpool - log.debug("Precompiling %s", self) + autotuning_log.debug("Precompiling %s", self) CppCodeCache.load(self.source_code, device_type="cpu") - log.debug("Done precompiling %s", self) + autotuning_log.debug("Done precompiling %s", self) def make_run_fn( self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor @@ -905,7 +906,7 @@ class CppBenchmarkRequest(CPUDeviceBenchmarkMixin, BenchmarkRequest): # TODO(jgong5): use CppPythonBindingsCodeCache for better binding perf self.DLL = CppCodeCache.load(self.source_code, device_type="cpu") args = [tensor.data_ptr() for tensor in list(input_tensors) + [output_tensor]] - log.debug( + autotuning_log.debug( "make_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%s", self.kernel_name, self.DLL, diff --git a/torch/_inductor/codegen/cuda/cuda_template.py b/torch/_inductor/codegen/cuda/cuda_template.py index 2f369e86bc8..2e4281065aa 100644 --- a/torch/_inductor/codegen/cuda/cuda_template.py +++ b/torch/_inductor/codegen/cuda/cuda_template.py @@ -19,7 +19,7 @@ from ..common import KernelTemplate from .cuda_kernel import CUDATemplateCaller, CUDATemplateKernel -log = getArtifactLogger(__name__, "autotuning") +autotuning_log = getArtifactLogger(__name__, "autotuning") @dataclass(frozen=True) @@ -80,8 +80,8 @@ class CUDATemplate(KernelTemplate): ) as kernel: code = self.render(kernel=kernel, **kwargs) _, call_args, _, _ = kernel.args.python_argdefs() - log.debug("Generated Code:\n%s", code) - log.debug( + autotuning_log.debug("Generated Code:\n%s", code) + autotuning_log.debug( "Args: cpp_argdefs: %s, python_argdefs: %s", kernel.args.cpp_argdefs(), kernel.args.python_argdefs(), diff --git a/torch/_logging/_registrations.py b/torch/_logging/_registrations.py index 3d805b639ab..a015a5be269 100644 --- a/torch/_logging/_registrations.py +++ b/torch/_logging/_registrations.py @@ -209,8 +209,8 @@ register_artifact( register_artifact( "autotuning", "Autotuning choice logs, such as kernel source, perf, and tuning parameters.", + off_by_default=True, ) - register_artifact( "graph_region_expansion", "Logs detailed steps of the duplicate graph region tracker expansion algorithm",