[cutlass backend] turn autotuning logs off by default + rename log to autotuning log (#147922)

things we did:
* turn off autotuning logs by default
* rename autotuning logs from log to autotuning_log, so people are aware that it is a special artifact log.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/147922
Approved by: https://github.com/eellison
This commit is contained in:
henrylhtsang 2025-02-25 19:04:38 -08:00 committed by PyTorch MergeBot
parent 976ff5cf01
commit 201666d77d
3 changed files with 21 additions and 20 deletions

View File

@ -51,7 +51,8 @@ from .virtualized import V
CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
EXIT_HANDLER_REGISTERED = False
log = getArtifactLogger(__name__, "autotuning")
autotuning_log = getArtifactLogger(__name__, "autotuning")
log = logging.getLogger(__name__)
# Used to synchronize between parent and child processes
@ -109,14 +110,14 @@ class TuningProcess:
"""
Entry point for the child process.
"""
log.debug(
autotuning_log.debug(
"Entering TuningProcess child. Visible devices = %s",
os.environ.get(CUDA_VISIBLE_DEVICES),
)
try:
TuningProcess.workloop(request_queue, response_queue)
except Exception:
log.exception("Exception in TuningProcess")
autotuning_log.exception("Exception in TuningProcess")
@staticmethod
def workloop(request_queue: Queue[Any], response_queue: Queue[Any]) -> None:
@ -262,14 +263,14 @@ class TuningProcess:
self.terminate()
self.process.join(timeout=graceful_timeout)
if self.process.is_alive():
log.warning(
autotuning_log.warning(
"Sending SIGTERM to process with PID %d",
self.process.pid,
)
self.process.terminate()
self.process.join(timeout=terminate_timeout)
if self.process.is_alive():
log.error(
autotuning_log.error(
"Sending SIGKILL to process with PID %d",
self.process.pid,
)
@ -526,7 +527,7 @@ class BenchmarkRequest:
*input_tensors: torch.Tensor,
output_tensor: Optional[torch.Tensor] = None,
) -> float:
debug = log.isEnabledFor(logging.DEBUG)
debug = autotuning_log.isEnabledFor(logging.DEBUG)
if debug:
start_ts = time.time()
@ -543,7 +544,7 @@ class BenchmarkRequest:
fn = self.make_run_fn(*input_tensors, output_tensor=output_tensor)
except NonzeroWorkspaceNotSupportedError:
# Skipping all ops with nonzero workspace requirements
log.info("Skipping op due to nonzero workspace requirement")
autotuning_log.info("Skipping op due to nonzero workspace requirement")
return float("inf")
if debug:
@ -554,7 +555,7 @@ class BenchmarkRequest:
if debug:
bench_elapse = time.time() - start_ts # type: ignore[possibly-undefined]
log.debug(
autotuning_log.debug(
"InChildProcess %s: load %f, create tensor %f, bench %f",
str(self),
load_elapse, # type: ignore[possibly-undefined]
@ -657,7 +658,7 @@ class TritonBenchmarkRequest(BenchmarkRequest):
self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
) -> Callable[[], None]:
mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path)
log.debug(
autotuning_log.debug(
"benchmark module key: %s, path: %s",
self.module_cache_key,
self.module_path,
@ -781,9 +782,9 @@ class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
def precompile(self):
# Prepopulate CUDACodeCache
# may happen in separate Threadpool
log.debug("Precompiling %s", self)
autotuning_log.debug("Precompiling %s", self)
CUDACodeCache.compile(self.source_code, "so")
log.debug("Done precompiling %s", self)
autotuning_log.debug("Done precompiling %s", self)
def make_run_fn(
self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
@ -794,7 +795,7 @@ class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
c_void_p(tensor.data_ptr())
for tensor in list(input_tensors) + [output_tensor]
]
log.debug(
autotuning_log.debug(
"make_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s",
self.kernel_name,
self.source_file,
@ -848,7 +849,7 @@ class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
)
torch.cuda.synchronize() # shake out any CUDA errors
self.workspace_size = c_workspace_size.value
log.debug(
autotuning_log.debug(
"update_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s", # noqa: B950
self.workspace_size,
self.kernel_name,
@ -895,9 +896,9 @@ class CppBenchmarkRequest(CPUDeviceBenchmarkMixin, BenchmarkRequest):
def precompile(self):
# Prepopulate CppCodeCache
# may happen in separate Threadpool
log.debug("Precompiling %s", self)
autotuning_log.debug("Precompiling %s", self)
CppCodeCache.load(self.source_code, device_type="cpu")
log.debug("Done precompiling %s", self)
autotuning_log.debug("Done precompiling %s", self)
def make_run_fn(
self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
@ -905,7 +906,7 @@ class CppBenchmarkRequest(CPUDeviceBenchmarkMixin, BenchmarkRequest):
# TODO(jgong5): use CppPythonBindingsCodeCache for better binding perf
self.DLL = CppCodeCache.load(self.source_code, device_type="cpu")
args = [tensor.data_ptr() for tensor in list(input_tensors) + [output_tensor]]
log.debug(
autotuning_log.debug(
"make_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%s",
self.kernel_name,
self.DLL,

View File

@ -19,7 +19,7 @@ from ..common import KernelTemplate
from .cuda_kernel import CUDATemplateCaller, CUDATemplateKernel
log = getArtifactLogger(__name__, "autotuning")
autotuning_log = getArtifactLogger(__name__, "autotuning")
@dataclass(frozen=True)
@ -80,8 +80,8 @@ class CUDATemplate(KernelTemplate):
) as kernel:
code = self.render(kernel=kernel, **kwargs)
_, call_args, _, _ = kernel.args.python_argdefs()
log.debug("Generated Code:\n%s", code)
log.debug(
autotuning_log.debug("Generated Code:\n%s", code)
autotuning_log.debug(
"Args: cpp_argdefs: %s, python_argdefs: %s",
kernel.args.cpp_argdefs(),
kernel.args.python_argdefs(),

View File

@ -209,8 +209,8 @@ register_artifact(
register_artifact(
"autotuning",
"Autotuning choice logs, such as kernel source, perf, and tuning parameters.",
off_by_default=True,
)
register_artifact(
"graph_region_expansion",
"Logs detailed steps of the duplicate graph region tracker expansion algorithm",