mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[cutlass backend] turn autotuning logs off by default + rename log to autotuning log (#147922)
things we did: * turn off autotuning logs by default * rename autotuning logs from log to autotuning_log, so people are aware that it is a special artifact log. Pull Request resolved: https://github.com/pytorch/pytorch/pull/147922 Approved by: https://github.com/eellison
This commit is contained in:
parent
976ff5cf01
commit
201666d77d
|
|
@ -51,7 +51,8 @@ from .virtualized import V
|
||||||
CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
|
CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
|
||||||
EXIT_HANDLER_REGISTERED = False
|
EXIT_HANDLER_REGISTERED = False
|
||||||
|
|
||||||
log = getArtifactLogger(__name__, "autotuning")
|
autotuning_log = getArtifactLogger(__name__, "autotuning")
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Used to synchronize between parent and child processes
|
# Used to synchronize between parent and child processes
|
||||||
|
|
@ -109,14 +110,14 @@ class TuningProcess:
|
||||||
"""
|
"""
|
||||||
Entry point for the child process.
|
Entry point for the child process.
|
||||||
"""
|
"""
|
||||||
log.debug(
|
autotuning_log.debug(
|
||||||
"Entering TuningProcess child. Visible devices = %s",
|
"Entering TuningProcess child. Visible devices = %s",
|
||||||
os.environ.get(CUDA_VISIBLE_DEVICES),
|
os.environ.get(CUDA_VISIBLE_DEVICES),
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
TuningProcess.workloop(request_queue, response_queue)
|
TuningProcess.workloop(request_queue, response_queue)
|
||||||
except Exception:
|
except Exception:
|
||||||
log.exception("Exception in TuningProcess")
|
autotuning_log.exception("Exception in TuningProcess")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def workloop(request_queue: Queue[Any], response_queue: Queue[Any]) -> None:
|
def workloop(request_queue: Queue[Any], response_queue: Queue[Any]) -> None:
|
||||||
|
|
@ -262,14 +263,14 @@ class TuningProcess:
|
||||||
self.terminate()
|
self.terminate()
|
||||||
self.process.join(timeout=graceful_timeout)
|
self.process.join(timeout=graceful_timeout)
|
||||||
if self.process.is_alive():
|
if self.process.is_alive():
|
||||||
log.warning(
|
autotuning_log.warning(
|
||||||
"Sending SIGTERM to process with PID %d",
|
"Sending SIGTERM to process with PID %d",
|
||||||
self.process.pid,
|
self.process.pid,
|
||||||
)
|
)
|
||||||
self.process.terminate()
|
self.process.terminate()
|
||||||
self.process.join(timeout=terminate_timeout)
|
self.process.join(timeout=terminate_timeout)
|
||||||
if self.process.is_alive():
|
if self.process.is_alive():
|
||||||
log.error(
|
autotuning_log.error(
|
||||||
"Sending SIGKILL to process with PID %d",
|
"Sending SIGKILL to process with PID %d",
|
||||||
self.process.pid,
|
self.process.pid,
|
||||||
)
|
)
|
||||||
|
|
@ -526,7 +527,7 @@ class BenchmarkRequest:
|
||||||
*input_tensors: torch.Tensor,
|
*input_tensors: torch.Tensor,
|
||||||
output_tensor: Optional[torch.Tensor] = None,
|
output_tensor: Optional[torch.Tensor] = None,
|
||||||
) -> float:
|
) -> float:
|
||||||
debug = log.isEnabledFor(logging.DEBUG)
|
debug = autotuning_log.isEnabledFor(logging.DEBUG)
|
||||||
if debug:
|
if debug:
|
||||||
start_ts = time.time()
|
start_ts = time.time()
|
||||||
|
|
||||||
|
|
@ -543,7 +544,7 @@ class BenchmarkRequest:
|
||||||
fn = self.make_run_fn(*input_tensors, output_tensor=output_tensor)
|
fn = self.make_run_fn(*input_tensors, output_tensor=output_tensor)
|
||||||
except NonzeroWorkspaceNotSupportedError:
|
except NonzeroWorkspaceNotSupportedError:
|
||||||
# Skipping all ops with nonzero workspace requirements
|
# Skipping all ops with nonzero workspace requirements
|
||||||
log.info("Skipping op due to nonzero workspace requirement")
|
autotuning_log.info("Skipping op due to nonzero workspace requirement")
|
||||||
return float("inf")
|
return float("inf")
|
||||||
|
|
||||||
if debug:
|
if debug:
|
||||||
|
|
@ -554,7 +555,7 @@ class BenchmarkRequest:
|
||||||
|
|
||||||
if debug:
|
if debug:
|
||||||
bench_elapse = time.time() - start_ts # type: ignore[possibly-undefined]
|
bench_elapse = time.time() - start_ts # type: ignore[possibly-undefined]
|
||||||
log.debug(
|
autotuning_log.debug(
|
||||||
"InChildProcess %s: load %f, create tensor %f, bench %f",
|
"InChildProcess %s: load %f, create tensor %f, bench %f",
|
||||||
str(self),
|
str(self),
|
||||||
load_elapse, # type: ignore[possibly-undefined]
|
load_elapse, # type: ignore[possibly-undefined]
|
||||||
|
|
@ -657,7 +658,7 @@ class TritonBenchmarkRequest(BenchmarkRequest):
|
||||||
self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
|
self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
|
||||||
) -> Callable[[], None]:
|
) -> Callable[[], None]:
|
||||||
mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path)
|
mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path)
|
||||||
log.debug(
|
autotuning_log.debug(
|
||||||
"benchmark module key: %s, path: %s",
|
"benchmark module key: %s, path: %s",
|
||||||
self.module_cache_key,
|
self.module_cache_key,
|
||||||
self.module_path,
|
self.module_path,
|
||||||
|
|
@ -781,9 +782,9 @@ class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
|
||||||
def precompile(self):
|
def precompile(self):
|
||||||
# Prepopulate CUDACodeCache
|
# Prepopulate CUDACodeCache
|
||||||
# may happen in separate Threadpool
|
# may happen in separate Threadpool
|
||||||
log.debug("Precompiling %s", self)
|
autotuning_log.debug("Precompiling %s", self)
|
||||||
CUDACodeCache.compile(self.source_code, "so")
|
CUDACodeCache.compile(self.source_code, "so")
|
||||||
log.debug("Done precompiling %s", self)
|
autotuning_log.debug("Done precompiling %s", self)
|
||||||
|
|
||||||
def make_run_fn(
|
def make_run_fn(
|
||||||
self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
|
self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
|
||||||
|
|
@ -794,7 +795,7 @@ class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
|
||||||
c_void_p(tensor.data_ptr())
|
c_void_p(tensor.data_ptr())
|
||||||
for tensor in list(input_tensors) + [output_tensor]
|
for tensor in list(input_tensors) + [output_tensor]
|
||||||
]
|
]
|
||||||
log.debug(
|
autotuning_log.debug(
|
||||||
"make_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s",
|
"make_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s",
|
||||||
self.kernel_name,
|
self.kernel_name,
|
||||||
self.source_file,
|
self.source_file,
|
||||||
|
|
@ -848,7 +849,7 @@ class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize() # shake out any CUDA errors
|
torch.cuda.synchronize() # shake out any CUDA errors
|
||||||
self.workspace_size = c_workspace_size.value
|
self.workspace_size = c_workspace_size.value
|
||||||
log.debug(
|
autotuning_log.debug(
|
||||||
"update_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s", # noqa: B950
|
"update_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s", # noqa: B950
|
||||||
self.workspace_size,
|
self.workspace_size,
|
||||||
self.kernel_name,
|
self.kernel_name,
|
||||||
|
|
@ -895,9 +896,9 @@ class CppBenchmarkRequest(CPUDeviceBenchmarkMixin, BenchmarkRequest):
|
||||||
def precompile(self):
|
def precompile(self):
|
||||||
# Prepopulate CppCodeCache
|
# Prepopulate CppCodeCache
|
||||||
# may happen in separate Threadpool
|
# may happen in separate Threadpool
|
||||||
log.debug("Precompiling %s", self)
|
autotuning_log.debug("Precompiling %s", self)
|
||||||
CppCodeCache.load(self.source_code, device_type="cpu")
|
CppCodeCache.load(self.source_code, device_type="cpu")
|
||||||
log.debug("Done precompiling %s", self)
|
autotuning_log.debug("Done precompiling %s", self)
|
||||||
|
|
||||||
def make_run_fn(
|
def make_run_fn(
|
||||||
self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
|
self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
|
||||||
|
|
@ -905,7 +906,7 @@ class CppBenchmarkRequest(CPUDeviceBenchmarkMixin, BenchmarkRequest):
|
||||||
# TODO(jgong5): use CppPythonBindingsCodeCache for better binding perf
|
# TODO(jgong5): use CppPythonBindingsCodeCache for better binding perf
|
||||||
self.DLL = CppCodeCache.load(self.source_code, device_type="cpu")
|
self.DLL = CppCodeCache.load(self.source_code, device_type="cpu")
|
||||||
args = [tensor.data_ptr() for tensor in list(input_tensors) + [output_tensor]]
|
args = [tensor.data_ptr() for tensor in list(input_tensors) + [output_tensor]]
|
||||||
log.debug(
|
autotuning_log.debug(
|
||||||
"make_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%s",
|
"make_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%s",
|
||||||
self.kernel_name,
|
self.kernel_name,
|
||||||
self.DLL,
|
self.DLL,
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ from ..common import KernelTemplate
|
||||||
from .cuda_kernel import CUDATemplateCaller, CUDATemplateKernel
|
from .cuda_kernel import CUDATemplateCaller, CUDATemplateKernel
|
||||||
|
|
||||||
|
|
||||||
log = getArtifactLogger(__name__, "autotuning")
|
autotuning_log = getArtifactLogger(__name__, "autotuning")
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
|
|
@ -80,8 +80,8 @@ class CUDATemplate(KernelTemplate):
|
||||||
) as kernel:
|
) as kernel:
|
||||||
code = self.render(kernel=kernel, **kwargs)
|
code = self.render(kernel=kernel, **kwargs)
|
||||||
_, call_args, _, _ = kernel.args.python_argdefs()
|
_, call_args, _, _ = kernel.args.python_argdefs()
|
||||||
log.debug("Generated Code:\n%s", code)
|
autotuning_log.debug("Generated Code:\n%s", code)
|
||||||
log.debug(
|
autotuning_log.debug(
|
||||||
"Args: cpp_argdefs: %s, python_argdefs: %s",
|
"Args: cpp_argdefs: %s, python_argdefs: %s",
|
||||||
kernel.args.cpp_argdefs(),
|
kernel.args.cpp_argdefs(),
|
||||||
kernel.args.python_argdefs(),
|
kernel.args.python_argdefs(),
|
||||||
|
|
|
||||||
|
|
@ -209,8 +209,8 @@ register_artifact(
|
||||||
register_artifact(
|
register_artifact(
|
||||||
"autotuning",
|
"autotuning",
|
||||||
"Autotuning choice logs, such as kernel source, perf, and tuning parameters.",
|
"Autotuning choice logs, such as kernel source, perf, and tuning parameters.",
|
||||||
|
off_by_default=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
register_artifact(
|
register_artifact(
|
||||||
"graph_region_expansion",
|
"graph_region_expansion",
|
||||||
"Logs detailed steps of the duplicate graph region tracker expansion algorithm",
|
"Logs detailed steps of the duplicate graph region tracker expansion algorithm",
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user