Lint rule to forbid direct use of logging.info/etc APIs (#90907)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90907
Approved by: https://github.com/jansel
This commit is contained in:
Edward Z. Yang 2022-12-16 08:56:17 +08:00 committed by PyTorch MergeBot
parent 82a191313e
commit eef019c14a
11 changed files with 79 additions and 35 deletions

View File

@ -615,6 +615,37 @@ command = [
'@{{PATHSFILE}}'
]
[[linter]]
code = 'ROOT_LOGGING'
include_patterns = [
'**/*.py',
]
# These are not library code, but scripts in their own right, and so
# therefore are permitted to use logging
exclude_patterns = [
'tools/**',
'test/**',
'benchmarks/**',
'torch/distributed/run.py',
'functorch/benchmarks/**',
# Grandfathered in
'caffe2/**',
]
command = [
'python3',
'tools/linter/adapters/grep_linter.py',
'--pattern=logging\.(debug|info|warn|warning|error|critical|log|exception)\(',
'--replace-pattern=s/logging\.(debug|info|warn|warning|error|critical|log|exception)\(/log.\1(/',
'--linter-name=ROOT_LOGGING',
'--error-name=use of root logger',
"""--error-description=\
Do not use root logger (logging.info, etc) directly; instead \
define 'log = logging.getLogger(__name__)' and call, e.g., log.info().
""",
'--',
'@{{PATHSFILE}}'
]
[[linter]]
code = 'CMAKE'
include_patterns = [

View File

@ -481,11 +481,11 @@ def convert_frame(compiler_fn: CompilerFn, hooks: Hooks):
counters["frames"]["ok"] += 1
return result
except (NotImplementedError, Unsupported):
logging.info("converting frame raised unsupported, leaving it unconverted")
log.info("converting frame raised unsupported, leaving it unconverted")
except Exception:
if not config.suppress_errors:
raise
logging.info("converting frame raised error, suppressing error")
log.info("converting frame raised error, suppressing error")
return None
_convert_frame._torchdynamo_orig_callable = compiler_fn # type: ignore[attr-defined]

View File

@ -1264,7 +1264,7 @@ def aot_wrapper_dedupe(flat_fn, flat_args: List[Tensor], aot_config: AOTConfig,
flat_fn
)(*flat_args)
except RuntimeError as e:
logging.warning(
log.warning(
"Failed to collect metadata on function, produced code may be suboptimal. "
"Known situations this can occur are inference mode only compilation involving "
"resize_ or prims (!schema.hasAnyAliasInfo() INTERNAL ASSERT FAILED); "

View File

@ -22,6 +22,8 @@ from .partitioners import (
import torch.utils._pytree as pytree
log = logging.getLogger(__name__)
# These canonicalizations are needed here (and not decompositions), as the ops
# we're trying to canonicalize to CompositeImplicitAutograd.
@ -333,7 +335,7 @@ def _save_fx_default(current_name, folder_name, dump_example_input, gm, example_
def graph_saver_helper(gm_to_save, args, type_name):
global graph_index
if len(gm_to_save.graph.nodes) == 0:
logging.log(
log.log(
logging.WARNING,
f"No nodes in graph {current_name}_{type_name}_{graph_index}.",
)

View File

@ -6,12 +6,14 @@ from abc import ABC
from contextlib import contextmanager
from typing import Callable, Generic, List, NamedTuple, Optional, Set, TypeVar
log = logging.getLogger(__name__)
# TODO(voz): Stolen pattern, not sure why this is the case,
# but mypy complains.
try:
import sympy # type: ignore[import]
except ImportError:
logging.warning("No sympy found")
log.warning("No sympy found")
"""
torch._guards is the definitional source of truth for general purpose guard structures.

View File

@ -13,6 +13,8 @@ from typing import Any, Dict, List, Optional, Set
__all__ = ['TimerRequest', 'TimerClient', 'RequestQueue', 'TimerServer', 'configure', 'expires']
log = logging.getLogger(__name__)
class TimerRequest:
"""
Data object representing a countdown timer acquisition and release
@ -168,7 +170,7 @@ class TimerServer(abc.ABC):
try:
return self._reap_worker(worker_id)
except Exception as e:
logging.error(
log.error(
"Uncaught exception thrown from _reap_worker(), "
"check that the implementation correctly catches exceptions",
exc_info=e,
@ -180,7 +182,7 @@ class TimerServer(abc.ABC):
try:
self._run_watchdog()
except Exception as e:
logging.error("Error running watchdog", exc_info=e)
log.error("Error running watchdog", exc_info=e)
def _run_watchdog(self):
batch_size = max(1, self._request_queue.size())
@ -189,15 +191,15 @@ class TimerServer(abc.ABC):
now = time.time()
reaped_worker_ids = set()
for worker_id, expired_timers in self.get_expired_timers(now).items():
logging.info(
log.info(
f"Reaping worker_id=[{worker_id}]."
f" Expired timers: {self._get_scopes(expired_timers)}"
)
if self._reap_worker_no_throw(worker_id):
logging.info(f"Successfully reaped worker=[{worker_id}]")
log.info(f"Successfully reaped worker=[{worker_id}]")
reaped_worker_ids.add(worker_id)
else:
logging.error(
log.error(
f"Error reaping worker=[{worker_id}]. Will retry on next watchdog."
)
self.clear_timers(reaped_worker_ids)
@ -206,7 +208,7 @@ class TimerServer(abc.ABC):
return [r.scope_id for r in timer_requests]
def start(self) -> None:
logging.info(
log.info(
f"Starting {type(self).__name__}..."
f" max_interval={self._max_interval},"
f" daemon={self._daemon}"
@ -214,18 +216,18 @@ class TimerServer(abc.ABC):
self._watchdog_thread = threading.Thread(
target=self._watchdog_loop, daemon=self._daemon
)
logging.info("Starting watchdog thread...")
log.info("Starting watchdog thread...")
self._watchdog_thread.start()
def stop(self) -> None:
logging.info(f"Stopping {type(self).__name__}")
log.info(f"Stopping {type(self).__name__}")
self._stop_signaled = True
if self._watchdog_thread:
logging.info("Stopping watchdog thread...")
log.info("Stopping watchdog thread...")
self._watchdog_thread.join(self._max_interval)
self._watchdog_thread = None
else:
logging.info("No watchdog thread running, doing nothing")
log.info("No watchdog thread running, doing nothing")
_timer_client = None
@ -237,7 +239,7 @@ def configure(timer_client: TimerClient):
"""
global _timer_client
_timer_client = timer_client
logging.info(f"Timer client configured to: {type(_timer_client).__name__}")
log.info(f"Timer client configured to: {type(_timer_client).__name__}")
@contextmanager

View File

@ -19,6 +19,8 @@ from torch.distributed.elastic.timer.api import TimerClient, TimerRequest
__all__ = ["FileTimerClient", "FileTimerRequest", "FileTimerServer"]
log = logging.getLogger(__name__)
class FileTimerRequest(TimerRequest):
"""
Data object representing a countdown timer acquisition and release
@ -175,25 +177,25 @@ class FileTimerServer:
def start(self) -> None:
logging.info(
log.info(
f"Starting {type(self).__name__}..."
f" max_interval={self._max_interval},"
f" daemon={self._daemon}"
)
self._watchdog_thread = threading.Thread(target=self._watchdog_loop, daemon=self._daemon)
logging.info("Starting watchdog thread...")
log.info("Starting watchdog thread...")
self._watchdog_thread.start()
self._log_event("watchdog started", None)
def stop(self) -> None:
logging.info(f"Stopping {type(self).__name__}")
log.info(f"Stopping {type(self).__name__}")
self._stop_signaled = True
if self._watchdog_thread:
logging.info("Stopping watchdog thread...")
log.info("Stopping watchdog thread...")
self._watchdog_thread.join(self._max_interval)
self._watchdog_thread = None
else:
logging.info("No watchdog thread running, doing nothing")
log.info("No watchdog thread running, doing nothing")
if os.path.exists(self._file_path):
os.remove(self._file_path)
self._log_event("watchdog stopped", None)
@ -201,11 +203,11 @@ class FileTimerServer:
def run_once(self) -> None:
self._run_once = True
if self._watchdog_thread:
logging.info("Stopping watchdog thread...")
log.info("Stopping watchdog thread...")
self._watchdog_thread.join()
self._watchdog_thread = None
else:
logging.info("No watchdog thread running, doing nothing")
log.info("No watchdog thread running, doing nothing")
if os.path.exists(self._file_path):
os.remove(self._file_path)
@ -223,7 +225,7 @@ class FileTimerServer:
if run_once:
break
except Exception as e:
logging.error("Error running watchdog", exc_info=e)
log.error("Error running watchdog", exc_info=e)
def _run_watchdog(self, fd: io.TextIOWrapper) -> None:
timer_requests = self._get_requests(fd, self._max_interval)
@ -231,7 +233,7 @@ class FileTimerServer:
now = time.time()
reaped_worker_pids = set()
for worker_pid, expired_timers in self.get_expired_timers(now).items():
logging.info(f"Reaping worker_pid=[{worker_pid}]." f" Expired timers: {self._get_scopes(expired_timers)}")
log.info(f"Reaping worker_pid=[{worker_pid}]." f" Expired timers: {self._get_scopes(expired_timers)}")
reaped_worker_pids.add(worker_pid)
# In case we have multiple expired timers, we find the first timer
# with a valid signal (>0) in the expiration time order.
@ -245,13 +247,13 @@ class FileTimerServer:
expired_timer = timer
break
if signal <= 0:
logging.info(f"No signal specified with worker=[{worker_pid}]. Do not reap it.")
log.info(f"No signal specified with worker=[{worker_pid}]. Do not reap it.")
continue
if self._reap_worker(worker_pid, signal):
logging.info(f"Successfully reaped worker=[{worker_pid}] with signal={signal}")
log.info(f"Successfully reaped worker=[{worker_pid}] with signal={signal}")
self._log_event("kill worker process", expired_timer)
else:
logging.error(f"Error reaping worker=[{worker_pid}]. Will retry on next watchdog.")
log.error(f"Error reaping worker=[{worker_pid}]. Will retry on next watchdog.")
self.clear_timers(reaped_worker_pids)
def _get_scopes(self, timer_requests: List[FileTimerRequest]) -> List[str]:
@ -323,8 +325,8 @@ class FileTimerServer:
os.kill(worker_pid, signal)
return True
except ProcessLookupError:
logging.info(f"Process with pid={worker_pid} does not exist. Skipping")
log.info(f"Process with pid={worker_pid} does not exist. Skipping")
return True
except Exception as e:
logging.error(f"Error terminating pid={worker_pid}", exc_info=e)
log.error(f"Error terminating pid={worker_pid}", exc_info=e)
return False

View File

@ -15,6 +15,8 @@ from .api import RequestQueue, TimerClient, TimerRequest, TimerServer
__all__ = ['LocalTimerClient', 'MultiprocessingRequestQueue', 'LocalTimerServer']
log = logging.getLogger(__name__)
class LocalTimerClient(TimerClient):
"""
Client side of ``LocalTimerServer``. This client is meant to be used
@ -116,8 +118,8 @@ class LocalTimerServer(TimerServer):
os.kill(worker_id, signal.SIGKILL)
return True
except ProcessLookupError:
logging.info(f"Process with pid={worker_id} does not exist. Skipping")
log.info(f"Process with pid={worker_id} does not exist. Skipping")
return True
except Exception as e:
logging.error(f"Error terminating pid={worker_id}", exc_info=e)
log.error(f"Error terminating pid={worker_id}", exc_info=e)
return False

View File

@ -9,6 +9,7 @@ from torch.cuda.amp.grad_scaler import _MultiDeviceReplicator, GradScaler, OptSt
from torch.distributed.distributed_c10d import ProcessGroup
from torch.optim.sgd import SGD
log = logging.getLogger(__name__)
def _refresh_per_optimizer_state():
return {"stage": OptState.READY, "found_inf_per_device": {}}
@ -158,7 +159,7 @@ class ShardedGradScaler(GradScaler):
for grad in grads:
for tensor in grad:
if tensor.device != expected_device:
logging.error(
log.error(
"tensor device is %s and expected device is %s"
% (tensor.device, expected_device)
)

View File

@ -32,6 +32,7 @@ from torch.profiler import _utils
TensorAndID = Tuple["TensorKey", int]
log = logging.getLogger(__name__)
class Category(enum.Enum):
INPUT = enum.auto()
@ -348,7 +349,7 @@ class SizeMap:
# the core PyTorch codebase.
if prior_size != new_size:
delta = f"{prior_size} vs. {new_size}"
logging.warn(f"Mismatch between allocation and free: {delta}")
log.warn(f"Mismatch between allocation and free: {delta}")
self._values.update(allocations)

View File

@ -13,6 +13,7 @@ from caffe2.python import core, workspace
from typing import Set, Dict, Tuple, List
log = logging.getLogger(__name__)
def _make_unique_name(seen: Set[str], name: str, min_version: int = 0):
"""
@ -754,7 +755,7 @@ def _try_get_shapes(nets):
shapes, _ = workspace.InferShapesAndTypes(nets)
return shapes
except Exception as e:
logging.warning("Failed to compute shapes: %s", e)
log.warning("Failed to compute shapes: %s", e)
return {}