diff --git a/.flake8 b/.flake8 index 5413776f199..636165863b0 100644 --- a/.flake8 +++ b/.flake8 @@ -16,7 +16,7 @@ ignore = # these ignores are from flake8-comprehensions; please fix! C407 # these ignores are from flake8-logging-format; please fix! - G001,G002,G003,G004,G100,G101,G200,G201,G202, + G004,G100,G101,G200,G201,G202 # these ignores are from flake8-simplify. please fix or ignore with commented reason SIM105,SIM108,SIM109,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12, # flake8-simplify code styles diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index 368e44a7b36..d30f6d291f2 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -891,7 +891,7 @@ def read_batch_size_from_file(args, filename, model_name): if model_name == cur_name: batch_size = int(b) if batch_size is None: - log.warning("Could not find batch size for {}".format(model_name)) + log.warning("Could not find batch size for %s", model_name) elif batch_size == -1: raise RuntimeError( f"Batch size is unset for {model_name} in {args.batch_size_file}" diff --git a/test/simulate_nccl_errors.py b/test/simulate_nccl_errors.py index 91b1531a28c..6b7d3cec1bd 100644 --- a/test/simulate_nccl_errors.py +++ b/test/simulate_nccl_errors.py @@ -31,7 +31,7 @@ if __name__ == "__main__": work = process_group.allreduce(torch.rand(10).cuda(rank)) logging.info('Waiting for allreduce to complete...') work.wait() - logging.info('Second allreduce successful: {}'.format(work.is_success())) + logging.info('Second allreduce successful: %s', work.is_success()) else: logging.info('Aborting all other ranks.') os.abort() diff --git a/torch/_dynamo/backends/distributed.py b/torch/_dynamo/backends/distributed.py index a9d1a45389b..020302a1a64 100644 --- a/torch/_dynamo/backends/distributed.py +++ b/torch/_dynamo/backends/distributed.py @@ -45,9 +45,11 @@ def pretty_print_buckets(buckets: List[Bucket]): try: from tabulate import tabulate + # TODO: Do you really want to log.info this? It would get + # suppressed if log level is too low log.info( - "\nDDPOptimizer bucket assignments\n" - + tabulate(rows, headers=headers, tablefmt="simple_grid") + "\nDDPOptimizer bucket assignments\n%s", + tabulate(rows, headers=headers, tablefmt="simple_grid"), ) except ImportError: log.info( @@ -318,9 +320,7 @@ class DDPOptimizer: else: curr_submod = real_mod - log.debug( - f"\n---{n.target} graph---\n" + str(curr_submod.graph) - ) + log.debug(f"\n---{n.target} graph---\n{curr_submod.graph}") # When calling the compiler on the submod, inputs (new_args) are expected to # be FakeTensors already since Dynamo would have made them FakeTensors in the @@ -348,5 +348,5 @@ class DDPOptimizer: submod_compiler.run(*example_inputs) split_gm.recompile() - log.debug("\n---final graph---\n" + str(split_gm.graph) + "\n---------------\n") + log.debug(f"\n---final graph---\n{split_gm.graph}\n---------------\n") return split_gm diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py index fd0ab5c3f89..321e541a7fa 100644 --- a/torch/_dynamo/convert_frame.py +++ b/torch/_dynamo/convert_frame.py @@ -262,9 +262,9 @@ def convert_frame_assert( assert code in guard_failures, "TODO(whc) any other recompile reasons?" log.warning( f"torch._dynamo hit config.cache_size_limit ({config.cache_size_limit})\n" - + f" function: {format_func_info(code)}\n" - + f" reasons: {format_guard_failures(code)}\n" - + f"to diagnose recompilation issues, see {troubleshooting_url}." + f" function: {format_func_info(code)}\n" + f" reasons: {format_guard_failures(code)}\n" + f"to diagnose recompilation issues, see {troubleshooting_url}." ) unimplemented("cache_size_limit reached") diff --git a/torch/backends/xeon/run_cpu.py b/torch/backends/xeon/run_cpu.py index 88c6de1656e..38ffd19d561 100644 --- a/torch/backends/xeon/run_cpu.py +++ b/torch/backends/xeon/run_cpu.py @@ -307,7 +307,7 @@ or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib6 find_tc = self.add_lib_preload(lib_type="tcmalloc") if not find_tc: msg = f"{self.msg_lib_notfound} you can use \"conda install -c conda-forge gperftools\" to install {{0}}" - logger.warning(msg.format("TCmalloc", "tcmalloc")) + logger.warning(msg.format("TCmalloc", "tcmalloc")) # noqa: G001 else: logger.info("Use TCMalloc memory allocator") @@ -315,7 +315,7 @@ or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib6 find_je = self.add_lib_preload(lib_type="jemalloc") if not find_je: msg = f"{self.msg_lib_notfound} you can use \"conda install -c conda-forge jemalloc\" to install {{0}}" - logger.warning(msg.format("Jemalloc", "jemalloc")) + logger.warning(msg.format("Jemalloc", "jemalloc")) # noqa: G001 else: logger.info("Use JeMalloc memory allocator") self.set_env("MALLOC_CONF", "oversize_threshold:1,background_thread:true,metadata_thp:auto") @@ -371,7 +371,7 @@ Value applied: {os.environ[env_name]}. Value ignored: {env_value}") find_iomp = self.add_lib_preload(lib_type="iomp5") if not find_iomp: msg = f"{self.msg_lib_notfound} you can use \"conda install mkl\" to install {{0}}" - logger.warning(msg.format("iomp", "iomp5")) + logger.warning(msg.format("iomp", "iomp5")) # noqa: G001 else: logger.info("Using Intel OpenMP") if set_kmp_affinity: @@ -429,9 +429,9 @@ please make sure ninstances <= total_cores)") num_leftover_cores = ncore_per_node % args.ncores_per_instance if args.ncores_per_instance > ncore_per_node: # too many ncores_per_instance to skip cross-node cores - logger.warning("there are {} core(s) per socket, but you specify {} ncores_per_instance and \ + logger.warning("there are %s core(s) per socket, but you specify %s ncores_per_instance and \ skip_cross_node_cores. Please make sure --ncores-per-instance < core(s) per \ -socket".format(ncore_per_node, args.ncores_per_instance)) +socket", ncore_per_node, args.ncores_per_instance) exit(-1) elif num_leftover_cores == 0: # aren't any cross-node cores diff --git a/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py index 36eeb85c599..71d0c0ccaf0 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py @@ -36,7 +36,7 @@ class PostLocalSGDState: post_local_gradient_allreduce=True, ): logger.info( - "Local SGD will be started after {} iterations".format(start_localSGD_iter) + "Local SGD will be started after %s iterations", start_localSGD_iter ) # The group used for all-reducing gradients globally. @@ -58,7 +58,7 @@ class PostLocalSGDState: if self.iter == self.start_localSGD_iter: logger.info( - "Start to apply local SGD after {} iterations.".format(self.iter) + "Start to apply local SGD after %s iterations.", self.iter ) diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py index 0b75fe3b38b..adc9f54b0c6 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py @@ -106,8 +106,8 @@ def _report_compression_stats(bucket, state): ): stats = state.compression_stats() logger.info( - "Compression stats: iter {}, total before compression {}, total after compression {}, " - "rate {}".format(state.iter, stats[1], stats[2], stats[0]) + "Compression stats: iter %s, total before compression %s, total after compression %s, " + "rate %s", state.iter, stats[1], stats[2], stats[0] ) state.next_stats_report = state.iter + state.compression_stats_logging_frequency @@ -183,19 +183,18 @@ class PowerSGDState: batch_tensors_with_same_shape: bool = False, ): logger.info( - "PowerSGD config: matrix_approximation_rank = {}; start_powerSGD_iter = {}; " - "min_compression_rate = {}; orthogonalization_epsilon = {}; use_error_feedback = {}; warm_start = {}; " - "random_seed = {}; compression_stats_logging_frequency = {}; batch_tensors_with_same_shape = {}".format( - matrix_approximation_rank, - start_powerSGD_iter, - min_compression_rate, - orthogonalization_epsilon, - use_error_feedback, - warm_start, - random_seed, - compression_stats_logging_frequency, - batch_tensors_with_same_shape, - ) + "PowerSGD config: matrix_approximation_rank = %s; start_powerSGD_iter = %s; " + "min_compression_rate = %s; orthogonalization_epsilon = %s; use_error_feedback = %s; warm_start = %s; " + "random_seed = %s; compression_stats_logging_frequency = %s; batch_tensors_with_same_shape = %s", + matrix_approximation_rank, + start_powerSGD_iter, + min_compression_rate, + orthogonalization_epsilon, + use_error_feedback, + warm_start, + random_seed, + compression_stats_logging_frequency, + batch_tensors_with_same_shape, ) self.process_group = process_group @@ -300,7 +299,7 @@ class PowerSGDState: if self.iter == self.start_powerSGD_iter: logger.info( - "Start to apply PowerSGD after {} iterations.".format(self.iter) + "Start to apply PowerSGD after %s iterations.", self.iter ) def compression_stats(self): @@ -409,9 +408,8 @@ def powerSGD_hook( input_tensor.add_(state.error_dict[bucket_index]) else: logger.info( - "A zero tensor of length {} that represents local error is created.".format( - total_length - ) + "A zero tensor of length %s that represents local error is created.", + total_length ) state.error_dict[bucket_index] = torch.zeros( total_length, device=device, dtype=dtype @@ -468,9 +466,8 @@ def powerSGD_hook( # Only log this if warm-start to avoid spamming. if state.warm_start: logger.info( - "Allocating contiguous memory of length {} for Ps, and of length {} for Qs, respectively.".format( - total_Ps_size, total_Qs_size - ) + "Allocating contiguous memory of length %s for Ps, and of length %s for Qs, respectively.", + total_Ps_size, total_Qs_size ) state.p_memory_dict[bucket_index] = torch.empty( total_Ps_size, device=device, dtype=dtype @@ -728,9 +725,8 @@ def batched_powerSGD_hook( input_tensor.add_(state.error_dict[bucket_index]) else: logger.info( - "A zero tensor of length {} that represents local error is created.".format( - padded_total_length - ) + "A zero tensor of length %s that represents local error is created.", + padded_total_length ) state.error_dict[bucket_index] = torch.zeros( padded_total_length, device=device, dtype=input_tensor.dtype @@ -749,9 +745,8 @@ def batched_powerSGD_hook( # Only log this if warm-start to avoid spamming. if state.warm_start: logger.info( - "Initializing low-rank tensors P and Q, each of which has a shape of {} x {}.".format( - square_side_length, state.matrix_approximation_rank - ) + "Initializing low-rank tensors P and Q, each of which has a shape of %s x %s.", + square_side_length, state.matrix_approximation_rank ) def create_low_rank_tensor(fill_random_values, rng): diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 7c48d45f4f1..765ea56784f 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -476,7 +476,7 @@ def _store_based_barrier(rank, store, timeout): """ store_key = "{}:{}".format(STORE_BASED_BARRIER_PREFIX, _world.group_count) store.add(store_key, 1) - logger.info("Added key: {} to store for rank: {}".format(store_key, rank)) + logger.info("Added key: %s to store for rank: %s", store_key, rank) # Now wait for all workers to check in with the store. world_size = get_world_size() @@ -496,9 +496,8 @@ def _store_based_barrier(rank, store, timeout): if timedelta(seconds=(time.time() - log_time)) > timedelta(seconds=10): logger.info( "Waiting in store based barrier to initialize process group for " - "rank: {}, key: {} (world_size={}, worker_count={}, timeout={})".format( - rank, store_key, world_size, worker_count, timeout - ) + "rank: %s, key: %s (world_size=%s, worker_count=%s, timeout=%s)", + rank, store_key, world_size, worker_count, timeout ) log_time = time.time() @@ -3716,7 +3715,8 @@ def new_subgroups( if rank in ranks_in_subgroup: cur_subgroup = subgroup logger.info( - "Rank {} is assigned to subgroup {}".format(rank, ranks_in_subgroup) + "Rank %s is assigned to subgroup %s", + rank, ranks_in_subgroup ) return cur_subgroup, subgroups @@ -3828,7 +3828,7 @@ def new_subgroups_by_enumeration( rank_to_ranks_dict[rank] = ranks if my_rank == rank: cur_subgroup = subgroup - logger.info("Rank {} is assigned to subgroup {}".format(rank, ranks)) + logger.info("Rank %s is assigned to subgroup %s", rank, ranks) return cur_subgroup, subgroups diff --git a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py index 4bcc3ad9048..55603fe266b 100644 --- a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py +++ b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py @@ -211,7 +211,7 @@ class EtcdRendezvous: last_call_timeout, ): self.client = client - log.info("Etcd machines: " + str(self.client.machines)) + log.info("Etcd machines: %s", self.client.machines) self._prefix = prefix self._run_id = run_id @@ -310,7 +310,7 @@ class EtcdRendezvous: # to avoid spamming etcd # FIXME: there are a few things that fall under this like # etcd.EtcdKeyNotFound, etc, which could be handled more explicitly. - log.info("Rendezvous attempt failed, will retry. Reason: " + str(e)) + log.info("Rendezvous attempt failed, will retry. Reason: %s", e) time.sleep(1) def init_phase(self): @@ -335,12 +335,12 @@ class EtcdRendezvous: try: active_version = self.try_create_rendezvous() state = json.loads(active_version.value) - log.info("New rendezvous state created: " + str(state)) + log.info("New rendezvous state created: %s", state) except etcd.EtcdAlreadyExist: active_version, state = self.get_rdzv_state() # Note: it is possible for above query to fail (etcd.EtcdKeyNotFound), # but this is ok for us - just means we'll restart from beginning. - log.info("Observed existing rendezvous state: " + str(state)) + log.info("Observed existing rendezvous state: %s", state) if state["status"] == "closed": raise RendezvousClosedError() @@ -365,9 +365,8 @@ class EtcdRendezvous: active_version, this_rank = self.join_rendezvous(expected_version) state = json.loads(active_version.value) log.info( - "Joined rendezvous version {} as rank {}. Full state: {}".format( - state["version"], this_rank, state - ) + "Joined rendezvous version %s as rank %s. Full state: %s", + state["version"], this_rank, state ) # If this worker was first to reach num_min_workers requirement, @@ -380,10 +379,10 @@ class EtcdRendezvous: # when min_num_workers is reached. if this_rank == self._num_min_workers - 1 and state["status"] == "joinable": - log.info("Rank {} is responsible for join last call.".format(this_rank)) + log.info("Rank %s is responsible for join last call.", this_rank) last_call_deadline = time.time() + self._last_call_timeout self.handle_join_last_call(expected_version, last_call_deadline) - log.info("Rank {} finished join last call.".format(this_rank)) + log.info("Rank %s finished join last call.", this_rank) # Wait for rendezvous state to be frozen, which means a fixed set of peers log.info("Waiting for remaining peers.") @@ -412,9 +411,8 @@ class EtcdRendezvous: state = json.loads(active_version.value) log.info( - "Rendezvous version {} is complete. Final state: {}".format( - state["version"], state - ) + "Rendezvous version %s is complete. Final state: %s", + state["version"], state ) # Rendezvous version number; our rank in it; world size @@ -433,9 +431,8 @@ class EtcdRendezvous: # 2. if keep alives are missing, destroy it and bail out. active_state = self.announce_self_waiting(expected_version) log.info( - "Added self to waiting list. Rendezvous full state: {}".format( - active_state.value - ) + "Added self to waiting list. Rendezvous full state: %s", + active_state.value ) self.wait_for_rendezvous_to_free(expected_version) @@ -698,9 +695,10 @@ class EtcdRendezvous: if key not in keep_alive_keys: # This participant didn't renew their lease. We'll declare this # rendezvous version as dead (but only if it hadn't changed) - log.info("Keep-alive key {} is not renewed.".format(key)) + log.info("Keep-alive key %s is not renewed.", key) log.info( - "Rendevous version {} is incomplete. ".format(expected_version) + "Rendevous version %s is incomplete. ", + expected_version ) log.info("Attempting to destroy it.") @@ -713,9 +711,8 @@ class EtcdRendezvous: ) log.info( - "Destroyed rendezvous version {} successfully.".format( - expected_version - ) + "Destroyed rendezvous version %s successfully.", + expected_version ) # We can return (and retry) immediately diff --git a/torch/distributed/fsdp/sharded_grad_scaler.py b/torch/distributed/fsdp/sharded_grad_scaler.py index edf0efcd602..342ec29088e 100644 --- a/torch/distributed/fsdp/sharded_grad_scaler.py +++ b/torch/distributed/fsdp/sharded_grad_scaler.py @@ -161,8 +161,9 @@ class ShardedGradScaler(GradScaler): for tensor in grad: if tensor.device != expected_device: log.error( - "tensor device is %s and expected device is %s" - % (tensor.device, expected_device) + "tensor device is %s and expected device is %s", + tensor.device, + expected_device, ) raise ValueError("Gradients must be on the same device.") diff --git a/torch/distributed/nn/jit/instantiator.py b/torch/distributed/nn/jit/instantiator.py index 7b78ee085a6..d66b651869f 100644 --- a/torch/distributed/nn/jit/instantiator.py +++ b/torch/distributed/nn/jit/instantiator.py @@ -73,10 +73,10 @@ def _write(out_path, text): old_text = None if old_text != text: with open(out_path, "w") as f: - logger.info("Writing {}".format(out_path)) + logger.info("Writing %s", out_path) f.write(text) else: - logger.info("Skipped writing {}".format(out_path)) + logger.info("Skipped writing %s", out_path) def _do_instantiate_remote_module_template(