diff --git a/.flake8 b/.flake8
index 5413776f199..636165863b0 100644
--- a/.flake8
+++ b/.flake8
@@ -16,7 +16,7 @@ ignore =
     # these ignores are from flake8-comprehensions; please fix!
     C407
     # these ignores are from flake8-logging-format; please fix!
-    G001,G002,G003,G004,G100,G101,G200,G201,G202,
+    G004,G100,G101,G200,G201,G202
     # these ignores are from flake8-simplify. please fix or ignore with commented reason
     SIM105,SIM108,SIM109,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
     # flake8-simplify code styles
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 368e44a7b36..d30f6d291f2 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -891,7 +891,7 @@ def read_batch_size_from_file(args, filename, model_name):
             if model_name == cur_name:
                 batch_size = int(b)
     if batch_size is None:
-        log.warning("Could not find batch size for {}".format(model_name))
+        log.warning("Could not find batch size for %s", model_name)
     elif batch_size == -1:
         raise RuntimeError(
             f"Batch size is unset for {model_name} in {args.batch_size_file}"
diff --git a/test/simulate_nccl_errors.py b/test/simulate_nccl_errors.py
index 91b1531a28c..6b7d3cec1bd 100644
--- a/test/simulate_nccl_errors.py
+++ b/test/simulate_nccl_errors.py
@@ -31,7 +31,7 @@ if __name__ == "__main__":
         work = process_group.allreduce(torch.rand(10).cuda(rank))
         logging.info('Waiting for allreduce to complete...')
         work.wait()
-        logging.info('Second allreduce successful: {}'.format(work.is_success()))
+        logging.info('Second allreduce successful: %s', work.is_success())
     else:
         logging.info('Aborting all other ranks.')
         os.abort()
diff --git a/torch/_dynamo/backends/distributed.py b/torch/_dynamo/backends/distributed.py
index a9d1a45389b..020302a1a64 100644
--- a/torch/_dynamo/backends/distributed.py
+++ b/torch/_dynamo/backends/distributed.py
@@ -45,9 +45,11 @@ def pretty_print_buckets(buckets: List[Bucket]):
     try:
         from tabulate import tabulate
 
+        # TODO: Do you really want to log.info this?  It would get
+        # suppressed if log level is too low
         log.info(
-            "\nDDPOptimizer bucket assignments\n"
-            + tabulate(rows, headers=headers, tablefmt="simple_grid")
+            "\nDDPOptimizer bucket assignments\n%s",
+            tabulate(rows, headers=headers, tablefmt="simple_grid"),
         )
     except ImportError:
         log.info(
@@ -318,9 +320,7 @@ class DDPOptimizer:
                         else:
                             curr_submod = real_mod
 
-                        log.debug(
-                            f"\n---{n.target} graph---\n" + str(curr_submod.graph)
-                        )
+                        log.debug(f"\n---{n.target} graph---\n{curr_submod.graph}")
 
                         # When calling the compiler on the submod, inputs (new_args) are expected to
                         # be FakeTensors already since Dynamo would have made them FakeTensors in the
@@ -348,5 +348,5 @@ class DDPOptimizer:
         submod_compiler.run(*example_inputs)
         split_gm.recompile()
 
-        log.debug("\n---final graph---\n" + str(split_gm.graph) + "\n---------------\n")
+        log.debug(f"\n---final graph---\n{split_gm.graph}\n---------------\n")
         return split_gm
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index fd0ab5c3f89..321e541a7fa 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -262,9 +262,9 @@ def convert_frame_assert(
             assert code in guard_failures, "TODO(whc) any other recompile reasons?"
             log.warning(
                 f"torch._dynamo hit config.cache_size_limit ({config.cache_size_limit})\n"
-                + f"   function: {format_func_info(code)}\n"
-                + f"   reasons:  {format_guard_failures(code)}\n"
-                + f"to diagnose recompilation issues, see {troubleshooting_url}."
+                f"   function: {format_func_info(code)}\n"
+                f"   reasons:  {format_guard_failures(code)}\n"
+                f"to diagnose recompilation issues, see {troubleshooting_url}."
             )
             unimplemented("cache_size_limit reached")
 
diff --git a/torch/backends/xeon/run_cpu.py b/torch/backends/xeon/run_cpu.py
index 88c6de1656e..38ffd19d561 100644
--- a/torch/backends/xeon/run_cpu.py
+++ b/torch/backends/xeon/run_cpu.py
@@ -307,7 +307,7 @@ or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib6
             find_tc = self.add_lib_preload(lib_type="tcmalloc")
             if not find_tc:
                 msg = f"{self.msg_lib_notfound} you can use \"conda install -c conda-forge gperftools\" to install {{0}}"
-                logger.warning(msg.format("TCmalloc", "tcmalloc"))
+                logger.warning(msg.format("TCmalloc", "tcmalloc"))  # noqa: G001
             else:
                 logger.info("Use TCMalloc memory allocator")
 
@@ -315,7 +315,7 @@ or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib6
             find_je = self.add_lib_preload(lib_type="jemalloc")
             if not find_je:
                 msg = f"{self.msg_lib_notfound} you can use \"conda install -c conda-forge jemalloc\" to install {{0}}"
-                logger.warning(msg.format("Jemalloc", "jemalloc"))
+                logger.warning(msg.format("Jemalloc", "jemalloc"))  # noqa: G001
             else:
                 logger.info("Use JeMalloc memory allocator")
                 self.set_env("MALLOC_CONF", "oversize_threshold:1,background_thread:true,metadata_thp:auto")
@@ -371,7 +371,7 @@ Value applied: {os.environ[env_name]}. Value ignored: {env_value}")
             find_iomp = self.add_lib_preload(lib_type="iomp5")
             if not find_iomp:
                 msg = f"{self.msg_lib_notfound} you can use \"conda install mkl\" to install {{0}}"
-                logger.warning(msg.format("iomp", "iomp5"))
+                logger.warning(msg.format("iomp", "iomp5"))  # noqa: G001
             else:
                 logger.info("Using Intel OpenMP")
                 if set_kmp_affinity:
@@ -429,9 +429,9 @@ please make sure ninstances <= total_cores)")
                     num_leftover_cores = ncore_per_node % args.ncores_per_instance
                     if args.ncores_per_instance > ncore_per_node:
                         # too many ncores_per_instance to skip cross-node cores
-                        logger.warning("there are {} core(s) per socket, but you specify {} ncores_per_instance and \
+                        logger.warning("there are %s core(s) per socket, but you specify %s ncores_per_instance and \
 skip_cross_node_cores. Please make sure --ncores-per-instance < core(s) per \
-socket".format(ncore_per_node, args.ncores_per_instance))
+socket", ncore_per_node, args.ncores_per_instance)
                         exit(-1)
                     elif num_leftover_cores == 0:
                         # aren't any cross-node cores
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py
index 36eeb85c599..71d0c0ccaf0 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py
@@ -36,7 +36,7 @@ class PostLocalSGDState:
         post_local_gradient_allreduce=True,
     ):
         logger.info(
-            "Local SGD will be started after {} iterations".format(start_localSGD_iter)
+            "Local SGD will be started after %s iterations", start_localSGD_iter
         )
 
         # The group used for all-reducing gradients globally.
@@ -58,7 +58,7 @@ class PostLocalSGDState:
 
         if self.iter == self.start_localSGD_iter:
             logger.info(
-                "Start to apply local SGD after {} iterations.".format(self.iter)
+                "Start to apply local SGD after %s iterations.", self.iter
             )
 
 
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
index 0b75fe3b38b..adc9f54b0c6 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -106,8 +106,8 @@ def _report_compression_stats(bucket, state):
     ):
         stats = state.compression_stats()
         logger.info(
-            "Compression stats: iter {}, total before compression {}, total after compression {}, "
-            "rate {}".format(state.iter, stats[1], stats[2], stats[0])
+            "Compression stats: iter %s, total before compression %s, total after compression %s, "
+            "rate %s", state.iter, stats[1], stats[2], stats[0]
         )
         state.next_stats_report = state.iter + state.compression_stats_logging_frequency
 
@@ -183,19 +183,18 @@ class PowerSGDState:
         batch_tensors_with_same_shape: bool = False,
     ):
         logger.info(
-            "PowerSGD config: matrix_approximation_rank = {}; start_powerSGD_iter = {}; "
-            "min_compression_rate = {}; orthogonalization_epsilon = {}; use_error_feedback = {}; warm_start = {}; "
-            "random_seed = {}; compression_stats_logging_frequency = {}; batch_tensors_with_same_shape = {}".format(
-                matrix_approximation_rank,
-                start_powerSGD_iter,
-                min_compression_rate,
-                orthogonalization_epsilon,
-                use_error_feedback,
-                warm_start,
-                random_seed,
-                compression_stats_logging_frequency,
-                batch_tensors_with_same_shape,
-            )
+            "PowerSGD config: matrix_approximation_rank = %s; start_powerSGD_iter = %s; "
+            "min_compression_rate = %s; orthogonalization_epsilon = %s; use_error_feedback = %s; warm_start = %s; "
+            "random_seed = %s; compression_stats_logging_frequency = %s; batch_tensors_with_same_shape = %s",
+            matrix_approximation_rank,
+            start_powerSGD_iter,
+            min_compression_rate,
+            orthogonalization_epsilon,
+            use_error_feedback,
+            warm_start,
+            random_seed,
+            compression_stats_logging_frequency,
+            batch_tensors_with_same_shape,
         )
 
         self.process_group = process_group
@@ -300,7 +299,7 @@ class PowerSGDState:
 
         if self.iter == self.start_powerSGD_iter:
             logger.info(
-                "Start to apply PowerSGD after {} iterations.".format(self.iter)
+                "Start to apply PowerSGD after %s iterations.", self.iter
             )
 
     def compression_stats(self):
@@ -409,9 +408,8 @@ def powerSGD_hook(
             input_tensor.add_(state.error_dict[bucket_index])
         else:
             logger.info(
-                "A zero tensor of length {} that represents local error is created.".format(
-                    total_length
-                )
+                "A zero tensor of length %s that represents local error is created.",
+                total_length
             )
             state.error_dict[bucket_index] = torch.zeros(
                 total_length, device=device, dtype=dtype
@@ -468,9 +466,8 @@ def powerSGD_hook(
         # Only log this if warm-start to avoid spamming.
         if state.warm_start:
             logger.info(
-                "Allocating contiguous memory of length {} for Ps, and of length {} for Qs, respectively.".format(
-                    total_Ps_size, total_Qs_size
-                )
+                "Allocating contiguous memory of length %s for Ps, and of length %s for Qs, respectively.",
+                total_Ps_size, total_Qs_size
             )
         state.p_memory_dict[bucket_index] = torch.empty(
             total_Ps_size, device=device, dtype=dtype
@@ -728,9 +725,8 @@ def batched_powerSGD_hook(
             input_tensor.add_(state.error_dict[bucket_index])
         else:
             logger.info(
-                "A zero tensor of length {} that represents local error is created.".format(
-                    padded_total_length
-                )
+                "A zero tensor of length %s that represents local error is created.",
+                padded_total_length
             )
             state.error_dict[bucket_index] = torch.zeros(
                 padded_total_length, device=device, dtype=input_tensor.dtype
@@ -749,9 +745,8 @@ def batched_powerSGD_hook(
         # Only log this if warm-start to avoid spamming.
         if state.warm_start:
             logger.info(
-                "Initializing low-rank tensors P and Q, each of which has a shape of {} x {}.".format(
-                    square_side_length, state.matrix_approximation_rank
-                )
+                "Initializing low-rank tensors P and Q, each of which has a shape of %s x %s.",
+                square_side_length, state.matrix_approximation_rank
             )
 
         def create_low_rank_tensor(fill_random_values, rng):
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 7c48d45f4f1..765ea56784f 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -476,7 +476,7 @@ def _store_based_barrier(rank, store, timeout):
     """
     store_key = "{}:{}".format(STORE_BASED_BARRIER_PREFIX, _world.group_count)
     store.add(store_key, 1)
-    logger.info("Added key: {} to store for rank: {}".format(store_key, rank))
+    logger.info("Added key: %s to store for rank: %s", store_key, rank)
 
     # Now wait for all workers to check in with the store.
     world_size = get_world_size()
@@ -496,9 +496,8 @@ def _store_based_barrier(rank, store, timeout):
         if timedelta(seconds=(time.time() - log_time)) > timedelta(seconds=10):
             logger.info(
                 "Waiting in store based barrier to initialize process group for "
-                "rank: {}, key: {} (world_size={}, worker_count={}, timeout={})".format(
-                    rank, store_key, world_size, worker_count, timeout
-                )
+                "rank: %s, key: %s (world_size=%s, worker_count=%s, timeout=%s)",
+                rank, store_key, world_size, worker_count, timeout
             )
             log_time = time.time()
 
@@ -3716,7 +3715,8 @@ def new_subgroups(
         if rank in ranks_in_subgroup:
             cur_subgroup = subgroup
             logger.info(
-                "Rank {} is assigned to subgroup {}".format(rank, ranks_in_subgroup)
+                "Rank %s is assigned to subgroup %s",
+                rank, ranks_in_subgroup
             )
 
     return cur_subgroup, subgroups
@@ -3828,7 +3828,7 @@ def new_subgroups_by_enumeration(
             rank_to_ranks_dict[rank] = ranks
             if my_rank == rank:
                 cur_subgroup = subgroup
-                logger.info("Rank {} is assigned to subgroup {}".format(rank, ranks))
+                logger.info("Rank %s is assigned to subgroup %s", rank, ranks)
 
     return cur_subgroup, subgroups
 
diff --git a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
index 4bcc3ad9048..55603fe266b 100644
--- a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
+++ b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
@@ -211,7 +211,7 @@ class EtcdRendezvous:
         last_call_timeout,
     ):
         self.client = client
-        log.info("Etcd machines: " + str(self.client.machines))
+        log.info("Etcd machines: %s", self.client.machines)
 
         self._prefix = prefix
         self._run_id = run_id
@@ -310,7 +310,7 @@ class EtcdRendezvous:
                 # to avoid spamming etcd
                 # FIXME: there are a few things that fall under this like
                 # etcd.EtcdKeyNotFound, etc, which could be handled more explicitly.
-                log.info("Rendezvous attempt failed, will retry. Reason: " + str(e))
+                log.info("Rendezvous attempt failed, will retry. Reason: %s", e)
                 time.sleep(1)
 
     def init_phase(self):
@@ -335,12 +335,12 @@ class EtcdRendezvous:
         try:
             active_version = self.try_create_rendezvous()
             state = json.loads(active_version.value)
-            log.info("New rendezvous state created: " + str(state))
+            log.info("New rendezvous state created: %s", state)
         except etcd.EtcdAlreadyExist:
             active_version, state = self.get_rdzv_state()
             # Note: it is possible for above query to fail (etcd.EtcdKeyNotFound),
             # but this is ok for us - just means we'll restart from beginning.
-            log.info("Observed existing rendezvous state: " + str(state))
+            log.info("Observed existing rendezvous state: %s", state)
 
         if state["status"] == "closed":
             raise RendezvousClosedError()
@@ -365,9 +365,8 @@ class EtcdRendezvous:
         active_version, this_rank = self.join_rendezvous(expected_version)
         state = json.loads(active_version.value)
         log.info(
-            "Joined rendezvous version {} as rank {}. Full state: {}".format(
-                state["version"], this_rank, state
-            )
+            "Joined rendezvous version %s as rank %s. Full state: %s",
+            state["version"], this_rank, state
         )
 
         # If this worker was first to reach num_min_workers requirement,
@@ -380,10 +379,10 @@ class EtcdRendezvous:
         # when min_num_workers is reached.
 
         if this_rank == self._num_min_workers - 1 and state["status"] == "joinable":
-            log.info("Rank {} is responsible for join last call.".format(this_rank))
+            log.info("Rank %s is responsible for join last call.", this_rank)
             last_call_deadline = time.time() + self._last_call_timeout
             self.handle_join_last_call(expected_version, last_call_deadline)
-            log.info("Rank {} finished join last call.".format(this_rank))
+            log.info("Rank %s finished join last call.", this_rank)
 
         # Wait for rendezvous state to be frozen, which means a fixed set of peers
         log.info("Waiting for remaining peers.")
@@ -412,9 +411,8 @@ class EtcdRendezvous:
         state = json.loads(active_version.value)
 
         log.info(
-            "Rendezvous version {} is complete. Final state: {}".format(
-                state["version"], state
-            )
+            "Rendezvous version %s is complete. Final state: %s",
+            state["version"], state
         )
 
         # Rendezvous version number; our rank in it; world size
@@ -433,9 +431,8 @@ class EtcdRendezvous:
         #   2. if keep alives are missing, destroy it and bail out.
         active_state = self.announce_self_waiting(expected_version)
         log.info(
-            "Added self to waiting list. Rendezvous full state: {}".format(
-                active_state.value
-            )
+            "Added self to waiting list. Rendezvous full state: %s",
+            active_state.value
         )
 
         self.wait_for_rendezvous_to_free(expected_version)
@@ -698,9 +695,10 @@ class EtcdRendezvous:
                 if key not in keep_alive_keys:
                     # This participant didn't renew their lease. We'll declare this
                     # rendezvous version as dead (but only if it hadn't changed)
-                    log.info("Keep-alive key {} is not renewed.".format(key))
+                    log.info("Keep-alive key %s is not renewed.", key)
                     log.info(
-                        "Rendevous version {} is incomplete. ".format(expected_version)
+                        "Rendevous version %s is incomplete. ",
+                        expected_version
                     )
                     log.info("Attempting to destroy it.")
 
@@ -713,9 +711,8 @@ class EtcdRendezvous:
                     )
 
                     log.info(
-                        "Destroyed rendezvous version {} successfully.".format(
-                            expected_version
-                        )
+                        "Destroyed rendezvous version %s successfully.",
+                        expected_version
                     )
 
                     # We can return (and retry) immediately
diff --git a/torch/distributed/fsdp/sharded_grad_scaler.py b/torch/distributed/fsdp/sharded_grad_scaler.py
index edf0efcd602..342ec29088e 100644
--- a/torch/distributed/fsdp/sharded_grad_scaler.py
+++ b/torch/distributed/fsdp/sharded_grad_scaler.py
@@ -161,8 +161,9 @@ class ShardedGradScaler(GradScaler):
             for tensor in grad:
                 if tensor.device != expected_device:
                     log.error(
-                        "tensor device is %s and expected device is %s"
-                        % (tensor.device, expected_device)
+                        "tensor device is %s and expected device is %s",
+                        tensor.device,
+                        expected_device,
                     )
                     raise ValueError("Gradients must be on the same device.")
 
diff --git a/torch/distributed/nn/jit/instantiator.py b/torch/distributed/nn/jit/instantiator.py
index 7b78ee085a6..d66b651869f 100644
--- a/torch/distributed/nn/jit/instantiator.py
+++ b/torch/distributed/nn/jit/instantiator.py
@@ -73,10 +73,10 @@ def _write(out_path, text):
         old_text = None
     if old_text != text:
         with open(out_path, "w") as f:
-            logger.info("Writing {}".format(out_path))
+            logger.info("Writing %s", out_path)
             f.write(text)
     else:
-        logger.info("Skipped writing {}".format(out_path))
+        logger.info("Skipped writing %s", out_path)
 
 
 def _do_instantiate_remote_module_template(