Fix unused Python variables in test/[a-d]* (#134665)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/134665 Approved by: https://github.com/albanD
2025-12-06 12:20:52 +01:00 · 2024-12-13 18:35:20 +00:00 · 2024-12-13 18:35:20 +00:00 · d25e6e623f
commit d25e6e623f
parent e19f493f02
120 changed files with 410 additions and 522 deletions
--- a/test/ao/sparsity/test_kernels.py
+++ b/test/ao/sparsity/test_kernels.py
@ -147,7 +147,6 @@ def _sparse_layer_test_helper(
    W_zp = 0

    X_fp32 = torch.randn(batch_size, input_channels, dtype=torch.float32)
-    float_bias = torch.randn(output_channels, dtype=torch.float32)

    # generate a weight which we'll insert into the model
    W_fp32 = torch.randn(output_channels, input_channels, dtype=torch.float32)
--- a/test/ao/sparsity/test_qlinear_packed_params.py
+++ b/test/ao/sparsity/test_qlinear_packed_params.py
@ -30,7 +30,6 @@ class TestQlinearPackedParams(TestCase):
        row_block_size = 1
        col_block_size = 4
        out_features = weight_fp32.shape[0]
-        in_features = weight_fp32.shape[1]

        scales = [2.0, 6.0, 12.0]
        zero_points = [
@ -201,14 +200,11 @@ class TestQlinearPackedParams(TestCase):
        row_block_size = 1
        col_block_size = 4
        out_features = weight_fp32.shape[0]
-        in_features = weight_fp32.shape[1]

        scales = [2.0, 3.0, 7.0]
        zero_points = [0 for _ in range(out_features)]
        dtype = torch.qint8

-        x = torch.rand(size=(1, weight_fp32.shape[1]))
-
        def make_lin_get_state_weight_bias_and_save():
            weight = torch.quantize_per_tensor(
                weight_fp32,
--- a/test/ao/sparsity/test_sparsifier.py
+++ b/test/ao/sparsity/test_sparsifier.py
@ -86,7 +86,7 @@ class TestBaseSparsifier(TestCase):
        sparsifier0.prepare(model0, [{"tensor_fqn": "linear1.weight"}])
        mask = model0.linear1.parametrizations["weight"][0].mask
        mask.data = torch.arange(mask.shape[0] * mask.shape[1]).reshape(mask.shape)
-        for step in range(step_count):
+        for _ in range(step_count):
            sparsifier0.step()
        state_dict = sparsifier0.state_dict()

--- a/test/ao/sparsity/test_sparsity_utils.py
+++ b/test/ao/sparsity/test_sparsity_utils.py
@ -124,7 +124,7 @@ class TestSparsityUtilFunctions(TestCase):
            list_of_modules = [m for _, m in model.named_modules()] + [model]
            for module in list_of_modules:
                module_fqn = module_to_fqn(model, module)
-                for tensor_name, tensor in module.named_parameters(recurse=False):
+                for tensor_name, _ in module.named_parameters(recurse=False):
                    tensor_fqn = (
                        module_fqn + ("." if module_fqn != "" else "") + tensor_name
                    )
--- a/test/ao/sparsity/test_structured_sparsifier.py
+++ b/test/ao/sparsity/test_structured_sparsifier.py
@ -269,7 +269,6 @@ class TestBaseStructuredSparsifier(TestCase):

    def _test_step_linear_on_device(self, model, device):
        model = model.to(device)
-        x = torch.ones(7, 7, device=device)
        pruner = SimplePruner(None)
        pruner.prepare(model, None)
        pruner.enable_mask_update = True
@ -808,7 +807,7 @@ class TestBaseStructuredSparsifier(TestCase):
        pruned_model = fx_pruner.prune()
        pruned_model.eval()
        out_pruned, lstm_out_pruned = pruned_model(lstm_input)
-        r, c = lstm_out_expected.size()
+        _, c = lstm_out_expected.size()

        # We cannot check that y_expected == y_pruned as usual because
        # zeros vs. missing elements yield different numerical results.
@ -891,7 +890,7 @@ class TestBaseStructuredSparsifier(TestCase):
        pruned_model = fx_pruner.prune()
        pruned_model.eval()
        out_pruned, lstm_out_pruned = pruned_model(lstm_input)
-        r, c = lstm_out_expected.size()
+        _, c = lstm_out_expected.size()

        # We cannot check that y_expected == y_pruned as usual because
        # zeros vs. missing elements yield different numerical results.
--- a/test/autograd/test_functional.py
+++ b/test/autograd/test_functional.py
@ -670,7 +670,7 @@ class TestAutogradFunctional(TestCase):

        x = ctors.randn(3)
        with warnings.catch_warnings(record=True) as wa:
-            result = api(foo, x, vectorize=True)
+            api(foo, x, vectorize=True)
        self.assertEqual(len(wa), 0)

    @base_and_logging_tensor
@ -762,7 +762,7 @@ class TestAutogradFunctional(TestCase):

        inp = ctors.rand(4)
        with self.assertRaisesRegex(RuntimeError, "not supported together"):
-            res = autogradF.jacobian(foo, inp, strict=True, vectorize=True)
+            autogradF.jacobian(foo, inp, strict=True, vectorize=True)

    @base_and_logging_tensor
    def test_jacobian_no_grad(self, ctors):
@ -1122,7 +1122,7 @@ class TestAutogradFunctional(TestCase):

        inp = ctors.rand(4)
        with self.assertRaisesRegex(RuntimeError, "not supported together"):
-            res = autogradF.hessian(foo, inp, strict=True, vectorize=True)
+            autogradF.hessian(foo, inp, strict=True, vectorize=True)

    @base_and_logging_tensor
    def test_hessian_no_grad(self, ctors):
--- a/test/bottleneck_test/test_cuda.py
+++ b/test/bottleneck_test/test_cuda.py
@ -18,7 +18,7 @@ def main():
    data = torch.randn(10, 50).cuda()
    model = Model().cuda()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
-    for i in range(10):
+    for _ in range(10):
        optimizer.zero_grad()
        loss = model(data)
        loss.backward()
--- a/test/custom_operator/test_custom_ops.py
+++ b/test/custom_operator/test_custom_ops.py
@ -78,9 +78,9 @@ def forward(self, arg0_1):
        x = torch.randn(3, device="meta")
        self.assertNotIn("my_custom_ops2", sys.modules.keys())
        with self.assertRaisesRegex(NotImplementedError, r"'my_custom_ops2'"):
-            y = torch.ops.custom.sin.default(x)
+            torch.ops.custom.sin.default(x)
        torch.ops.import_module("my_custom_ops2")
-        y = torch.ops.custom.sin.default(x)
+        torch.ops.custom.sin.default(x)

    def test_calling_custom_op_string(self):
        output = ops.custom.op2("abc", "def")
--- a/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
@ -35,7 +35,7 @@ class _TestClipGradNormBase(FSDPTest):
        vector_norm_fn = functools.partial(torch.linalg.vector_norm, ord=norm_type)
        dp_mesh = dp_mesh or init_device_mesh("cuda", (self.world_size,))
        torch.manual_seed(42 + dp_mesh.get_local_rank() + 1)
-        for iter_idx in range(10):
+        for _ in range(10):
            ref_optim.zero_grad()
            ref_model(inp).sum().backward()
            optim.zero_grad()
--- a/test/distributed/_composable/fsdp/test_fully_shard_comm.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
@ -250,8 +250,8 @@ class TestFullyShardCollectiveOps(FSDPTestMultiThread):
        self.assertEqual(group.size(), self.world_size)
        all_reduce_stream = torch.cuda.Stream()
        (
-            reduce_scatter_input,
-            reduce_scatter_event,
+            _,
+            _,
            post_reduce_event,
            _,
            _,
@ -406,7 +406,7 @@ class TestFullyShardCommunication(FSDPTest):
        torch.manual_seed(42 + self.rank)
        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")

-        for iter_idx in range(10):
+        for _ in range(10):
            ref_loss = ref_model(inp).sum()
            ref_loss.backward()
            for param in ref_model.parameters():
@ -501,7 +501,7 @@ class TestFullyShardPrefetch(FSDPTest):
        self, reshard_after_forward: Union[bool, int], checkpoint_impl: Optional[str]
    ):
        n_layers = 3
-        model, optim, inp = self._init_transformer(
+        model, _, inp = self._init_transformer(
            n_layers, reshard_after_forward, checkpoint_impl
        )
        events: List[EventType] = []
@ -843,7 +843,7 @@ class TestFullyShardPrefetch(FSDPTest):
        with patch_unshard(unshard_with_record), patch_post_backward(
            post_backward_with_record
        ):
-            for iter_idx in range(3):
+            for _ in range(3):
                loss = model(inp)
                expected_events = [
                    (
@ -922,7 +922,7 @@ class TestFullyShardPrefetch(FSDPTest):
        with patch_unshard(unshard_with_record), patch_post_backward(
            post_backward_with_record
        ):
-            for iter_idx in range(3):
+            for _ in range(3):
                loss = model(inp)
                expected_events = [
                    ("unshard", "", TrainingState.FORWARD),
--- a/test/distributed/_composable/fsdp/test_fully_shard_compile.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
@ -662,7 +662,7 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]},
            def __init__(self, n_layers):
                super().__init__()
                self.layers = torch.nn.ModuleList()
-                for layer_id in range(n_layers):
+                for _ in range(n_layers):
                    self.layers.append(TestSubmodule(hidden_dim))

            def forward(self, x):
@ -684,7 +684,7 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]},
            fsdp_config = {}
            mesh = init_device_mesh("cuda", (self.world_size,))
            model = TestModule(n_layers=3)
-            for layer_id, mod in enumerate(model.layers):
+            for mod in model.layers:
                fully_shard(mod, mesh=mesh, reshard_after_forward=True, **fsdp_config)
            model = fully_shard(
                model, mesh=mesh, reshard_after_forward=True, **fsdp_config
@ -871,7 +871,7 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]},
                        else:
                            v.requires_grad_(False)
                assert requires_grad_param_count == n_layers * len(requires_grad_params)
-            for layer_id, mod in enumerate(model.layers):
+            for _, mod in enumerate(model.layers):
                fully_shard(mod, mesh=mesh, reshard_after_forward=True, **fsdp_config)
            model = fully_shard(
                model, mesh=mesh, reshard_after_forward=True, **fsdp_config
@ -1087,7 +1087,7 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]},
                setattr(m.encoder, name, new_child)
        m = FSDP(m, sharding_strategy=ShardingStrategy.FULL_SHARD, use_orig_params=True)
        inp = torch.randn(32, 784, device="cuda")
-        out = m(inp)
+        m(inp)


 if __name__ == "__main__":
--- a/test/distributed/_composable/fsdp/test_fully_shard_extensions.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_extensions.py
@ -241,7 +241,7 @@ class TestFullyShardAllGatherExtensionsMultiProcess(
                losses.append(_model(inp).sum())
                losses[-1].backward()
                if _model is ref_model:
-                    for param_name, param in _model.named_parameters():
+                    for _, param in _model.named_parameters():
                        dist.all_reduce(param.grad)
                        param.grad.detach().div_(self.world_size)
            self.assertEqual(losses[0], losses[1])
--- a/test/distributed/_composable/fsdp/test_fully_shard_init.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_init.py
@ -904,7 +904,7 @@ class TestFullyShardProcessGroupInit(FSDPTestMultiThread):
        )
        self.assertEqual(mesh.mesh, ref_mesh.mesh)
        self.assertEqual(mesh._coordinate_on_dim, ref_mesh._coordinate_on_dim)
-        for (tag, ranks, group_name), (ref_tag, ref_ranks, ref_group_name) in zip(
+        for (_, ranks, _), (_, ref_ranks, _) in zip(
            mesh._dim_group_infos, ref_mesh._dim_group_infos
        ):
            # Since we manually constructed new subgroups, the test and ref
--- a/test/distributed/_composable/fsdp/test_fully_shard_logging.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_logging.py
@ -26,7 +26,7 @@ class LoggingTests(LoggingTestCase):
        env["WORLD_SIZE"] = "1"
        env["MASTER_PORT"] = "34715"
        env["MASTER_ADDR"] = "localhost"
-        stdout, stderr = self.run_process_no_exception(
+        _, stderr = self.run_process_no_exception(
            """\
 import logging
 import torch
--- a/test/distributed/_composable/fsdp/test_fully_shard_training.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@ -590,7 +590,7 @@ class TestFullyShard1DTrainingCore(FSDPTest):

        torch.manual_seed(42 + self.rank)
        inp = torch.randint(0, model_args.vocab_size, (2, 8), device="cuda")
-        for iter_idx in range(10):
+        for _ in range(10):
            losses: List[torch.Tensor] = []
            for _model, _optim in ((ref_model, ref_optim), (model, optim)):
                _optim.zero_grad()
@ -624,12 +624,12 @@ class TestFullyShard1DTrainingCore(FSDPTest):
        # sync point after each iteration
        ref_losses: List[torch.Tensor] = []
        losses: List[torch.Tensor] = []
-        for iter_idx in range(10):
+        for _ in range(10):
            ref_optim.zero_grad()
            ref_losses.append(ref_model(inp).sum())
            ref_losses[-1].backward()
            ref_optim.step()
-        for iter_idx in range(10):
+        for _ in range(10):
            optim.zero_grad()
            losses.append(model(inp).sum())
            losses[-1].backward()
@ -1185,7 +1185,7 @@ class TestFullyShardNDTraining(FSDPTest):
        foreach: bool,
    ):
        global_mesh = self.init_global_mesh()
-        pp_mesh, dp_mesh, tp_mesh = (
+        _, dp_mesh, tp_mesh = (
            global_mesh["pp"],
            global_mesh["dp"],
            global_mesh["tp"],
@ -1217,7 +1217,7 @@ class TestFullyShardNDTraining(FSDPTest):
                _optim.step()
            self.assertEqual(losses[0], losses[1])

-        for n, p in model.named_parameters():
+        for _, p in model.named_parameters():
            self.assertIsInstance(p, DTensor)
            self.assertEqual(p.device_mesh.ndim, 2)
            self.assertEqual(len(p.placements), 2)
@ -1288,7 +1288,7 @@ class TestFullyShardHSDP3DTraining(FSDPTest):
                _optim.step()
            self.assertEqual(losses[0], losses[1])

-        for n, p in model.named_parameters():
+        for _, p in model.named_parameters():
            self.assertIsInstance(p, DTensor)
            self.assertEqual(p.device_mesh.ndim, 3)
            self.assertEqual(len(p.placements), 3)
--- a/test/distributed/_composable/test_checkpoint.py
+++ b/test/distributed/_composable/test_checkpoint.py
@ -119,7 +119,6 @@ class TestCheckpoint(TestCase):
        # no checkpoint
        with MemoryDelta(x.device) as mem1:
            loss1 = net1(x1).sum()
-        graph_size1 = self._get_graph_size(loss1)
        loss1.backward()

        # with checkpoint
--- a/test/distributed/_composable/test_composability/test_2d_composability.py
+++ b/test/distributed/_composable/test_composability/test_2d_composability.py
@ -244,7 +244,6 @@ class TestFullyShard2DTraining(FSDPTest):
                ref_model.parameters(), model.named_parameters()
            ):
                full_grad = param.grad.full_tensor()
-                ref_grad = ref_param.grad
                self.assertEqual(ref_param.grad, full_grad)

            ref_optim.step()
@ -285,7 +284,7 @@ class TestFullyShard2DTraining(FSDPTest):
        # called, but they will just be no-ops without issuing any kernels.
        # We prefer to keep the no-op check at the c10d level, not in FSDP.
        inp = torch.randn((4, mlp_dim), device="cuda")  # same on all ranks
-        for iter_idx in range(10):
+        for _ in range(10):
            ref_optim.zero_grad()
            optim.zero_grad()

@ -583,9 +582,7 @@ class TestNew2dParallelTraining(DTensorTestBase):
                "net1": ColwiseParallel(),
                "net2": RowwiseParallel(),
            }
-            model_2d = parallelize_module(
-                SimpleModel().cuda(), mesh_2d["tp"], parallelize_plan
-            )
+            parallelize_module(SimpleModel().cuda(), mesh_2d["tp"], parallelize_plan)

    @with_comms
    @skip_if_lt_x_gpu(4)
@ -833,7 +830,6 @@ class TestNew2dParallelStateDict(DTensorTestBase):
        # Create a model without wrapper
        torch.manual_seed(0)
        no_wrap_model = simple_model().cuda(self.rank)
-        no_wrap_state_dict = no_wrap_model.state_dict()
        no_wrap_optim = torch.optim.Adam(no_wrap_model.parameters(), lr=0.01)
        no_wrap_model(no_wrap_model.get_input().cuda(self.rank)).sum().backward()
        no_wrap_optim.step()
@ -890,8 +886,6 @@ class TestNew2dParallelStateDict(DTensorTestBase):
        set_optimizer_state_dict(
            model_2d, optimizers=optim_2d, optim_state_dict=ref_optim_2d_osd
        )
-        new_optim_2d_osd = get_optimizer_state_dict(model_2d, optimizers=optim_2d)
-
        ref_optim_2d_osd_states = ref_optim_2d_osd["state"]
        new_optim_2d_osd_states = optim_2d_osd["state"]

--- a/test/distributed/_composable/test_composability/test_pp_composability.py
+++ b/test/distributed/_composable/test_composability/test_pp_composability.py
@ -119,7 +119,7 @@ class ComposabilityTest(MultiProcessTestCase):
    )
    @parametrize("use_new_runtime", [False, True])
    def test_manual_with_data_parallel(self, dp_type, ScheduleClass, use_new_runtime):
-        device = torch.device("cuda", self.device)
+        _device_raii = torch.device("cuda", self.device)
        torch.cuda.set_device(self.device)
        store = torch.distributed.FileStore(self.file_name, self.world_size)
        torch.distributed.init_process_group(
@ -398,7 +398,7 @@ class ComposabilityTest(MultiProcessTestCase):
        ],
    )
    def test_3d_with_tp_dp_pp(self, ScheduleClass, MixedPrecisionParam):
-        device = torch.device("cuda", self.device)
+        _device_raii = torch.device("cuda", self.device)
        torch.cuda.set_device(self.device)
        store = torch.distributed.FileStore(self.file_name, self.world_size)
        torch.distributed.init_process_group(
--- a/test/distributed/_composable/test_replicate_with_compiler.py
+++ b/test/distributed/_composable/test_replicate_with_compiler.py
@ -329,11 +329,11 @@ class ReplicateTest(MultiProcessInductorTestCase):
        code = self._test_bucketing()
        self.assertEqual(counters["inductor"]["ddp_buckets"], 3)
        fc = FileCheck()
-        for i in range(3):
+        for _ in range(3):
            fc.check("cpp_fused_").check(
                "torch.ops._c10d_functional.all_reduce_coalesced_.default("
            )
-        for i in range(3):
+        for _ in range(3):
            fc.check("torch.ops._c10d_functional.wait_tensor.default")

        fc.run(code)
@ -342,11 +342,11 @@ class ReplicateTest(MultiProcessInductorTestCase):
        code = self._test_bucketing(init_process_group=False, loop=2)
        self.assertEqual(counters["inductor"]["ddp_buckets"], 3)
        fc = FileCheck()
-        for i in range(3):
+        for _ in range(3):
            fc.check("cpp_fused_").check(
                "torch.ops._c10d_functional.all_reduce_coalesced_.default("
            )
-        for i in range(3):
+        for _ in range(3):
            fc.check("torch.ops._c10d_functional.wait_tensor.default")

        fc.run(code)
@ -371,11 +371,11 @@ class ReplicateTest(MultiProcessInductorTestCase):
        code = self._test_bucketing()
        self.assertEqual(counters["inductor"]["ddp_buckets"], 3)
        fc = FileCheck()
-        for i in range(3):
+        for _ in range(3):
            fc.check("aten.flatten.using_ints(").check("cpp_fused_").check(
                "torch.ops._c10d_functional.all_reduce_.default("
            )
-        for i in range(3):
+        for _ in range(3):
            fc.check("torch.ops._c10d_functional.wait_tensor.default")
        fc.run(code)

@ -383,11 +383,11 @@ class ReplicateTest(MultiProcessInductorTestCase):
        code = self._test_bucketing(init_process_group=False, loop=2)
        self.assertEqual(counters["inductor"]["ddp_buckets"], 3)
        fc = FileCheck()
-        for i in range(3):
+        for _ in range(3):
            fc.check("aten.flatten.using_ints(").check("cpp_fused_").check(
                "torch.ops._c10d_functional.all_reduce_.default("
            )
-        for i in range(3):
+        for _ in range(3):
            fc.check("torch.ops._c10d_functional.wait_tensor.default")
        fc.run(code)

--- a/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py
+++ b/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py
@ -129,7 +129,7 @@ class TestShardedTensorBinaryOps(ShardedTensorTestBase):
    def test_torch_equal(self):
        """Test torch.equal(ShardedTensor, ShardedTensor)"""

-        spec, alt_spec = self.get_gpu_specs()
+        spec, _ = self.get_gpu_specs()
        st1, st2 = self.get_random_tensors(spec, spec, 10, 10)
        self.assertTrue(torch.equal(st1, st2))

@ -145,7 +145,7 @@ class TestShardedTensorBinaryOps(ShardedTensorTestBase):
    def test_torch_allclose(self):
        """Test torch.allclose(ShardedTensor, ShardedTensor)"""

-        spec, alt_spec = self.get_gpu_specs()
+        spec, _ = self.get_gpu_specs()

        st1, st2 = self.get_random_tensors(spec, spec, 10, 10)
        self.assertTrue(torch.allclose(st1, st2))
--- a/test/distributed/_shard/sharded_tensor/ops/test_init.py
+++ b/test/distributed/_shard/sharded_tensor/ops/test_init.py
@ -40,8 +40,6 @@ class TestShardedTensorNNInit(ShardedTensorTestBase):
            ],
        )
        h, w = 8, 2
-        expected_h = 2
-        expected_device = torch.device(f"cuda:{self.rank}")
        a, b = 10, 20

        seed = 1234
@ -75,8 +73,6 @@ class TestShardedTensorNNInit(ShardedTensorTestBase):
            ],
        )
        h, w = 8, 2
-        expected_h = 2
-        expected_device = torch.device(f"cuda:{self.rank}")
        mean, std = 10, 5

        seed = 1234
@ -110,8 +106,6 @@ class TestShardedTensorNNInit(ShardedTensorTestBase):
            ],
        )
        h, w = 8, 2
-        expected_h = 2
-        expected_device = torch.device(f"cuda:{self.rank}")
        a, mode, nonlinearity = 0, "fan_in", "leaky_relu"

        seed = 1234
--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
@ -456,7 +456,7 @@ class TestLocalTensor(ShardedTensorTestBase):
        with self.assertRaisesRegex(
            NotImplementedError, "Only single local shard is supported."
        ):
-            local_shard = st.local_tensor()
+            st.local_tensor()


 class TestShardedTensorChunked(ShardedTensorTestBase):
@ -981,7 +981,6 @@ class TestShardedTensorChunked(ShardedTensorTestBase):
        # Validate remote shards.
        remote_shards = st.remote_shards()
        self.assertEqual(3, len(remote_shards))
-        owners = {}
        for rpc_rank, shards in remote_shards.items():
            self.assertEqual(2, len(shards))
            for remote_shard in shards:
@ -1364,14 +1363,14 @@ class TestShardedTensorChunked(ShardedTensorTestBase):
            with self.assertRaisesRegex(RuntimeError, "Local rank at save time was"):
                with load_with_process_group(pg):
                    # ShardedTensor weights_only is already tested in TestFSDPStateDict.test_torch_save_load
-                    state_dict_deser = torch.load(buffer, weights_only=False)
+                    torch.load(buffer, weights_only=False)
        else:
            with self.assertRaisesRegex(
                RuntimeError, "Local world size at save time was"
            ):
                with load_with_process_group(pg):
                    # ShardedTensor weights_only is already tested in TestFSDPStateDict.test_torch_save_load
-                    state_dict_deser = torch.load(buffer, weights_only=False)
+                    torch.load(buffer, weights_only=False)

        dist.destroy_process_group()
        buffer.seek(0)
@ -1379,7 +1378,7 @@ class TestShardedTensorChunked(ShardedTensorTestBase):
            RuntimeError, "Need to initialize default process group"
        ):
            # ShardedTensor weights_only is already tested in TestFSDPStateDict.test_torch_save_load
-            state_dict_deser = torch.load(buffer, weights_only=False)
+            torch.load(buffer, weights_only=False)
        rpc.shutdown()

    @with_comms
@ -1396,8 +1395,8 @@ class TestShardedTensorChunked(ShardedTensorTestBase):
                    "rank:3/cuda:3",
                ],
            )
-            st1 = sharded_tensor.empty(spec, 10, 20, init_rrefs=True)
-            st2 = sharded_tensor.empty(spec, 10, 20)
+            sharded_tensor.empty(spec, 10, 20, init_rrefs=True)
+            sharded_tensor.empty(spec, 10, 20)

        create_tensors()
        self.assertEqual(0, len(sharded_tensor.api._sharded_tensor_map))
@ -2204,7 +2203,6 @@ class TestShardedTensorEnumerable(ShardedTensorTestBase):
        else:
            self.assertEqual(2, len(remote_shards))

-        owners = {}
        for rpc_rank, shards in remote_shards.items():
            self.assertEqual(2, len(shards))
            for remote_shard in shards:
@ -2418,10 +2416,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
            placement=f"rank:{self.rank}/cuda:{self.rank}",
        )
        with self.assertRaisesRegex(ValueError, "Shard tensor size does not match"):
-            local_shard_from_wrong_meta = sharded_tensor.Shard(
-                local_tensor,
-                metadata=wrong_local_shard_metadata,
-            )
+            sharded_tensor.Shard(local_tensor, metadata=wrong_local_shard_metadata)

    @with_comms
    @skip_if_lt_x_gpu(4)
@ -2696,7 +2691,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):

        empty_local_shards = []
        with self.assertRaisesRegex(ValueError, "have no local shards on all ranks"):
-            st = sharded_tensor.init_from_local_shards(
+            sharded_tensor.init_from_local_shards(
                empty_local_shards, [10, 10], init_rrefs=True
            )

@ -2706,7 +2701,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
        with self.assertRaisesRegex(
            ValueError, "Only torch.strided layout is currently supported"
        ):
-            st = sharded_tensor.init_from_local_shards(
+            sharded_tensor.init_from_local_shards(
                wrong_layout_shards, [10, 10], init_rrefs=True
            )

@ -2719,23 +2714,19 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
            ValueError,
            "Only torch.contiguous_format memory_format is currently supported",
        ):
-            st = sharded_tensor.init_from_local_shards(
+            sharded_tensor.init_from_local_shards(
                wrong_memory_format_shards, [10, 10], init_rrefs=True
            )

        with self.assertRaisesRegex(ValueError, "Shard tensor size does not match"):
-            wrong_size_shards = [
            sharded_tensor.Shard(
                torch.randn(2, 3, device=f"cuda:{self.rank}"), local_shard_metadata
            )
-            ]

        with self.assertRaisesRegex(
            ValueError, "Local shard tensor device does not match"
        ):
-            wrong_device_shards = [
            sharded_tensor.Shard(torch.randn(5, 5), local_shard_metadata)
-            ]

    @with_comms
    @skip_if_lt_x_gpu(4)
@ -2756,7 +2747,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
            ValueError,
            "ShardedTensor global_size property does not match from different ranks!",
        ):
-            st = sharded_tensor.init_from_local_shards(
+            sharded_tensor.init_from_local_shards(
                wrong_dtype_shards, tensor_overall_size, init_rrefs=True
            )

@ -2771,7 +2762,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
            ValueError,
            "ShardedTensor dtype property does not match from different ranks!",
        ):
-            st = sharded_tensor.init_from_local_shards(
+            sharded_tensor.init_from_local_shards(
                wrong_dtype_shards, [10, 10], init_rrefs=True
            )

@ -2788,7 +2779,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
            ValueError,
            "ShardedTensor requires_grad property does not match from different ranks!",
        ):
-            st = sharded_tensor.init_from_local_shards(
+            sharded_tensor.init_from_local_shards(
                wrong_requires_grad_shards, [10, 10], init_rrefs=True
            )

@ -2818,7 +2809,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
        with self.assertRaisesRegex(
            ValueError, "Local shards' tensor pin_memory property need to be the same"
        ):
-            st = sharded_tensor.init_from_local_shards(
+            sharded_tensor.init_from_local_shards(
                wrong_pin_memory_local_shards, [10, 10], init_rrefs=True
            )

@ -2832,7 +2823,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
            ValueError,
            "ShardedTensor pin_memory property does not match from different ranks!",
        ):
-            st = sharded_tensor.init_from_local_shards(
+            sharded_tensor.init_from_local_shards(
                wrong_pin_memory_shards_cross_ranks, [10, 10], init_rrefs=True
            )

@ -2945,19 +2936,15 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
        with self.assertRaisesRegex(
            ValueError, "Shard tensor size does not match with metadata.shard_lengths"
        ):
-            wrong_size_shards = [
            sharded_tensor.Shard(
                torch.randn(2, 3, device=f"cuda:{self.rank}"), local_shard_metadata
            )
-            ]

        with self.assertRaisesRegex(
            ValueError,
            "Local shard tensor device does not match with local Shard's placement",
        ):
-            wrong_device_shards = [
            sharded_tensor.Shard(torch.randn(5, 5), local_shard_metadata)
-            ]

        wrong_dtype_shards = [
            sharded_tensor.Shard(
--- a/test/distributed/_shard/sharding_plan/test_sharding_plan.py
+++ b/test/distributed/_shard/sharding_plan/test_sharding_plan.py
@ -42,7 +42,7 @@ class ChunkAllShardingPlanner(ShardingPlanner):
    def build_plan(self, module: nn.Module) -> ShardingPlan:
        named_params = module.named_parameters()
        plan = {}
-        for name, param in named_params:
+        for name, _ in named_params:
            plan[name] = ChunkShardingSpec(self.dim, placements=self.devices)

        return ShardingPlan(plan=plan)
--- a/test/distributed/_tensor/debug/test_comm_mode.py
+++ b/test/distributed/_tensor/debug/test_comm_mode.py
@ -92,7 +92,6 @@ class TestCommMode(TestCase):
        self.assertEqual(comm_counts[c10d_functional.reduce_scatter_tensor], 1)

    def test_comm_mode_with_dtensor(self):
-        world_pg = self.world_pg
        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))

        def f(x, y):
@ -118,8 +117,6 @@ class TestCommMode(TestCase):
        if not torch.cuda.is_available():
            return

-        world_pg = self.world_pg
-
        inp = torch.rand(2, 8, 16).cuda()
        all_gather_out = inp.new_empty(self.world_size * 2, 8, 16)

@ -202,7 +199,7 @@ class TestCommMode(TestCase):
        self.checksAssert(comm_mode, c10d_ops.reduce_scatter_, 1, 1)

        # tests c10d reduce_scatter_tensor_coalesced
-        with comm_mode as A, dist._coalescing_manager() as B:
+        with comm_mode, dist._coalescing_manager():
            dist.reduce_scatter_tensor(all_gather_out, inp)

        self.checksAssert(comm_mode, c10d_ops.reduce_scatter_tensor_coalesced_, 1, 1)
--- a/test/distributed/_tensor/debug/test_comm_mode_features.py
+++ b/test/distributed/_tensor/debug/test_comm_mode_features.py
@ -251,7 +251,7 @@ class TestCommModeFeatures(DTensorTestBase):
                comm_mode.comm_module_counts,
                {"Global": {"forward": {}, "backward": {}}},
            )
-            output_tp = model(inp)
+            model(inp)

        model_args = ModelArgs(dropout_p=0.0)
        model2 = Transformer(model_args).to(device=self.device_type)
@ -264,7 +264,7 @@ class TestCommModeFeatures(DTensorTestBase):

        comm_mode = CommDebugMode()
        with comm_mode:
-            output = model2(inp)
+            model2(inp)

        # checks to see if all collectives were correctly traced at the module-level
        self.assertEqual(
--- a/test/distributed/_tensor/test_dtensor.py
+++ b/test/distributed/_tensor/test_dtensor.py
@ -155,14 +155,12 @@ class DTensorTest(DTensorTestBase):
        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
        shard0_spec = [Shard(0)]
        local_tensor = torch.randn(4, 8)
-        global_shape = torch.Size([self.world_size * 4, 8])
        dist_tensor = DTensor.from_local(local_tensor, device_mesh, shard0_spec)
        # won't affect stride
        self.assertEqual(dist_tensor.stride(), (8, 1))

        shard1_spec = [Shard(1)]
        local_tensor = torch.randn(8, 4)
-        global_shape = torch.Size([8, self.world_size * 4])
        dist_tensor = DTensor.from_local(local_tensor, device_mesh, shard1_spec)
        # will affect stride after DT initialized
        self.assertEqual(dist_tensor.stride(), (4 * self.world_size, 1))
@ -170,7 +168,6 @@ class DTensorTest(DTensorTestBase):
        # if initialized from a transposed mat
        local_tensor = torch.randn(8, 4, 8)
        local_tensor_t = local_tensor.permute(1, 2, 0)
-        global_shape = torch.Size([4, self.world_size * 8, 8])
        self.assertEqual(local_tensor_t.stride(), (8, 1, 32))
        dist_tensor = DTensor.from_local(local_tensor_t, device_mesh, shard1_spec)
        global_stride = (8 * self.world_size, 1, 32 * self.world_size)
@ -257,7 +254,7 @@ class DTensorTest(DTensorTestBase):
        with self.assertRaisesRegex(
            RuntimeError, "Please pass both shape and stride at the same time."
        ):
-            dtensor = DTensor.from_local(
+            DTensor.from_local(
                tensor_list[self.rank],
                device_mesh,
                (Shard(0),),
@ -267,7 +264,7 @@ class DTensorTest(DTensorTestBase):
        with self.assertRaisesRegex(
            RuntimeError, "Please pass both shape and stride at the same time."
        ):
-            dtensor = DTensor.from_local(
+            DTensor.from_local(
                tensor_list[self.rank],
                device_mesh,
                (Shard(0),),
@ -1043,7 +1040,7 @@ class DTensorLogTest(LoggingTestCase):
        env["MASTER_PORT"] = "12345"
        env["MASTER_ADDR"] = "localhost"

-        stdout, stderr = self.run_process_no_exception(
+        _, stderr = self.run_process_no_exception(
            """\
 import logging
 import torch
--- a/test/distributed/_tensor/test_dtensor_compile.py
+++ b/test/distributed/_tensor/test_dtensor_compile.py
@ -234,8 +234,8 @@ class TestDTensorCompile(torch._dynamo.test_case.TestCase):
                requires_grad=x.requires_grad,
            )

-        out = fn(x)
-        out2 = torch.compile(fn, backend="eager")(x)
+        fn(x)
+        torch.compile(fn, backend="eager")(x)

    def test_dtensor_constructor_w_dynamo_disable(self):
        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
@ -599,7 +599,7 @@ class TestDTensorCompile(torch._dynamo.test_case.TestCase):

        @torch.compile(backend=cnt)
        def fn(x):
-            dt = DTensor.from_local(x, mesh, [placement], run_check=False)
+            DTensor.from_local(x, mesh, [placement], run_check=False)

        x = torch.ones(4, 4, requires_grad=True)

@ -659,7 +659,7 @@ class TestDTensorCompile(torch._dynamo.test_case.TestCase):
        x2 = x_dt.redistribute(mesh, [Replicate()], async_op=True)
        x2 = x2.to_local()
        self.assertTrue(isinstance(x2, AsyncCollectiveTensor))
-        out = opt_fn(x2)
+        opt_fn(x2)
        # The important part: we get a wait_tensor() in the graph.
        # At runtime, the input to the graph is an AsyncCollectiveTensor,
        # and inside the graph we need to issue a wait() to synchronize.
@ -880,8 +880,6 @@ class TestDTensorCompileE2E(DTensorTestBase):
            mesh_dim_names=["dp", "tp"],
        )

-        fsdp_pg = twod_mesh.get_group(mesh_dim=0)
-
        inp = torch.rand(20, 10, device=self.device_type)
        parallelize_plan = {
            "mlp_0.net1": ColwiseParallel(),
--- a/test/distributed/_tensor/test_random_ops.py
+++ b/test/distributed/_tensor/test_random_ops.py
@ -249,7 +249,7 @@ class DistTensorRandomOpTest(DTensorTestBase):

        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
        # seed synchronization happens after the first `distribute_tensor` call
-        dtensor = distribute_tensor(
+        distribute_tensor(
            torch.empty([self.world_size], device="cuda"), device_mesh, [Shard(0)]
        )
        self.assertEqual(seed_from_rank_0, random._rng_tracker.get_seed("parallel-rng"))
--- a/test/distributed/_tensor/test_redistribute.py
+++ b/test/distributed/_tensor/test_redistribute.py
@ -309,7 +309,7 @@ class RedistributeTest(DTensorTestBase):
        shard_tensor = distribute_tensor(local_tensor, device_mesh, shard_spec)
        self.assertEqual(shard_tensor.placements[0].dim, 1)
        reshard_tensor = shard_tensor.redistribute(device_mesh, shard_minus_spec)
-        self.assertEqual(shard_tensor.placements[0].dim, 1)
+        self.assertEqual(reshard_tensor.placements[0].dim, 1)

    @with_comms
    def test_redistribute_uneven_sharding(self):
--- a/test/distributed/_tensor/test_tensor_ops.py
+++ b/test/distributed/_tensor/test_tensor_ops.py
@ -622,7 +622,7 @@ class DistTensorOpsTest(DTensorTestBase):
        self.assertEqual(misses, 2)

        # convert to fp32 again and see if there's cache hit
-        fp32_sharded_dtensor1 = bf16_sharded_dtensor1.float()
+        bf16_sharded_dtensor1.float()
        hits, misses, _, _ = _get_sharding_prop_cache_info()
        # by now we should have cache hit
        self.assertEqual(hits, 1)
--- a/test/distributed/_tensor/test_utils.py
+++ b/test/distributed/_tensor/test_utils.py
@ -133,7 +133,6 @@ class UtilTest(DTensorTestBase):
            global_tensor_shape, global_mesh, placements
        )
        assert global_mesh.get_coordinate is not None
-        dp_replic_rank = global_mesh.get_local_rank("dp_replic")
        dp_shard_rank = global_mesh.get_local_rank("dp_shard")
        tp_rank = global_mesh.get_local_rank("tp")
        shard_idx_on_dim_0 = tp_rank * dp_shard_size + dp_shard_rank
--- a/test/distributed/_tensor/test_xla_integration.py
+++ b/test/distributed/_tensor/test_xla_integration.py
@ -150,7 +150,7 @@ class DTensorXLAIntegrationTest(TestCase):
            shard_spec = [Shard(0)]
            # annoate fc1 and fc2
            if isinstance(mod, nn.Linear):
-                for name, param in mod.named_parameters():
+                for _, param in mod.named_parameters():
                    # annotate the parameter tensors directly
                    distribute_tensor(param, mesh, shard_spec)

--- a/test/distributed/algorithms/quantization/test_quantization.py
+++ b/test/distributed/algorithms/quantization/test_quantization.py
@ -1,4 +1,5 @@
 # Owner(s): ["oncall: distributed"]
+# ruff: noqa: F841

 import os
 import sys
--- a/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
+++ b/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
@ -277,7 +277,7 @@ class TestE2ESaveAndLoad(DTensorTestBase, VerifyStateDictMixin):
        self.assertEqual(loss, dist_loss)

        dist_msd, dist_osd = get_state_dict(dist_model, optimizers=dist_optim)
-        model_sd, optim_sd = get_state_dict(model, optimizers=optim)
+        model_sd, _ = get_state_dict(model, optimizers=optim)

        self._verify_msd(model_sd, dist_msd)
        self._verify_osd_by_load(model, optim, self._optim(model), dist_osd)
--- a/test/distributed/checkpoint/e2e/test_fine_tuning.py
+++ b/test/distributed/checkpoint/e2e/test_fine_tuning.py
@ -96,7 +96,7 @@ class TestFineTuning(DTensorTestBase):
        optim = torch.optim.Adam(model.parameters(), lr=1e-3)

        # Training
-        for i in range(3):
+        for _ in range(3):
            batch = torch.rand(32, DIM, device="cuda")
            loss = model(batch).sum()
            loss.backward()
@ -161,7 +161,7 @@ class TestFineTuning(DTensorTestBase):
                self.assertEqual(i, 0)

            # Training
-            for j in range(3):
+            for _ in range(3):
                batch = torch.rand(32, DIM, device="cuda")
                loss = model(batch).sum()
                loss.backward()
--- a/test/distributed/checkpoint/test_checkpoint.py
+++ b/test/distributed/checkpoint/test_checkpoint.py
@ -85,11 +85,9 @@ class TestDistributedCheckpointing(ShardedTensorTestBase):
        )

        st = sharded_tensor.zeros(spec, 4, 4, dtype=torch.float64)
-        mapping = {}
-
        md = _create_default_local_metadata({"st": st})
-
        st_md = md.state_dict_metadata["st"]
+
        self.assertEqual(1, len(st_md.chunks))

    @with_comms(init_rpc=False)
--- a/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py
+++ b/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py
@ -86,7 +86,6 @@ class TestFsdpTpCheckpointConversion(DTensorTestBase):
        tp_model.load_state_dict(tp_state_dict)

        # Check parameters are equal after loading.
-        tp_state_dict_after_load = tp_model.state_dict()
        for fsdp_item, tp_item in zip(fsdp_state_dict.items(), tp_state_dict.items()):
            fsdp_k, fsdp_v = fsdp_item
            tp_k, tp_v = tp_item
--- a/test/distributed/checkpoint/test_hsdp_checkpoint.py
+++ b/test/distributed/checkpoint/test_hsdp_checkpoint.py
@ -120,7 +120,6 @@ class TestHSDPCheckpoint(DTensorTestBase):
        )
        model.load_state_dict(state_dict_to_save["model"])

-        state_dict_after_load = model.state_dict()
        # After loading, the current model state dict should be the same as state_dict_to_save.
        for (k1, v1), (k2, v2) in zip(
            state_dict_to_save["model"].items(), model.state_dict().items()
--- a/test/distributed/checkpoint/test_nested_dict.py
+++ b/test/distributed/checkpoint/test_nested_dict.py
@ -43,7 +43,7 @@ class TestFlattening(TestCase):
            "k3": ["x", 99, [{"k3": "y"}]],
        }

-        flatten_dict, mapping = flatten_state_dict(state_dict)
+        _, mapping = flatten_state_dict(state_dict)
        """
        flatten_dict:
        {'k0': [1], 'k2.0': tensor([1]), 'k2.1': 99, 'k2.2.0.k3': tensor(1), 'k3': ['x', 99, [{'k3': 'y'}]]}
--- a/test/distributed/checkpoint/test_save_load_api.py
+++ b/test/distributed/checkpoint/test_save_load_api.py
@ -40,21 +40,19 @@ class TestSaveAndLoadAPI(DTensorTestBase):
        device_mesh = init_device_mesh(self.device_type, (self.world_size,))
        model = FSDP(model, device_mesh=device_mesh)
        dcp.save(model.state_dict(), checkpoint_id=os.path.join(self.temp_dir, "first"))
-        sd = dcp.load(
-            model.state_dict(), checkpoint_id=os.path.join(self.temp_dir, "first")
-        )
+        dcp.load(model.state_dict(), checkpoint_id=os.path.join(self.temp_dir, "first"))

        with patch.object(
            dcp.FileSystemReader, "validate_checkpoint_id", return_value=False
-        ) as m1:
+        ):
            with patch.object(
                dcp.FileSystemWriter, "validate_checkpoint_id", return_value=False
-            ) as m2:
+            ):
                dcp.save(
                    model.state_dict(),
                    checkpoint_id=os.path.join(self.temp_dir, "second"),
                )
-                sd = dcp.load(
+                dcp.load(
                    model.state_dict(),
                    checkpoint_id=os.path.join(self.temp_dir, "second"),
                )
@ -62,7 +60,7 @@ class TestSaveAndLoadAPI(DTensorTestBase):
        with self.assertRaisesRegex(RuntimeError, "Cannot detect"):
            dcp.save(model.state_dict(), checkpoint_id="abc://abc.abc")
        with self.assertRaisesRegex(RuntimeError, "Cannot detect"):
-            sd = dcp.load(model.state_dict(), checkpoint_id="abc://abc.abc")
+            dcp.load(model.state_dict(), checkpoint_id="abc://abc.abc")


 if __name__ == "__main__":
--- a/test/distributed/checkpoint/test_state_dict.py
+++ b/test/distributed/checkpoint/test_state_dict.py
@ -81,7 +81,7 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):

        # Train 10 steps.
        _dist_optim = [dist_optim] if not isinstance(dist_optim, list) else dist_optim
-        for i in range(10):
+        for _ in range(10):
            optim.zero_grad()
            for d_optim in _dist_optim:
                d_optim.zero_grad()
--- a/test/distributed/checkpoint/test_state_dict_utils.py
+++ b/test/distributed/checkpoint/test_state_dict_utils.py
@ -104,7 +104,7 @@ class TestStateDictUtils(DTensorTestBase):
            return tensor, dist_tensor

        ltensor, ldtensor = [], []
-        for i in range(10):
+        for _ in range(10):
            tensor, dtensor = create_dtensor()
            ltensor.append(tensor)
            ltensor.append(torch.ones(10, device=torch.device("cuda")))
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@ -259,7 +259,7 @@ class _StartProcessesTest(TestCase):
    ) -> None:
        mp_queue = mp.get_context("spawn").Queue()
        child_nproc = 2
-        ctx = mp.spawn(
+        mp.spawn(
            start_processes_zombie_test,
            nprocs=1,
            args=(entrypoint, mp_queue, self.log_dir(), child_nproc),
--- a/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py
+++ b/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py
@ -165,7 +165,7 @@ class CreateBackendTest(TestCase):
    def test_create_backend_returns_backend_if_is_host_is_not_specified_and_store_already_exists(
        self,
    ) -> None:
-        store = TCPStore(  # type: ignore[call-arg] # noqa: F841
+        TCPStore(  # type: ignore[call-arg] # noqa: F841
            self._expected_endpoint_host, self._expected_endpoint_port, is_master=True
        )

--- a/test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py
+++ b/test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py
@ -99,7 +99,7 @@ class RendezvousTimeoutTest(TestCase):
                    ValueError,
                    rf"^The join timeout \({join_timeout}\) must be positive.$",
                ):
-                    timeout = RendezvousTimeout(join_timeout)
+                    RendezvousTimeout(join_timeout)


 class NodeDescTest(TestCase):
@ -1637,7 +1637,7 @@ class CreateHandlerTest(TestCase):
 def _ignore_exception(exception_type: Exception, fn: Callable):
    try:
        fn()
-    except exception_type as e:
+    except exception_type:
        pass


--- a/test/distributed/elastic/rendezvous/rendezvous_backend_test.py
+++ b/test/distributed/elastic/rendezvous/rendezvous_backend_test.py
@ -70,7 +70,7 @@ class RendezvousBackendTestMixin(ABC):
        self.assertTrue(has_set)

    def test_set_state_sets_backend_state_if_token_is_current(self) -> None:
-        state1, token1, has_set1 = self._set_state(b"x")
+        _, token1, has_set1 = self._set_state(b"x")

        state2, token2, has_set2 = self._set_state(b"y", token1)

@ -80,7 +80,7 @@ class RendezvousBackendTestMixin(ABC):
        self.assertTrue(has_set2)

    def test_set_state_returns_current_backend_state_if_token_is_old(self) -> None:
-        state1, token1, _ = self._set_state(b"x")
+        _, token1, _ = self._set_state(b"x")

        state2, token2, _ = self._set_state(b"y", token1)

--- a/test/distributed/elastic/timer/file_based_local_timer_test.py
+++ b/test/distributed/elastic/timer/file_based_local_timer_test.py
@ -113,7 +113,7 @@ if not (IS_WINDOWS or IS_MACOS):
            num_clients = 10
            num_requests_per_client = 10
            processes = []
-            for i in range(num_clients):
+            for _ in range(num_clients):
                p = mp.Process(
                    target=func, args=(num_requests_per_client, self.file_path)
                )
@ -190,7 +190,7 @@ if not (IS_WINDOWS or IS_MACOS):
        """
        client = timer.FileTimerClient(file_path)
        sem.release()
-        for i in range(0, n):
+        for _ in range(0, n):
            client.acquire("test_scope", 0)
            time.sleep(interval)

--- a/test/distributed/fsdp/test_checkpoint_wrapper.py
+++ b/test/distributed/fsdp/test_checkpoint_wrapper.py
@ -159,7 +159,7 @@ class CheckpointWrapperTest(TestCase):
                    if use_reentrant
                    else CheckpointImpl.NO_REENTRANT,
                )
-                for i in range(self.n):
+                for _ in range(self.n):
                    l = nn.Sequential(
                        nn.Linear(256, 256), nn.Linear(256, 256), nn.Linear(256, 256)
                    )
--- a/test/distributed/fsdp/test_fsdp_dtensor_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_dtensor_state_dict.py
@ -303,13 +303,13 @@ class TestFSDPWithDeviceMeshAndDTensor(DTensorTestBase):
            RuntimeError, "DeviceMesh is not compatible with LOCAL_STATE_DICT."
        ):
            with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
-                state_dict = model.state_dict()
+                model.state_dict()

        with self.assertRaisesRegex(
            RuntimeError, "DeviceMesh is not compatible with LOCAL_STATE_DICT."
        ):
            with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
-                optim_state_dict = FSDP.optim_state_dict(model, optim)
+                FSDP.optim_state_dict(model, optim)


 instantiate_parametrized_tests(TestFSDPWithDeviceMeshAndDTensor)
--- a/test/distributed/fsdp/test_fsdp_fine_tune.py
+++ b/test/distributed/fsdp/test_fsdp_fine_tune.py
@ -364,9 +364,8 @@ class TestFSDPFineTune(FSDPTest):
        )
        torch.manual_seed(self.rank + 1)
        losses = []
-        for idx in range(6):
+        for _ in range(6):
            frozen_input = torch.randn((4, 4), device="cuda", requires_grad=False)
-            learnable_input = torch.randn((4, 4), device="cuda", requires_grad=True)
            for _model, _optim in ((model, model_optim), (ref_model, ref_model_optim)):
                loss = _model(frozen_input, frozen_input).sum()
                losses.append(loss)
--- a/test/distributed/fsdp/test_fsdp_freezing_weights.py
+++ b/test/distributed/fsdp/test_fsdp_freezing_weights.py
@ -182,7 +182,7 @@ class TestFreezingWeights(FSDPTest):
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)

-        for iteration in range(3):
+        for _ in range(3):
            out = model(batch)
            fake_loss = criterion(out, target)
            optimizer.zero_grad()
--- a/test/distributed/fsdp/test_fsdp_memory.py
+++ b/test/distributed/fsdp/test_fsdp_memory.py
@ -108,8 +108,6 @@ class TestFSDPMemory(FSDPTest):

    def _dist_train(self, with_checkpoint, expected, model_hidden_dim, iterations):
        gpu_id = self.rank
-        world_size = self.world_size
-
        batch = torch.randn(size=(2, 3, 224, 224)).cuda()

        model = create_model(
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@ -278,9 +278,9 @@ class TestFSDPMiscMultiProcess(FSDPTest):
        )
        x = torch.randn(10, 10, device="cuda")
        y = torch.randn(10, 10, device="cuda")
-        for i in range(4):
+        for _ in range(4):
            if use_second_layer:
-                a, b = fsdp(x, y)
+                a, _ = fsdp(x, y)
            else:
                a = fsdp(x, y)
            loss = a.sum()
@ -509,7 +509,7 @@ class TestFSDPMiscMultiProcess(FSDPTest):
    def test_fsdp_cpu_training(self):
        """Tests FSDP training on CPU."""
        gloo_pg = dist.new_group(backend="gloo")
-        for ss in [
+        for ss in [  # noqa: F841
            ShardingStrategy.NO_SHARD,
            ShardingStrategy.FULL_SHARD,
            ShardingStrategy.SHARD_GRAD_OP,
@ -857,13 +857,13 @@ class TestFSDPMiscMultiThread(FSDPTestMultiThread):
        torch.cuda.set_device(self.rank)
        # Test CPU
        no_params = nn.ReLU()
-        module = FSDP(no_params)
+        FSDP(no_params)
        # Test CUDA
        no_params = nn.ReLU().cuda()
-        module = FSDP(no_params)
+        FSDP(no_params)
        # Test CPU + device_id
        no_params = nn.ReLU()
-        module = FSDP(no_params, device_id=torch.cuda.current_device())
+        FSDP(no_params, device_id=torch.cuda.current_device())
        # For modules with no params, wrong device_id will raise error about
        # inconsistency between compute_device and device_id, since compute_device
        # is computed as torch.cuda.current_device when there are no params.
--- a/test/distributed/fsdp/test_fsdp_mixed_precision.py
+++ b/test/distributed/fsdp/test_fsdp_mixed_precision.py
@ -1139,7 +1139,6 @@ class TestFSDPDifferentSubmodulePrecision(FSDPTest):
        model = SaveForwardInputsModel(
            forward_inputs=forward_inputs, cast_forward_inputs=False
        ).cuda()
-        c1, c2 = model.c1, model.c2
        x = torch.zeros(2, 100, device="cuda")

        # float16 on one submodule and float32 on everything else
--- a/test/distributed/fsdp/test_fsdp_multiple_wrapping.py
+++ b/test/distributed/fsdp/test_fsdp_multiple_wrapping.py
@ -45,7 +45,7 @@ class TestMultipleWrapping(FSDPTest):
        model = FSDP(inner_model).cuda()
        optim = SGD(model.parameters(), lr=0.1)

-        for i in range(3):
+        for _ in range(3):
            input = torch.rand((1, 5), dtype=torch.float).cuda()
            input.requires_grad = True
            output = model(input)
--- a/test/distributed/fsdp/test_fsdp_optim_state.py
+++ b/test/distributed/fsdp/test_fsdp_optim_state.py
@ -1510,7 +1510,7 @@ class TestFSDPOptimState(FSDPTest):
        ) = self._init_nested_model(wrap=False, use_multiple_param_groups=False)
        if should_check_method_fn("rekey_optim_state_dict"):
            with context_fn():
-                rekeyed_osd = FSDP.rekey_optim_state_dict(
+                FSDP.rekey_optim_state_dict(
                    fsdp_osd,  # from `full_optim_state_dict()`
                    OptimStateKeyType.PARAM_ID,
                    nonwrapped_model,
@ -1650,7 +1650,7 @@ class TestFSDPOptimState(FSDPTest):
        )

        # Make optim1 has a different state.
-        for i in range(5):
+        for _ in range(5):
            batch = torch.rand(5, 8).cuda()
            loss = models[1](batch).sum()
            loss.backward()
@ -1765,7 +1765,7 @@ class TestFSDPOptimState(FSDPTest):
        initializer = self._model_class[model_class]

        # First, run a wrapped model with full world size for a few iterations
-        model1, optim1, optim_input1 = initializer(
+        model1, optim1, _ = initializer(
            wrap=True,
            use_multiple_param_groups=use_multiple_param_groups,
        )
@ -1788,7 +1788,7 @@ class TestFSDPOptimState(FSDPTest):
            new_group = dist.distributed_c10d._get_default_group()
        # Second, run a wrapped model with (possibly) halved world size and
        # (possibly) differing `optim_input` across ranks
-        model2, optim2, optim_input2 = initializer(
+        model2, optim2, _ = initializer(
            wrap=True,
            group=new_group,
            use_multiple_param_groups=use_multiple_param_groups,
@ -1861,7 +1861,8 @@ class TestFSDPOptimState(FSDPTest):
            FSDP.optim_state_dict(model, optim), osd, check_same_param_keys=True
        )
        step()
-        osd_to_load = FSDP.optim_state_dict_to_load(
+
+        osd_to_load = FSDP.optim_state_dict_to_load(  # noqa: F841
            model, optim, osd, load_directly=True
        )
        self._check_same_state(
@ -1994,7 +1995,7 @@ class TestFSDPOptimState(FSDPTest):
            loss.backward()
            fsdp_optim.step()
            orig_state_dict = deepcopy(fsdp_optim.state_dict())
-            optim_state_dict = FSDP.optim_state_dict(fsdp_model, fsdp_optim)
+            FSDP.optim_state_dict(fsdp_model, fsdp_optim)
            FSDP.optim_state_dict_to_load(
                fsdp_model,
                fsdp_optim,
--- a/test/distributed/fsdp/test_fsdp_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_state_dict.py
@ -966,7 +966,7 @@ class TestFSDPStateDict(FSDPTest):
                setattr(module, LINEAR_SKIP, linear_skip)
                return fsdp, linear_skip_tensor_names

-        fsdp, linear_skip_tensor_names = _create_module()
+        fsdp, _ = _create_module()
        # Run a forward pass
        inp = torch.randn((1, 10), device=torch.cuda.current_device())
        loss = fsdp(inp)
--- a/test/distributed/fsdp/test_fsdp_unshard_params.py
+++ b/test/distributed/fsdp/test_fsdp_unshard_params.py
@ -634,7 +634,7 @@ class TestUnshardParams(TestUnshardParamsBase):
        model = FSDP(model, auto_wrap_policy=ModuleWrapPolicy((nn.Sequential,)))
        with FSDP.summon_full_params(model[0]):
            # Check that the summoned module does not have its flat parameter
-            for param_name, param in model[0].named_parameters():
+            for param_name, _ in model[0].named_parameters():
                self.assertFalse(FLAT_PARAM in param_name)
            self.assertGreater(len(list(model[0].parameters())), 1)

--- a/test/distributed/fsdp/test_fsdp_use_orig_params.py
+++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py
@ -260,7 +260,7 @@ class TestFSDPUseOrigParamsMultipleParamGroups(FSDPTest):
        model = FSDP(copy.deepcopy(base_model), self.process_group, **fsdp_kwargs)
        model = torch.compile(model)
        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
-        for i in range(10):
+        for _ in range(10):
            losses = []
            inp = ref_model.get_input(torch.device("cuda"))
            for _model, _optim in ((ref_model, ref_optim), (model, optim)):
--- a/test/distributed/fsdp/test_utils.py
+++ b/test/distributed/fsdp/test_utils.py
@ -118,7 +118,7 @@ class TestUtils(TestCase):
            x.fill_(0)

        x = nn.utils.rnn.pack_padded_sequence(x, seq_length)
-        x, h = rnn(x)
+        x, _ = rnn(x)
        x = _apply_to_tensors(fill_fn, x)
        x, _ = nn.utils.rnn.pad_packed_sequence(x)
        self.assertEqual(torch.sum(x), 0)
--- a/test/distributed/launcher/launch_test.py
+++ b/test/distributed/launcher/launch_test.py
@ -41,7 +41,6 @@ class LaunchTest(unittest.TestCase):
    def test_launch_without_env(self):
        nnodes = 1
        nproc_per_node = 4
-        world_size = nnodes * nproc_per_node
        sock = get_socket_with_port()
        with closing(sock):
            master_port = sock.getsockname()[1]
--- a/test/distributed/pipelining/model_registry.py
+++ b/test/distributed/pipelining/model_registry.py
@ -114,7 +114,7 @@ class CustomLinearDx(Function):

    @staticmethod
    def backward(ctx, grad_output):
-        input_val, weight, bias = ctx.saved_tensors
+        input_val, weight, _ = ctx.saved_tensors
        grad_input = grad_output.mm(weight)
        ctx.module.cached_context[ctx.layer_idx].append(grad_output.clone())
        ctx.module.cached_context[str(ctx.layer_idx) + "_input"].append(
@ -131,7 +131,7 @@ class CustomLinearDxDw(Function):

    @staticmethod
    def backward(ctx, grad_output):
-        input_val, weight, bias = ctx.saved_tensors
+        input_val, weight, _ = ctx.saved_tensors
        grad_input = grad_output.mm(weight)
        grad_weight = grad_output.t().mm(input_val)
        grad_bias = grad_output.sum(0)
--- a/test/distributed/pipelining/test_backward.py
+++ b/test/distributed/pipelining/test_backward.py
@ -74,7 +74,7 @@ class StageBackwardTests(TestCase):
        # Forward, then backward of loss with respect to inputs
        out = mod(x)
        loss = loss_fn(out, target)
-        dinputs, param_groups = stage_backward_input(
+        dinputs, _param_groups = stage_backward_input(
            stage_outputs_or_loss=(loss,),
            output_grads=None,
            input_values=[x],
@ -88,7 +88,7 @@ class StageBackwardTests(TestCase):

        torch.testing.assert_close(x.grad, ref_x.grad)
        torch.testing.assert_close(dinputs[0], ref_x.grad)
-        for name, p in mod.named_parameters():
+        for _, p in mod.named_parameters():
            # Check that the weight gradients were not updated
            self.assertEqual(p.grad, None)

@ -109,7 +109,7 @@ class StageBackwardTests(TestCase):
        # Forward, then backward of loss with respect to inputs
        out = mod(x)
        loss = loss_fn(out, target)
-        dinputs, param_groups = stage_backward_input(
+        _dinputs, param_groups = stage_backward_input(
            stage_outputs_or_loss=(loss,),
            output_grads=None,
            input_values=[x],
@ -157,7 +157,7 @@ class StageBackwardTests(TestCase):
        for x in inputs:
            out = mod(x)
            loss = loss_fn(out, target)
-            dinputs, param_groups = stage_backward_input(
+            _dinputs, param_groups = stage_backward_input(
                stage_outputs_or_loss=(loss,),
                output_grads=None,
                input_values=[x],
--- a/test/distributed/pipelining/test_schedule.py
+++ b/test/distributed/pipelining/test_schedule.py
@ -264,7 +264,7 @@ class TestSchedulePlan(TestCase):
                ]

                schedule = ScheduleClass(stages, num_microbatches)
-                formatted_pipeline_order = _format_pipeline_order(
+                _formatted_pipeline_order = _format_pipeline_order(
                    schedule.pipeline_order
                )

@ -305,10 +305,7 @@ class TestSchedulePlan(TestCase):
                    for i in range(num_local_stages)
                ]
                schedule = ScheduleClass(stages, num_microbatches)
-                formatted_pipeline_order = _format_pipeline_order(
-                    schedule.pipeline_order
-                )
-                # print(formatted_pipeline_order)
+                _format_pipeline_order(schedule.pipeline_order)

                def stage_to_rank(stage):
                    return stage % group_size
--- a/test/distributed/pipelining/test_schedule_multiproc.py
+++ b/test/distributed/pipelining/test_schedule_multiproc.py
@ -151,7 +151,7 @@ class ScheduleTest(MultiProcContinousTest):
                schedule.step(x)
            elif self.rank == self.world_size - 1:
                losses = []
-                out = schedule.step(target=target, losses=losses)
+                schedule.step(target=target, losses=losses)
            else:
                schedule.step()

@ -412,7 +412,6 @@ class ScheduleTest(MultiProcContinousTest):
            if hasattr(ScheduleClass, "num_microbatches")
            else 8
        )
-        input_args = x.chunk(num_microbatches)[0]
        stages = [
            PipelineStage(
                stage_module,
@ -548,7 +547,6 @@ class ScheduleTest(MultiProcContinousTest):
        loss_fn = torch.nn.MSELoss(reduction="sum")

        # Create a pipeline stage to wrap that submodule
-        input_args = x.chunk(num_microbatches)[0]
        stage_indices = rank_stages[self.rank]
        print(f"Rank {self.rank} stages: {stage_indices}")
        submod_names = [f"layers.{i}" for i in stage_indices]
@ -582,7 +580,7 @@ class ScheduleTest(MultiProcContinousTest):
                    schedule.step(x)
                elif self.rank == self.world_size - 1:
                    losses = []
-                    out = schedule.step(target=target, losses=losses)
+                    schedule.step(target=target, losses=losses)
                else:
                    schedule.step()
        self.assertEqual(
@ -887,7 +885,6 @@ class ScheduleTest(MultiProcContinousTest):

        # Create a pipeline stage to wrap that submodule
        chunks = 2
-        input_args = x.chunk(chunks)[0]
        stages = [
            PipelineStage(
                stage_module,
--- a/test/distributed/pipelining/test_stage.py
+++ b/test/distributed/pipelining/test_stage.py
@ -310,9 +310,6 @@ class StageTest(MultiProcContinousTest):
        full_mod.to(self.device)
        stage_mod = full_mod.get_submodule(f"layers.{self.rank}")

-        x = torch.randn(batch_size, d_hid, device=self.device)
-        target = torch.randn(batch_size, d_hid, device=self.device)
-
        stage_with_dw_builder = PipelineStage(
            stage_mod,
            self.rank,
--- a/test/distributed/pipelining/test_unflatten.py
+++ b/test/distributed/pipelining/test_unflatten.py
@ -58,7 +58,7 @@ class UnflattenTests(TestCase):
        # Check qualnames
        for stage_idx in range(pipe.num_stages):
            stage_mod = pipe.get_stage_module(stage_idx)
-            for param_name, param in stage_mod.named_parameters():
+            for param_name, _ in stage_mod.named_parameters():
                assert (
                    param_name in orig_state_dict
                ), f"{param_name} not in original state dict"
--- a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
+++ b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
@ -87,7 +87,9 @@ class MicroPipelineTPTest(TestCase):
            a = all_gather_tensor(inp, gather_dim=0, group=group.group_name)
            b = all_gather_tensor(inp, gather_dim=1, group=group.group_name)
            c = _fp8_all_gather(inp, gather_dim=0, group_name=group.group_name)
-            d = _fp8_all_gather(inp, gather_dim=1, group_name=group.group_name)
+            d = _fp8_all_gather(  # noqa: F841
+                inp, gather_dim=1, group_name=group.group_name
+            )
            return a, b, c

        inp = torch.rand(64, 32, device="cuda")
--- a/test/distributed/tensor/parallel/test_tp_examples.py
+++ b/test/distributed/tensor/parallel/test_tp_examples.py
@ -311,7 +311,7 @@ class DistTensorParallelExampleTest(DTensorTestBase):

        torch.manual_seed(0)
        steps = 10 if type(model) is torch.float64 else 1
-        for iter in range(steps):
+        for _ in range(steps):
            inp = torch.randint(
                model_args.vocab_size, inp_size, device=self.device_type
            )
--- a/test/distributed/tensor/parallel/test_tp_style.py
+++ b/test/distributed/tensor/parallel/test_tp_style.py
@ -223,7 +223,7 @@ class TensorParallelStyleTest(DTensorTestBase):
            AssertionError,
            "input_layouts and desired_input_layouts should have same length!",
        ):
-            prepare_inps_dimension_mismatch = PrepareModuleInput(
+            PrepareModuleInput(
                input_layouts=Shard(0), desired_input_layouts=(Replicate(), None)
            )
        # Raise assertion error if module inputs and input_layouts do not have same length.
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@ -182,7 +182,7 @@ class TimeoutTest(TestCase):
                threads.append(t)
                t.start()

-            for i, thread in enumerate(threads):
+            for _, thread in enumerate(threads):
                thread.join()

            # we expect the world_size-1 threads to have failed
@ -583,14 +583,14 @@ class CommonDistributedDataParallelTest:
                )
            )
            with err_ctx:
-                model = self._test_ddp_checkpointing(
+                self._test_ddp_checkpointing(
                    self.CheckpointOnceModule(use_reentrant=use_reentrant),
                    process_group=process_group,
                    use_bucket_view=use_bucket_view,
                    find_unused_parameters=True,
                )
            # test passes when static_graph is true
-            model = self._test_ddp_checkpointing(
+            self._test_ddp_checkpointing(
                self.CheckpointOnceModule(use_reentrant=use_reentrant),
                process_group=process_group,
                use_bucket_view=use_bucket_view,
@ -615,7 +615,7 @@ class CommonDistributedDataParallelTest:
                )
            )
            with err_ctx:
-                model = self._test_ddp_checkpointing(
+                self._test_ddp_checkpointing(
                    self.CheckpointTwiceModule(use_reentrant=use_reentrant),
                    process_group=process_group,
                    use_bucket_view=use_bucket_view,
@ -623,7 +623,7 @@ class CommonDistributedDataParallelTest:
                )

            with err_ctx:
-                model = self._test_ddp_checkpointing(
+                self._test_ddp_checkpointing(
                    self.CheckpointTwiceModule(use_reentrant=use_reentrant),
                    process_group=process_group,
                    use_bucket_view=use_bucket_view,
@ -641,7 +641,7 @@ class CommonDistributedDataParallelTest:
        process_group = self._get_process_group()
        for use_bucket_view in (True, False):
            # Test passes when static_graph=True.
-            model = self._test_ddp_checkpointing(
+            self._test_ddp_checkpointing(
                self.CheckpointTwiceModule(use_reentrant=use_reentrant),
                process_group=process_group,
                use_bucket_view=use_bucket_view,
@ -656,7 +656,7 @@ class CommonDistributedDataParallelTest:
        """
        process_group = self._get_process_group()
        for use_bucket_view in (True, False):
-            model = self._test_ddp_checkpointing(
+            self._test_ddp_checkpointing(
                self.DynamicCheckpointTwiceModule(use_reentrant=False),
                process_group=process_group,
                use_bucket_view=use_bucket_view,
@ -675,7 +675,7 @@ class CommonDistributedDataParallelTest:
        """
        process_group = self._get_process_group()
        for use_bucket_view in (True, False):
-            model = self._test_ddp_checkpointing(
+            self._test_ddp_checkpointing(
                self.DynamicCheckpointTwiceModuleWeightSharing(use_reentrant=False),
                process_group=process_group,
                use_bucket_view=use_bucket_view,
@ -719,7 +719,7 @@ class CommonDistributedDataParallelTest:
        process_group = self._get_process_group()
        torch.cuda.set_device(self.rank)
        for use_bucket_view in (True, False):
-            model = self._test_ddp_checkpointing(
+            self._test_ddp_checkpointing(
                self.CheckpointTwiceModuleWeightSharing(),
                process_group=process_group,
                use_bucket_view=use_bucket_view,
@ -737,7 +737,7 @@ class CommonDistributedDataParallelTest:
                "Expect `start_powerSGD_iter` > 1 if `use_error_feedback` or `warm_start` is enabled, "
                "because PowerSGD can only be applied after the first two iterations in DDP.",
            ):
-                state = powerSGD.PowerSGDState(
+                powerSGD.PowerSGDState(
                    process_group=None,
                    matrix_approximation_rank=1,
                    start_powerSGD_iter=start_powerSGD_iter,
--- a/test/distributed/test_c10d_functional_native.py
+++ b/test/distributed/test_c10d_functional_native.py
@ -429,7 +429,7 @@ class TestWithNCCL(MultiProcessTestCase):

        input = torch.full((10, 10), float(self.rank), device=self.device)
        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
-        output = torch.ops._c10d_functional.all_reduce(
+        torch.ops._c10d_functional.all_reduce(
            input,
            "avg",
            "default",
@ -550,7 +550,7 @@ class CompileTest(TestCase):
        assert "= torch.ops._c10d_functional.wait_tensor.default" not in code

        # Test aoti
-        out = AOTIRunnerUtil.run("cuda", func, (arg,))
+        AOTIRunnerUtil.run("cuda", func, (arg,))
        torch.cuda.synchronize()

    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@ -596,7 +596,7 @@ class CompileTest(TestCase):
        assert "= torch.ops._c10d_functional.wait_tensor.default" not in code

        # Test aoti
-        out = AOTIRunnerUtil.run("cuda", func, (args,))
+        out = AOTIRunnerUtil.run("cuda", func, (args,))  # noqa: F841
        torch.cuda.synchronize()

    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@ -708,7 +708,7 @@ class CompileTest(TestCase):
        assert "= torch.ops._c10d_functional.wait_tensor.default" not in code

        # Test aoti
-        out = AOTIRunnerUtil.run("cuda", func, (arg,))
+        AOTIRunnerUtil.run("cuda", func, (arg,))
        torch.cuda.synchronize()

    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@ -742,7 +742,7 @@ class CompileTest(TestCase):
        )

        # Test aoti
-        out = AOTIRunnerUtil.run("cuda", func, (args,))
+        out = AOTIRunnerUtil.run("cuda", func, (args,))  # noqa: F841
        torch.cuda.synchronize()

    @unittest.skipIf(not HAS_GPU, "This is a GPU test!")
@ -764,7 +764,7 @@ class CompileTest(TestCase):
        )

        # Test aoti
-        out = AOTIRunnerUtil.run("cuda", func, (arg,))
+        AOTIRunnerUtil.run("cuda", func, (arg,))
        torch.cuda.synchronize()

    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@ -790,7 +790,7 @@ class CompileTest(TestCase):
        )

        # Test aoti
-        out = AOTIRunnerUtil.run("cuda", func, (arg,))
+        AOTIRunnerUtil.run("cuda", func, (arg,))
        torch.cuda.synchronize()

    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@ -910,7 +910,7 @@ class CompileTest(TestCase):
        )

        # Test aoti
-        out = AOTIRunnerUtil.run("cuda", func, (arg,))
+        AOTIRunnerUtil.run("cuda", func, (arg,))
        torch.cuda.synchronize()

    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@ -1920,7 +1920,7 @@ class DistributedDataParallelTest(
        ddp_state_dict = torch.load(checkpoint_path, map_location=map_location)

        for model in [ddp_withload, model_withload]:
-            for p in ddp_withload.parameters():
+            for p in model.parameters():
                with torch.no_grad():
                    p.zero_()
        ddp_withload.load_state_dict(ddp_state_dict)
@ -1973,7 +1973,8 @@ class DistributedDataParallelTest(
        This unit test verifies whether the Future object is passed properly.
        The callback function creates a Future object and sets a value to it.
        """
-        store = c10d.FileStore(self.file_name, self.world_size)
+        store = c10d.FileStore(self.file_name, self.world_size)  # noqa: F841
+
        process_group = self._get_process_group()

        # Test on CPU
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@ -366,7 +366,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
            thread.start()

            # We would get stuck here due to d2h if we didn't abort.
-            t_cpu = t.cpu()
+            t.cpu()

            thread.join()

@ -741,7 +741,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
        # First allreduce to initialize default PG's communicator.
        pg.allreduce(t).wait()
        # PG1 is an PG without comms initialized, since we don't call collective on it
-        new_pg1 = c10d.new_group([0, 1])
+        new_pg1 = c10d.new_group([0, 1])  # noqa: F841
        new_pg2 = c10d.new_group([0, 1])
        t2 = torch.rand(10, 10, device=device)

@ -807,7 +807,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
        # 'timeout' kwarg (or its kwdefault) taking precedence
        opts = dist.ProcessGroupNCCL.Options()
        opts._timeout = timedelta(seconds=123)
-        with warnings.catch_warnings(record=True) as w:
+        with warnings.catch_warnings(record=True):
            dist.init_process_group(**base_opts, pg_options=opts)
            # TODO(whc) i verified that we are indeed emitting this warning, and i can't figure out why i can't catch it.
            # self.assertEqual(len(w), 1)
@ -1266,30 +1266,26 @@ class DistributedDataParallelTest(
            "DistributedDataParallel device_ids and output_device arguments only work with "
            "single-device/multiple-device GPU modules or CPU modules",
        ):
-            ddp_model = DistributedDataParallel(
+            DistributedDataParallel(
                model, output_device=gpus[1], process_group=process_group
            )

        with self.assertRaisesRegex(
            ValueError, "device_ids can only be None or contain a single element."
        ):
-            ddp_model = DistributedDataParallel(
-                model, device_ids=gpus, process_group=process_group
-            )
+            DistributedDataParallel(model, device_ids=gpus, process_group=process_group)

        with self.assertRaisesRegex(
            ValueError, "input module must be on the same type of devices"
        ):
            model.fc1 = model.fc1.cpu()
-            ddp_model = DistributedDataParallel(model, process_group=process_group)
+            DistributedDataParallel(model, process_group=process_group)

        model = model.cpu()
        with self.assertRaisesRegex(
            ValueError, "device_ids can only be None or contain a single element."
        ):
-            ddp_model = DistributedDataParallel(
-                model, device_ids=gpus, process_group=process_group
-            )
+            DistributedDataParallel(model, device_ids=gpus, process_group=process_group)

    def _test_fp16(self, gradient_as_bucket_view=False):
        process_group = self._get_process_group()
@ -1940,11 +1936,9 @@ class DistributedDataParallelTest(
                                    ),
                                    named_msg,
                                )
-                                for j, ((param_name, p), p_ddp) in enumerate(
-                                    zip(
+                                for (param_name, p), p_ddp in zip(
                                    m_child.named_parameters(),
                                    m_ddp_child.parameters(),
-                                    )
                                ):
                                    named_msg = (
                                        layer_name + "." + param_name + " " + iter_msg
@ -2010,15 +2004,13 @@ class DistributedDataParallelTest(

        m = ConvNet(layer_devs, layer_formats, layer_dtypes)
        if self.rank == 0:
-            m_ddp = DistributedDataParallel(
-                m, device_ids=[dev0], process_group=process_group
-            )
+            DistributedDataParallel(m, device_ids=[dev0], process_group=process_group)
        else:
            with self.assertRaisesRegex(
                RuntimeError,
                ".* appears not to match strides of the same param in process 0",
            ):
-                m_ddp = DistributedDataParallel(
+                DistributedDataParallel(
                    m, device_ids=[dev0], process_group=process_group
                )

@ -2356,7 +2348,7 @@ class DistributedDataParallelTest(
                process_group=process_group,
            )

-            for i in range(3):
+            for _ in range(3):
                m.zero_grad(set_to_none=try_set_to_none)
                m(1).sum().backward()

@ -2701,7 +2693,7 @@ class WorkHookTest(MultiProcessTestCase):
        pg._register_on_completion_hook(hook)
        tensor = torch.ones([2, 3]).cuda(self.rank) * self.rank
        work_count = 3
-        for i in range(work_count):
+        for _ in range(work_count):
            work += 1
            pg.broadcast([tensor]).wait()

@ -2806,7 +2798,7 @@ class NcclErrorHandlingTest(MultiProcessTestCase):
            # Run some GPU operations to make sure cuda has not gotten stuck.
            # It was observed cuda could get stuck if NCCL communicators were
            # not properly aborted before throwing RuntimeError.
-            a = torch.rand(10).cuda(self.rank)
+            torch.rand(10).cuda(self.rank)
        elif self.rank == 1:
            # Clean up structures (ex: files for FileStore before going down)
            del process_group
@ -2947,7 +2939,7 @@ class NcclErrorHandlingTest(MultiProcessTestCase):
        os.environ["TORCH_NCCL_BLOCKING_WAIT"] = val
        store = c10d.FileStore(self.file_name, self.world_size)
        with self.assertRaises(RuntimeError):
-            process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+            c10d.ProcessGroupNCCL(store, self.rank, self.world_size)

    @requires_nccl()
    @skip_if_lt_x_gpu(3)
@ -4223,7 +4215,7 @@ class NCCLTraceTestBase(MultiProcessTestCase):
    def _join_processes(self, fn):
        # We need to patch sys.exit() as skip_if will use sys.exit() and
        # the exit code from the this process will not be catched.
-        with mock.patch("sys.exit") as exit_mock:
+        with mock.patch("sys.exit"):
            fn()
        super()._join_processes(fn)

@ -4231,7 +4223,7 @@ class NCCLTraceTestBase(MultiProcessTestCase):
        proc = torch.multiprocessing.get_context("spawn").Process
        self.children_pipes = []
        parent_pipes = []
-        for i in range(self.world_size):
+        for _ in range(self.world_size):
            parent_conn, child_conn = torch.multiprocessing.Pipe()
            self.children_pipes.append(child_conn)
            parent_pipes.append(parent_conn)
@ -4346,7 +4338,7 @@ class NCCLTraceTest(NCCLTraceTestBase):
            pg._enable_collectives_timing()
        device = self.local_device
        a = torch.full((3, 4), float(self.rank), device=device)
-        for i in range(2):
+        for _ in range(2):
            f = pg.allreduce(a)
        f.wait()
        torch.cuda.synchronize(device=device)
@ -4372,7 +4364,7 @@ class NCCLTraceTest(NCCLTraceTestBase):
            pg._enable_collectives_timing()
        device = self.local_device
        a = torch.full((3, 4), float(self.rank), device=device)
-        for i in range(2):
+        for _ in range(2):
            f = pg.allreduce(a)
        f.wait()
        torch.cuda.synchronize(device=device)
@ -4420,7 +4412,7 @@ class NCCLTraceTest(NCCLTraceTestBase):
        pg = self._create_process_group_nccl()
        device = self.local_device
        a = torch.full((3, 4), float(self.rank), device=device)
-        for i in range(2):
+        for _ in range(2):
            f = pg.allreduce(a)
        f.wait()
        torch.cuda.synchronize(device=device)
@ -4436,7 +4428,7 @@ class NCCLTraceTest(NCCLTraceTestBase):
        pg = self._create_process_group_nccl()
        device = self.local_device
        a = torch.full((3, 4), float(self.rank), device=device)
-        for i in range(2):
+        for _ in range(2):
            # test some other primitives to make sure
            # their strings are valid
            xs = [torch.ones(3, 4, device=device)]
@ -4496,7 +4488,7 @@ class NCCLTraceTest(NCCLTraceTestBase):
        pg = self._create_process_group_nccl()
        device = self.local_device
        # send more works than the buffer size to overwrite the previous entry
-        for i in range(12):
+        for _ in range(12):
            a = [torch.ones(3, 4, device=device)]
            pg.broadcast(a).wait()
        torch.cuda.synchronize(device=device)
@ -4611,7 +4603,7 @@ class NCCLTraceTest(NCCLTraceTestBase):
                th.start()
                # fill the cuda buffer, at around 1024 events
                # this will stall
-                for i in range(2000):
+                for _ in range(2000):
                    a = a + a
                th.join()
            else:
@ -4646,7 +4638,7 @@ class NCCLTraceTest(NCCLTraceTestBase):

        num_coalesced_ops = 20
        ops_per_coalesce = len(op_sizes_per_coalesce)
-        for i in range(num_coalesced_ops):
+        for _ in range(num_coalesced_ops):
            ops = []
            for input_sizes in op_sizes_per_coalesce:
                tensor = torch.zeros(input_sizes).to(self.local_device)
@ -4745,7 +4737,7 @@ class NCCLTraceTest(NCCLTraceTestBase):
            pg._enable_collectives_timing()
        num_repeats = 10
        ops_per_repeat = len(op_sizes)
-        for i in range(num_repeats):
+        for _ in range(num_repeats):
            for input_sizes in op_sizes:
                tensor = torch.zeros(input_sizes).to(self.local_device)
                if self.rank == 0:
@ -5047,7 +5039,7 @@ class NcclErrorDumpTest(NCCLTraceTestBase):
                # Block the current stream on the NCCL stream
                work.wait()
                # Run some GPU operations
-                a = torch.rand(10).cuda(self.rank)
+                torch.rand(10).cuda(self.rank)
        elif self.rank == 1:
            # Clean up structures (ex: files for FileStore before going down)
            del process_group
@ -5108,7 +5100,6 @@ class ProcessGroupNCCLLargerScaleTest(MultiProcessTestCase):

        tensor = torch.full((1,), self.rank).cuda(device)
        ng1 = c10d.split_group(pg, [[0, 1], [2, 3, 4, 5, 6, 7]])
-        backend1 = ng1._get_backend(torch.device(device))

        # comm split happens eagerly since device_id is passed to init_process_group.
        self.assertEqual(backend.comm_split_count(), 1)
--- a/test/distributed/test_c10d_ops_nccl.py
+++ b/test/distributed/test_c10d_ops_nccl.py
@ -162,7 +162,6 @@ class ProcessGroupNCCLOpTest(MultiProcContinousTest):
    @requires_nccl()
    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
    def test_allreduce_ops(self):
-        device_count = torch.cuda.device_count()
        pg = self.pg
        local_device_id = self.rank_to_GPU[self.rank][0]

@ -303,9 +302,8 @@ class ProcessGroupNCCLOpTest(MultiProcContinousTest):
        pg = self.pg
        rank = self.rank_to_GPU[self.rank][0]
        with torch.cuda.device(rank):
-            for i in range(10):
+            for _ in range(10):
                xs = [torch.FloatTensor([1]).cuda(rank)]
-                ys = [torch.FloatTensor([4]).cuda(rank)]
                for _ in range(30):
                    pg.allreduce(xs[0]).wait()

@ -410,7 +408,7 @@ class ProcessGroupNCCLOpTest(MultiProcContinousTest):
            output_tensors.append([t.cuda(device=gpu) for t in output_per_gpu])
            expected_output.append([t.cuda(device=gpu) for t in expected_per_gpu])

-        result = allgather(output_tensors, tensors)
+        allgather(output_tensors, tensors)

        # Verification
        self.assertEqual(output_tensors, expected_output)
@ -558,7 +556,7 @@ class ProcessGroupNCCLOpTest(MultiProcContinousTest):

        # init output
        output_ts = []
-        for rank in range(self.world_size):
+        for _ in range(self.world_size):
            output_ts.append(torch.tensor([-1]).cuda(device_id))

        with self.assertRaisesRegex(ValueError, "invalid root rank"):
@ -914,7 +912,6 @@ class ProcessGroupNCCLOpTest(MultiProcContinousTest):
    @requires_nccl()
    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
    def test_send_recv(self):
-        pg = self.pg
        device = self.rank_to_GPU[self.rank][0]

        # Generate the same random tensor
@ -930,7 +927,6 @@ class ProcessGroupNCCLOpTest(MultiProcContinousTest):
    @requires_nccl()
    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
    def test_send_recv_complex(self):
-        pg = self.pg
        device = self.rank_to_GPU[self.rank][0]

        # Generate the same random tensor
--- a/test/distributed/test_c10d_ucc.py
+++ b/test/distributed/test_c10d_ucc.py
@ -755,7 +755,7 @@ class DistributedDataParallelTest(
        ddp_state_dict = torch.load(checkpoint_path, map_location=map_location)

        for model in [ddp_withload, model_withload]:
-            for p in ddp_withload.parameters():
+            for p in model.parameters():
                with torch.no_grad():
                    p.zero_()
        ddp_withload.load_state_dict(ddp_state_dict)
--- a/test/distributed/test_collective_utils.py
+++ b/test/distributed/test_collective_utils.py
@ -57,7 +57,7 @@ class TestCollectiveUtils(MultiProcessTestCase):
        Ensure broadcast has no dependency on torch.distributed when run in single process.
        """
        func = mock.MagicMock()
-        res = broadcast(data_or_fn=func, rank=0)
+        broadcast(data_or_fn=func, rank=0)
        func.assert_called_once()

    def test_broadcast_result_raises_exceptions_from_func(
@ -98,7 +98,7 @@ class TestCollectiveUtils(MultiProcessTestCase):
        Ensure all_gather has no dependency on torch.distributed when run in single process.
        """
        func = mock.MagicMock()
-        res = all_gather(data_or_fn=func)
+        all_gather(data_or_fn=func)
        func.assert_called_once()

    def test_all_gather_result_raises_exceptions_from_func(
--- a/test/distributed/test_data_parallel.py
+++ b/test/distributed/test_data_parallel.py
@ -791,8 +791,8 @@ class TestDataParallel(TestCase):
                                ),
                                named_msg,
                            )
-                            for j, ((param_name, p), p_dp) in enumerate(
-                                zip(m_child.named_parameters(), m_dp_child.parameters())
+                            for (param_name, p), p_dp in zip(
+                                m_child.named_parameters(), m_dp_child.parameters()
                            ):
                                named_msg = (
                                    layer_name + "." + param_name + " " + iter_msg
--- a/test/distributed/test_device_mesh.py
+++ b/test/distributed/test_device_mesh.py
@ -88,7 +88,7 @@ class DeviceMeshTest(DTensorTestBase):
    def test_assert_invalid_mesh_tensor(self):
        mesh = torch.arange(self.world_size).to(self.rank)
        with self.assertRaises(ValueError):
-            device_mesh = DeviceMesh(self.device_type, mesh)
+            DeviceMesh(self.device_type, mesh)

    @with_comms()
    def test_2d_mesh_non_eager_init_subgroup(self):
@ -144,7 +144,7 @@ class DeviceMeshTest(DTensorTestBase):
            RuntimeError,
            "Optional kwarg `mesh_dim` needs to be specified when device_mesh.ndim > 1.",
        ):
-            local_rank = mesh_2d.get_local_rank()
+            mesh_2d.get_local_rank()

    @with_comms
    def test_get_local_rank(self):
@ -258,7 +258,7 @@ class DeviceMeshTest(DTensorTestBase):
        ):
            # test init_device_mesh with an invalid device type that contains a GPU index
            mesh_shape = (2, self.world_size // 2)
-            mesh_2d = init_device_mesh(
+            init_device_mesh(
                "cuda:0", mesh_shape=mesh_shape, mesh_dim_names=("dp", "tp")
            )

@ -453,7 +453,7 @@ class InitDeviceMeshTest(DTensorTestBase):
            RuntimeError,
            "Each mesh_dim_name must be unique.",
        ):
-            mesh = init_device_mesh(
+            init_device_mesh(
                self.device_type,
                (2, 4),
                mesh_dim_names=["dp", "dp"],
@ -465,7 +465,7 @@ class InitDeviceMeshTest(DTensorTestBase):
            RuntimeError,
            "mesh_shape and mesh_dim_names should have same length!",
        ):
-            mesh = init_device_mesh(
+            init_device_mesh(
                self.device_type,
                (8,),
                mesh_dim_names=["dp", "tp"],
@ -483,7 +483,7 @@ class TestDeviceMeshGetItem(DTensorTestBase):
            RuntimeError, "Cannot slice a DeviceMesh without mesh_dim_names!"
        ):
            mesh = init_device_mesh(self.device_type, (2, 4))
-            child_mesh = mesh["DP"]
+            mesh["DP"]

    @with_comms
    def test_raises_invalid_mesh_dim_name(self):
@ -493,7 +493,7 @@ class TestDeviceMeshGetItem(DTensorTestBase):
            mesh = init_device_mesh(
                self.device_type, (2, 4), mesh_dim_names=mesh_dim_names
            )
-            child_mesh = mesh[child_mesh_dim_name]
+            mesh[child_mesh_dim_name]

    @with_comms
    def test_get_item_2d(self):
@ -514,7 +514,6 @@ class TestDeviceMeshGetItem(DTensorTestBase):
        tp_group_idx = self.rank // 4
        self.assertEqual(tp_mesh.mesh, pg_ranks_by_dim_name["TP"][tp_group_idx])

-        dp_mesh = mesh_2d["DP"]
        dp_group_idx = self.rank % 4
        self.assertEqual(mesh_2d["DP"].mesh, pg_ranks_by_dim_name["DP"][dp_group_idx])

@ -564,17 +563,15 @@ class TestDeviceMeshGetItem(DTensorTestBase):
    def test_cache_and_reuse_submesh_slice_result(self):
        mesh = init_device_mesh(self.device_type, (2, 4), mesh_dim_names=("dp", "tp"))

-        dp_mesh = mesh["dp"]
        ref_pg_count = _world.group_count

        # When we call the "dp" slice second time, it should not create any new pg.
        # As we are just using the cached result so the pg count should be the same.
-        dp_mesh_2 = mesh["dp"]
        self.assertEqual(ref_pg_count, _world.group_count)

        # When we call the "tp" slice, it should not create a new pg, as the "tp" slice would
        # just reuse the parent mesh pg.
-        tp_mesh = mesh["tp"]
+        mesh["tp"]
        self.assertEqual(_world.group_count, ref_pg_count)

    @with_comms
@ -603,7 +600,7 @@ class TestDeviceMeshGetItem(DTensorTestBase):
            KeyError,
            "Invalid mesh_dim_names",
        ):
-            cp_dp_mesh = mesh_3d["cp", "dp"]
+            mesh_3d["cp", "dp"]

    @with_comms
    def test_flatten_mesh_3d(self):
@ -767,9 +764,9 @@ class TestMeshEnv(DTensorTestBase):
        )

        with FakeTensorMode():
-            dp_mesh = mesh_2d["DP"]
-            tp_mesh = mesh_2d["TP"]
-            dp_tp_mesh = mesh_2d["DP", "TP"]
+            mesh_2d["DP"]
+            mesh_2d["TP"]
+            mesh_2d["DP", "TP"]


 class DeviceMeshCollectiveTest(DTensorTestBase):
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@ -421,7 +421,7 @@ class TestFakeDistributedSingleProc(torch._dynamo.test_case.TestCase):
                self.weight2 = nn.Parameter(torch.randn(512, 512))

            def forward(self, x, y):
-                u0, u1 = y.tolist()
+                u0, _ = y.tolist()
                x = torch.cat([x, x])
                y = x @ self.weight1
                z = (x + y @ self.weight2) * u0
@ -442,7 +442,7 @@ class TestFakeDistributedSingleProc(torch._dynamo.test_case.TestCase):
                self.weight2 = nn.Parameter(torch.randn(512, 512))

            def forward(self, x, y):
-                u0, u1 = y.tolist()
+                u0, _ = y.tolist()
                a = torch.ones(u0)
                x = torch.cat([x, x])
                y = x @ self.weight1
@ -466,7 +466,7 @@ class TestFakeDistributedSingleProc(torch._dynamo.test_case.TestCase):

            def forward(self, x, y):
                # partition one (contains the u0 def)
-                u0, u1 = y.tolist()
+                u0, _ = y.tolist()
                x = torch.cat([x, x])
                y1 = x @ self.weight1
                # partition two (contains the variable)
@ -511,7 +511,7 @@ class TestFakeDistributedSingleProc(torch._dynamo.test_case.TestCase):
            ):
                super().__init__()
                layers = []
-                for l in range(2):
+                for _ in range(2):
                    layer = nn.ModuleList(
                        [
                            nn.LayerNorm(96),
@ -529,7 +529,7 @@ class TestFakeDistributedSingleProc(torch._dynamo.test_case.TestCase):
                for m in self.layers:
                    x = x.reshape(B * F, T, H)
                    x = m[0](x)
-                    x, attn = m[1].forward(x, x, x)
+                    x, _ = m[1].forward(x, x, x)
                    x = x.reshape(B, F, T, H)
                return x

@ -937,8 +937,8 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):

            @torch.compile()
            def f(x, y):
-                zx = x.shape
-                zy = y.shape
+                zx = x.shape  # noqa: F841
+                zy = y.shape  # noqa: F841
                return x.sum() + y.sum()

            if self.rank == 0:
@ -967,10 +967,10 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):

            @torch.compile()
            def f(x, y):
-                z = y
+                z = y  # noqa: F841
                print("woof")
-                zx = x.shape
-                zy = y.shape
+                zx = x.shape  # noqa: F841
+                zy = y.shape  # noqa: F841
                return x.sum() + y.sum()

            if self.rank == 0:
@ -999,8 +999,8 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):

            @torch.compile()
            def f(x, y):
-                zx = x.shape
-                zy = y.shape
+                zx = x.shape  # noqa: F841
+                zy = y.shape  # noqa: F841
                return x.sum() + y.sum()

            if self.rank == 0:
@ -1405,7 +1405,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
        model = DDP(model, device_ids=self.device_ids)

        hidden_states = torch.randn(B, S, H * D).to(device)
-        attention_scores = model(hidden_states)
+        model(hidden_states)
        torch.cuda.synchronize()

    @patch.object(config, "optimize_ddp", True)
@ -1461,7 +1461,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
        model = DDP(model, device_ids=self.device_ids)

        hidden_states = torch.randn(B, S, H * D).to(device)
-        attention_scores = model(hidden_states)
+        model(hidden_states)
        torch.cuda.synchronize()

    @patch.object(config, "optimize_ddp", True)
@ -1723,7 +1723,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):

    def test_fsdp_orig_params_assert(self):
        # Test with basic FSDP wrapping (outer wrap around whole model)
-        m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+        m, inputs, _ = get_model(f"cuda:{self.rank}")
        fsdp_m = FSDP(m, use_orig_params=False)
        fsdp_m = torch.compile(fsdp_m)
        self.assertRaisesRegex(
--- a/test/distributed/test_functional_api.py
+++ b/test/distributed/test_functional_api.py
@ -130,7 +130,7 @@ class TestExpand(MultiThreadedTestCase):
        tag, rankset, group_size = ft_c._expand_group(dist.group.WORLD, "bla")
        self.assertEqual("bla", tag)

-        my_pg, others = new_subgroups(group_size=2)
+        my_pg, _ = new_subgroups(group_size=2)
        tag, rankset, group_size = ft_c._expand_group(my_pg)
        self.assertEqual(c10d._get_group_tag(my_pg), tag)
        self.assertEqual(dist.get_process_group_ranks(my_pg), rankset)
@ -588,7 +588,7 @@ class TestCollectivesWithDistributedBackend(DistributedTestBase):
        def allreduce(t, pg):
            return ft_c.all_reduce(t, "sum", pg)

-        compiled_allreduce = torch.compile(allreduce, fullgraph=True)
+        compiled_allreduce = torch.compile(allreduce, fullgraph=True)  # noqa: F841
        dist.init_process_group(
            backend="fake",
            rank=0,
@ -615,9 +615,7 @@ class TestCollectivesWithDistributedBackend(DistributedTestBase):
                return batch * 5

        compiled_func = torch.compile(func)
-        ret = compiled_func(
-            torch.ones((100,), device=device), self.process_group, self.rank
-        )
+        compiled_func(torch.ones((100,), device=device), self.process_group, self.rank)
        dist.barrier()


@ -715,7 +713,7 @@ class TestFunctionalAutograd(MultiThreadedTestCase):
            out = compiled(t, self.world_size)
            out.backward()

-        res, codes = run_and_get_code(run_with_backward)
+        _, codes = run_and_get_code(run_with_backward)
        for code in codes:
            FileCheck().check_count(
                "_c10d_functional.all_to_all_single.default", 1, exactly=True
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@ -411,7 +411,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
                y = self.emb(x)
                last_dim = y.dim() - 1
                y = y.transpose_(0, last_dim).contiguous()
-                res = _functional_collectives.all_gather_tensor(y, 0, ranks, tag)
+                _functional_collectives.all_gather_tensor(y, 0, ranks, tag)
                out = y.transpose_(0, last_dim).contiguous()
                return out

--- a/test/distributed/test_launcher.py
+++ b/test/distributed/test_launcher.py
@ -35,7 +35,6 @@ class TestDistributedLaunch(TestCase):
    def test_launch_user_script(self):
        nnodes = 1
        nproc_per_node = 4
-        world_size = nnodes * nproc_per_node
        sock = get_socket_with_port()
        with closing(sock):
            master_port = sock.getsockname()[1]
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@ -553,7 +553,7 @@ class LibUvTCPStoreTest(TCPStoreTest):
        )

        with self.assertRaisesRegex(NotImplementedError, err_msg_reg):
-            store = dist.TCPStore(
+            dist.TCPStore(
                addr,
                port,
                1,
@ -748,7 +748,7 @@ class RendezvousTCPTest(TestCase):
        url = self.create_tcp_url()
        test_store_timeout = timedelta(seconds=0.1)
        gen0 = dist.rendezvous(url + "&rank=0", timeout=timedelta(seconds=10))
-        store0, rank0, size0 = next(gen0)
+        store0, _, _ = next(gen0)
        store0.set_timeout(test_store_timeout)
        # this should time out in 0.1s. If the timeout passed into rendezvous was
        # not respected, it will take much longer to timeout.
@ -766,7 +766,7 @@ class RendezvousTCPTest(TestCase):
        url = self.create_tcp_url()
        test_store_timeout = timedelta(seconds=0.1)
        gen0 = dist.rendezvous(url + "&rank=0", timeout=timedelta(seconds=10))
-        store0, rank0, size0 = next(gen0)
+        store0, _, _ = next(gen0)
        store0.set_timeout(test_store_timeout)
        # this should time out in 10s. If the timeout passed into rendezvous was
        # not respected, it will take much longer to timeout.
@ -787,7 +787,7 @@ class RendezvousTCPTest(TestCase):
    def test_tcp_store_url_with_libuv(self):
        url = self.create_tcp_url()
        gen0 = dist.rendezvous(url + "&rank=0&use_libuv=1")
-        store0, rank0, size0 = next(gen0)
+        store0, _, _ = next(gen0)
        self.assertTrue(store0.libuvBackend)


@ -1078,7 +1078,7 @@ class TestClientProtocol(TestCase):
        thread = threading.Thread(target=listen)
        thread.start()

-        store = dist.TCPStore(
+        dist.TCPStore(
            host_name="localhost",
            port=port,
            world_size=2,
--- a/test/distributed/test_symmetric_memory.py
+++ b/test/distributed/test_symmetric_memory.py
@ -332,7 +332,6 @@ class SymmetricMemoryTest(MultiProcessTestCase):
        K = 32
        group = dist.group.WORLD
        rank = self.rank
-        world_size = self.world_size

        torch.manual_seed(42 + rank)
        A_shard = torch.rand(BATCH, M // self.world_size, K, device="cuda")
@ -428,7 +427,6 @@ class SymmetricMemoryTest(MultiProcessTestCase):
        K = 32
        group = dist.group.WORLD
        rank = self.rank
-        world_size = self.world_size

        if gather_dim == 0:
            leading_dims = (BATCH // self.world_size, M)
@ -513,7 +511,6 @@ class SymmetricMemoryTest(MultiProcessTestCase):
        K = 32
        group = dist.group.WORLD
        rank = self.rank
-        world_size = self.world_size

        torch.manual_seed(42 + rank)
        A = torch.rand(BATCH, M, K, device="cuda")
@ -546,7 +543,6 @@ class SymmetricMemoryTest(MultiProcessTestCase):
        K = 32
        group = dist.group.WORLD
        rank = self.rank
-        world_size = self.world_size

        torch.manual_seed(42 + rank)
        A = torch.rand(BATCH, M, K, device="cuda").to(torch.float8_e4m3fn)
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@ -1314,7 +1314,7 @@ class TestDistributions(DistributionsTestCase):
        if not msk.all():
            counts = np.concatenate([counts[msk], np.sum(counts[~msk], keepdims=True)])
            pmf = np.concatenate([pmf[msk], np.sum(pmf[~msk], keepdims=True)])
-        chisq, p = scipy.stats.chisquare(counts, pmf * num_samples)
+        _, p = scipy.stats.chisquare(counts, pmf * num_samples)
        self.assertGreater(p, failure_rate, message)

    def _check_enumerate_support(self, dist, examples):
@ -1912,9 +1912,7 @@ class TestDistributions(DistributionsTestCase):
    @set_default_dtype(torch.double)
    def test_one_hot_categorical_2d(self):
        probabilities = [[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]]
-        probabilities_1 = [[1.0, 0.0], [0.0, 1.0]]
        p = torch.tensor(probabilities, requires_grad=True)
-        s = torch.tensor(probabilities_1, requires_grad=True)
        self.assertEqual(OneHotCategorical(p).sample().size(), (2, 3))
        self.assertEqual(
            OneHotCategorical(p).sample(sample_shape=(3, 4)).size(), (3, 4, 2, 3)
@ -2074,13 +2072,11 @@ class TestDistributions(DistributionsTestCase):
    @set_default_dtype(torch.double)
    def test_relaxed_one_hot_categorical_2d(self):
        probabilities = [[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]]
-        probabilities_1 = [[1.0, 0.0], [0.0, 1.0]]
        temp = torch.tensor([3.0], requires_grad=True)
        # The lower the temperature, the more unstable the log_prob gradcheck is
        # w.r.t. the sample. Values below 0.25 empirically fail the default tol.
        temp_2 = torch.tensor([0.25], requires_grad=True)
        p = torch.tensor(probabilities, requires_grad=True)
-        s = torch.tensor(probabilities_1, requires_grad=True)
        self.assertEqual(RelaxedOneHotCategorical(temp, p).sample().size(), (2, 3))
        self.assertEqual(
            RelaxedOneHotCategorical(temp, p).sample(sample_shape=(3, 4)).size(),
@ -3939,7 +3935,7 @@ class TestDistributions(DistributionsTestCase):
        for dim in range(2, 5):
            log_probs = []
            lkj = LKJCholesky(dim, concentration=1.0, validate_args=True)
-            for i in range(2):
+            for _ in range(2):
                sample = lkj.sample()
                sample_tril = tril_matrix_to_vec(sample, diag=-1)
                log_prob = lkj.log_prob(sample)
@ -6241,7 +6237,7 @@ class TestLazyLogitsInitialization(DistributionsTestCase):
            except NotImplementedError:
                pass
            self.assertNotIn("probs", dist.__dict__, msg=message)
-            batch_shape, event_shape = dist.batch_shape, dist.event_shape
+            dist.batch_shape, dist.event_shape
            self.assertNotIn("probs", dist.__dict__, msg=message)

    def test_lazy_probs_initialization(self):
@ -6258,7 +6254,7 @@ class TestLazyLogitsInitialization(DistributionsTestCase):
            except NotImplementedError:
                pass
            self.assertNotIn("logits", dist.__dict__, msg=message)
-            batch_shape, event_shape = dist.batch_shape, dist.event_shape
+            dist.batch_shape, dist.event_shape
            self.assertNotIn("logits", dist.__dict__, msg=message)


@ -6565,6 +6561,7 @@ class TestFunctors(DistributionsTestCase):
        expected_jac = sum(
            [t1.log_abs_det_jacobian(x1, y1), t2.log_abs_det_jacobian(x2, y2)]
        )
+        self.assertEqual(actual_jac, expected_jac)

    def test_stack_transform(self):
        x1 = -1 * torch.arange(1, 101, dtype=torch.float)
@ -6628,18 +6625,18 @@ class TestValidation(DistributionsTestCase):
                for v in torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]):
                    # samples with incorrect shape must throw ValueError only
                    try:
-                        log_prob = d_val.log_prob(v)
+                        d_val.log_prob(v)
                    except ValueError:
                        pass
                    # get sample of correct shape
                    val = torch.full(d_val.batch_shape + d_val.event_shape, v)
                    # check samples with incorrect support
                    try:
-                        log_prob = d_val.log_prob(val)
+                        d_val.log_prob(val)
                    except ValueError as e:
                        if e.args and "must be within the support" in e.args[0]:
                            try:
-                                log_prob = d_nonval.log_prob(val)
+                                d_nonval.log_prob(val)
                            except RuntimeError:
                                pass

--- a/test/dynamo/test_activation_checkpointing.py
+++ b/test/dynamo/test_activation_checkpointing.py
@ -1260,7 +1260,7 @@ Non-primal fwd outputs from model w/o backward hook: {mod_no_hook_fwd_outputs_no
                super().__init__()

            def forward(self, x, ys):
-                a = torch.sin(x)
+                a = torch.sin(x)  # noqa: F841
                b = torch.cos(ys[0])
                c = torch.cos(ys[1])
                return (x, [b, c])
--- a/test/dynamo/test_aot_autograd.py
+++ b/test/dynamo/test_aot_autograd.py
@ -453,7 +453,7 @@ class AotAutogradFallbackTests(torch._inductor.test_case.TestCase):
        a = torch.randn(3, 3, requires_grad=True)
        b = torch.randn(3, 3, requires_grad=True)
        a1, a2 = a.clone(), a.clone()
-        b1, b2 = b.clone(), b.clone()
+        _, b2 = b.clone(), b.clone()

        failure_reason = None

@ -481,7 +481,7 @@ class AotAutogradFallbackTests(torch._inductor.test_case.TestCase):
        c = torch.randn(3, 3, requires_grad=True)
        d = torch.randn(3, 3, requires_grad=True)
        c3, c4 = c.clone(), c.clone()
-        d3, d4 = d.clone(), d.clone()
+        _, d4 = d.clone(), d.clone()

        f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
        f(c3, c3, 3, 3)
@ -507,7 +507,7 @@ class AotAutogradFallbackTests(torch._inductor.test_case.TestCase):
        b = torch.randn(3, 3, requires_grad=True)
        z = a
        a1, a2 = a.clone(), a.clone()
-        b1, b2 = b.clone(), b.clone()
+        _, b2 = b.clone(), b.clone()

        failure_reason = None

@ -543,7 +543,7 @@ class AotAutogradFallbackTests(torch._inductor.test_case.TestCase):
        a = torch.randn(3, 3, requires_grad=True)
        b = torch.randn(3, 3, requires_grad=True)
        a1, a2 = a.clone(), a.clone()
-        b1, b2 = b.clone(), b.clone()
+        _, b2 = b.clone(), b.clone()

        failure_reason = None

@ -571,7 +571,7 @@ class AotAutogradFallbackTests(torch._inductor.test_case.TestCase):
        c = torch.randn(3, 3, requires_grad=True)
        d = torch.randn(3, 3, requires_grad=True)
        c3, c4 = c.clone(), c.clone()
-        d3, d4 = d.clone(), d.clone()
+        _, d4 = d.clone(), d.clone()

        f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
        f([3, 2, 1], [4, 5, 6], c3, c3)
@ -593,7 +593,7 @@ class AotAutogradFallbackTests(torch._inductor.test_case.TestCase):
        a = torch.randn(3, 3, requires_grad=True)
        b = torch.randn(3, 3, requires_grad=True)
        a1, a2 = a.clone(), a.clone()
-        b1, b2 = b.clone(), b.clone()
+        _, b2 = b.clone(), b.clone()

        failure_reason = None

@ -621,7 +621,7 @@ class AotAutogradFallbackTests(torch._inductor.test_case.TestCase):
        c = torch.randn(3, 3, requires_grad=True)
        d = torch.randn(3, 3, requires_grad=True)
        c3, c4 = c.clone(), c.clone()
-        d3, d4 = d.clone(), d.clone()
+        _, d4 = d.clone(), d.clone()

        f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
        f(c3, c3)
@ -642,7 +642,7 @@ class AotAutogradFallbackTests(torch._inductor.test_case.TestCase):
        a = torch.randn(3, 3, requires_grad=True)
        b = torch.randn(3, 3, requires_grad=True)
        a1, a2, a3, a4 = a.clone(), a.clone(), a.clone(), a.clone()
-        b1, b2, b3, b4 = b.clone(), b.clone(), b.clone(), b.clone()
+        _, b2, b3, b4 = b.clone(), b.clone(), b.clone(), b.clone()

        failure_reason = None

@ -670,7 +670,7 @@ class AotAutogradFallbackTests(torch._inductor.test_case.TestCase):
        c = torch.randn(3, 3, requires_grad=True)
        d = torch.randn(3, 3, requires_grad=True)
        c3, c4 = c.clone(), c.clone()
-        d3, d4 = d.clone(), d.clone()
+        _, d4 = d.clone(), d.clone()

        f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
        f(a3, b3, c3, c3)
@ -1017,7 +1017,7 @@ SeqNr|OrigAten|SrcFn|FwdSrcFn
            activities=[torch.profiler.ProfilerActivity.CPU],
            record_shapes=True,
        ) as kineto_prof:
-            res = model_instance(*args)
+            model_instance(*args)
        bwd_set = set()
        prof_str = "SeqNr|Thread|FwdThread|Name\n"
        for event in kineto_prof.events():
@ -1191,7 +1191,7 @@ SeqNr|OrigAten|SrcFn|FwdSrcFn

            x = torch.randn(3, requires_grad=True)
            with self.assertRaisesRegex(RuntimeError, "Cannot access data pointer"):
-                y = torch.compile(f, backend="aot_eager", fullgraph=True)(x)
+                torch.compile(f, backend="aot_eager", fullgraph=True)(x)
            self.assertTrue(backward_called)

    # We don't know how to catch multiple mutations to the same memory location
--- a/test/dynamo/test_aot_autograd_cache.py
+++ b/test/dynamo/test_aot_autograd_cache.py
@ -157,7 +157,7 @@ class AOTAutogradCacheTests(InductorTestCase):

        with torch.autograd._force_original_view_tracking(True):
            compiled_fn = torch.compile(fn)
-            out = compiled_fn(torch.rand(2, 3))
+            compiled_fn(torch.rand(2, 3))

        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
        self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], 1)
@ -654,7 +654,7 @@ class AOTAutogradCachePicklerTests(torch._dynamo.test_case.TestCase):
        def fn(x):
            return x.sin().cos()

-        def fn2(x):
+        def fn2(x):  # noqa: F841
            y = x.sin()
            z = y.cos()
            return z
--- a/test/dynamo/test_autograd_function.py
+++ b/test/dynamo/test_autograd_function.py
@ -760,7 +760,7 @@ class GraphModule(torch.nn.Module):
            def backward(ctx, gO):
                return torch.tensor(float("nan")).expand(10, 10)

-        def run_fn(a):
+        def run_fn(a):  # noqa: F841
            out = MyFunc2.apply(a)
            return out.sum()

@ -837,11 +837,11 @@ class GraphModule(torch.nn.Module):

            x = torch.randn(5, 5, requires_grad=True)
            y = torch.randn(5, 5, requires_grad=True)
-            q, p = Identity.apply(x, y)
+            Identity.apply(x, y)

            a = torch.rand(1, 2)
            b = torch.rand(1, requires_grad=True)
-            view_a = MyFn.apply(a)
+            MyFn.apply(a)

            a = torch.ones(2, requires_grad=True)
            b = torch.ones(2, requires_grad=True)
@ -860,7 +860,7 @@ class GraphModule(torch.nn.Module):
            MyFn2.apply(c, d)

            base = torch.rand(10, requires_grad=True)
-            foo = MyFn3.apply(base, False)
+            MyFn3.apply(base, False)

        test()
        opt_test = torch.compile(test, backend="eager")
--- a/test/dynamo/test_backends.py
+++ b/test/dynamo/test_backends.py
@ -267,9 +267,8 @@ class TestCustomBackendAPI(torch._dynamo.test_case.TestCase):
        self.assertTrue(backend_run)

    def test_lookup_backend(self):
-        from torch._dynamo import list_backends, lookup_backend
+        from torch._dynamo import lookup_backend

-        backends = list_backends()
        backend_run = False

        def my_compiler(gm, example_inputs):
--- a/test/dynamo/test_backward_higher_order_ops.py
+++ b/test/dynamo/test_backward_higher_order_ops.py
@ -247,8 +247,6 @@ class GraphModule(torch.nn.Module):
                with compiled_autograd._enable(compiler_fn):
                    out.backward(grad_out)

-            graph = None
-

 if __name__ == "__main__":
    from torch._dynamo.test_case import run_tests
--- a/test/dynamo/test_bytecode_utils.py
+++ b/test/dynamo/test_bytecode_utils.py
@ -518,7 +518,7 @@ def fn():
        insts = bytecode_transformation.bytecode_from_template(fn, noprefix=False)
        self.assertEqual(insts[-1].opname, "NOP")
        insts_i = 0
-        for i, inst in enumerate(dis_insts):
+        for inst in dis_insts:
            if inst.opname == "RETURN_CONST":
                self.assertEqual(insts[insts_i].opname, "LOAD_CONST")
                insts_i += 1
@ -538,7 +538,7 @@ def fn():
                    x = x + 1
                except NotImplementedError:
                    x = x + 1
-                except Exception as e:
+                except Exception:
                    x = x + 1
            return x

--- a/test/dynamo/test_compiler_bisector.py
+++ b/test/dynamo/test_compiler_bisector.py
@ -43,7 +43,7 @@ class TestCompilerBisector(TestCase):
        return lib

    def test_bad_decomp(self):
-        mod = import_module("torch._inductor.compile_fx")
+        import_module("torch._inductor.compile_fx")

        def bad_exp_decomp(self, rate=1, generator=None):
            assert generator is None
@ -86,7 +86,7 @@ class TestCompilerBisector(TestCase):
                vq_compiled = torch.compile(vq)
                x = torch.randn(4, 400, 256).cuda()
                with torch._dynamo.utils.preserve_rng_state():
-                    out = vq(x)
+                    vq(x)
                out_compiled = vq_compiled(x)

            return not out_compiled.isnan().any()
@ -150,7 +150,6 @@ class TestCompilerBisector(TestCase):
        self.assertTrue("inductor_fallback_random" in out.debug_info)

    def test_crossref(self):
-        test_ns = "bisect_ops"
        with _scoped_library(self.test_ns, "FRAGMENT") as lib:
            lib.define("foo(Tensor x) -> Tensor")
            op = self.get_op("foo")
--- a/test/dynamo/test_comptime.py
+++ b/test/dynamo/test_comptime.py
@ -117,7 +117,7 @@ def forward(self, L_x_ : torch.Tensor):

            return y + 3

-        def munge_disas(s):
+        def munge_disas(s):  # noqa: F841
            re.sub(
                r"^(?: +\d+)?(?: +(-->)) \+\d+ ([A-Za-z0-9_]+)",
                "\1 \3",
@ -271,7 +271,7 @@ y = FakeTensor(..., size=(2,))
            y = g(y)
            return y + 3

-        def munge_filenames(s):
+        def munge_filenames(s):  # noqa: F841
            return re.sub(r'File "[^"]+", line \d+', 'File "X", line X', s)

        f(torch.randn(2))
@ -389,7 +389,7 @@ y = FakeTensor(..., size=(2,))
        @torch.compile(backend=cnt)
        def f(x):
            y = x * 2
-            lit = 2
+            lit = 2  # noqa: F841

            @comptime
            def _(ctx):
--- a/test/dynamo/test_ctx_manager.py
+++ b/test/dynamo/test_ctx_manager.py
@ -268,15 +268,13 @@ class CtxManagerTests(torch._dynamo.test_case.TestCase):
            cur_stream.wait_stream(new_stream)

            x = torch.add(x, 4)
-            is_idle = cur_stream.query()
+            cur_stream.query()
            cur_stream.synchronize()

            with torch.cuda.stream(new_stream):
                x = torch.add(x, 5)
            new_stream.synchronize()

-            is_equal = cur_stream == new_stream
-
            x = torch.relu(x)
            x = torch.cos(x)
            return x
@ -439,7 +437,7 @@ class CtxManagerTests(torch._dynamo.test_case.TestCase):
            x = torch.add(x, 3)

            event = cur_stream.record_event()
-            is_idle = event.query()
+            event.query()

            new_stream.wait_event(event)
            with torch.cuda.stream(new_stream):
@ -481,7 +479,7 @@ class CtxManagerTests(torch._dynamo.test_case.TestCase):
            x = torch.add(x, 3)

            event = cur_stream.record_event()
-            is_idle = event.query()
+            event.query()

            new_stream.wait_event(event)
            with torch.cuda.stream(new_stream):
@ -567,7 +565,7 @@ class CtxManagerTests(torch._dynamo.test_case.TestCase):
        real_device = real.device
        real_dtype = real.dtype

-        graph, guards = torch._dynamo.export(module)(torch.tensor([[0.0, 0], [0, 0]]))
+        graph, _ = torch._dynamo.export(module)(torch.tensor([[0.0, 0], [0, 0]]))
        exported = graph(torch.tensor([0.5]))
        self.assertEqual(exported.device, real_device)
        self.assertEqual(exported.dtype, real_dtype)
@ -676,7 +674,7 @@ class CtxManagerTests(torch._dynamo.test_case.TestCase):
        real_device = real.device
        real_dtype = real.dtype

-        graph, guards = torch._dynamo.export(module)(torch.tensor([[0.0, 0], [0, 0]]))
+        graph, _ = torch._dynamo.export(module)(torch.tensor([[0.0, 0], [0, 0]]))
        exported = graph(torch.tensor([0.5]))
        self.assertEqual(exported.device, real_device)
        self.assertEqual(exported.dtype, real_dtype)
@ -850,7 +848,7 @@ class CtxManagerTests(torch._dynamo.test_case.TestCase):
        real_device = real.device
        real_dtype = real.dtype

-        graph, guards = torch._dynamo.export(module)(torch.tensor([[0.0, 0], [0, 0]]))
+        graph, _ = torch._dynamo.export(module)(torch.tensor([[0.0, 0], [0, 0]]))
        exported = graph(torch.tensor([0.5]))
        self.assertEqual(exported.device, real_device)
        self.assertEqual(exported.dtype, real_dtype)
@ -876,7 +874,7 @@ class CtxManagerTests(torch._dynamo.test_case.TestCase):
        real_device = real.device
        real_dtype = real.dtype

-        graph, guards = torch._dynamo.export(module)(torch.tensor([[0.0, 0], [0, 0]]))
+        graph, _ = torch._dynamo.export(module)(torch.tensor([[0.0, 0], [0, 0]]))
        exported = graph(torch.tensor([0.5]))
        self.assertEqual(exported.device, real_device)
        self.assertEqual(exported.dtype, real_dtype)
@ -1297,7 +1295,7 @@ class GraphModule(torch.nn.Module):
        eager = EagerAndRecordGraphs()
        torch.compile(fn, backend=eager, fullgraph=False)(torch.randn(()))

-        def check_graph(actual, expected):
+        def check_graph(actual, expected):  # noqa: F841
            self.assertExpectedInline(actual, expected)

        graph = eager.graphs[0]
@ -1342,7 +1340,7 @@ class GraphModule(torch.nn.Module):
            for i in range(2):
                torch._dynamo.reset()

-                ctx_wrapper, mode = ctx_wrappers[i]
+                ctx_wrapper, _ = ctx_wrappers[i]
                ctx_wrapper_inverse, mode_inverse = ctx_wrappers[(i + 1) % 2]

                def fn(x):
@ -1373,7 +1371,7 @@ class GraphModule(torch.nn.Module):
            for i in range(2):
                torch._dynamo.reset()

-                ctx_wrapper, mode = ctx_wrappers[i]
+                ctx_wrapper, _ = ctx_wrappers[i]
                ctx_wrapper_inverse, mode_inverse = ctx_wrappers[(i + 1) % 2]

                def fn(x):
--- a/test/dynamo/test_cudagraphs.py
+++ b/test/dynamo/test_cudagraphs.py
@ -63,7 +63,7 @@ class TestAotCudagraphs(torch._dynamo.test_case.TestCase):

        @torch.compile(backend="cudagraphs")
        def fn(x, y):
-            for i in range(N_ITERS):
+            for _ in range(N_ITERS):
                loss = model(x, y).sum()
                loss.backward()

@ -80,7 +80,7 @@ class TestAotCudagraphs(torch._dynamo.test_case.TestCase):

        @torch.compile(backend="cudagraphs")
        def fn(x, y):
-            for i in range(N_ITERS):
+            for _ in range(N_ITERS):
                loss = model(x, y).sum()
                loss.backward()

@ -96,7 +96,7 @@ class TestAotCudagraphs(torch._dynamo.test_case.TestCase):

        @torch.compile(backend="cudagraphs")
        def fn(x, y):
-            for i in range(N_ITERS):
+            for _ in range(N_ITERS):
                loss = model(x, y).sum()
                loss.backward()

--- a/test/dynamo/test_debug_utils.py
+++ b/test/dynamo/test_debug_utils.py
@ -45,7 +45,7 @@ def forward(self, x_1):
    """,  # NOQA: B950
        )

-        fp64_model, fp64_examples = debug_utils.cast_to_fp64(fx, (x,))
+        _, fp64_examples = debug_utils.cast_to_fp64(fx, (x,))
        self.assertEqual(fp64_examples, (x.to(torch.float64),))

        self.assertExpectedInline(
@ -79,7 +79,7 @@ def forward(self, x_1):
                _tensor_constant0
            )
            _tensor_constant0 = None
-            index: "f32[6144, 4190]" = torch.ops.aten.index.Tensor(
+            index: "f32[6144, 4190]" = torch.ops.aten.index.Tensor(  # noqa: F841
                primals_48, [None, lift_fresh_copy]
            )
            lift_fresh_copy = None
--- a/test/dynamo/test_decorators.py
+++ b/test/dynamo/test_decorators.py
@ -83,7 +83,7 @@ class DecoratorTests(torch._dynamo.test_case.TestCase):

        # This behavior is not ideal, but supporting it would add overhead
        # to callsites of eval_frame.innermost_fn. A warning would also be very noisy.
-        w = torch._dynamo.disable(fn=wrapper, recursive=True)
+        torch._dynamo.disable(fn=wrapper, recursive=True)

    def test_disable_nn_modules_forward_hook(self):
        class SimpleLinear(torch.nn.Module):
@ -543,7 +543,7 @@ class DecoratorTests(torch._dynamo.test_case.TestCase):
            return v1, v2, v3, v4, v5, v6, v7, v8, v9

        a, b, c = A(), B(), C()
-        v1, v2, v3, v4, v5, v6, v7, v8, v9 = fn(a, b, c)
+        v1, v2, v3, v4, v5, _, v7, v8, v9 = fn(a, b, c)

        self.assertEqual(v1, (A, 1))
        self.assertEqual(v2, (A, 2))
--- a/test/dynamo/test_exc.py
+++ b/test/dynamo/test_exc.py
@ -92,7 +92,7 @@ from user code:
                raise NotImplementedError

            # Ensure graph break is not possible
-            for i in range(3):
+            for _ in range(3):
                comptime(f)

        torch.compile(fn001, backend="eager")(torch.randn(1))
--- a/Show More
+++ b/Show More