[BE][2/6] fix typos in test/ (test/test_*.py) (#157636)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157636 Approved by: https://github.com/yewentao256, https://github.com/mlazos ghstack dependencies: #156311, #156609
2025-12-06 12:20:52 +01:00 · 2025-07-09 13:23:55 +08:00 · 2025-07-09 13:23:55 +08:00 · fc0376e8b1
commit fc0376e8b1
parent ffe11b2bf2
57 changed files with 194 additions and 195 deletions
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1167,7 +1167,6 @@ exclude_patterns = [
    'aten/src/ATen/native/[a-pA-P]*/**',
    'aten/src/ATen/[a-mA-M]*/**',
    'test/**',
    'test/test_*',
    'test/[a-hA-h]*/**',
    'test/distributed/**',
    'torch/_*/**',
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@ -4129,7 +4129,7 @@ class TestAutograd(TestCase):
        self.assertIsNone(y.grad_fn)
    def test_backward_copy(self):
-        # This tests checks backward engine for a very subtle bug that appreared
+        # This tests checks backward engine for a very subtle bug that appeared
        # in one of the initial versions of autograd. Gradients tensors were
        # simply stored in lists while the function waited for all its gradients
        # to be computed. However, sometimes an output was used multiple times,
@ -4312,7 +4312,7 @@ class TestAutograd(TestCase):
                    ctx.output_var.sum().backward()
                return ctx.x.grad * grad_output
-        # Reentrant starts on CPU thread, finishs on GPU thread
+        # Reentrant starts on CPU thread, finishes on GPU thread
        x = torch.randn(2, 2, requires_grad=True)
        out = Reenter.apply(x)
        out.sum().backward()
@ -10728,7 +10728,7 @@ class TestAutogradForwardMode(TestCase):
            dual = fwAD.make_dual(foo, tangent)
            self.assertFalse(tangent_ref.expired())
-            # Make sure that the tangent we provided has been re-used as is
+            # Make sure that the tangent we provided has been reused as is
            self.assertTrue(fwAD.unpack_dual(dual)[1] is tangent)
            # Make sure that dual is keeping the tangent alive
@ -11087,7 +11087,7 @@ class TestAutogradForwardMode(TestCase):
            self.assertEqual(
                dual_tangent.storage().data_ptr(), bar.storage().data_ptr()
            )
-            # And the tangent is actually re-used as-is so it is still the same Tensor
+            # And the tangent is actually reused as-is so it is still the same Tensor
            self.assertIs(dual_tangent, bar)
            # Ensure we properly share the version counter
@ -11969,19 +11969,19 @@ class TestAutogradDeviceType(TestCase):
                        (new_param**2).sum().backward()
                return grad_output
-        # Reentrant starts on GPU thread, finishs on GPU thread
+        # Reentrant starts on GPU thread, finishes on GPU thread
        x = torch.randn(2, 2, device=device, requires_grad=True)
        out = ReentrantFunc.apply(x)
        out.sum().backward()
-        # Reentrant starts on CPU thread, finishs on GPU thread
+        # Reentrant starts on CPU thread, finishes on GPU thread
        x = torch.randn(2, 2, requires_grad=True)
        # set ReentrantFunc node to GPU to emit tasks to GPU queue
        ReentrantFunc._cpu_mode = False
        out = ReentrantFunc.apply(x)
        out.sum().backward()
-        # Reentrant starts on GPU thread, finishs on CPU thread
+        # Reentrant starts on GPU thread, finishes on CPU thread
        x = torch.randn(2, 2, device=device, requires_grad=True)
        # set ReentrantFunc node to CPU to emit tasks to CPU queue
        ReentrantFunc._cpu_mode = True
@ -13665,7 +13665,7 @@ class TestMultithreadAutograd(TestCase):
                    y = x * x
                    if torch.cuda.device_count() >= 2:
                        # DataParallel is calling the forward in different threads
-                        # without progating TLS, so hooks should not be called here
+                        # without propagating TLS, so hooks should not be called here
                        _self.assertEqual(len(w), 0)
                    else:
                        # DataParallel only uses one thread
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@ -79,7 +79,7 @@ if TEST_SCIPY:
 class TestBinaryUfuncs(TestCase):
    # Generic tests for elementwise binary (AKA binary universal (u) functions (funcs))
    # TODO: below contiguous tensor results are compared with a variety of noncontiguous results.
-    #   It would be interesting to have the lhs and rhs have different discontiguities.
+    #   It would be interesting to have the lhs and rhs have different discontinuities.
    # Helper for comparing torch tensors and NumPy arrays
    # TODO: should this or assertEqual also validate that strides are equal?
@ -2521,7 +2521,7 @@ class TestBinaryUfuncs(TestCase):
            # Verify Value
            self.assertEqual(torch_result, expected)
            # Verify Sign
-            # Use double copysign to verify the correctnes of 0.0 and -0.0, since
+            # Use double copysign to verify the correctness of 0.0 and -0.0, since
            # it always True for self.assertEqual(0.0 == -0.0). So, we use 1 as the
            # magnitude to verify the sign between torch and numpy results, elementwise.
            # Special case: NaN conversions between FP32 and FP16 is not bitwise
--- a/test/test_cpp_extensions_jit.py
+++ b/test/test_cpp_extensions_jit.py
@ -1031,7 +1031,7 @@ class TestCppExtensionJIT(common.TestCase):
        t = torch.rand(2).double()
        cpp_tensor_name = r"CPUDoubleType"
-        # Without error handling, the warnings cannot be catched
+        # Without error handling, the warnings cannot be caught
        warn_mod = torch.utils.cpp_extension.load_inline(
            name="warn_mod",
            cpp_sources=[source],
@ -1065,23 +1065,23 @@ class TestCppExtensionJIT(common.TestCase):
        )
        with warnings.catch_warnings(record=True) as w:
-            # Catched with no error should be detected
+            # Caught with no error should be detected
            warn_mod.foo(t, 0)
            self.assertEqual(len(w), 1)
-            # Catched with cpp error should also be detected
+            # Caught with cpp error should also be detected
            with self.assertRaisesRegex(TypeError, t.type()):
                warn_mod.foo(t, 1)
            self.assertEqual(len(w), 2)
-            # Catched with python error should also be detected
+            # Caught with python error should also be detected
            with self.assertRaisesRegex(
                SystemError, "bad argument to internal function"
            ):
                warn_mod.foo(t, 2)
            self.assertEqual(len(w), 3)
-            # Catched with pybind error should also be detected
+            # Caught with pybind error should also be detected
            # Note that there is no type name translation for pybind errors
            with self.assertRaisesRegex(KeyError, cpp_tensor_name):
                warn_mod.foo(t, 3)
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@ -795,7 +795,7 @@ print(t.is_pinned())
            os.environ["TORCH_ALLOW_TF32_CUBLAS_OVERRIDE"]
        )
        # this is really just checking that the environment variable is respected during testing
-        # and not overwritten by another function that doesn't revert it to the intitial value
+        # and not overwritten by another function that doesn't revert it to the initial value
        if not skip_tf32_cublas:
            self.assertFalse(torch.backends.cuda.matmul.allow_tf32)
            self.assertEqual(torch.get_float32_matmul_precision(), "highest")
@ -1143,7 +1143,7 @@ print(t.is_pinned())
            tmp2 = torch.cuda.FloatTensor(t.size())
            tmp2.zero_()
            self.assertNotEqual(
-                tmp2.data_ptr(), ptr[0], msg="allocation re-used to soon"
+                tmp2.data_ptr(), ptr[0], msg="allocation reused to soon"
            )
        self.assertEqual(result.tolist(), [1, 2, 3, 4])
@ -1154,7 +1154,7 @@ print(t.is_pinned())
            torch.cuda.current_stream().synchronize()
            with torch.cuda.stream(stream):
                tmp3 = torch.cuda.FloatTensor(t.size())
-                self.assertEqual(tmp3.data_ptr(), ptr[0], msg="allocation not re-used")
+                self.assertEqual(tmp3.data_ptr(), ptr[0], msg="allocation not reused")
    def test_record_stream_on_shifted_view(self):
        # See issue #27366
@ -1235,20 +1235,20 @@ print(t.is_pinned())
    def test_caching_pinned_memory(self):
        cycles_per_ms = get_cycles_per_ms()
-        # check that allocations are re-used after deletion
+        # check that allocations are reused after deletion
        t = torch.FloatTensor([1]).pin_memory()
        ptr = t.data_ptr()
        del t
        t = torch.FloatTensor([1]).pin_memory()
        self.assertEqual(t.data_ptr(), ptr, msg="allocation not reused")
-        # check that the allocation is not re-used if it's in-use by a copy
+        # check that the allocation is not reused if it's in-use by a copy
        gpu_tensor = torch.cuda.FloatTensor([0])
        torch.cuda._sleep(int(1000 * cycles_per_ms))  # delay the copy by 1s
        gpu_tensor.copy_(t, non_blocking=True)
        del t
        t = torch.FloatTensor([1]).pin_memory()
-        self.assertNotEqual(t.data_ptr(), ptr, msg="allocation re-used too soon")
+        self.assertNotEqual(t.data_ptr(), ptr, msg="allocation reused too soon")
        self.assertEqual(list(gpu_tensor), [1])
    def test_caching_allocator_record_stream_oom(self):
@ -1263,7 +1263,7 @@ print(t.is_pinned())
            x = torch.empty(40 * 1024 * 1024, device="cuda")
            with torch.cuda.stream(stream):
                y += x
-            # delays re-use of `x` until after all operations in `stream`
+            # delays reuse of `x` until after all operations in `stream`
            x.record_stream(stream)
            del x
@ -2970,7 +2970,7 @@ exit(2)
                    current = postcapture_stats[stat] - precapture_stats[stat]
                    # There will only ever be one expandable segment in each of the small and large pools. The way the
-                    # bookeeping is done in the allocator means that we never increment the number of segments.
+                    # bookkeeping is done in the allocator means that we never increment the number of segments.
                    if self.expandable_segments and "segment" in stat:
                        expected = 0
                    # These two cases hit an edge case where the PyTorch allocator won't immediately unmap part of an
@ -3011,7 +3011,7 @@ exit(2)
                current = postdel_stats[stat] - precapture_stats[stat]
                # There will only ever be one expandable segment in each of the small and large pools. The way the
-                # bookeeping is done in the allocator means that we never increment the number of segments.
+                # bookkeeping is done in the allocator means that we never increment the number of segments.
                if self.expandable_segments and "segment" in stat:
                    expected = 0
                # These two cases hit an edge case where the PyTorch allocator won't immediately unmap part of an
@ -3648,7 +3648,7 @@ exit(2)
        graph.replay()
        self.assertTrue(torch.all(x == 3.0))
-        # Check that graph capture can succeed after reseting.
+        # Check that graph capture can succeed after resetting.
        graph.reset()
        # Don't do x[:] = 0.0 because we want to capture a new address
@ -5382,7 +5382,7 @@ class TestMemPool(TestCase):
            out_2 = torch.randn(nelem_1mb, device="cuda")
            # pool now should have 2 segments since the CUDACachingAllocator had
-            # to make a new 2 MB buffer to accomodate out_2
+            # to make a new 2 MB buffer to accommodate out_2
            self.assertEqual(len(pool.snapshot()), 2)
        self.assertEqual(len(pool.snapshot()), 2)
--- a/test/test_cuda_multigpu.py
+++ b/test/test_cuda_multigpu.py
@ -967,7 +967,7 @@ class TestCudaMultiGPU(TestCase):
    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
    def test_caching_pinned_memory_multi_gpu(self):
-        # checks that the events preventing pinned memory from being re-used
+        # checks that the events preventing pinned memory from being reused
        # too early are recorded on the correct GPU
        cycles_per_ms = get_cycles_per_ms()
@ -982,7 +982,7 @@ class TestCudaMultiGPU(TestCase):
        del t
        t = torch.FloatTensor([2]).pin_memory()
-        self.assertNotEqual(t.data_ptr(), ptr, msg="allocation re-used too soon")
+        self.assertNotEqual(t.data_ptr(), ptr, msg="allocation reused too soon")
        with torch.cuda.device(0):
            gpu_tensor0.copy_(t, non_blocking=True)
--- a/test/test_cuda_nvml_based_avail.py
+++ b/test/test_cuda_nvml_based_avail.py
@ -138,7 +138,7 @@ class TestVisibleDeviceParses(TestCase):
            _transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-123", "GPU-47"], uuids),
            [1],
        )
-        # First ambigous UUID aborts parsing
+        # First ambiguous UUID aborts parsing
        self.assertEqual(
            _transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-e", "GPU-47"], uuids), [1]
        )
--- a/test/test_cuda_primary_ctx.py
+++ b/test/test_cuda_primary_ctx.py
@ -42,7 +42,7 @@ class TestCudaPrimaryCtx(TestCase):
        self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
        torch.cuda.set_device(0)
        if _get_torch_cuda_version() >= (12, 0):
-            # Now after the device was set, the contex should present in CUDA 12.
+            # Now after the device was set, the context should present in CUDA 12.
            self.assertTrue(torch._C._cuda_hasPrimaryContext(0))
        else:
            # In CUDA 11 the context should not be created.
--- a/test/test_custom_ops.py
+++ b/test/test_custom_ops.py
@ -630,7 +630,7 @@ def _(x):
                g(x)
    def test_invalid_schemas(self):
-        # function schmea validation goes through torchgen, so this is just a
+        # function schema validation goes through torchgen, so this is just a
        # basic test.
        with self.assertRaisesRegex(AssertionError, "Invalid function schema: foo"):
            custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo", "(")
@ -2712,7 +2712,7 @@ class TestCustomOpAPI(TestCase):
                self.assertEqual(ctx.needs_input_grad, expected)
                return list(grad.unbind(0))
-        # call two applys, do a backward on the first
+        # call two applies, do a backward on the first
        def t():
            return torch.randn([], requires_grad=True)
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@ -734,12 +734,12 @@ class SleepDataset(Dataset):
    def __init__(self, size, sleep_sec):
        self.size = size
        self.sleep_sec = sleep_sec
-        self.sleeped = False
+        self.slept = False
    def __getitem__(self, idx):
-        if not self.sleeped:
+        if not self.slept:
            time.sleep(self.sleep_sec)
-            self.sleeped = True
+            self.slept = True
        return idx
    def __len__(self):
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@ -573,7 +573,7 @@ class TestCaptureDataFrame(TestCase):
 class TestDataFramesPipes(TestCase):
    """
-    Most of test will fail if pandas instaled, but no dill available.
+    Most of test will fail if pandas installed, but no dill available.
    Need to rework them to avoid multiple skips.
    """
@ -1887,7 +1887,7 @@ class TestFunctionalIterDataPipe(TestCase):
        with self.assertRaises(ValueError):
            list(filter_dp)
-        # Funtional Test: Specify input_col
+        # Functional Test: Specify input_col
        tuple_input_ds = dp.iter.IterableWrapper([(d - 1, d, d + 1) for d in range(10)])
        # Single input_col
@ -3356,7 +3356,7 @@ class TestSharding(TestCase):
        with self.assertRaises(Exception):
            dp.apply_sharding(2, 1, sharding_group=SHARDING_PRIORITIES.DEFAULT)
-    # Test tud.datapipes.iter.grouping.SHARDING_PRIORITIES for backward compatbility
+    # Test tud.datapipes.iter.grouping.SHARDING_PRIORITIES for backward compatibility
    # TODO: Remove this test once tud.datapipes.iter.grouping.SHARDING_PRIORITIES is deprecated
    def test_sharding_groups_in_legacy_grouping_package(self):
        with self.assertWarnsRegex(
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@ -854,7 +854,7 @@ def forward(self, scores_1, mask_1, value_1):
            #  de-functionalise the graph, as that would break AoTAutograd
            # We run the real function *after* the decomposition to make sure that the
            # decomposition does not modify any of the inputs in-place. If it does
-            # real_out should be differen than decom_out so we should catch this
+            # real_out should be different than decom_out so we should catch this
            real_out_unflat = func(*args, **kwargs)
            real_out = pytree.tree_leaves(real_out_unflat)
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@ -3286,7 +3286,7 @@ def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "i64[u1][1]
    def test_unbacked_reshape2(self):
        cnt = CompileCounterWithBackend("inductor")
-        # This reshape requires a clone when the input is not contiguous and we cant compute strides.
+        # This reshape requires a clone when the input is not contiguous and we can't compute strides.
        # reshape (u2, u3) -> (u0, u1)
        def func(x, y):
            u0, u1 = y.tolist()
@ -3421,7 +3421,7 @@ def forward(self, arg0_1: "i64[2][1]cpu", arg1_1: "Sym(u2)", arg2_1: "Sym(u3)",
    def test_invalid_view_unbacked_view(self):
        cnt = CompileCounterWithBackend("inductor")
-        # This view (u2, u3) -> (u0, u1) cant happen in general unless we know that input is contigous or we have
+        # This view (u2, u3) -> (u0, u1) can't happen in general unless we know that input is contiguous or we have
        # hints to to compute strides.
        def func(x, y):
            u0, u1 = y.tolist()
@ -3452,7 +3452,7 @@ def forward(self, arg0_1: "i64[2][1]cpu", arg1_1: "Sym(u2)", arg2_1: "Sym(u3)",
        func(torch.ones(5, 6, 7, 8))
        self.assertEqual(cnt.frame_count, 1)
-        # it can be dynamic in all dimentions except dim=2
+        # it can be dynamic in all dimensions except dim=2
        func(torch.ones(4, 9, 7, 10))
        self.assertEqual(cnt.frame_count, 1)
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@ -97,7 +97,7 @@ class FakeTensorTest(TestCase):
    @unittest.skipIf(not RUN_CUDA, "requires cuda")
    def test_cuda_initialized(self):
-        # doesnt error
+        # doesn't error
        with FakeTensorMode():
            p = torch.randn(4, 2, requires_grad=True, device="cuda")
            x = torch.randn(8, 4, device="cuda")
@ -1471,7 +1471,7 @@ class FakeTensorOperatorInvariants(TestCase):
                with torch._subclasses.CrossRefFakeMode():
                    Repro()(*args)
            except MetadataMismatchError as e:
-                # We expect the cross ref to succed for the first output to fail
+                # We expect the cross ref to succeed for the first output to fail
                # for the rng state, see Note [Seed and Offset]
                self.assertTrue("output[0]" not in str(e))
                if self.__class__.__name__.startswith("PropagateRealTensors"):
@ -2327,7 +2327,7 @@ class FakeTensorDispatchCache(TestCase):
            self.assertEqual(len(backend.fw_graphs), 1)
            mod = backend.fw_graphs[0]
-            # Ensure that we see hits everytime
+            # Ensure that we see hits every time
            with FakeTensorMode():
                x = torch.randn(6, 4)
                y = torch.randn(6, 4)
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@ -199,7 +199,7 @@ class TestFunctionalization(TestCase):
            y.set_(x.storage())
            return y
-        # We should probaby get the crossref test to work,
+        # We should probably get the crossref test to work,
        # but fixing it for Storage() objects is annoying.
        r = _functionalize(f, reapply_views=True, crossref=False)(torch.ones(2))
        self.assertEqual(str(r.device), "cpu")
@ -2318,7 +2318,7 @@ def forward(self, arg0_1):
    ]
 )
@unittest.skipIf(
-    TEST_WITH_TORCHDYNAMO, "dynamo-ing code with proxy + fake doesnt work well"
+    TEST_WITH_TORCHDYNAMO, "dynamo-ing code with proxy + fake doesn't work well"
 )
 class TestCrossRefFunctionalization(TestFunctionalization):
    crossref = True
--- a/test/test_functionalization_of_rng_ops.py
+++ b/test/test_functionalization_of_rng_ops.py
@ -302,7 +302,7 @@ class TestFunctionalizationRngOps(TestCase):
        fwd_compiler = functools.partial(count_philox_rand, freq=1)
        bwd_compiler = functools.partial(count_philox_rand, freq=0)
        aot_fn = aot_function(fn, fwd_compiler, bwd_compiler)
-        # We cant check accuracy here because rand_like generated different rand numbers than dropout
+        # We can't check accuracy here because rand_like generated different rand numbers than dropout
        res = aot_fn(x, y)
        res.sum().backward()
@ -316,7 +316,7 @@ class TestFunctionalizationRngOps(TestCase):
        # Ensure the decomp is happening
        aot_fn = aot_function(fn, functools.partial(count_philox_rand, freq=1))
-        # We cant check accuracy here because rand_like generated different rand numbers than dropout
+        # We can't check accuracy here because rand_like generated different rand numbers than dropout
        aot_fn(x)
--- a/test/test_fx.py
+++ b/test/test_fx.py
@ -908,7 +908,7 @@ class TestFX(JitTestCase):
            wrapper = WrapperModule(interpreter)
            # Create a graph that: 1) Takes function arguments 2) Invokes the interpreter
-            # 3) Returns the speficied return value
+            # 3) Returns the specified return value
            # FIXME: The following code could be greatly simplified by symbolic_trace'ing
            # the wrapper with a Tracer that considers the Wrapper instance a root
@ -2225,8 +2225,8 @@ class TestFX(JitTestCase):
        foo_scripted = torch.jit.script(Foo())
        foo_scripted(Pair(torch.rand(5), torch.rand(5)), torch.rand(5), 3)
-        fxed = symbolic_trace(Foo())
+        fixed = symbolic_trace(Foo())
-        fxed_scripted = torch.jit.script(fxed)
+        fxed_scripted = torch.jit.script(fixed)
        fxed_scripted(Pair(torch.rand(5), torch.rand(5)), torch.rand(5), 3)
    def test_fn_type_annotation_empty(self):
--- a/test/test_fx_passes.py
+++ b/test/test_fx_passes.py
@ -110,7 +110,7 @@ class TestPartitionFunctions:
    @staticmethod
    def forward6(a, b, c):
-        # add should have its own partition, as neither branchs are supported
+        # add should have its own partition, as neither branches are supported
        add = a + 1
        # left branch
        relu = add.relu()
@ -283,7 +283,7 @@ class TestFXGraphPasses(JitTestCase):
        (TestPartitionFunctions.forward15, [['add_1', 'add', 'permute_1', 'view', 'permute_2', 'permute_3', 'permute']], False),
        (TestPartitionFunctions.forward16, [["permute_1", "add_1", "add"]], True),
        (TestPartitionFunctions.forward16, [['add_1', 'add', 'permute_1', 'view', 'permute_2', 'permute_3', 'permute']], False),
-        # should be empty partition, not a partiton with empty nodes
+        # should be empty partition, not a partition with empty nodes
        (TestPartitionFunctions.forward18, [], False),
    ])
    def test_partitioner(self, fn, expected_partition, bookend_non_compute_pass):
@ -344,9 +344,9 @@ class TestFXGraphPasses(JitTestCase):
        [['add', 'add_1', 'add_2']],  # vertical fusion
        [['add_2', 'add_3']],         # horizontal fusion
        [['add_3', 'add_4']],
-        [['add_6', 'add_5']],     # arbitray node order
+        [['add_6', 'add_5']],     # arbitrary node order
-        [['add_4', 'add_1', 'add_3', 'add_2']],           # arbitray node order
+        [['add_4', 'add_1', 'add_3', 'add_2']],           # arbitrary node order
-        [['add_5', 'add_6'], ['add_1', 'add_2', 'add_3', 'add_4']],  # arbitray partition order
+        [['add_5', 'add_6'], ['add_1', 'add_2', 'add_3', 'add_4']],  # arbitrary partition order
        [['add_5', 'linear2']],   # includes call_function + call_module node
        [['add_6', 'relu']],   # includes call_function + call_module node
        [['param', 'add_2']],   # includes get_attr + call_module nodes
--- a/test/test_fx_reinplace_pass.py
+++ b/test/test_fx_reinplace_pass.py
@ -43,7 +43,7 @@ def forward(self, x_1):
        def f(x):
            a = x.clone()
            a_view = a.view(-1)
-            # We shouldn't re-inplace the first add(), because an alias of a is re-used later in the program
+            # We shouldn't re-inplace the first add(), because an alias of a is reused later in the program
            b = a.add(1)  # noqa: F841
            # Second add() is fine to re-inplace
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@ -231,7 +231,7 @@ class TestIndexing(TestCase):
                x[ri([0, 2, 4]),], torch.tensor([5, 4, 3], dtype=dtype, device=device)
            )
-        # Only validates indexing and setting for halfs
+        # Only validates indexing and setting for Halfs
        if dtype == torch.half:
            reference = consec((10,))
            validate_indexing(reference)
--- a/test/test_jit.py
+++ b/test/test_jit.py
@ -4764,7 +4764,7 @@ a")
        self.assertIsNot(fun_compiled, fun_compiled_2)
        self.assertEqual(fun_compiled_2(), 7)
-        # caching doesnt increase refcounts to function (holds weak reference)
+        # caching doesn't increase refcounts to function (holds weak reference)
        self.assertTrue(sys.getrefcount(fun), num_ref_counts)
    def test_string_ops(self):
@ -7374,7 +7374,7 @@ a")
                    # tensor from empty list is type float in python and annotated type in torchscript
                    if "annotate" in li and "dtype" not in option:
                        continue
-                    # Skip unsigned tensor initializaton for signed values on 3.10
+                    # Skip unsigned tensor initialization for signed values on 3.10
                    if sys.version_info[:2] >= (3, 10) and "torch.uint8" in option and "-" in li:
                        continue
                    code = tensor_template.format(list_create=li, tensor_op=op, options=option)
@ -7990,7 +7990,7 @@ dedent """
                m += k
            return m
-        # use of k tests the pathway where we have to insert unitialized
+        # use of k tests the pathway where we have to insert uninitialized
        self.checkScript(test_varexit, (3,))
        self.checkScript(test_varexit, (2,))
@ -10066,7 +10066,7 @@ dedent """
        buffer = io.BytesIO()
        torch.jit.save(cm, buffer)
        buffer.seek(0)
-        # when tensor is loaded as constant it isnt specialized
+        # when tensor is loaded as constant it isn't specialized
        cm_load = torch.jit.load(buffer)
        FileCheck().check_not("Float(1, 3)").run(cm_load.forward.graph)
@ -10300,7 +10300,7 @@ dedent """
    def test_type_inferred_from_empty_annotation(self):
        """
-        Test that the type inferred from an empty or missing annotation is Torch.Tensor wtih `inferred=true`
+        Test that the type inferred from an empty or missing annotation is Torch.Tensor with `inferred=true`
        """
        @torch.jit.script
        def fn(x):
@ -15606,7 +15606,7 @@ dedent """
                a = hasattr(self, "fee")
                b = hasattr(self, "foo")
                c = hasattr(self, "hi")
-                d = hasattr(self, "nonexistant")
+                d = hasattr(self, "nonexistent")
                return (a, b, c, d)
            def foo(self):
@ -16044,7 +16044,7 @@ EXCLUDE_TYPE_CHECK = {
 # chunk returns a list in scripting and we don't unpack the list,
 # Thus it won't be replaced by ConstantChunk and run AD.
 # It's explicitly checked in test_chunk_constant_script_ad
-# Similary for split, it's replaced by split_with_sizes in tracing,
+# Similarly for split, it's replaced by split_with_sizes in tracing,
 # but we don't have AD formula for aten::split(Tensor, int[], int),
 # an op registered in JIT so AD is not triggered in scripting.
 EXCLUDE_SCRIPT_AD_CHECK = {
--- a/test/test_jit_autocast.py
+++ b/test/test_jit_autocast.py
@ -319,7 +319,7 @@ class TestAutocast(JitTestCase):
    # TODO: fix and enable this test?
    #   (we could technically fix this, but is it really worth it?)
-    @unittest.skipIf(True, "unsuported autocast syntax")
+    @unittest.skipIf(True, "unsupported autocast syntax")
    def test_reused_autocast_expr(self):
        @torch.jit.script
        def fn(a, b, c, d):
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@ -126,7 +126,7 @@ class TestTEFuser(JitTestCase):
        super().setUp()
        self.tensorexpr_options = TensorExprTestOptions()
-        # note: `self.dynamic_shapes` instatiated in specialization of class
+        # note: `self.dynamic_shapes` instantiated in specialization of class
        # defined below
        fusion_strategy = [("DYNAMIC", 20)] if self.dynamic_shapes else [("STATIC", 20)]
--- a/test/test_legacy_vmap.py
+++ b/test/test_legacy_vmap.py
@ -1679,7 +1679,7 @@ class TestVmapOperatorsLegacy(Namespace.TestVmapBaseLegacy):
            # Interesting case #2: Batch dim at end of tensor, success cases
            # view_as_complex requires that the dim with size 2 have stride 1
-            # in order for the view to function propertly
+            # in order for the view to function properly
            test(op, [get([B0, 2]).transpose(0, 1)], in_dims=1)
            test(vmap(op, in_dims=1), [get([B0, B1, 2]).movedim(1, 2)])
            test(vmap(op, in_dims=2), [get([B0, 3, B1, 2]).movedim(2, 3)])
--- a/test/test_license.py
+++ b/test/test_license.py
@ -45,7 +45,7 @@ class TestLicense(TestCase):
                'Found too many "torch-*dist-info" directories '
                f'in "{site_packages}, expected only one'
            )
-        # setuptools renamed *dist-info/LICENSE to *dist-info/licenses/LICENSE sicne 77.0
+        # setuptools renamed *dist-info/LICENSE to *dist-info/licenses/LICENSE since 77.0
        license_file = os.path.join(distinfo[0], "licenses", "LICENSE")
        if not os.path.exists(license_file):
            license_file = os.path.join(distinfo[0], "LICENSE")
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@ -135,7 +135,7 @@ class TestLinalg(TestCase):
    @contextlib.contextmanager
    def _tunableop_ctx(self):
-        # Inialize and then tear down TunableOp
+        # Initialize and then tear down TunableOp
        import glob
        import os
        self._set_tunableop_defaults()
@ -4261,7 +4261,7 @@ class TestLinalg(TestCase):
            output = torch.einsum(equation, tensors)
            self.assertEqual(output, torch.tensor(expected_output, dtype=torch.float32, device=device))
-        # Test equation variantions
+        # Test equation variations
        check(' ', 1, expected_output=1)
        check(' -> ', 1, expected_output=1)
        check(' , ', 2, 2, expected_output=4)
@ -4770,7 +4770,7 @@ class TestLinalg(TestCase):
        with self._tunableop_ctx():
            torch.cuda.tunable.set_rotating_buffer_size(0)
            # Numerical check adds significant overhead, unsure if this is needed
-            # or if there was a transiet problem at the time.
+            # or if there was a transient problem at the time.
            # if dtype is torch.half:
            #     os.environ["PYTORCH_TUNABLEOP_NUMERICAL_CHECK"] = "1"
            ordinal = torch.cuda.current_device()
@ -5009,7 +5009,7 @@ class TestLinalg(TestCase):
            torch.cuda.tunable.tune_gemm_in_file(untuned_filename)
            new_results = len(torch.cuda.tunable.get_results())
-            # This stores total number of cummulative results
+            # This stores total number of cumulative results
            total_num_results = new_results - ref_results
            # Rowwise case will have an extra solution
@ -5202,7 +5202,7 @@ class TestLinalg(TestCase):
        # Validator,ROCBLAS_VERSION,X.Y,Z
        # Validator,HIPBLASLT_VERSION,X,Y.Z
        # Validator,ROCM_Version,X,Y.Z
-        # Validator,GCN_ARCH_NAME,<architecutre name>
+        # Validator,GCN_ARCH_NAME,<architecture name>
        validator_num_lines = 5
        with self._tunableop_ctx():
@ -5242,7 +5242,7 @@ class TestLinalg(TestCase):
            B = torch.randn(K, M, device=device, dtype=dtype)
            C = torch.matmul(A, B)
-            # This stores total number of cummulative results
+            # This stores total number of cumulative results
            total_num_results = len(torch.cuda.tunable.get_results())
            # There must be a new tuning result
@ -5270,7 +5270,7 @@ class TestLinalg(TestCase):
                    B = torch.randn(K, M, device=device, dtype=dtype)
                    C = torch.matmul(A, B)
-            # This stores total number of cummulative results
+            # This stores total number of cumulative results
            total_num_results = len(torch.cuda.tunable.get_results())
            # Take the difference to calculate the number of results from
@ -5303,7 +5303,7 @@ class TestLinalg(TestCase):
            B = torch.randn(K, M, device=device, dtype=dtype)
            C = torch.matmul(A, B)
-            # This stores total number of cummulative results
+            # This stores total number of cumulative results
            total_num_results = len(torch.cuda.tunable.get_results())
            # Take the difference to calculate the number of results from
@ -5326,7 +5326,7 @@ class TestLinalg(TestCase):
            # Take the difference to calculate the number of results from
            # this test. There should be no change in the number of results
-            # since tuning is disabe.
+            # since tuning is disable.
            self.assertEqual((total_num_results - ref_num_results), 0)
    @onlyCUDA
@ -5335,7 +5335,7 @@ class TestLinalg(TestCase):
        # Test that the TunableOp results file is created
        # and is NOT empty.
        # To test this we create a subprocess and then
-        # execut a matmul from within the subprocess
+        # execute a matmul from within the subprocess
        import os
        import multiprocessing as mp
@ -5384,7 +5384,7 @@ class TestLinalg(TestCase):
            torch.nn.functional.linear(X, matA, bias)
-            # This stores total number of cummulative results
+            # This stores total number of cumulative results
            total_num_results = len(torch.cuda.tunable.get_results())
            # There must be a new tuning result
@ -5438,7 +5438,7 @@ class TestLinalg(TestCase):
            torch.cuda.tunable.tune_gemm_in_file(untuned_filename)
            new_results = len(torch.cuda.tunable.get_results())
-            # This stores total number of cummulative results
+            # This stores total number of cumulative results
            total_num_results = new_results - ref_results
            # There must be a new tuning results
@ -5514,7 +5514,7 @@ class TestLinalg(TestCase):
                scaleB = torch.ones((1, matB.shape[1]), device=device)
                torch._scaled_mm(matA, matB, scale_a=scaleA, scale_b=scaleB, out_dtype=torch.bfloat16)
-            # This stores total number of cummulative results
+            # This stores total number of cumulative results
            total_num_results = len(torch.cuda.tunable.get_results())
            # Rowwise case will have an extra solution
@ -5638,7 +5638,7 @@ class TestLinalg(TestCase):
                torch.cuda.tunable.tune_gemm_in_file(untuned_filename)
                new_results = len(torch.cuda.tunable.get_results())
-                # This stores total number of cummulative results
+                # This stores total number of cumulative results
                total_num_results = new_results - ref_results
                # There must be a new tuning results
@ -5879,7 +5879,7 @@ class TestLinalg(TestCase):
            torch.cuda.tunable.tune_gemm_in_file(untuned_filename)
            new_results = len(torch.cuda.tunable.get_results())
-            # This stores total number of cummulative results
+            # This stores total number of cumulative results
            total_num_results = new_results - ref_results
            # There must be a new tuning results
@ -6700,7 +6700,7 @@ class TestLinalg(TestCase):
        with self.assertRaisesRegex(RuntimeError, "torch.int32 dtype"):
            torch.lu_unpack(lu_data, lu_pivots.long())
-        # check that onces flags are unset, Nones are returned
+        # check that once flags are unset, Nones are returned
        p, l, u = torch.lu_unpack(lu_data, lu_pivots, unpack_data=False)
        self.assertTrue(l.numel() == 0 and u.numel() == 0)
        p, l, u = torch.lu_unpack(lu_data, lu_pivots, unpack_pivots=False)
@ -6919,7 +6919,7 @@ class TestLinalg(TestCase):
            lambdas1.append(worker.E[:])
        tol = 1e-8
-        # tol for scipy lobpcg will be choosed so that the number of
+        # tol for scipy lobpcg will be chosen so that the number of
        # iterations will be equal or very close to pytorch lobpcg
        # (that is around 170-180)
@ -6999,7 +6999,7 @@ scipy_lobpcg  | {elapsed_scipy_ms:10.2f}  | {elapsed_general_scipy_ms:10.2f}  |
 -(input size: {m:4}, eigenpairs:{k:2}, units: ms per call)-
        ''')
-        # Handling of very small tolerence
+        # Handling of very small tolerance
        tol = 1e-100
        lambdas1 = []
@ -8025,7 +8025,7 @@ scipy_lobpcg  | {eq_err_scipy:10.2e}  | {eq_err_general_scipy:10.2e}  | {iters2:
        if self.device_type == 'cuda' and dtype is torch.bfloat16 and not SM53OrLater:
            # cuBLAS does not guarantee BFloat16 support on SM < 53.
            # So on PyTorch, we consider BFloat16 support on SM < 53 as
-            # undefined bahavior
+            # undefined behavior
            return
        batch_sizes = [1, 10]
@ -8138,7 +8138,7 @@ scipy_lobpcg  | {eq_err_scipy:10.2e}  | {eq_err_general_scipy:10.2e}  | {iters2:
        if self.device_type == 'cuda' and dtype is torch.bfloat16 and not SM53OrLater:
            # cuBLAS does not guarantee BFloat16 support on SM < 53.
            # So on PyTorch, we consider BFloat16 support on SM < 53 as
-            # undefined bahavior
+            # undefined behavior
            return
        num_batches = 2
@ -8212,7 +8212,7 @@ scipy_lobpcg  | {eq_err_scipy:10.2e}  | {eq_err_general_scipy:10.2e}  | {iters2:
        if self.device_type == 'cuda' and dtype is torch.bfloat16 and not SM53OrLater:
            # cuBLAS does not guarantee BFloat16 support on SM < 53.
            # So on PyTorch, we consider BFloat16 support on SM < 53 as
-            # undefined bahavior
+            # undefined behavior
            return
        num_batches = 10
--- a/test/test_masked.py
+++ b/test/test_masked.py
@ -57,7 +57,7 @@ def apply_masked_reduction_along_dim(op, input, *args, **kwargs):
       [[op([1, 2], *args0, **kwargs, dim=None, keepdim=False)]
        [op([3, 4, 5], *args0, **kwargs, dim=None, keepdim=False)]]
-      where args0 is args where dim value is replased with None if
+      where args0 is args where dim value is replaced with None if
      present.
      Using the same example data, if the op is called with dim=(0, 1)
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@ -595,7 +595,7 @@ class TestMatmulCuda(TestCase):
                    -2, -1
                )[:, :n, :]
        else:
-            raise AssertionError(f"Invaild op: {op}")
+            raise AssertionError(f"Invalid op: {op}")
        C_ref = f_ref(A, B.transpose(-2, -1), offs=offs)
        C = f(A, B.transpose(-2, -1), offs=offs)
@ -1284,7 +1284,7 @@ class TestFP8Matmul(TestCase):
                out_dtype=torch.bfloat16,
            )
-        # Note re.compile is used, not re.escape. This is to accomodate fn vs fnuz type message.
+        # Note re.compile is used, not re.escape. This is to accommodate fn vs fnuz type message.
        with self.assertRaisesRegex(
            RuntimeError,
            r"Expected b\.dtype\(\) == at::kFloat8_e4m3fnu?z? to be true, but got false\.",
@ -1754,7 +1754,7 @@ class TestFP8Matmul(TestCase):
    # Testing only _scaled_grouped_mm() with multiple shapes, as
    # _scaled_mm() already has more combinations of parameters than
-    # _scaled_grouped_mm(), for supporing more than one inputs layout
+    # _scaled_grouped_mm(), for supporting more than one inputs layout
    # combinations.
    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
--- a/test/test_meta.py
+++ b/test/test_meta.py
@ -1502,7 +1502,7 @@ class TestMeta(TestCase):
    def test_fill__alias_relationship(self):
        inps = torch.rand(2**52, device='meta')
        r = torch.ops.aten.fill_(inps, 1.0)
-        # aten.fill_ returns an aliase
+        # aten.fill_ returns an alias
        self.assertEqual(id(inps), id(r))
        # aten.fill returns a new tensor
--- a/test/test_mkldnn.py
+++ b/test/test_mkldnn.py
@ -492,7 +492,7 @@ class TestMkldnn(TestCase):
            C = torch.randint(1, 3, (1,)).item() * groups
            x_shape = (N, C) + input_shapes[dim]
            data = torch.randn(x_shape, dtype=torch.float32)
-            # conv: mkldnn tranpose conv fp32
+            # conv: mkldnn transpose conv fp32
            # conv_ref: thnn transpose conv fp32
            conv = conv_module[dim](in_channels=C,
                                    out_channels=M,
--- a/test/test_mps.py
+++ b/test/test_mps.py
@ -640,7 +640,7 @@ class MPSLeakyReluTest(TestCaseMPS):
        mps_x = cpu_x.detach().clone().to('mps')
        if not contiguous and not (0 in shape or len(shape) < 2):
-            # Tranposing will make the tensor non-contiguous
+            # Transposing will make the tensor non-contiguous
            cpu_x = cpu_x.transpose(0, 1)
            mps_x = mps_x.transpose(0, 1)
            assert not mps_x.is_contiguous()
@ -940,7 +940,7 @@ class TestMPS(TestCaseMPS):
            x.requires_grad = True
            d = torch.cdist(x, y)
            d.backward(dist_grad)
-            # Check that the backward passs does not contain invalid
+            # Check that the backward pass does not contain invalid
            # values such as nan or inf
            assert torch.isfinite(x.grad).all()
@ -1195,7 +1195,7 @@ class TestMPS(TestCaseMPS):
            torch.nn.functional.linear(torch.rand(size, device='mps'),
                                       torch.randint(-10, 10, size, dtype=torch.int8, device='mps'))
-        # Weigths on wrong device
+        # Weights on wrong device
        with self.assertRaisesRegex(RuntimeError, "argument weight is on cpu but expected on mps"):
            torch.nn.functional.linear(torch.rand(size, device='mps'),
                                       torch.rand(size, device='cpu'))
@ -6285,7 +6285,7 @@ class TestMPS(TestCaseMPS):
            x = cpu_x.detach().clone().to('mps')
            if not contiguous and (0 not in shape and len(shape) >= 2):
-                # Tranposing will make the tensor non-contiguous
+                # Transposing will make the tensor non-contiguous
                cpu_x = cpu_x.transpose(0, 1)
                x = x.transpose(0, 1)
                assert not x.is_contiguous()
@ -6441,7 +6441,7 @@ class TestMPS(TestCaseMPS):
            x = cpu_x.detach().clone().to('mps')
            if not contiguous and (0 not in shape and len(shape) >= 2):
-                # Tranposing will make the tensor non-contiguous
+                # Transposing will make the tensor non-contiguous
                cpu_x = cpu_x.transpose(0, 1)
                x = x.transpose(0, 1)
                assert not x.is_contiguous()
@ -6481,7 +6481,7 @@ class TestMPS(TestCaseMPS):
            x = cpu_x.detach().clone().to('mps')
            if not contiguous and (0 not in shape and len(shape) >= 2):
-                # Tranposing will make the tensor non-contiguous
+                # Transposing will make the tensor non-contiguous
                cpu_x = cpu_x.transpose(0, 1)
                x = x.transpose(0, 1)
                assert not x.is_contiguous()
@ -7706,13 +7706,13 @@ class TestMPS(TestCaseMPS):
    # Test exponential
    @unittest.skip("This does not test anything")
    def test_exponential(self):
-        def helper(shape, lamda, dtype=torch.float32):
+        def helper(shape, lambda_, dtype=torch.float32):
            mps_out = torch.zeros(shape, device='mps', dtype=dtype)
-            mps_out.exponential_(lamda)
+            mps_out.exponential_(lambda_)
-            print(mps_out.to('cpu').float().mean(), 1 / lamda)
+            print(mps_out.to('cpu').float().mean(), 1 / lambda_)
-            print(mps_out.to('cpu').float().std() ** 2, 1 / (lamda**2))
+            print(mps_out.to('cpu').float().std() ** 2, 1 / (lambda_**2))
        for dtype in [torch.float32, torch.float16]:
            helper([100, 100], 2, dtype)
@ -8179,7 +8179,7 @@ class TestLogical(TestCaseMPS):
            self.assertEqual(torch.isin(x, 2.0), torch.tensor([False, False, True, False], device="mps"))
            self.assertEqual(torch.isin(x, 1.0, invert=True), torch.tensor([True, False, True, True], device="mps"))
            self.assertEqual(torch.isin(x, 8.0), torch.tensor([False, False, False, False], device="mps"))
-            # Scalar.Tensor varaiant(alaises to Scalar.Scalar), not covered by OpInfo
+            # Scalar.Tensor variant(alaises to Scalar.Scalar), not covered by OpInfo
            self.assertEqual(torch.isin(2.0, x), torch.tensor(True, device="mps"))
    def test_isin_asserts(self):
@ -10437,7 +10437,7 @@ class TestConvolutionMPS(TestCaseMPS):
        grad_in_cl = torch.empty(1, f, oc, device="mps").transpose(1, 2)
        grad_in_cl[:] = grad_in
-        # It does not matter whether grad_in contigous, or channels last, results should equal to each other
+        # It does not matter whether grad_in contiguous, or channels last, results should equal to each other
        grad_rc = torch.autograd.grad((out,), (inp, conv.weight, conv.bias), (grad_in,), retain_graph=True)
        grad_rc_cl = torch.autograd.grad((out,), (inp, conv.weight, conv.bias), (grad_in_cl,), retain_graph=True)
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@ -7190,7 +7190,7 @@ torch.cuda.synchronize()
        query = torch.rand(bs, d1, d3, device=device)
        value = torch.rand(30, d2, requires_grad=True, device=device)
-        # total_length must > than max_length otherwise flash_attn backwark will fail
+        # total_length must > than max_length otherwise flash_attn backward will fail
        offsets = torch.tensor([0, 2, 3, 30], device=device)
        m = mha(use_legacy_api)
--- a/test/test_nn.py
+++ b/test/test_nn.py
@ -2013,7 +2013,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
                eval_out0 = wrapped_m(input)
                # assert eval gives same result as last training iteration
                self.assertEqual(eval_out0, last_train_out)
-                # assert doing more iteartion in eval don't change things
+                # assert doing more iteration in eval don't change things
                self.assertEqual(eval_out0, wrapped_m(input))
                self.assertEqual(last_train_u, m.weight_u)
                self.assertEqual(last_train_v, m.weight_v)
@ -8911,7 +8911,7 @@ class TestNNDeviceType(NNTestCase):
        # Should raise error when negative padding results in negative output shape
        self.assertRaises(RuntimeError, lambda: F.pad(inputs, (-3, -2), mode='circular'))
-        # assert that relfection padding errors when pad >= input size
+        # assert that reflection padding errors when pad >= input size
        expected_err_msg = r"Padding size should be less than the corresponding input dimension"
        inputs = torch.randn(1, 1, 2, 3, device=device, dtype=dtype)
        self.assertRaisesRegex(RuntimeError, expected_err_msg,
@ -11018,7 +11018,7 @@ class TestNNDeviceType(NNTestCase):
    @onlyCUDA
    @dtypes(torch.double)
    def test_lstmcell_backward_only_one_output_grad(self, device, dtype):
-        # checks that undefined gradients doen't hamper the backward
+        # checks that undefined gradients doesn't hamper the backward
        # see #11872
        l = torch.nn.LSTMCell(2, 3).to(device).to(dtype=dtype)
        s = torch.randn(1, 2, device=device, dtype=dtype, requires_grad=True)
@ -11967,7 +11967,7 @@ class TestNNDeviceType(NNTestCase):
    def test_softmax_bfloat16(self, device):
        for dim in [0, 1, 2, 3]:
            _test_bfloat16_ops(self, torch.nn.Softmax(dim=dim), device, inp_dims=(16, 33, 15, 16), prec=1e-2)
-            # test softmax with large input value which casues exp() to overflow
+            # test softmax with large input value which causes exp() to overflow
            _test_bfloat16_ops(self, torch.nn.Softmax(dim=dim), device, inp_dims=(16, 33, 15, 16), prec=0.05, scale_factor=1000.0)
    def test_nll_loss_mismatched_batch(self, device):
@ -12298,7 +12298,7 @@ if __name__ == '__main__':
            input = torch.randn(N, C, *other_dims, device=device, requires_grad=True)
            target = torch.empty(N, *other_dims, dtype=torch.long, device=device).random_(0, C)
-            # construct target probablity that should have the same result as label_smoothing
+            # construct target probability that should have the same result as label_smoothing
            target_proba = F.one_hot(target, num_classes=C)
            # Need to put the C dim at index 1.
            target_proba = target_proba.permute(0, -1, *range(1, target_proba.dim() - 1))
--- a/test/test_openreg.py
+++ b/test/test_openreg.py
@ -205,7 +205,7 @@ class TestPrivateUse1(TestCase):
 class TestOpenReg(TestCase):
-    """Tests of mimick accelerator named OpenReg based on PrivateUse1"""
+    """Tests of mimic accelerator named OpenReg based on PrivateUse1"""
    # Stream & Event
    def test_stream_synchronize(self):
@ -475,7 +475,7 @@ class TestOpenReg(TestCase):
                    with torch.serialization.skip_data():
                        torch.save(sd, f)
-    # Opeartors
+    # Operators
    def test_factory(self):
        x = torch.empty(3, device="openreg")
        self.assertEqual(x.device.type, "openreg")
--- a/test/test_ops.py
+++ b/test/test_ops.py
@ -87,7 +87,7 @@ _variant_ops = partial(
 # Get names of all the operators which have ref in their entry in OpInfo (testing infra)
 #   except for elementwise unary operators (separately implemented in test/test_unary_ufuncs.py),
 #   elementwise binary operators (separately implemented in test_binary_ufuncs.py),
-#   reduction operations (separately impelemented in test_reductions.py),
+#   reduction operations (separately implemented in test_reductions.py),
 #   and Spectral Functions (separately implemented for only 1D as of now, in test/test_spectral_ops.py)
 _ref_test_ops = tuple(
    filter(
@ -373,7 +373,7 @@ class TestCommon(TestCase):
            # output_process_fn_grad has a very unfortunate name
            # We use this function in linalg extensively to postprocess the inputs of functions
-            # that are not completely well-defined. Think svd and muliplying the singular vectors by -1.
+            # that are not completely well-defined. Think svd and multiplying the singular vectors by -1.
            # CPU and CUDA implementations of the SVD can return valid SVDs that are different.
            # We use this function to compare them.
            cuda_results = sample.output_process_fn_grad(cuda_results)
@ -580,7 +580,7 @@ class TestCommon(TestCase):
    # Tests that experimental Python References perform the same computation
    # as the operators they reference, when operator calls in the torch
-    # namesapce are remapped to the refs namespace (torch.foo becomes refs.foo).
+    # namespace are remapped to the refs namespace (torch.foo becomes refs.foo).
    @onlyNativeDeviceTypesAnd(["hpu"])
    @ops(python_ref_db)
    @skipIfTorchInductor("Takes too long for inductor")
@ -759,7 +759,7 @@ class TestCommon(TestCase):
                else tuple(n_inp) + n_args
            )
-            # Filter the elemnts that are tensors that require grad
+            # Filter the elements that are tensors that require grad
            t_input_tensors = [
                t for t in t_inputs if isinstance(t, torch.Tensor) and t.requires_grad
            ]
--- a/test/test_ops_jit.py
+++ b/test/test_ops_jit.py
@ -188,7 +188,7 @@ class TestJit(JitCommonTestCase):
            # Note: only runs in float32 because schema isn't affected by dtype,
            #   so running it on all dtypes is would be excessive
            if dtype == torch.float32:
-                # TODO: no reason why we cant run this with tracing graph
+                # TODO: no reason why we can't run this with tracing graph
                if support_script and op.name != "rsub":
                    check_alias_annotation(
                        name,
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@ -77,7 +77,7 @@ def quux(a):
 # dictionary are function names in the torch API and the values are
 # function implementations. Implementations are added to
 # HANDLED_FUNCTION_DIAGONAL by decorating a python function with
-# implements_diagonal. See the overrides immediately below the defintion
+# implements_diagonal. See the overrides immediately below the definition
 # of DiagonalTensor for usage examples.
 HANDLED_FUNCTIONS_DIAGONAL = {}
@ -133,7 +133,7 @@ class DiagonalTensor:
        https://numpy.org/devdocs/user/basics.dispatch.html
    """
    # This is defined as a class attribute so that SubDiagonalTensor
-    # below which subclasses DiagonalTensor can re-use DiagonalTensor's
+    # below which subclasses DiagonalTensor can reuse DiagonalTensor's
    # __torch_function__ implementation.
    handled_functions = HANDLED_FUNCTIONS_DIAGONAL
--- a/test/test_per_overload_api.py
+++ b/test/test_per_overload_api.py
@ -7,7 +7,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase
 class TestPerOverloadAPI(TestCase):
    def test_basics_opoverloadpacket(self):
-        # add is ony used as an example here. It is ok to update the test
+        # add is only used as an example here. It is ok to update the test
        # if the semantics of add are modified in the future.
        add_packet = torch.ops.aten.add
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@ -512,7 +512,7 @@ class TestPublicBindings(TestCase):
                            "does not have `__all__` defined"
                        )
                        fix_is_public = (
-                            f"remove it from the modules's (`{modname}`) `__all__`"
+                            f"remove it from the modules' (`{modname}`) `__all__`"
                            if is_all
                            else f"either define a `__all__` for `{modname}` or add a `_` at the beginning of the name"
                        )
@ -522,7 +522,7 @@ class TestPublicBindings(TestCase):
                            f"it is not inside the module's (`{modname}`) `__all__`"
                        )
                        fix_is_public = (
-                            f"add it from the modules's (`{modname}`) `__all__`"
+                            f"add it from the modules' (`{modname}`) `__all__`"
                        )
                    if looks_public:
                        why_looks_public = (
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@ -156,7 +156,7 @@ class TestPythonRegistration(TestCase):
                # New dispatcher call should hit the first callback again
                self.assertFalse(first_called)
                a, b = args
-                # Make a substraction here instead of add !
+                # Make a subtraction here instead of add !
                c = a - b
                self.assertTrue(first_called)
                return c
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@ -735,7 +735,7 @@ class TestReductions(TestCase):
        res2 = x1.sum(axis=(0, 2), keepdims=True)
        self.assertEqual(res1, res2)
-    # TODO: kill this ane replace with common creation ops
+    # TODO: kill this and replace with common creation ops
    def _make_tensors(self, shape, val_range=(-100, 100), use_floating=True, use_integral=True,
                      use_complex=False) -> dict[str, list[torch.Tensor]]:
        float_types = [torch.double,
@ -1629,7 +1629,7 @@ class TestReductions(TestCase):
                RuntimeError, "only when boundaries tensor dimension is 1"):
            torch.searchsorted(boundaries, 1)
-        # incompatiable output tensor's dtype
+        # incompatible output tensor's dtype
        def test_output_dtype(dtype, is_int32):
            output = values_1d.to(dtype)
            with self.assertRaisesRegex(
@ -2018,7 +2018,7 @@ class TestReductions(TestCase):
                with self.assertRaisesRegex(RuntimeError, error_msg):
                    op(x, dim=dim)
-    # TODO: update this test to comapre against NumPy
+    # TODO: update this test to compare against NumPy
    @onlyCUDA
    def test_var(self, device):
        cpu_tensor = torch.randn(2, 3, 3)
@ -2513,7 +2513,7 @@ class TestReductions(TestCase):
            k = int((t.numel() - 1) / 2)
            self.assertEqual(res, t.view(-1).sort()[0][k])
            if t.numel() % 2 == 1:
-                # We can only test agains numpy for odd reductions because numpy
+                # We can only test against numpy for odd reductions because numpy
                # returns the mean of the two medians and torch returns the lower
                self.assertEqual(res.cpu().numpy(), np.median(t_numpy))
            for dim in range(t.ndim):
@ -2524,7 +2524,7 @@ class TestReductions(TestCase):
                self.assertEqual(res[0], (t.sort(dim)[0]).select(dim, k).unsqueeze_(dim))
                self.assertEqual(res[0], t.gather(dim, res[1]))
                if size % 2 == 1:
-                    # We can only test agains numpy for odd reductions because numpy
+                    # We can only test against numpy for odd reductions because numpy
                    # returns the mean of the two medians and torch returns the lower
                    self.assertEqual(res[0].cpu().numpy(), np.median(t_numpy, dim, keepdims=True), exact_dtype=False)
@ -2548,7 +2548,7 @@ class TestReductions(TestCase):
                    k = int((t.numel() - num_nan - 1) / 2)
                self.assertEqual(res, t.view(-1).sort()[0][k])
                if (t.numel() - num_nan) % 2 == 1:
-                    # We can only test agains numpy for odd reductions because numpy
+                    # We can only test against numpy for odd reductions because numpy
                    # returns the mean of the two medians and torch returns the lower
                    self.assertEqual(res.item(), numpy_op(t.cpu().numpy()))
                for dim in range(t.ndim):
@ -2561,7 +2561,7 @@ class TestReductions(TestCase):
                        k = ((size - num_nan - 1) / 2).type(torch.long)
                    self.assertEqual(res[0], (t.sort(dim)[0]).gather(dim, k))
                    self.assertEqual(res[0], t.gather(dim, res[1]))
-                    # We can only test agains numpy for odd reductions because numpy
+                    # We can only test against numpy for odd reductions because numpy
                    # returns the mean of the two medians and torch returns the lower
                    mask = (size - num_nan) % 2 == 1
                    res = res[0].masked_select(mask).cpu()
@ -3526,7 +3526,7 @@ as the input tensor excluding its innermost dimension'):
    # raises an error if no `dim` parameter is specified. This exists separately from tests in
    # test_tensot_compare_ops_empty because not specifying a `dim` parameter in the former tests does
    # not throw errors. Also, checking the return type of argmax requires supplying a different dtype
-    # argument than that for the input tensor. There is also variantion in numpy testing.
+    # argument than that for the input tensor. There is also variation in numpy testing.
    def test_tensor_compare_ops_argmax_argmix_kthvalue_dim_empty(self, device):
        shape = (2, 0, 4)
        master_input = torch.randn(shape, device=device)
--- a/test/test_scatter_gather_ops.py
+++ b/test/test_scatter_gather_ops.py
@ -455,7 +455,7 @@ class TestScatterGather(TestCase):
        helper([50, 8, 7], 100)
        helper([50, 3, 4, 5], 100)
-# Generic Device Test Framework instantation, see
+# Generic Device Test Framework instantiation, see
 #   https://github.com/pytorch/pytorch/wiki/Running-and-writing-tests
 #   for details.
 instantiate_device_type_tests(TestScatterGather, globals())
--- a/test/test_segment_reductions.py
+++ b/test/test_segment_reductions.py
@ -558,7 +558,7 @@ class TestSegmentReductions(TestCase):
        lengths = torch.tensor([0, 2, 3, 0], device=device, dtype=length_type)
        data = torch.arange(6, dtype=torch.float, device=device)
-        # test for error on 1-D lenghts
+        # test for error on 1-D lengths
        with self.assertRaisesRegex(RuntimeError, "Expected all rows of lengths along axis"):
            torch._segment_reduce(data, 'sum', lengths=lengths, axis=0, unsafe=False)
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@ -746,7 +746,7 @@ class SerializationMixin:
                                          'readinto() stress test')
    def test_serialization_filelike_uses_readinto(self):
-        # For maximum effiency, when reading a file-like object,
+        # For maximum efficiency, when reading a file-like object,
        # ensure the C API calls readinto instead of read.
        a = torch.randn(5, 4)
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@ -458,7 +458,7 @@ class TestSparse(TestSparseBase):
                        torch.autograd.gradcheck(func, (t._indices(), t._values().requires_grad_(True), shape, True))
    @dtypes(*floating_and_complex_types_and(torch.float16, torch.bfloat16))
-    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
+    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
    @gradcheck_semantics()
    def test_to_dense_with_gradcheck(self, device, dtype, gradcheck):
@ -594,7 +594,7 @@ class TestSparse(TestSparseBase):
        self.assertEqual(torch.empty((3, 0), dtype=dtype, device=device), self.safeToDense(x))
    @dtypes(torch.double, torch.cdouble)
-    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
+    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
    @gradcheck_semantics()
    def test_to_dense_hybrid(self, device, dtype, gradcheck):
@ -950,7 +950,7 @@ class TestSparse(TestSparseBase):
    @coalescedonoff
    @dtypes(torch.double, torch.cdouble)
-    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
+    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
    @gradcheck_semantics()
    def test_permute(self, device, dtype, coalesced, gradcheck):
        # trivial checks
@ -1240,7 +1240,7 @@ class TestSparse(TestSparseBase):
            # NOTE: indices are negative
            idx_dim_d_range = list(range(-sizes[d], 0))
            for idx_len in range(sizes[d], sizes[d] + 1):
-                # creates all possible valid indices into dim d of lenght idx_len
+                # creates all possible valid indices into dim d of length idx_len
                for idx in itertools.product(*itertools.repeat(idx_dim_d_range, idx_len)):
                    t_idx = torch.tensor(idx, dtype=torch.long, device=device)
@ -1619,7 +1619,7 @@ class TestSparse(TestSparseBase):
    @coalescedonoff
    @dtypes(torch.double)
-    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
+    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
    def test_sparse_mm(self, device, dtype, coalesced):
        def test_shape(d1, d2, d3, nnz, transposed):
            if transposed:
@ -1641,7 +1641,7 @@ class TestSparse(TestSparseBase):
    @coalescedonoff
    @dtypes(torch.double)
-    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
+    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
    @gradcheck_semantics()
    def test_sparse_mul(self, device, dtype, coalesced, gradcheck):
        # https://github.com/pytorch/pytorch/issues/79914
@ -3600,13 +3600,13 @@ class TestSparse(TestSparseBase):
    @dtypes(torch.double, torch.float)
-    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
+    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
    def test_softmax_zero_nnz(self, device, dtype):
        self._check_zero_nnz_softmax_op(torch.sparse.softmax, 1, device, dtype)
        self._check_zero_nnz_softmax_op(torch.sparse.softmax, 10, device, dtype)
    @dtypes(torch.double, torch.float)
-    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
+    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
    def test_log_softmax_zero_nnz(self, device, dtype):
        self._check_zero_nnz_softmax_op(torch.sparse.log_softmax, 1, device, dtype)
        self._check_zero_nnz_softmax_op(torch.sparse.log_softmax, 10, device, dtype)
@ -3985,11 +3985,11 @@ class TestSparse(TestSparseBase):
            # some normal cases
            yield (make_diags((1, 5)), make_offsets([0]), (5, 5))
            yield (make_diags((3, 3)), make_offsets([-1, 0, 1]), (4, 4))
-            # noncontigous diags
+            # non-contiguous diags
            yield (make_diags((5, 4), noncontiguous=True), make_offsets([-1, 1, 0, 2, -2]), (5, 5))
-            # noncontigous offsets
+            # non-contiguous offsets
            yield (make_diags((3, 4)), make_offsets([1, -1, 0, -2, 2])[::2], (5, 5))
-            # noncontigous diags + offsets
+            # non-contiguous diags + offsets
            yield (make_diags((3, 4), noncontiguous=True), make_offsets([1, -1, 0, -2, 2])[::2], (5, 5))
            # correct dimensionality, 2d, 2d , and shapes match, but the number of diagonals is zero
            yield (make_diags((0, 3)), make_offsets([]), (3, 3))
@ -4624,7 +4624,7 @@ class TestSparseAny(TestCase):
            # However, invariants check can be disabled via
            # constructor's optional argument so that the invalid
-            # tensor is succesfully constructed:
+            # tensor is successfully constructed:
            r = create_invalid_tensor(check_invariants=False)
            self.assertEqual(r.layout, layout)
@ -4646,7 +4646,7 @@ class TestSparseAny(TestCase):
            self.assertTrue(torch.sparse.check_sparse_tensor_invariants.is_enabled())
        self.assertFalse(torch.sparse.check_sparse_tensor_invariants.is_enabled())
-        # Test an attempt to re-use an activate context manager instance
+        # Test an attempt to reuse an activate context manager instance
        check_ctx2 = torch.sparse.check_sparse_tensor_invariants(True)
        with check_ctx:
            self.assertTrue(torch.sparse.check_sparse_tensor_invariants.is_enabled())
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@ -2791,7 +2791,7 @@ class TestSparseCSR(TestCase):
            raise ValueError("Expected at least one 2D tensor in samples.")
        for sample in samples:
-            # We must skip samples of low dimensionality, we can't covert them to sparsed compressed layouts
+            # We must skip samples of low dimensionality, we can't convert them to sparsed compressed layouts
            if sample.input.ndim < 2:
                continue
            sparse_input = sample.input.to_sparse_csr().requires_grad_(True)
@ -3255,7 +3255,7 @@ class TestSparseCSR(TestCase):
        # helpers
        def _check_against_scipy_matrix(pt_matrix, dense, blocksize, **kwargs):
-            # scipy has no bsc layout, so we check against the bsr layout of the tranposed dense
+            # scipy has no bsc layout, so we check against the bsr layout of the transposed dense
            if layout == torch.sparse_bsc:
                sp_matrix = self._construct_sp_matrix(dense.t(), layout=torch.sparse_bsr, blocksize=blocksize[::-1])
            else:
@ -3272,7 +3272,7 @@ class TestSparseCSR(TestCase):
            self.assertEqual(torch.tensor(sp_matrix.indptr, dtype=torch.int64), compressed_indices_mth(pt_matrix))
            self.assertEqual(torch.tensor(sp_matrix.indices, dtype=torch.int64), plain_indices_mth(pt_matrix))
            if layout == torch.sparse_bsc:
-                # we must tranpose the blocks before comparing
+                # we must transpose the blocks before comparing
                self.assertEqual(torch.tensor(sp_matrix.data), pt_matrix.values().transpose(-2, -1))
            else:
                self.assertEqual(torch.tensor(sp_matrix.data), pt_matrix.values())
@ -3371,7 +3371,7 @@ class TestSparseCSR(TestCase):
        # special cases for batched tensors
        if batched:
-            # batched sparse tensors need only have the same number of non-zeros in each batch not nessesarily the
+            # batched sparse tensors need only have the same number of non-zeros in each batch not necessarily the
            # same sparsity pattern in each batch
            sparse_shape = sparse_sizes[0]
            hybrid_shape = hybrid_sizes[0]
@ -3382,7 +3382,7 @@ class TestSparseCSR(TestCase):
            # number of elements/blocks in each batch (total not nnz)
            batch_mask_shape = sparse_shape
            if layout in blocked_layouts:
-                # if we are blocked the mask is genereated for the block valued elemetns
+                # if we are blocked the mask is generated for the block valued elements
                batch_mask_shape = sparse_shape[0] // blocksize[0], sparse_shape[1] // blocksize[1]
            # random bool vector w/ length equal to max possible nnz for the sparse_shape
@ -3815,7 +3815,7 @@ class TestSparseCompressedTritonKernels(TestCase):
                input_broadcasted_clone.col_indices(),
                # For testing `out=` let's make values to have "weird" strides
                # so that if the kernel modifies values to it's needs, the result
-                # is being compied into out.values.
+                # is being copied into out.values.
                input_broadcasted_clone.values().transpose(-3, -2).contiguous().transpose(-3, -2),
                layout=input_broadcasted_clone.layout,
                size=input_broadcasted_clone.shape
@ -3930,7 +3930,7 @@ class TestSparseCompressedTritonKernels(TestCase):
                    try:
                        result = bsr_scatter_mm(bsr, dense, indices_data=indices_data)
                    except triton.compiler.OutOfResources:
-                        # ensure that there was at least one succesful test:
+                        # ensure that there was at least one successful test:
                        assert SPLIT_N < SPLIT_N_list[0]
                        break
--- a/test/test_stateless.py
+++ b/test/test_stateless.py
@ -210,7 +210,7 @@ class TestStatelessFunctionalAPI(TestCase):
        prev_buffer = module.buffer.clone()
        res = functional_call(module, parameters, x, tie_weights=False)
        self.assertEqual(x, res)
-        # check that the weights remain unmodified and were correctly accesed
+        # check that the weights remain unmodified and were correctly accessed
        cur_weight = module.l1.weight
        cur_buffer = module.buffer
        self.assertEqual(cur_weight, prev_weight)
@ -753,7 +753,7 @@ class TestStatelessFunctionalAPI(TestCase):
        res = torch.func.functional_call(mod, (), x)
        self.assertEqual(res, mod(x))
-        # three dictonaries
+        # three dictionaries
        a = ({'l1.weight': torch.ones(1, 1)}, {'l1.bias': torch.ones(1)}, {'buffer': torch.zeros(1)})
        res = torch.func.functional_call(mod, a, x)
        self.assertEqual(res, x + 1)
--- a/test/test_sympy_utils.py
+++ b/test/test_sympy_utils.py
@ -423,7 +423,7 @@ class TestSympyInterp(TestCase):
                sargs = [sympy.sympify(a) for a in args]
                sympy_expr = getattr(ReferenceAnalysis, fn)(*symbols)
                ref_r = getattr(ReferenceAnalysis, fn)(*sargs)
-                # Yes, I know this is a longwinded way of saying xreplace; the
+                # Yes, I know this is a long-winded way of saying xreplace; the
                # point is to test sympy_interp
                r = sympy_interp(
                    ReferenceAnalysis, dict(zip(symbols, sargs)), sympy_expr
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@ -1531,7 +1531,7 @@ class TestTensorCreation(TestCase):
        expected = torch.empty(0, 5, dtype=a.dtype, device=device)
        self.assertEqual(c, expected)
-        # test empty imput
+        # test empty input
        a = torch.empty(0, device=device)
        c1 = torch.combinations(a)
        c2 = torch.combinations(a, with_replacement=True)
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@ -695,12 +695,12 @@ class TestTensorExprFuser(BaseTestClass):
            _atol = 2e-3
            _rtol = 1e-5
            if data_type is torch.bfloat16:
-                # Compared to aten logic, NNC coudl save addtional BF16/Fp32 conversion.
+                # Compared to aten logic, NNC could save additional BF16/Fp32 conversion.
                # Take d = a + b - c as an example, the aten logic is as follows at
                # operator level:
                #    tmp = to_bf16(to_fp32(a) + to_fp32(b))
                #    d = to_bf16(to_fp32(tmp) + to_fp32(c))
-                # But NNC could fuse the compression and remove the redudant conversions.
+                # But NNC could fuse the compression and remove the redundant conversions.
                # The final statement is as follows
                #    d = to_bf16(to_fp32(a) + to_fp32(b) + to_fp32(c))
                # Hence, we simulate NNC computation by feeding fp32 tensors and converting
--- a/test/test_torch.py
+++ b/test/test_torch.py
@ -1091,7 +1091,7 @@ class TestTorchDeviceType(TestCase):
            small2_expanded = small2.expand(*dims_full)
        if small.is_cuda and fn in ['map', 'map2']:
-            # map and map2 are not implementd on CUDA tensors
+            # map and map2 are not implemented on CUDA tensors
            return
        if hasattr(large_expanded, fn):
@ -2677,7 +2677,7 @@ else:
            x.requires_grad = True
            d = torch.cdist(x, y)
            d.backward(dist_grad)
-            # Check that the backward passs does not contain invalid
+            # Check that the backward pass does not contain invalid
            # values such as nan or inf
            assert torch.isfinite(x.grad).all()
@ -2709,7 +2709,7 @@ else:
                                             [0, 0, 0],
                                             [1, 2, 3]]))
-        # Check that cummulative sum over a zero length dimension doesn't crash on backprop.
+        # Check that cumulative sum over a zero length dimension doesn't crash on backprop.
        # Also check that cumsum over other dimensions in a tensor with a zero-length
        # dimensiuon also works
        # Also include a basic suite of similar tests for other bases cases.
@ -2761,7 +2761,7 @@ else:
                                             [0, 0, 0],
                                             [1, 1, 1]]))
-        # Check that cummulative prod over a zero length dimension doesn't crash on backprop.
+        # Check that cumulative prod over a zero length dimension doesn't crash on backprop.
        # Also check that cumprod over other dimensions in a tensor with a zero-length
        # dimensiuon also works
        # Also include a basic suite of similar tests for other bases cases.
@ -3806,7 +3806,7 @@ else:
        # Test for parallel adds with accumulate == True
        low_precision = dtype == torch.half or dtype == torch.bfloat16
        # Less numbers to avoid overflow with low_precision
-        # Grainsize is 3000 for the for_loop to be parallized on CPU
+        # Grainsize is 3000 for the for_loop to be parallelized on CPU
        sizes = ((100,)) if low_precision else ((200,), (3002,))
        # Bfloat16 has a particularly bad performance here
        # This operation is nondeterministic on GPU, so we are generous with the rtol
@ -7063,7 +7063,7 @@ class TestTorch(TestCase):
                dest.index_add(0, index, source)
    def test_linspace_logspace(self):
-        # Ensure the output does not require grad regardless of inputs requiring gard or not.
+        # Ensure the output does not require grad regardless of inputs requiring guard or not.
        # The output of factory functions should not be part of any computational graph.
        start = 0.0
        end = 3.0
@ -8700,7 +8700,7 @@ tensor([[[1.+1.j, 1.+1.j, 1.+1.j,  ..., 1.+1.j, 1.+1.j, 1.+1.j],
        self.assertEqual(2 * size, (1, 2, 3, 1, 2, 3))
    def test_Size_concat_non_tuple_sequence(self):
-        # check that TypeError get's raised on adding non-tuple sequences.
+        # check that TypeError gets raised on adding non-tuple sequences.
        from collections.abc import Sequence
        class DummySequence(Sequence):
@ -11104,7 +11104,7 @@ def add_neg_dim_tests():
        assert not hasattr(TestTorch, test_name), "Duplicated test name: " + test_name
        setattr(TestTorch, test_name, make_neg_dim_test(name, tensor_arg, arg_constr, types, extra_dim))
-# TODO: these empy classes are temporarily instantiated for XLA compatibility
+# TODO: these empty classes are temporarily instantiated for XLA compatibility
 #   once XLA updates their test suite it should be removed
 class TestViewOps(TestCase):
    pass
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@ -98,7 +98,7 @@ def _check_equal(
    """
    Compare test tensor against golden and reference tensors.
    Golden is the highest precision possible serving as the "ground truth"
-    Refernce is the same precision as test and should also serve as less precisie ground truth.
+    Reference is the same precision as test and should also serve as less precisie ground truth.
    We calcculate the "reference error" by comparing the golden to reference and use this as the
    measruing stick for the test tensor.
@ -1693,7 +1693,7 @@ class TestSDPAFailureModes(NNTestCase):
    @onlyCUDA
    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support fused SDPA or pre-SM80 hardware")
    def test_unaligned_tensors(self, device):
-        # The alignment is depdent on arch so we specifiy SM80OrLater
+        # The alignment is dependent on arch so we specify SM80OrLater
        dtype = torch.float16
        size = SdpaShape(2, 2, 8, 5)
        make_tensor = partial(torch.rand, size, device=device, dtype=dtype)
@ -3042,7 +3042,7 @@ class TestSDPACudaOnly(NNTestCase):
        # Cast up and compare
        # Since we are doing the compute on fp16 we have to bump the tolerance
-        # Bump down the tolearnce for blfoat16
+        # Bump down the tolerance for blfoat16
        atol = 7e-4 if dtype == torch.float16 else 7e-3
        rtol = 7e-4 if dtype == torch.float16 else 7e-3
        if TEST_WITH_ROCM:
@ -3525,7 +3525,7 @@ class TestSDPACudaOnly(NNTestCase):
                    query, key, value, is_causal=is_causal, scale=scale, enable_gqa=enable_gqa)
        else:
            # Problem: We pad sizes in the composite region of the top level SDPA. But we need the
-            # Debug mask when have dropout. So I am going to manualy pad up here when testing dropout
+            # Debug mask when have dropout. So I am going to manually pad up here when testing dropout
            q_padded, q_og_size = pad_last_dim(query, 8)
            k_padded, k_og_size = pad_last_dim(key, 8)
            v_padded, v_og_size = pad_last_dim(value, 8)
--- a/test/test_type_promotion.py
+++ b/test/test_type_promotion.py
@ -1052,7 +1052,7 @@ class TestTypePromotion(TestCase):
                torch.cat([x, y], out=out)
                self.assertEqual(out, expected_out, exact_dtype=True)
-    # Verfies that unary ops require matching out types
+    # Verifies that unary ops require matching out types
    @onlyNativeDeviceTypes
    @dtypes(*itertools.product((torch.int64,
                                torch.float32, torch.float64,
--- a/test/test_typing.py
+++ b/test/test_typing.py
@ -35,7 +35,7 @@ CACHE_DIR = os.path.join(DATA_DIR, ".mypy_cache")
 def _key_func(key: str) -> str:
-    """Split at the first occurance of the ``:`` character.
+    """Split at the first occurrence of the ``:`` character.
    Windows drive-letters (*e.g.* ``C:``) are ignored herein.
    """
@ -135,7 +135,7 @@ def _parse_reveals(file: IO[str]) -> list[str]:
    comments = "/n".join(comments_array)
    # Only search for the `{*}` pattern within comments,
-    # otherwise there is the risk of accidently grabbing dictionaries and sets
+    # otherwise there is the risk of accidentally grabbing dictionaries and sets
    key_set = set(re.findall(r"\{(.*?)\}", comments))
    kwargs = {
        k: FORMAT_DICT.get(k, f"<UNRECOGNIZED FORMAT KEY {k!r}>") for k in key_set
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@ -1080,7 +1080,7 @@ class TestUnaryUfuncs(TestCase):
    def test_silu_complex(self, device, dtype):
        atol = 1e-6
        rtol = 1e-6
-        inouts = [
+        inp_outs = [
            (0.2 + 0.3j, 0.08775215595960617065 + 0.18024823069572448730j),
            (1e-19 + 1e-18j, 4.99999984132761269448e-20 + 5.00000022906852482872e-19j),
            (-1.0 + 2.0j, -0.78546208143234252930 + -0.44626939296722412109j),
@ -1088,7 +1088,7 @@ class TestUnaryUfuncs(TestCase):
            (2.0j, -1.55740761756896972656 + 0.99999988079071044922j),
        ]
-        for inp, out in inouts:
+        for inp, out in inp_outs:
            res = torch.nn.functional.silu(
                torch.tensor(inp, dtype=dtype, device=device)
            )
@ -1096,7 +1096,7 @@ class TestUnaryUfuncs(TestCase):
            self.assertEqual(res.real, out.real, atol=atol, rtol=rtol)
            self.assertEqual(res.imag, out.imag, atol=atol, rtol=rtol)
-        for inp, out in inouts:
+        for inp, out in inp_outs:
            res = torch.nn.functional.silu(
                torch.tensor(inp, dtype=dtype, device=device), inplace=True
            )
@ -1170,7 +1170,7 @@ class TestUnaryUfuncs(TestCase):
        # Not using numpy's log1p here because by the time of writing this,
        # np.log1p has precision problems for small complex input values, see here:
        # https://github.com/numpy/numpy/issues/22609
-        inouts = [
+        inp_outs = [
            (0.2 + 0.3j, 0.21263386770217202 + 0.24497866312686414j),
            (1e-19 + 1e-18j, 1e-19 + 1e-18j),
            (1e-18 + 0.1j, 0.00497517 + 0.0996687j),
@ -1184,7 +1184,7 @@ class TestUnaryUfuncs(TestCase):
        ]
        # test the extreme values
        if dtype == torch.complex128:
-            inouts += [
+            inp_outs += [
                (-1 + 1e250j, 575.6462732485114 + 1.5707963267948966j),
                (1e250 + 1j, 575.6462732485114 + 1e-250j),
                (1e250 + 1e250j, 575.9928468387914 + 0.7853981633974483j),
@ -1193,7 +1193,7 @@ class TestUnaryUfuncs(TestCase):
                (1e250 + 1e-250j, 575.6462732485114 + 0.0j),
            ]
        elif dtype == torch.complex64:
-            inouts += [
+            inp_outs += [
                (-1 + 1e30j, 69.07755278982137 + 1.5707963267948966j),
                (1e30 + 1j, 69.07755278982137 + 1e-30j),
                (1e30 + 1e30j, 69.42412638010134 + 0.7853981633974483j),
@ -1203,7 +1203,7 @@ class TestUnaryUfuncs(TestCase):
            ]
        # test the log1p individually
-        for inp, out in inouts:
+        for inp, out in inp_outs:
            res = torch.log1p(torch.tensor(inp, dtype=dtype, device=device))
            self.assertFalse(torch.any(torch.isnan(res)))
            # setting up atol == 0.0 because some part has very small values
@ -1211,7 +1211,7 @@ class TestUnaryUfuncs(TestCase):
            self.assertEqual(res.imag, out.imag, atol=0.0, rtol=1e-6)
        # test the log1p in tensor
-        inp_lst, out_lst = (list(elmt) for elmt in zip(*inouts))
+        inp_lst, out_lst = (list(elmt) for elmt in zip(*inp_outs))
        inp_tens = torch.tensor(inp_lst, dtype=dtype, device=device)
        out_tens = torch.tensor(out_lst, dtype=dtype, device=device)
        res_tens = torch.log1p(inp_tens)
@ -1292,7 +1292,7 @@ class TestUnaryUfuncs(TestCase):
        zero_to_large = torch.tensor([0.0, 1.0, 1e3], **tkwargs)
        small_to_inf = torch.tensor([1e-3, 1.0, float("inf")], **tkwargs)
        nans = torch.zeros((3,), **tkwargs) + float("nan")
-        inpouts = [
+        inp_outs = [
            # (a    ,    x),       out
            ((zeros, small_to_inf), ones),
            ((small_to_inf, zeros), zeros),
@ -1302,7 +1302,7 @@ class TestUnaryUfuncs(TestCase):
            ((infs, infs), nans),
            ((-small_to_inf, small_to_inf), nans),
        ]
-        for inputs, output in inpouts:
+        for inputs, output in inp_outs:
            input0, input1 = inputs
            calc = torch.igamma(input0, input1)
            if torch.all(torch.isnan(output)):
@ -1321,7 +1321,7 @@ class TestUnaryUfuncs(TestCase):
        zero_to_large = torch.tensor([0.0, 1.0, 1e3], **tkwargs)
        small_to_inf = torch.tensor([1e-3, 1.0, float("inf")], **tkwargs)
        nans = torch.zeros((3,), **tkwargs) + float("nan")
-        inpouts = [
+        inp_outs = [
            # (a    ,    x),       out
            ((zeros, small_to_inf), zeros),
            ((small_to_inf, zeros), ones),
@ -1331,7 +1331,7 @@ class TestUnaryUfuncs(TestCase):
            ((infs, infs), nans),
            ((-small_to_inf, small_to_inf), nans),
        ]
-        for inputs, output in inpouts:
+        for inputs, output in inp_outs:
            input0, input1 = inputs
            calc = torch.igammac(input0, input1)
            if torch.all(torch.isnan(output)):
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@ -1955,7 +1955,7 @@ class TestOldViewOps(TestCase):
            with self.assertRaises(numpy_err, msg=msg):
                np.array_split(a.cpu().numpy(), sections_or_indices, dim)
-        # addtional tests for tensor_split with tensor_indices_or_sections
+        # additional tests for tensor_split with tensor_indices_or_sections
        with self.assertRaisesRegex(
            RuntimeError,
            r"tensor_split expected tensor_indices_or_sections to have dtype of long, but got Float",
--- a/test/test_weak.py
+++ b/test/test_weak.py
@ -159,7 +159,7 @@ class WeakTest(TestCase):
        self.assertRaises(KeyError, d.__delitem__, o)
        self.assertRaises(KeyError, d.__getitem__, o)
-        # If a key isn't of a weakly referencable type, __getitem__ and
+        # If a key isn't of a weakly referenceable type, __getitem__ and
        # __setitem__ raise TypeError.  __delitem__ should too.
        self.assertRaises(TypeError, d.__delitem__, 13)
        self.assertRaises(TypeError, d.__getitem__, 13)