[BE][1/5] fix typos in aten/ (#157550)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157550 Approved by: https://github.com/albanD ghstack dependencies: #156605, #157637
2025-12-06 00:20:18 +01:00 · 2025-07-17 14:55:15 +08:00 · 2025-07-17 14:55:15 +08:00 · 4c8b408d16
commit 4c8b408d16
parent c8d43cbc6e
33 changed files with 61 additions and 62 deletions
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1162,7 +1162,6 @@ exclude_patterns = [
    # These files are all grandfathered in, feel free to remove from this list
    # as necessary
    # NOTE: remove the patterns in the order they are listed
-    'aten/**',
    'aten/src/ATen/native/**',
    'aten/src/ATen/native/q*/**',
    'aten/src/ATen/native/[a-pA-P]*/**',
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -458,7 +458,7 @@ if(LAPACK_FOUND)
    # would not need this at all), some of our libraries (magma in particular)
    # backend to CPU BLAS/LAPACK implementations, and so it is very important
    # we get the *right* implementation, because even if the symbols are the
-    # same, LAPACK implementions may have different calling conventions.
+    # same, LAPACK implementations may have different calling conventions.
    # This caused https://github.com/pytorch/pytorch/issues/7353
    #
    # We do NOT do this on Linux, since we just rely on torch_cpu to
--- a/aten/src/ATen/DLConvertor.h
+++ b/aten/src/ATen/DLConvertor.h
@ -4,7 +4,7 @@
 #include <ATen/Tensor.h>
 #include <ATen/dlpack.h>

-// this convertor will:
+// this converter will:
 // 1) take a Tensor object and wrap it in the DLPack tensor
 // 2) take a dlpack tensor and convert it to the ATen Tensor

--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@ -233,8 +233,8 @@ Tensor FunctionalInverses::slice_Tensor_inverse(const Tensor& base, const Tensor

 // NOLINTNEXTLINE(performance-unnecessary-value-param)
 Tensor FunctionalInverses::split_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, int64_t mutated_view_idx, c10::SymInt split_size, int64_t dim) {
-    // It would be nice if this logic could be re-used from autograd's split_backward(), but I don't think it can.
-    // For functionalization, we have only have one of the tensors from the TensorList outputed by split(), and we want to layer i
+    // It would be nice if this logic could be reused from autograd's split_backward(), but I don't think it can.
+    // For functionalization, we have only have one of the tensors from the TensorList outputted by split(), and we want to layer i
    // on top of the base tensor.
    // For autograd, we have all of the tensors outputted by split() and we just want to stack them.
    dim = at::maybe_wrap_dim(dim, base.dim());
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -286,11 +286,11 @@ void FunctionalTensorWrapper::storage_resize_(const c10::SymInt& new_size) {
  // storage resizing is severely limited: we only support resizing either to zero, or from zero bytes.
  TORCH_CHECK(new_size == 0 || curr_storage_size == 0, "new_size: ", new_size, ". curr_storage_size: ", curr_storage_size);
  // The "functionalization rule" for storage resizing is a giant no-op, mainly because we don't want
-  // resize_() calls to actualy emit any ops in the functional graph.
+  // resize_() calls to actually emit any ops in the functional graph.
  // How does it work?
  // Resizing up (old size == 0):
  //   We do nothing in this case.
-  //   The expection is that for the user code to be valid, the next op that should run against the current tensor "x"
+  //   The expectation is that for the user code to be valid, the next op that should run against the current tensor "x"
  //   will be a x.copy_(y) (or similar), that will fully overwrite the data of x.
  //   If there are any outstanding aliases of x, we expect them not to be used until after the copy_() call
  //   (otherwise the eager code would be invalid),
@ -327,7 +327,7 @@ void FunctionalTensorWrapper::maybe_replace_storage(const Tensor& other) {
  // We're also no longer re-generate "b" fully from "a" anymore, since "a" refers to a slice of "b"'s data.
  //
  // This is probably fixable in theory, but:
-  // - the fix would likey complicated the functionalization logic quite a bit.
+  // - the fix would likely complicated the functionalization logic quite a bit.
  // - the primary use case for resize_() today is resizing zero-sized tensors in out= variants of operators
  // - resize_() also can give you weird results today if you try to resize_() a weirdly strided tensor.
  //
@ -344,7 +344,7 @@ void FunctionalTensorWrapper::maybe_replace_storage(const Tensor& other) {
  set_sizes_and_strides(value_.sizes(), value_.strides());
  refresh_numel();
  // (Technically we should be guaranteed that the tensor was already contiguous,
-  // since it's guaranteed not to have been a view. Doesnt hurt to run though)
+  // since it's guaranteed not to have been a view. Doesn't hurt to run though)
  refresh_contiguous();
  // Swapping out the storage of a tensor (aka from a resize_() call) will update the sizes and strides of the tensor,
  // so we need to record the fact that metadata was mutated.
@ -819,7 +819,7 @@ void setFunctionalizationReapplyViewsTLS(bool reapply_views) {
 // This function will "functionalize" it.
 // That is, it will call the operator, but removing any intermediate views/mutations
 // that are performed inside of it.
-// This is useful for LTC/XLA, which would like to re-use some of our composite kernels
+// This is useful for LTC/XLA, which would like to reuse some of our composite kernels
 // from pytorch core but not have to worry about the view ops that they might call.
 // e.g. at::block_diag
 void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
--- a/aten/src/ATen/LegacyBatchedFallback.cpp
+++ b/aten/src/ATen/LegacyBatchedFallback.cpp
@ -218,7 +218,7 @@ static Tensor safeStack(TensorList tensors) {
  // is possible for the backward function to return an undefined grad for some
  // grad_input for each example. In that case, we return an undefined grad.
  //
-  // It is theoretically posssible for *some* of the examples to produce an
+  // It is theoretically possible for *some* of the examples to produce an
  // undefined grad (a kernel could peek at the gradient values and return an
  // undefined tensor if it determines the gradient is full of zeros). We
  // could handle this by treating the undefined grad as a zero-filled tensor
--- a/aten/src/ATen/LegacyVmapTransforms.h
+++ b/aten/src/ATen/LegacyVmapTransforms.h
@ -140,7 +140,7 @@ struct TORCH_API VmapPhysicalView {
  // mapping a physical tensor to a new logical tensor (BatchedTensor)
  VmapPhysicalToLogicalMap getPhysicalToLogicalMap() const;

-  // Maps a logical shape to a physical shape by pre-pending the batch
+  // Maps a logical shape to a physical shape by prepending the batch
  // sizes to the logical shape.
  VmapDimVector getPhysicalShape(IntArrayRef logical_shape) const;

--- a/aten/src/ATen/MapAllocator.cpp
+++ b/aten/src/ATen/MapAllocator.cpp
@ -299,7 +299,7 @@ MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags,
            ::close(fd);
            TORCH_CHECK(false, "unable to stretch file <", filename_, "> to the right size: ", c10::utils::str_error(last_err), " (", last_err, ")");
          }
-/* on macOS write returns with errno 45 (Opperation not supported) when used
+/* on macOS write returns with errno 45 (Operation not supported) when used
 * with a file descriptor obtained via shm_open
 */
 #ifndef __APPLE__
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@ -211,7 +211,7 @@ NestedTensorImpl::NestedTensorImpl(
 }

 // assume contiguous, `nested_strides` and `offsets`
-// can be infered from `nested_sizes`
+// can be inferred from `nested_sizes`
 NestedTensorImpl::NestedTensorImpl(
    const at::Tensor& buffer,
    const at::Tensor& nested_sizes)
--- a/aten/src/ATen/NestedTensorImpl.h
+++ b/aten/src/ATen/NestedTensorImpl.h
@ -32,7 +32,7 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
      at::Tensor nested_strides,
      at::Tensor storage_offsets);
  // assume contiguous, `nested_strides` and `offsets`
-  // can be infered from `nested_sizes`
+  // can be inferred from `nested_sizes`
  explicit NestedTensorImpl(
      const at::Tensor& buffer,
      const at::Tensor& nested_sizes);
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@ -93,12 +93,12 @@ ident: identity for binary combination function sf. sf(ident, x) needs to return
 x.

 f: function for reduction over a chunk. f needs to be of signature scalar_t
-f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy)
+f(int64_t partial_begin, int64_t partial_end, scalar_t identify)

 sf: function to combine two partial results. sf needs to be of signature
 scalar_t sf(scalar_t x, scalar_t y)

-For example, you might have a tensor of 10000 entires and want to sum together
+For example, you might have a tensor of 10000 entries and want to sum together
 all the elements. Parallel_reduce with a grain_size of 2500 will then allocate
 an intermediate result tensor with 4 elements. Then it will execute the function
 "f" you provide and pass the beginning and end index of these chunks, so
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@ -252,7 +252,7 @@ inline Tensor applySelect(
    // Note: `size >= -index` is not equivalent to `size > -1 - index` if index
    // is INT64_MIN For std::numeric_limits<int64_t>::min() result of unary
    // minus is undefined by the standard but in practice is equal to self. On
-    // the other hand, indexing wraping is valid for all negative int64_t
+    // the other hand, indexing wrapping is valid for all negative int64_t
    // values, as x[INT64_MIN] is the same as x[INT64_MAX]
    TORCH_CHECK_INDEX(
        size.sym_gt(-1 - index)
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@ -208,7 +208,7 @@ bool TensorIteratorConfig::is_tensor_const(size_t idx) {
 // same strides are increasing. If dimensions are non-increasing, we move on to the next input to break the tie.
 //
 // Instead of applying rule 4 for tie breaking, we could move on to the next tensor directly. This would result in possibly
-// losing the correct permuation of the first tensor if there are permuted trivial dimensions, but could potentially
+// losing the correct permutation of the first tensor if there are permuted trivial dimensions, but could potentially
 // improve traversal order of the second tensor. We chose the former option to better propagate channels last layout
 // for example for a tensor with the sizes N1H1
 // These rules result in the intuitive behavior that in most cases recovers permutation of either the first argument (if all
@ -244,7 +244,7 @@ void TensorIteratorBase::reorder_dimensions() {
  // initialize perm with n-1, n-2, ..., 1, 0
  std::iota(perm_.rbegin(), perm_.rend(), 0);

-  // Reordering dimensions changes iteraton order
+  // Reordering dimensions changes iteration order
  if (enforce_linear_iteration_) {
    permute_dimensions(perm_);
    return;
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@ -388,7 +388,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {

  /// Return scalar value from original_tensor_base if it is defined. When
  /// common_dtype is Half, casting scalar input to common_dtype might overflow.
-  /// If the scalar is aleady given in the type of Half, then return scalar
+  /// If the scalar is already given in the type of Half, then return scalar
  /// value from tensor_base.
  template <typename T>
  T original_scalar_value(int64_t arg) {
@ -502,7 +502,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
  /// kernels
  bool can_use_32bit_indexing() const;

-  /// An "iteratable" object that recursively splits this iterator into
+  /// An "iterable" object that recursively splits this iterator into
  /// sub-iterators that can use 32-bit indexing.
  SplitUntil32Bit with_32bit_indexing() const;

@ -878,7 +878,7 @@ class TORCH_API TensorIteratorConfig final {

  // Sets the enforce_linear_iteration_ flag, which is false by default.
  // If true, iteration goes in the same order as a C-contiguous tensor
-  // is layed out in memory. i.e. last dimension iterates fastest.
+  // is laid out in memory. i.e. last dimension iterates fastest.
  //
  // This iteration order can be less efficient and may even prevent
  // vectorization. So only use if the correctness of your kernel depends on it.
--- a/aten/src/ATen/TensorSubclassLikeUtils.h
+++ b/aten/src/ATen/TensorSubclassLikeUtils.h
@ -78,7 +78,7 @@ inline bool areAnyOptionalTensorSubclassLike(
 // NOTE: This function expects a scalar tensor of boolean dtype.
 // Eg.
 // Non-Composite Compliant Pattern : (t == 0).all().item<bool>()
-// Composite Compliant Patter : is_salar_tensor_true((t == 0).all())
+// Composite Compliant Pattern : is_salar_tensor_true((t == 0).all())
 inline bool is_scalar_tensor_true(const Tensor& t) {
  TORCH_INTERNAL_ASSERT(t.dim() == 0)
  TORCH_INTERNAL_ASSERT(t.scalar_type() == kBool)
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@ -378,9 +378,9 @@ inline static std::optional<ResultVec> computeStride_impl(
        (TORCH_GUARD_OR_TRUE(sym_ne(oldshape[tensor_d - 1], 1)) &&
        TORCH_GUARD_OR_TRUE(sym_ne(oldstride[tensor_d - 1], tensor_numel * chunk_base_stride)))) {
     // We want to accumulate stuff in view_numel until view_numel == tensor_numel, if we do not
-     // know if that is satisfied we keep accumalating. For example if view_numel = 1 and tensor_numel = u1,
+     // know if that is satisfied we keep accumulating. For example if view_numel = 1 and tensor_numel = u1,
     // we want to take that path, view_numel will become u0. Next iteration if u0==u1 we want to stop.
-     // Thats why we use TORCH_GUARD_OR_TRUE below.
+     // That's why we use TORCH_GUARD_OR_TRUE below.

     // we use TORCH_GUARD_OR_FALSE and not TORCH_GUARD_OR_TRUE when comparing newshape[view_d] ==1 because
     // if we know view_numel < tensor_numel is false, we want to stop. Unless we know for sure newshape[view_d]==1
--- a/aten/src/ATen/TracerMode.h
+++ b/aten/src/ATen/TracerMode.h
@ -27,7 +27,7 @@
 //    ops (ops being called by other ops). After the intermediate op call
 //    finishes it's set back to the original `TracingState` object.
 //
-//    The `TracingState` obect in TLS can also be read/written via its Python
+//    The `TracingState` object in TLS can also be read/written via its Python
 //    binding in `python_tracer.cpp`, and `get/setTracingState()` C++ APIs,
 //    which are also exposed as `TORCH_API`.
 //
--- a/aten/src/ATen/ZeroTensorFallback.cpp
+++ b/aten/src/ATen/ZeroTensorFallback.cpp
@ -95,7 +95,7 @@ namespace at {
    m.impl("clone", torch::CppFunction::makeFallthrough());
    m.impl("dot", torch::CppFunction::makeFallthrough());
    m.impl("vdot", torch::CppFunction::makeFallthrough());
-    // The functions in the list below have a specific registeration in native_functions.yaml and
+    // The functions in the list below have a specific registration in native_functions.yaml and
    // do not use the fallback.
    // m.impl("mul.Tensor", torch::CppFunction::makeFallthrough());
    // m.impl("add.Tensor", torch::CppFunction::makeFallthrough());
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@ -377,7 +377,7 @@ Keep it simple for now by assuming only one such flag is
 present in the argument list.  If I ever need a function
 with more than flag I'll figure out something else.
 The policy is:
-If the user has explicity specified a dtype, respect it.
+If the user has explicitly specified a dtype, respect it.
 Otherwise, set it to the autocast type.
 ********************************************************/

--- a/aten/src/ATen/dlpack.h
+++ b/aten/src/ATen/dlpack.h
@ -199,7 +199,7 @@ typedef struct {
   * `byte_offset` field should be used to point to the beginning of the data.
   *
   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
-   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+   * TVM, perhaps others) do not adhere to this 256 byte alignment requirement
   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
   * (after which this note will be updated); at the moment it is recommended
   * to not rely on the data pointer being correctly aligned.
--- a/aten/src/ATen/nnapi/nnapi_bind.cpp
+++ b/aten/src/ATen/nnapi/nnapi_bind.cpp
@ -26,7 +26,7 @@ static void load_platform_library() {
  (void)run_once;
 }

-// NnapiCompilation functon definitions:
+// NnapiCompilation function definitions:

 // Could possibly call load_platform_library in constructor, but error reporting
 // can be complicated if the constructor is called during model loading.
--- a/aten/src/ATen/record_function.h
+++ b/aten/src/ATen/record_function.h
@ -666,7 +666,7 @@ void record_function_with_scope_and_debug_handle(
        guard, fn, debug_handle, inputs, ##__VA_ARGS__);       \
  }

-// Helper macros to record LITE INTERPETER scope events with debug handles
+// Helper macros to record LITE INTERPRETER scope events with debug handles
 #define RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS( \
    fn, debug_handle, inputs)                           \
  RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS(            \
--- a/aten/src/ATen/templates/RegisterDispatchKey.cpp
+++ b/aten/src/ATen/templates/RegisterDispatchKey.cpp
@ -5,7 +5,7 @@

 // NOTE: This condition is true for all PyTorch internal libraries, it
 //       just excludes external projects such as torch_xla which
-//       re-use some of the PyTorch codegen machinery.
+//       reuse some of the PyTorch codegen machinery.
 #if defined(CAFFE2_BUILD_MAIN_LIB)        || \
    defined(TORCH_CUDA_BUILD_MAIN_LIB)    || \
    defined(TORCH_HIP_BUILD_MAIN_LIB)     || \
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@ -491,7 +491,7 @@ class TORCH_API Tensor: public TensorBase {
        "attribute won't be populated during autograd.backward(). If you indeed want the .grad "
        "field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. "
        "If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor "
-        "instead. See github.com/pytorch/pytorch/pull/30531 for more informations.");
+        "instead. See github.com/pytorch/pytorch/pull/30531 for more information.");
    }
    return maybe_grad;
  }
--- a/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
+++ b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
@ -199,7 +199,7 @@ int main(int argc, char* argv[]) {

  #ifdef C10_MOBILE
  // Need to disable mkldnn for this test since it allocated memory
-  // via raw_allocate inteface which requires context pointer and raw
+  // via raw_allocate interface which requires context pointer and raw
  // pointer to be the same. Tis is not true for mobile allocator.
  at::globalContext().setUserEnabledMkldnn(false);
  #endif
--- a/aten/src/ATen/test/half_test.cpp
+++ b/aten/src/ATen/test/half_test.cpp
@ -25,7 +25,7 @@ TEST(TestHalf, Arithmetic) {
  ASSERT_EQ(one + one, 2);
 }

-TEST(TestHalf, Comparisions) {
+TEST(TestHalf, Comparisons) {
  Half zero = 0;
  Half one = 1;
  ASSERT_LT(zero, one);
--- a/aten/src/ATen/test/undefined_tensor_test.cpp
+++ b/aten/src/ATen/test/undefined_tensor_test.cpp
@ -9,7 +9,7 @@ using namespace at;
 TEST(TestUndefined, UndefinedTest) {
  manual_seed(123);

-  // mainly test ops on undefined tensors don't segfault and give a reasonable errror message.
+  // mainly test ops on undefined tensors don't segfault and give a reasonable error message.
  Tensor und;
  Tensor ft = ones({1}, CPU(kFloat));

--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@ -5,7 +5,7 @@ namespace {
    template <typename T>
    class Memory : public ::testing::Test {};
    template <typename T>
-    class Arithmetics : public ::testing::Test {};
+    class Arithmetic : public ::testing::Test {};
    template <typename T>
    class Comparison : public ::testing::Test {};
    template <typename T>
@ -92,7 +92,7 @@ namespace {
    using ComplexTypes = ::testing::Types<vcomplex, vcomplexDbl>;
    using ReducedFloatTestedTypes = ::testing::Types<vBFloat16, vHalf>;
    TYPED_TEST_SUITE(Memory, ALLTestedTypes);
-    TYPED_TEST_SUITE(Arithmetics, FloatIntTestedTypes);
+    TYPED_TEST_SUITE(Arithmetic, FloatIntTestedTypes);
    TYPED_TEST_SUITE(Comparison, RealFloatIntReducedFloatTestedTypes);
    TYPED_TEST_SUITE(Bitwise, FloatIntTestedTypes);
    TYPED_TEST_SUITE(MinMax, RealFloatIntTestedTypes);
@ -691,7 +691,7 @@ namespace {
        AssertVectorized<vec>(NAME_INFO(DeInterleave FirstHalf), std::get<0>(cc), vec::loadu(vals)).check(true);
        AssertVectorized<vec>(NAME_INFO(DeInterleave SecondHalf), std::get<1>(cc), vec::loadu(vals + vec::size())).check(true);
    }
-    TYPED_TEST(Arithmetics, Plus) {
+    TYPED_TEST(Arithmetic, Plus) {
        using vec = TypeParam;
        using VT = ValueType<TypeParam>;
        test_binary<vec>(
@ -703,7 +703,7 @@ namespace {
            createDefaultBinaryTestCase<vec>(TestSeed()),
                RESOLVE_OVERLOAD(filter_add_overflow));
    }
-    TYPED_TEST(Arithmetics, Minus) {
+    TYPED_TEST(Arithmetic, Minus) {
        using vec = TypeParam;
        using VT = ValueType<TypeParam>;
        test_binary<vec>(
@ -715,7 +715,7 @@ namespace {
            createDefaultBinaryTestCase<vec>(TestSeed()),
                RESOLVE_OVERLOAD(filter_sub_overflow));
    }
-    TYPED_TEST(Arithmetics, Multiplication) {
+    TYPED_TEST(Arithmetic, Multiplication) {
        using vec = TypeParam;
        test_binary<vec>(
            NAME_INFO(mult),
@ -724,7 +724,7 @@ namespace {
            createDefaultBinaryTestCase<vec>(TestSeed(), false, true),
            RESOLVE_OVERLOAD(filter_mult_overflow));
    }
-    TYPED_TEST(Arithmetics, Division) {
+    TYPED_TEST(Arithmetic, Division) {
        using vec = TypeParam;
        TestSeed seed;
        test_binary<vec>(
--- a/aten/src/ATen/test/vec_test_all_types.h
+++ b/aten/src/ATen/test/vec_test_all_types.h
@ -531,7 +531,7 @@ template <typename T>
 std::enable_if_t<is_complex<T>::value, void>
 filter_div_ub(T& val1, T& val2) {
    //missing
-    //at least consdier zero division
+    //at least consider zero division
    auto ret = std::abs(val2);
    if (ret == 0) {
        val2 = T(1, 2);
@ -1291,7 +1291,7 @@ std::enable_if_t<is_complex<Complex<T>>::value, Complex<T>> local_multiply(Compl
    T y_real = y.real();
    T y_imag = y.imag();
 #if defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_ZVECTOR)
-    //check multiplication considerin swap and fma
+    //check multiplication considering swap and fma
    T rr = x_real * y_real;
    T ii = x_imag * y_real;
    T neg_imag = -y_imag;
@ -1362,7 +1362,7 @@ std::enable_if_t<is_complex<Complex<T>>::value, Complex<T>> local_division(Compl
    return Complex<T>(rr, ii);
 #else /* defined(CPU_CAPABILITY_ZVECTOR) */
 #if defined(CPU_CAPABILITY_VSX)
-    //check multiplication considerin swap and fma
+    //check multiplication considering swap and fma
    T rr = x_real * y_real;
    T ii = x_imag * y_real;
    T neg_imag = -y_imag;
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@ -1232,7 +1232,7 @@ void test_matmul(
 }

 TEST_F(VulkanAPITest, DISABLED_matmul_3d_weight_vulkan) {
-  // This will call at::bmm. Will crash for unknow reason.
+  // This will call at::bmm. Will crash for unknown reason.
  const auto m1_cpu =
      at::rand({13, 23, 45}, at::device(at::kCPU).dtype(at::kFloat));
  const auto m2_cpu =
@ -1241,7 +1241,7 @@ TEST_F(VulkanAPITest, DISABLED_matmul_3d_weight_vulkan) {
 }

 TEST_F(VulkanAPITest, DISABLED_matmul_3d_weight_cpu) {
-  // This will call at::bmm. Will crash for unknow reason.
+  // This will call at::bmm. Will crash for unknown reason.
  const auto m1_cpu =
      at::rand({13, 23, 45}, at::device(at::kCPU).dtype(at::kFloat));
  const auto m2_cpu =
@ -2004,7 +2004,7 @@ TEST_F(VulkanAPITest, conv2d_pw_prepack_bc_medium) {
    1);                 // groups
 }

-// The followin 2 tests failed on Meta's CI when all tests are executed.  Output
+// The following 2 tests failed on Meta's CI when all tests are executed.  Output
 // has lots of nan. Cause unknown.
 // When this test is run alone (with gtest_filter), it passes.
 // The test also passes with smaller planes, see "conv2d_pw_prepack_medium".
@ -5664,7 +5664,7 @@ TEST_F(VulkanAPITest, var_2d_unbiased) {
  test_var({3, 5}, {1}, true, true);
  test_var({3, 5}, {1}, true, false);

-  // inpu.dim() == dim_list.size(), only keepdim == true is supported
+  // input.dim() == dim_list.size(), only keepdim == true is supported
  test_var({3, 5}, {0, 1}, true, true);
 }

@ -5672,7 +5672,7 @@ TEST_F(VulkanAPITest, var_2d_biased) {
  test_var({3, 5}, {1}, false, true);
  test_var({3, 5}, {1}, false, false);

-  // inpu.dim() == dim_list.size(), only keepdim == true is supported
+  // input.dim() == dim_list.size(), only keepdim == true is supported
  test_var({3, 5}, {0, 1}, false, true);
 }

@ -7142,12 +7142,12 @@ TEST_F(VulkanAPITest, clone_success) {
 }

 TEST_F(VulkanAPITest, clone_invalidinputs_exceptions) {
-  // Act: Vulkan supports Preserve and Contiguous memory foramts
+  // Act: Vulkan supports Preserve and Contiguous memory formats
  EXPECT_THROW({
    clone_test({2, 3, 5, 161}, c10::MemoryFormat::ChannelsLast);
  }, ::std::exception);

-  // Act: Vulkan supports Preserve and Contiguous memory foramts
+  // Act: Vulkan supports Preserve and Contiguous memory formats
  EXPECT_THROW({
    clone_test({2, 3, 5, 161}, c10::MemoryFormat::ChannelsLast3d);
  }, ::std::exception);
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@ -2116,7 +2116,7 @@ std::tuple<double, double, int, int> produce_inputs_for_binary_op(
    input2_cpu = produce_random_tensor(input2_shape);

    if (compute_quantization_params) {
-      // compute appropiate scale and zero point for inputs
+      // compute appropriate scale and zero point for inputs
      const auto in1_quant_params = compute_quant_params(input1_cpu);
      in1_scale = std::get<0>(in1_quant_params);
      in1_zero_point = std::get<1>(in1_quant_params);
@ -2287,7 +2287,7 @@ void test_quantized_binary_op(
      apply_cpu_quantized_binary_op(op_name, input1_cpu_deq, input2_cpu_deq);

  if (compute_quantization_params || random_quantization_params) {
-    // compute appropiate scale and zero point for output
+    // compute appropriate scale and zero point for output
    const auto out_quant_params = compute_quant_params(output_cpu);
    out_scale = std::get<0>(out_quant_params);
    out_zero_point = std::get<1>(out_quant_params);
@ -2540,7 +2540,7 @@ void test_quantized_conv2d(
    bias_cpu = produce_random_tensor(bias_shape, 1.26, 5.97, 0.59);

    if (compute_quantization_params) {
-      // compute appropiate scale and zero point for input, weight and bias
+      // compute appropriate scale and zero point for input, weight and bias
      const auto in_quant_params = compute_quant_params(input_cpu, in_dtype);
      in_scale = std::get<0>(in_quant_params);
      in_zero_point = std::get<1>(in_quant_params);
@ -2624,7 +2624,7 @@ void test_quantized_conv2d(
      groups);

  if (compute_quantization_params || random_quantization_params) {
-    // compute appropiate scale and zero point for output
+    // compute appropriate scale and zero point for output
    const auto out_quant_params = compute_quant_params(output_cpu, out_dtype);
    out_scale = std::get<0>(out_quant_params);
    out_zero_point = std::get<1>(out_quant_params);
@ -3524,7 +3524,7 @@ TEST_F(VulkanAPITest, linear_4d_large) {
  test_quantized_linear({9, 13, 11, 17}, {23, 17}, {23});
 }

-// The following code is not directly releated to quantization. We put it here
+// The following code is not directly related to quantization. We put it here
 // since we are not able to run this test on GH's CI: for some unknown reason,
 // we are not able to reference symbols in the vulkan directory, hence the build
 // on GH fails. Moving the test here so we are still able to run it on
@ -3566,7 +3566,7 @@ TEST_F(VulkanAPITest, extract_texel_test) {
  // is the channel count.
  // We always start a new batch on a new z. Hence, when c cannot be divided by
  // 4, there are some undefined values in the padding area. We use -1 to
-  // indicate that we are not performing comparsion on those values.
+  // indicate that we are not performing comparison on those values.
  std::tuple<ivec3, ivec4> test_cases[]{
      {{0, 0, 0}, {0, hw, 2 * hw, 3 * hw}},
      {{1, 0, 0}, {1, hw + 1, 2 * hw + 1, 3 * hw + 1}},
@ -3672,7 +3672,7 @@ TEST_F(VulkanAPITest, channel_to_width_packing_test) {
  at::Tensor output = at::native::vulkan::ops::convert(v_output);

  // This tensor will be width-packed. Meaning that each texel represent
-  // consecutive elements along the width dimension. The  differece between
+  // consecutive elements along the width dimension. The  difference between
  // consecutive texels is 1.
  std::tuple<ivec3, ivec4> test_cases[]{
      {{0, 0, 0}, {0, 1, 2, 3}},
--- a/aten/src/ATen/xpu/XPUEvent.h
+++ b/aten/src/ATen/xpu/XPUEvent.h
@ -12,7 +12,7 @@ namespace at::xpu {
 * must match the same device.
 *
 * Currently, XPUEvent does NOT support to export an inter-process event from
- * another process via inter-process comunication(IPC). So it means that
+ * another process via inter-process communication(IPC). So it means that
 * inter-process communication for event handles between different processes is
 * not available. This could impact some applications that rely on cross-process
 * synchronization and communication.
--- a/aten/src/README.md
+++ b/aten/src/README.md
@ -8,7 +8,7 @@ multiple variants of the library, summarized here:
 * THC = TorcH Cuda
 * THCS = TorcH Cuda Sparse (now defunct)
 * THNN = TorcH Neural Network (now defunct)
-* THS = TorcH Sparse (now defunct)
+* THS = TorcH Sparse (now defunct)  <!-- codespell:ignore -->

 (You'll also see these abbreviations show up in symbol names.)