[BE][3/5] fix typos in aten/ (aten/src/ATen/native/) (#157552)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157552 Approved by: https://github.com/albanD ghstack dependencies: #156605, #157637, #157550, #157551
2025-12-06 12:20:52 +01:00 · 2025-07-17 14:55:18 +08:00 · 2025-07-17 14:55:18 +08:00 · d5af0eca8d
commit d5af0eca8d
parent f57ef62ebc
37 changed files with 134 additions and 133 deletions
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1162,7 +1162,6 @@ exclude_patterns = [
    # These files are all grandfathered in, feel free to remove from this list
    # as necessary
    # NOTE: remove the patterns in the order they are listed
-    'aten/src/ATen/native/q*/**',
    'aten/src/ATen/native/[a-pA-P]*/**',
    'aten/src/ATen/[a-mA-M]*/**',
    'test/**',
--- a/aten/src/ATen/native/quantized/cpu/ACLUtils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/ACLUtils.cpp
@ -81,7 +81,7 @@ DynamicQuantMatmul::DynamicQuantMatmul(
  auto src_q_tensor_info = arm_compute::TensorInfo(
      arm_compute::TensorShape(weight_dim_0, m),
      1,
-      // ACL dyanamically quantized matmuls only support (signed) int8_t
+      // ACL dynamically quantized matmuls only support (signed) int8_t
      arm_compute::DataType::QASYMM8_SIGNED,
      // TODO: setting the initial offset value to int8_t max instead of zero,
      // because ACL currently skips MatrixBReduction calculation if the
--- a/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
@ -456,7 +456,7 @@ make_zero_points_and_scales_tensor(
    uint32_t groups = 1) {
  const int out_ch_idx = transpose ? 1 : 0;
  const auto num_output_channels = weight_contig.size(out_ch_idx) * (transpose ? groups : 1);
-  // Add 8 to account for bufferring needed by QNNPACK.
+  // Add 8 to account for buffering needed by QNNPACK.
  const auto num_output_channels_padded = num_output_channels + kPaddingChannels;
  const auto qtype = weight_contig.qscheme();
  std::vector<uint8_t> weight_zp(num_output_channels_padded, 0);
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@ -366,7 +366,7 @@ Tensor ConvertConvWeightsToChannelLastTensor<3>(
 #endif // USE_FBGEMM

 namespace {
-  // This is really terrible, but couldnt figure out a better way to constexpr convert int to
+  // This is really terrible, but couldn't figure out a better way to constexpr convert int to
  // string and then perform string concatenation on/with it
  constexpr const char* _hack_int_to_class_name(int x) {
    switch(x) {
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@ -1277,7 +1277,7 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
  float sum_scale = has_accum ? accum.value().q_scale() : 1.0;
  int32_t sum_zero_point = has_accum ? accum.value().q_zero_point() : 0;
  if (has_accum) {
-    // Just tells we have these post op, the actual value such as scale and zero point will be setted later.
+    // Just tells we have these post op, the actual value such as scale and zero point will be set later.
    op_attr = kReluFused ? ideep::attr_t::residual_with_sum_zero_point() : ideep::attr_t::fuse_sum();
    const ideep::scale_t accum_scale = ideep::scale_t(1, 1.0/sum_scale);
    const ideep::zero_point_t accum_zero_points = ideep::zero_point_t(1, sum_zero_point);
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-arm64.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-arm64.sh
@ -53,7 +53,7 @@ CMAKE_ARGS+=("-DANDROID_PIE=ON")
 CMAKE_ARGS+=("-DANDROID_STL=c++_static")
 CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=exceptions")

-# Use-specified CMake arguments go last to allow overridding defaults
+# Use-specified CMake arguments go last to allow overriding defaults
 CMAKE_ARGS+=($@)

 cd build/android/arm64-v8a && cmake ../../.. \
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-armv7.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-armv7.sh
@ -53,7 +53,7 @@ CMAKE_ARGS+=("-DANDROID_PIE=ON")
 CMAKE_ARGS+=("-DANDROID_STL=c++_static")
 CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=exceptions")

-# Use-specified CMake arguments go last to allow overridding defaults
+# Use-specified CMake arguments go last to allow overriding defaults
 CMAKE_ARGS+=($@)

 cd build/android/armeabi-v7a && cmake ../../.. \
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-x86.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-x86.sh
@ -53,7 +53,7 @@ CMAKE_ARGS+=("-DANDROID_PIE=ON")
 CMAKE_ARGS+=("-DANDROID_STL=c++_static")
 CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=exceptions")

-# Use-specified CMake arguments go last to allow overridding defaults
+# Use-specified CMake arguments go last to allow overriding defaults
 CMAKE_ARGS+=($@)

 cd build/android/x86 && cmake ../../.. \
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-arm64.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-arm64.sh
@ -40,7 +40,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=arm64")
 CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
 CMAKE_ARGS+=("-DENABLE_ARC=OFF")

-# Use-specified CMake arguments go last to allow overridding defaults
+# Use-specified CMake arguments go last to allow overriding defaults
 CMAKE_ARGS+=($@)

 cd build/ios/arm64 && cmake ../../.. \
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-arm64e.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-arm64e.sh
@ -40,7 +40,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=arm64e")
 CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
 CMAKE_ARGS+=("-DENABLE_ARC=OFF")

-# Use-specified CMake arguments go last to allow overridding defaults
+# Use-specified CMake arguments go last to allow overriding defaults
 CMAKE_ARGS+=($@)

 cd build/ios/arm64e && cmake ../../.. \
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-armv7.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-armv7.sh
@ -40,7 +40,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=armv7")
 CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
 CMAKE_ARGS+=("-DENABLE_ARC=OFF")

-# Use-specified CMake arguments go last to allow overridding defaults
+# Use-specified CMake arguments go last to allow overriding defaults
 CMAKE_ARGS+=($@)

 cd build/ios/armv7 && cmake ../../.. \
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-armv7s.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-armv7s.sh
@ -40,7 +40,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=armv7s")
 CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
 CMAKE_ARGS+=("-DENABLE_ARC=OFF")

-# Use-specified CMake arguments go last to allow overridding defaults
+# Use-specified CMake arguments go last to allow overriding defaults
 CMAKE_ARGS+=($@)

 cd build/ios/armv7s && cmake ../../.. \
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-i386.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-i386.sh
@ -40,7 +40,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=i386")
 CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
 CMAKE_ARGS+=("-DENABLE_ARC=OFF")

-# Use-specified CMake arguments go last to allow overridding defaults
+# Use-specified CMake arguments go last to allow overriding defaults
 CMAKE_ARGS+=($@)

 cd build/ios/i386 && cmake ../../.. \
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-x86_64.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-x86_64.sh
@ -45,7 +45,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=x86_64")
 CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
 CMAKE_ARGS+=("-DENABLE_ARC=OFF")

-# Use-specified CMake arguments go last to allow overridding defaults
+# Use-specified CMake arguments go last to allow overriding defaults
 CMAKE_ARGS+=($@)

 cd build/ios/x86_64 && cmake ../../.. \
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-local.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-local.sh
@ -27,7 +27,7 @@ CMAKE_ARGS+=("-DPYTORCH_QNNPACK_LIBRARY_TYPE=static")
 CMAKE_ARGS+=("-DPYTORCH_QNNPACK_BUILD_BENCHMARKS=ON")
 CMAKE_ARGS+=("-DPYTORCH_QNNPACK_BUILD_TESTS=ON")

-# Use-specified CMake arguments go last to allow overridding defaults
+# Use-specified CMake arguments go last to allow overriding defaults
 CMAKE_ARGS+=($@)

 cd build/local && cmake ../.. \
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/convolution.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/convolution.c
@ -368,7 +368,7 @@ static enum pytorch_qnnp_status pytorch_qnnp_create_convolution_ndhwc_q8(
    case pytorch_qnnp_ukernel_type_xzp_gemm: {
      // TODO: XZP kernels won't be supporting per channel quantization.
      // For now we dont use XZP kernels anywhere. Probably deprecate it for now
-      // and ressurrect later if needed.
+      // and resurrect later if needed.
      const uint32_t nr = pytorch_qnnp_params.q8conv_xzp.nr;
      const uint32_t kr = pytorch_qnnp_params.q8conv_xzp.kr;
      const uint32_t sr = pytorch_qnnp_params.q8conv_xzp.kc;
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/4x8-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/4x8-aarch32-neon.S
@ -20,28 +20,28 @@

 #  Args passed via stack.
 #  TOS
-#  |-----------|
-#  |a          | 0
-#  |w          | 4
-#  |c          | 8
-#  |c_stride   | 12
-#  |out ch indx| 16
-#  |params     | 20
-#  |-----------|
+#  |------------|
+#  |a           | 0
+#  |w           | 4
+#  |c           | 8
+#  |c_stride    | 12
+#  |out ch index| 16
+#  |params      | 20
+#  |------------|
 #

 #  After loading w pointer in ip reg.
 #  And after pushing r4-r8 and d8-d15 on stack
-#  |-----------|
-#  |d8 - d15   | 0
-#  |r4 - r11   | 64
-#  |a          | 96
-#  |w          | 100
-#  |c          | 104
-#  |c_stride   | 108
-#  |out ch indx| 112
-#  |params     | 116
-#  |-----------|
+#  |------------|
+#  |d8 - d15    | 0
+#  |r4 - r11    | 64
+#  |a           | 96
+#  |w           | 100
+#  |c           | 104
+#  |c_stride    | 108
+#  |out ch index| 112
+#  |params      | 116
+#  |------------|
 #

 # void pytorch_q8conv_ukernel_4x8__aarch32_neon(
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/8x8-aarch64-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/8x8-aarch64-neon.S
@ -23,10 +23,10 @@

 #  Args passed via stack.
 #  TOS
-#  |-----------|
-#  |out ch indx| 0
-#  |params     | 8
-#  |-----------|
+#  |------------|
+#  |out ch index| 0
+#  |params      | 8
+#  |------------|

 # void pytorch_q8conv_ukernel_8x8__aarch64_neon(
 #    size_t mr,
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-aarch32-neon.S
@ -20,28 +20,28 @@

 #  Args passed via stack.
 #  TOS
-#  |-----------|
-#  |a_stride   | 0
-#  |w          | 4
-#  |c          | 8
-#  |c_stride   | 12
-#  |out ch indx| 16
-#  |params     | 20
-#  |-----------|
+#  |------------|
+#  |a_stride    | 0
+#  |w           | 4
+#  |c           | 8
+#  |c_stride    | 12
+#  |out ch index| 16
+#  |params      | 20
+#  |------------|
 #

 #  After loading w pointer in ip reg.
 #  And after pushing r4-r9 and d8-d15 on stack
-#  |-----------|
-#  |d8 - d15   | 0
-#  |r4 - r9    | 64
-#  |a_stride   | 88
-#  |w          | 92
-#  |c          | 96
-#  |c_stride   | 100
-#  |out ch indx| 104
-#  |params     | 108
-#  |-----------|
+#  |------------|
+#  |d8 - d15    | 0
+#  |r4 - r9     | 64
+#  |a_stride    | 88
+#  |w           | 92
+#  |c           | 96
+#  |c_stride    | 100
+#  |out ch index| 104
+#  |params      | 108
+#  |------------|
 #

 #
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-dq-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-dq-aarch32-neon.S
@ -33,29 +33,29 @@

 #  Args passed via stack.
 #  TOS
-#  |-----------|
-#  |a_stride   | 0
-#  |w          | 4
-#  |c          | 8
-#  |c_stride   | 12
-#  |out ch indx| 16
-#  |params     | 20
-#  |-----------|
+#  |------------|
+#  |a_stride    | 0
+#  |w           | 4
+#  |c           | 8
+#  |c_stride    | 12
+#  |out ch index| 16
+#  |params      | 20
+#  |------------|
 #

 #  After loading w pointer in ip reg.
 #  And after pushing r4-r8 and d8-d15 on stack
-#  |-----------|
-#  |d8 - d15   | 0
-#  |r4 - r7    | 64
-#  |a_stride   | 80
-#  |w          | 84
-#  |b          | 88
-#  |c          | 92
-#  |c_stride   | 96
-#  |out ch indx| 100
-#  |params     | 104
-#  |-----------|
+#  |------------|
+#  |d8 - d15    | 0
+#  |r4 - r7     | 64
+#  |a_stride    | 80
+#  |w           | 84
+#  |b           | 88
+#  |c           | 92
+#  |c_stride    | 96
+#  |out ch index| 100
+#  |params      | 104
+#  |------------|
 #

 # void pytorch_q8gemm_ukernel_4x8__aarch32_neon(
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-aarch64-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-aarch64-neon.S
@ -22,10 +22,10 @@

 #  Args passed via stack.
 #  TOS
-#  |-----------|
-#  |out ch indx| 0
-#  |params     | 8
-#  |-----------|
+#  |------------|
+#  |out ch index| 0
+#  |params      | 8
+#  |------------|

 # void pytorch_q8gemm_ukernel_8x8__aarch64_neon(
 #     size_t mr,
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S
@ -14,11 +14,11 @@

 #  Args passed via stack.
 #  TOS
-#  |-----------|
-#  |c_stride   | 0
-#  |out ch indx| 8
-#  |params     | 16
-#  |-----------|
+#  |------------|
+#  |c_stride    | 0
+#  |out ch index| 8
+#  |params      | 16
+#  |------------|

 # void pytorch_q8gemm_dq_ukernel_8x8__aarch64_neon(
 #     size_t mr,
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x4-packA-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x4-packA-aarch32-neon.S
@ -32,7 +32,7 @@
 #

 # Packed A format.
-# 4kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
+# 4kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
 # Original A
 # --------- K -----------          -- (K + 4 - 1) / 4 --
 # |                     |          |                   |
@ -53,7 +53,7 @@
 # This locality helps in loading 8kx4m blocks of activations
 # Note when M is not multiple of 4, the rest can contain arbitrary
 # data in packed A as we will not be writing those out.
-# This wil be taken care by just copying the appropriate valid data
+# This will be taken care by just copying the appropriate valid data

 # Also note that this packing is same as taking for 4x1 pattern.
 # This is because all the adjacent k's are laid next to each other
@ -109,7 +109,7 @@ k_loop:
    VLD1.8 {d2}, [r6]!
    VLD1.8 {d3}, [r7]!

-    #  Now we have 4x8 block of values that we will tranpose
+    #  Now we have 4x8 block of values that we will transpose
    #  A matrix
    #  --------------------------------
    #  |                              |
@ -155,7 +155,7 @@ k_loop:
    VTRN.32 d2, d3
    VSWP d1, d2

-    # Now store the tranposed values
+    # Now store the transposed values
    # d0, d1, d2, d3
    VST1.8 {q0}, [r2]!
    VST1.8 {q1}, [r2]!
@ -172,7 +172,7 @@ k_loop:
    VLD1.32 {d2[]}, [r6]
    VLD1.32 {d3[]}, [r7]

-    #  Now we have 4x8 block of values that we will tranpose
+    #  Now we have 4x8 block of values that we will transpose
    #  _d{0-3} are arm neon vector registers
    #  va0 = _d0 = a0 a1 a2 a3
    #  va1 = _d1 = b0 b1 b2 b3
@ -218,7 +218,7 @@ k_loop:
    VEXT.8 d0, d0, d1, #4
    VEXT.8 d1, d2, d3, #4

-    # Now store the tranposed values
+    # Now store the transposed values
    # d0, d1, d2, d3
    VST1.8 {q0}, [r2]
    .p2align 4
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c1x4-dq-packedA-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c1x4-dq-packedA-aarch32-neon.S
@ -46,7 +46,7 @@
 #  |b               | 12
 #  |c               | 16
 #  |c_stride        | 20
-#  |out ch indx     | 24
+#  |out ch index    | 24
 #  |params          | 28
 #  |----------------|
 #
@ -61,7 +61,7 @@
 #  |b               | 108
 #  |c               | 112
 #  |c_stride        | 116
-#  |out ch indx     | 120
+#  |out ch index    | 120
 #  |params          | 124
 #  |----------------|
 #
@ -101,7 +101,7 @@
        /* Add output_channel_index to the b_zero_point pointer */            ;\
        ADD r4, r4, r5                                                        ;\
                                                                              ;\
-        /* We enter the loop if r1 is atleast 1. */                           ;\
+        /* We enter the loop if r1 is at least 1. */                           ;\
        /* r1 = r1 - 1 will happen in the epilogue */                         ;\
        /* of the loop */                                                     ;\
        CMP r1, 1                                                             ;\
@ -222,7 +222,7 @@
        /* Thus we will load accumulators back in q0, q1, q2, q3, q4, q5, q6, q7 */ ;\
        /* When nr < 4, extra q values will be fetched from stack which may overlap */ ;\
        /* with other parts of stack storing local variables. To avoid that we just */ ;\
-        /* create a buffer of 128 bytes inbetween to make sure pointer increment */ ;\
+        /* create a buffer of 128 bytes in between to make sure pointer increment */ ;\
        /* never produces address that is beyond the stack frame of this function. */ ;\
        SUB r9, sp, 140                                                       ;\
        /* Each iteration produce 4 values each of 4 bytes */                 ;\
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c8x1-dq-packedA-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c8x1-dq-packedA-aarch32-neon.S
@ -46,7 +46,7 @@
 #  |b               | 12
 #  |c               | 16
 #  |c_stride        | 20
-#  |out ch indx     | 24
+#  |out ch index    | 24
 #  |params          | 28
 #  |----------------|
 #
@ -61,7 +61,7 @@
 #  |b               | 108
 #  |c               | 112
 #  |c_stride        | 116
-#  |out ch indx     | 120
+#  |out ch index    | 120
 #  |params          | 124
 #  |----------------|
 #
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch32-neon.S
@ -32,7 +32,7 @@
 #

 # Packed A format.
-# 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
+# 8kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
 # Original A
 # --------- K -----------          -- (K + 4 - 1) / 4 --
 # |                     |          |                   |
@ -53,7 +53,7 @@
 # This locality helps in loading 8kx8m blocks of activations
 # Note when M is not multiple of 8, the rest can contain arbitrary
 # data in packed A as we will not be writing those out.
-# This wil be taken care by just copying the appropriate valid data
+# This will be taken care by just copying the appropriate valid data

 # void pytorch_q8gemm_sparse_packA_ukernel_8x4__aarch32_neon(
 #     size_t mr,
@ -125,7 +125,7 @@ k_loop:
    VLD1.8 {d6}, [r10]!
    VLD1.8 {d7}, [r11]!

-    #  Now we have 8x8 block of values that we will tranpose
+    #  Now we have 8x8 block of values that we will transpose
    #  A matrix
    #  --------------------------------
    #  |                              |
@ -189,7 +189,7 @@ k_loop:
    VTRN.32 q0, q2
    VTRN.32 q1, q3

-    # Now store the tranposed values
+    # Now store the transposed values
    # d0, d1, d2, d3
    # then d4, d5, d6, d7 contiguously
    VST1.8 {q0}, [r2]!
@ -213,7 +213,7 @@ k_loop:
    VLD1.32 {d6[]}, [r7]
    VLD1.32 {d7[]}, [r11]

-    #  Now we have 4x8 block of values that we will tranpose
+    #  Now we have 4x8 block of values that we will transpose
    #  _d{0-3} are arm neon vector registers
    #  va04 = _d0 = a0 a1 a2 a3 e0 e1 e2 e3
    #  va15 = _d1 = b0 b1 b2 b3 f0 f1 f2 f3
@ -260,7 +260,7 @@ k_loop:
    VTRN.16 d0, d2
    VTRN.16 d1, d3

-    # Now store the tranposed values
+    # Now store the transposed values
    # d0, d1, d2, d3
    # then d4, d5, d6, d7 contiguously
    VST1.8 {q0}, [r2]!
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch64-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch64-neon.S
@ -9,7 +9,7 @@
 #include <qnnpack/assembly.h>

 # Packed A format.
-# 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
+# 8kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
 # Original A
 # --------- K -----------          -- (K + 4 - 1) / 4 --
 # |                     |          |                   |
@ -30,7 +30,7 @@
 # This locality helps in loading 8kx8m blocks of activations
 # Note when M is not multiple of 8, the rest can contain arbitrary
 # data in packed A as we will not be writing those out.
-# This wil be taken care by just copying the appropriate valid data
+# This will be taken care by just copying the appropriate valid data

 # void pytorch_q8gemm_sparse_packA_ukernel_8x4__aarch32_neon(
 #     size_t mr,
@ -93,7 +93,7 @@ k_loop:
    LD1 {v3.d}[0], [x7], 8
    LD1 {v3.d}[1], [x11], 8

-    #  Now we have 8x8 block of values that we will tranpose
+    #  Now we have 8x8 block of values that we will transpose
    #  A matrix
    #  ------------------------
    #  |                      |
@ -180,7 +180,7 @@ k_loop:
    LD1 {v3.s}[0], [x7]
    LD1 {v3.s}[1], [x11]

-    #  Now we have 8x4 block of values that we will tranpose
+    #  Now we have 8x4 block of values that we will transpose
    #  A matrix
    #  ----------------------------
    #  |                          |
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-sse2.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-sse2.c
@ -14,7 +14,7 @@
 #include "8x4c1x4-packed-sse2.h"

 // This is a super slow kernel in that it does not use intrinsics to
-// tranpose. Since this is for x86 we are not optimizing it.
+// transpose. Since this is for x86 we are not optimizing it.
 // For ARM this will be optimized.
 void pytorch_q8gemm_sparse_packA_ukernel_8x4__sse2(
    const size_t mr,
@ -24,7 +24,7 @@ void pytorch_q8gemm_sparse_packA_ukernel_8x4__sse2(
    uint8_t* a_packed) {

  // Packed A format.
-  // 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
+  // 8kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
  // Original A
  // --------- K -----------          -- (K + 4 - 1) / 4 --
  // |                     |          |                   |
@ -45,7 +45,7 @@ void pytorch_q8gemm_sparse_packA_ukernel_8x4__sse2(
  // This locality helps in loading 8kx8m blocks of activations
  // Note when M is not multiple of 8, the rest can contain arbitrary
  // data in packed A as we will not be writing those out.
-  // This wil be taken care by just copying the appropriate valid data
+  // This will be taken care by just copying the appropriate valid data

  // Note that parts of A that are not filled are:
  // Remainder of M blocks. So some m values are random. This is ok
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4c1x4-dq-packedA-sse2.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4c1x4-dq-packedA-sse2.h
@ -47,7 +47,7 @@ void KERNEL_NAME(
  const __m128i vzero = _mm_setzero_si128();

  // Packed A format.
-  // 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
+  // 8kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
  // Original A
  // --------- K -----------          -- (K + 4 - 1) / 4 --
  // |                     |          |                   |
@ -68,7 +68,7 @@ void KERNEL_NAME(
  // This locality helps in loading 8kx8m blocks of activations
  // Note when M is not multiple of 8, the rest can contain arbitrary
  // data in packed A as we will not be writing those out.
-  // This wil be taken care by just copying the appropriate valid data
+  // This will be taken care by just copying the appropriate valid data

  __m128i vacc_low[4];
  __m128i vacc_high[4];
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x8c1x4-dq-packedA-aarch64-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x8c1x4-dq-packedA-aarch64-neon.S
@ -42,11 +42,11 @@

 #  Args passed via stack.
 #  TOS
-#  |-----------|
-#  |c_stride   | 0
-#  |out ch indx| 8
-#  |params     | 16
-#  |-----------|
+#  |------------|
+#  |c_stride    | 0
+#  |out ch index| 8
+#  |params      | 16
+#  |------------|

 # void pytorch_q8gemm_dq_sparse_1x4_ukernel_8x8_packedA_w##W_INDEX_DTYPE_NUM_BITS##__aarch64_neon(
 #     size_t mr,
@ -234,7 +234,7 @@
        /* v16, v17, v18, v19, v20, v21, v22, v23 */                         XX\
        /* When nr < 8, say nr = 1, extra v values will be fetched from stack which may overlap */ XX\
        /* with other parts of stack storing local variables. To avoid that we just */ XX\
-        /* create a buffer of 256 bytes inbetween to make sure pointer increment */ XX\
+        /* create a buffer of 256 bytes in between to make sure pointer increment */ XX\
        /* never produces address that is beyond the stack frame of this function. */ XX\
        SUB x9, sp, 320                                                      XX\
        /* Each iteration produce 8 values each of 4 bytes */                XX\
@ -287,7 +287,7 @@
        LD1 {v22.4s}, [x9], 16                                               XX\
        LD1 {v23.4s}, [x9]                                                   XX\
                                                                             XX\
-        /* We can tranpose one 4x4 block using macro */                      XX\
+        /* We can transpose one 4x4 block using macro */                     XX\
        /* TRANSPOSE_4X4_S32 v8, v10, v12, v14, v0, v1, v2, v3 */            XX\
        /* After this we have */                                             XX\
        /* v8  : x00, x01, x02, x03 */                                       XX\
@ -302,7 +302,7 @@
        /* v20 : x24, x25, x26, x27 */                                       XX\
        /* v22 : x34, x35, x36, x37 */                                       XX\
        /* Similarly we can transpose other two 4x4 blocks and we get */     XX\
-        /* tranposed 8x8 */                                                  XX\
+        /* transposed 8x8 */                                                 XX\
                                                                             XX\
        TRANSPOSE_4X4_S32 v8, v10, v12, v14, v0, v1, v2, v3                  XX\
        TRANSPOSE_4X4_S32 v16, v18, v20, v22, v4, v5, v6, v7                 XX\
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x8c8x1-dq-packedA-aarch64-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x8c8x1-dq-packedA-aarch64-neon.S
@ -31,11 +31,11 @@

 #  Args passed via stack.
 #  TOS
-#  |-----------|
-#  |c_stride   | 0
-#  |out ch indx| 8
-#  |params     | 16
-#  |-----------|
+#  |------------|
+#  |c_stride    | 0
+#  |out ch index| 8
+#  |params      | 16
+#  |------------|

 # void pytorch_q8gemm_dq_sparse_8x1_ukernel_8x8_packedA_w##W_INDEX_DTYPE_NUM_BITS##__aarch64_neon(
 #     size_t mr,
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/pack.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/pack.h
@ -238,7 +238,7 @@ static inline void pytorch_pack_q8conv_wrq(
          }
        }
        if (kzp != 0) {
-          // This part fills the packed wights with zero points for output channels
+          // This part fills the packed weights with zero points for output channels
          // when they are not divisible by nr blocking parameter.
          // In that case
          for (size_t nr_block_offset = 0; nr_block_offset < (nr - nr_block_size);
@ -360,7 +360,7 @@ static inline void pytorch_pack_q8deconv_wrq(
          }
        }
        if (kzp != 0) {
-          // This part fills the packed wights with zero points for output channels
+          // This part fills the packed weights with zero points for output channels
          // when they are not divisible by nr blocking parameter.
          // In that case
          for (size_t nr_block_offset = 0; nr_block_offset < (nr - nr_block_size);
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/q31-scalar.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/q31-scalar.c
@ -93,7 +93,7 @@ void pytorch_qnnp_requantize_q31__scalar(
     * overflow is possible only when input is positive, and even when addition
     * of a rounding constant overflows 32-bit signed integer, it still doesn't
     *    overflow 32-bit unsigned integer. Thus, in case of signed overflow, we
-     * can compute the result using unsigned arithmetics, specifically using
+     * can compute the result using unsigned arithmetic, specifically using
     * logical shift right instead of arithmetic shift right.
     * 3. Performs arithmetic shift as is, which will produce division result
     * rounded down. Then compute remainder of this division by a power of 2,
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/requantization.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/requantization.cc
@ -17,7 +17,7 @@
 #include "requantization-tester.h"

 /*
- * Precise scalar implementation using unsigned 32-bit arithmetics.
+ * Precise scalar implementation using unsigned 32-bit arithmetic.
 */

 TEST(PRECISE__SCALAR_UNSIGNED32, exact_divide_by_po2) {
@ -83,7 +83,7 @@ TEST(PRECISE__SCALAR_UNSIGNED32, random_cases) {
 }

 /*
- * Precise scalar implementation using unsigned 64-bit arithmetics.
+ * Precise scalar implementation using unsigned 64-bit arithmetic.
 */

 TEST(PRECISE__SCALAR_UNSIGNED64, exact_divide_by_po2) {
@ -149,7 +149,7 @@ TEST(PRECISE__SCALAR_UNSIGNED64, random_cases) {
 }

 /*
- * Precise scalar implementation using signed 64-bit arithmetics.
+ * Precise scalar implementation using signed 64-bit arithmetic.
 */

 TEST(PRECISE__SCALAR_SIGNED64, exact_divide_by_po2) {
@ -302,7 +302,7 @@ TEST(GEMMLOWP__SCALAR, random_cases) {
 }

 /*
- * Precise PSIMD implementation using unsigned 32-bit arithmetics.
+ * Precise PSIMD implementation using unsigned 32-bit arithmetic.
 */

 TEST(PRECISE__PSIMD, exact_divide_by_po2) {
--- a/aten/src/ATen/native/quantized/cudnn/Linear.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/Linear.cpp
@ -171,7 +171,7 @@ void PackedLinearWeightCudnn::apply_impl_helper(const at::Tensor& quantized_outp
    return;
  }

-  // linear_op computes act_int8 * tranpose(w_int8) (matrix multiplication)
+  // linear_op computes act_int8 * transpose(w_int8) (matrix multiplication)
  // where act_int8 and w_int8 are the input and weight variables, resp.
  // output is a fp32 tensor
  auto linear_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
--- a/aten/src/ATen/native/quantized/cudnn/Pooling.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/Pooling.cpp
@ -54,7 +54,7 @@ void check_maxpool2d_params(
 Tensor adaptive_avg_pool2d_quantized_cuda(
    const at::Tensor& input,
    IntArrayRef output_size) {
-// TODO: renable these cudnn preprocessors like quantized_max_pool2d_cudnn below when we implement this function with cudnn
+// TODO: re-enable these cudnn preprocessors like quantized_max_pool2d_cudnn below when we implement this function with cudnn
 #ifdef USE_CUDA
 // #if AT_CUDNN_ENABLED()
    // TODO: limit this to per tensor quantized tensors for now, though should be easy to adapt
--- a/tools/linter/dictionary.txt
+++ b/tools/linter/dictionary.txt
@ -22,6 +22,7 @@ froms
 Halfs
 hsa
 indexT
+inH
 inp
 inps
 inpt
@ -57,5 +58,6 @@ strat
 supercede
 supercedes
 te
+THW
 tne
 WONT