[BE][3/5] fix typos in aten/ (aten/src/ATen/native/) (#157552)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157552
Approved by: https://github.com/albanD
ghstack dependencies: #156605, #157637, #157550, #157551
This commit is contained in:
Xuehai Pan 2025-07-17 14:55:18 +08:00 committed by PyTorch MergeBot
parent f57ef62ebc
commit d5af0eca8d
37 changed files with 134 additions and 133 deletions

View File

@ -1162,7 +1162,6 @@ exclude_patterns = [
# These files are all grandfathered in, feel free to remove from this list
# as necessary
# NOTE: remove the patterns in the order they are listed
'aten/src/ATen/native/q*/**',
'aten/src/ATen/native/[a-pA-P]*/**',
'aten/src/ATen/[a-mA-M]*/**',
'test/**',

View File

@ -81,7 +81,7 @@ DynamicQuantMatmul::DynamicQuantMatmul(
auto src_q_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(weight_dim_0, m),
1,
// ACL dyanamically quantized matmuls only support (signed) int8_t
// ACL dynamically quantized matmuls only support (signed) int8_t
arm_compute::DataType::QASYMM8_SIGNED,
// TODO: setting the initial offset value to int8_t max instead of zero,
// because ACL currently skips MatrixBReduction calculation if the

View File

@ -456,7 +456,7 @@ make_zero_points_and_scales_tensor(
uint32_t groups = 1) {
const int out_ch_idx = transpose ? 1 : 0;
const auto num_output_channels = weight_contig.size(out_ch_idx) * (transpose ? groups : 1);
// Add 8 to account for bufferring needed by QNNPACK.
// Add 8 to account for buffering needed by QNNPACK.
const auto num_output_channels_padded = num_output_channels + kPaddingChannels;
const auto qtype = weight_contig.qscheme();
std::vector<uint8_t> weight_zp(num_output_channels_padded, 0);

View File

@ -366,7 +366,7 @@ Tensor ConvertConvWeightsToChannelLastTensor<3>(
#endif // USE_FBGEMM
namespace {
// This is really terrible, but couldnt figure out a better way to constexpr convert int to
// This is really terrible, but couldn't figure out a better way to constexpr convert int to
// string and then perform string concatenation on/with it
constexpr const char* _hack_int_to_class_name(int x) {
switch(x) {

View File

@ -1277,7 +1277,7 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
float sum_scale = has_accum ? accum.value().q_scale() : 1.0;
int32_t sum_zero_point = has_accum ? accum.value().q_zero_point() : 0;
if (has_accum) {
// Just tells we have these post op, the actual value such as scale and zero point will be setted later.
// Just tells we have these post op, the actual value such as scale and zero point will be set later.
op_attr = kReluFused ? ideep::attr_t::residual_with_sum_zero_point() : ideep::attr_t::fuse_sum();
const ideep::scale_t accum_scale = ideep::scale_t(1, 1.0/sum_scale);
const ideep::zero_point_t accum_zero_points = ideep::zero_point_t(1, sum_zero_point);

View File

@ -53,7 +53,7 @@ CMAKE_ARGS+=("-DANDROID_PIE=ON")
CMAKE_ARGS+=("-DANDROID_STL=c++_static")
CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=exceptions")
# Use-specified CMake arguments go last to allow overridding defaults
# Use-specified CMake arguments go last to allow overriding defaults
CMAKE_ARGS+=($@)
cd build/android/arm64-v8a && cmake ../../.. \

View File

@ -53,7 +53,7 @@ CMAKE_ARGS+=("-DANDROID_PIE=ON")
CMAKE_ARGS+=("-DANDROID_STL=c++_static")
CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=exceptions")
# Use-specified CMake arguments go last to allow overridding defaults
# Use-specified CMake arguments go last to allow overriding defaults
CMAKE_ARGS+=($@)
cd build/android/armeabi-v7a && cmake ../../.. \

View File

@ -53,7 +53,7 @@ CMAKE_ARGS+=("-DANDROID_PIE=ON")
CMAKE_ARGS+=("-DANDROID_STL=c++_static")
CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=exceptions")
# Use-specified CMake arguments go last to allow overridding defaults
# Use-specified CMake arguments go last to allow overriding defaults
CMAKE_ARGS+=($@)
cd build/android/x86 && cmake ../../.. \

View File

@ -40,7 +40,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=arm64")
CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
CMAKE_ARGS+=("-DENABLE_ARC=OFF")
# Use-specified CMake arguments go last to allow overridding defaults
# Use-specified CMake arguments go last to allow overriding defaults
CMAKE_ARGS+=($@)
cd build/ios/arm64 && cmake ../../.. \

View File

@ -40,7 +40,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=arm64e")
CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
CMAKE_ARGS+=("-DENABLE_ARC=OFF")
# Use-specified CMake arguments go last to allow overridding defaults
# Use-specified CMake arguments go last to allow overriding defaults
CMAKE_ARGS+=($@)
cd build/ios/arm64e && cmake ../../.. \

View File

@ -40,7 +40,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=armv7")
CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
CMAKE_ARGS+=("-DENABLE_ARC=OFF")
# Use-specified CMake arguments go last to allow overridding defaults
# Use-specified CMake arguments go last to allow overriding defaults
CMAKE_ARGS+=($@)
cd build/ios/armv7 && cmake ../../.. \

View File

@ -40,7 +40,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=armv7s")
CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
CMAKE_ARGS+=("-DENABLE_ARC=OFF")
# Use-specified CMake arguments go last to allow overridding defaults
# Use-specified CMake arguments go last to allow overriding defaults
CMAKE_ARGS+=($@)
cd build/ios/armv7s && cmake ../../.. \

View File

@ -40,7 +40,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=i386")
CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
CMAKE_ARGS+=("-DENABLE_ARC=OFF")
# Use-specified CMake arguments go last to allow overridding defaults
# Use-specified CMake arguments go last to allow overriding defaults
CMAKE_ARGS+=($@)
cd build/ios/i386 && cmake ../../.. \

View File

@ -45,7 +45,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=x86_64")
CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
CMAKE_ARGS+=("-DENABLE_ARC=OFF")
# Use-specified CMake arguments go last to allow overridding defaults
# Use-specified CMake arguments go last to allow overriding defaults
CMAKE_ARGS+=($@)
cd build/ios/x86_64 && cmake ../../.. \

View File

@ -27,7 +27,7 @@ CMAKE_ARGS+=("-DPYTORCH_QNNPACK_LIBRARY_TYPE=static")
CMAKE_ARGS+=("-DPYTORCH_QNNPACK_BUILD_BENCHMARKS=ON")
CMAKE_ARGS+=("-DPYTORCH_QNNPACK_BUILD_TESTS=ON")
# Use-specified CMake arguments go last to allow overridding defaults
# Use-specified CMake arguments go last to allow overriding defaults
CMAKE_ARGS+=($@)
cd build/local && cmake ../.. \

View File

@ -368,7 +368,7 @@ static enum pytorch_qnnp_status pytorch_qnnp_create_convolution_ndhwc_q8(
case pytorch_qnnp_ukernel_type_xzp_gemm: {
// TODO: XZP kernels won't be supporting per channel quantization.
// For now we dont use XZP kernels anywhere. Probably deprecate it for now
// and ressurrect later if needed.
// and resurrect later if needed.
const uint32_t nr = pytorch_qnnp_params.q8conv_xzp.nr;
const uint32_t kr = pytorch_qnnp_params.q8conv_xzp.kr;
const uint32_t sr = pytorch_qnnp_params.q8conv_xzp.kc;

View File

@ -20,28 +20,28 @@
# Args passed via stack.
# TOS
# |-----------|
# |a | 0
# |w | 4
# |c | 8
# |c_stride | 12
# |out ch indx| 16
# |params | 20
# |-----------|
# |------------|
# |a | 0
# |w | 4
# |c | 8
# |c_stride | 12
# |out ch index| 16
# |params | 20
# |------------|
#
# After loading w pointer in ip reg.
# And after pushing r4-r8 and d8-d15 on stack
# |-----------|
# |d8 - d15 | 0
# |r4 - r11 | 64
# |a | 96
# |w | 100
# |c | 104
# |c_stride | 108
# |out ch indx| 112
# |params | 116
# |-----------|
# |------------|
# |d8 - d15 | 0
# |r4 - r11 | 64
# |a | 96
# |w | 100
# |c | 104
# |c_stride | 108
# |out ch index| 112
# |params | 116
# |------------|
#
# void pytorch_q8conv_ukernel_4x8__aarch32_neon(

View File

@ -23,10 +23,10 @@
# Args passed via stack.
# TOS
# |-----------|
# |out ch indx| 0
# |params | 8
# |-----------|
# |------------|
# |out ch index| 0
# |params | 8
# |------------|
# void pytorch_q8conv_ukernel_8x8__aarch64_neon(
# size_t mr,

View File

@ -20,28 +20,28 @@
# Args passed via stack.
# TOS
# |-----------|
# |a_stride | 0
# |w | 4
# |c | 8
# |c_stride | 12
# |out ch indx| 16
# |params | 20
# |-----------|
# |------------|
# |a_stride | 0
# |w | 4
# |c | 8
# |c_stride | 12
# |out ch index| 16
# |params | 20
# |------------|
#
# After loading w pointer in ip reg.
# And after pushing r4-r9 and d8-d15 on stack
# |-----------|
# |d8 - d15 | 0
# |r4 - r9 | 64
# |a_stride | 88
# |w | 92
# |c | 96
# |c_stride | 100
# |out ch indx| 104
# |params | 108
# |-----------|
# |------------|
# |d8 - d15 | 0
# |r4 - r9 | 64
# |a_stride | 88
# |w | 92
# |c | 96
# |c_stride | 100
# |out ch index| 104
# |params | 108
# |------------|
#
#

View File

@ -33,29 +33,29 @@
# Args passed via stack.
# TOS
# |-----------|
# |a_stride | 0
# |w | 4
# |c | 8
# |c_stride | 12
# |out ch indx| 16
# |params | 20
# |-----------|
# |------------|
# |a_stride | 0
# |w | 4
# |c | 8
# |c_stride | 12
# |out ch index| 16
# |params | 20
# |------------|
#
# After loading w pointer in ip reg.
# And after pushing r4-r8 and d8-d15 on stack
# |-----------|
# |d8 - d15 | 0
# |r4 - r7 | 64
# |a_stride | 80
# |w | 84
# |b | 88
# |c | 92
# |c_stride | 96
# |out ch indx| 100
# |params | 104
# |-----------|
# |------------|
# |d8 - d15 | 0
# |r4 - r7 | 64
# |a_stride | 80
# |w | 84
# |b | 88
# |c | 92
# |c_stride | 96
# |out ch index| 100
# |params | 104
# |------------|
#
# void pytorch_q8gemm_ukernel_4x8__aarch32_neon(

View File

@ -22,10 +22,10 @@
# Args passed via stack.
# TOS
# |-----------|
# |out ch indx| 0
# |params | 8
# |-----------|
# |------------|
# |out ch index| 0
# |params | 8
# |------------|
# void pytorch_q8gemm_ukernel_8x8__aarch64_neon(
# size_t mr,

View File

@ -14,11 +14,11 @@
# Args passed via stack.
# TOS
# |-----------|
# |c_stride | 0
# |out ch indx| 8
# |params | 16
# |-----------|
# |------------|
# |c_stride | 0
# |out ch index| 8
# |params | 16
# |------------|
# void pytorch_q8gemm_dq_ukernel_8x8__aarch64_neon(
# size_t mr,

View File

@ -32,7 +32,7 @@
#
# Packed A format.
# 4kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
# 4kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
# Original A
# --------- K ----------- -- (K + 4 - 1) / 4 --
# | | | |
@ -53,7 +53,7 @@
# This locality helps in loading 8kx4m blocks of activations
# Note when M is not multiple of 4, the rest can contain arbitrary
# data in packed A as we will not be writing those out.
# This wil be taken care by just copying the appropriate valid data
# This will be taken care by just copying the appropriate valid data
# Also note that this packing is same as taking for 4x1 pattern.
# This is because all the adjacent k's are laid next to each other
@ -109,7 +109,7 @@ k_loop:
VLD1.8 {d2}, [r6]!
VLD1.8 {d3}, [r7]!
# Now we have 4x8 block of values that we will tranpose
# Now we have 4x8 block of values that we will transpose
# A matrix
# --------------------------------
# | |
@ -155,7 +155,7 @@ k_loop:
VTRN.32 d2, d3
VSWP d1, d2
# Now store the tranposed values
# Now store the transposed values
# d0, d1, d2, d3
VST1.8 {q0}, [r2]!
VST1.8 {q1}, [r2]!
@ -172,7 +172,7 @@ k_loop:
VLD1.32 {d2[]}, [r6]
VLD1.32 {d3[]}, [r7]
# Now we have 4x8 block of values that we will tranpose
# Now we have 4x8 block of values that we will transpose
# _d{0-3} are arm neon vector registers
# va0 = _d0 = a0 a1 a2 a3
# va1 = _d1 = b0 b1 b2 b3
@ -218,7 +218,7 @@ k_loop:
VEXT.8 d0, d0, d1, #4
VEXT.8 d1, d2, d3, #4
# Now store the tranposed values
# Now store the transposed values
# d0, d1, d2, d3
VST1.8 {q0}, [r2]
.p2align 4

View File

@ -46,7 +46,7 @@
# |b | 12
# |c | 16
# |c_stride | 20
# |out ch indx | 24
# |out ch index | 24
# |params | 28
# |----------------|
#
@ -61,7 +61,7 @@
# |b | 108
# |c | 112
# |c_stride | 116
# |out ch indx | 120
# |out ch index | 120
# |params | 124
# |----------------|
#
@ -101,7 +101,7 @@
/* Add output_channel_index to the b_zero_point pointer */ ;\
ADD r4, r4, r5 ;\
;\
/* We enter the loop if r1 is atleast 1. */ ;\
/* We enter the loop if r1 is at least 1. */ ;\
/* r1 = r1 - 1 will happen in the epilogue */ ;\
/* of the loop */ ;\
CMP r1, 1 ;\
@ -222,7 +222,7 @@
/* Thus we will load accumulators back in q0, q1, q2, q3, q4, q5, q6, q7 */ ;\
/* When nr < 4, extra q values will be fetched from stack which may overlap */ ;\
/* with other parts of stack storing local variables. To avoid that we just */ ;\
/* create a buffer of 128 bytes inbetween to make sure pointer increment */ ;\
/* create a buffer of 128 bytes in between to make sure pointer increment */ ;\
/* never produces address that is beyond the stack frame of this function. */ ;\
SUB r9, sp, 140 ;\
/* Each iteration produce 4 values each of 4 bytes */ ;\

View File

@ -46,7 +46,7 @@
# |b | 12
# |c | 16
# |c_stride | 20
# |out ch indx | 24
# |out ch index | 24
# |params | 28
# |----------------|
#
@ -61,7 +61,7 @@
# |b | 108
# |c | 112
# |c_stride | 116
# |out ch indx | 120
# |out ch index | 120
# |params | 124
# |----------------|
#

View File

@ -32,7 +32,7 @@
#
# Packed A format.
# 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
# 8kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
# Original A
# --------- K ----------- -- (K + 4 - 1) / 4 --
# | | | |
@ -53,7 +53,7 @@
# This locality helps in loading 8kx8m blocks of activations
# Note when M is not multiple of 8, the rest can contain arbitrary
# data in packed A as we will not be writing those out.
# This wil be taken care by just copying the appropriate valid data
# This will be taken care by just copying the appropriate valid data
# void pytorch_q8gemm_sparse_packA_ukernel_8x4__aarch32_neon(
# size_t mr,
@ -125,7 +125,7 @@ k_loop:
VLD1.8 {d6}, [r10]!
VLD1.8 {d7}, [r11]!
# Now we have 8x8 block of values that we will tranpose
# Now we have 8x8 block of values that we will transpose
# A matrix
# --------------------------------
# | |
@ -189,7 +189,7 @@ k_loop:
VTRN.32 q0, q2
VTRN.32 q1, q3
# Now store the tranposed values
# Now store the transposed values
# d0, d1, d2, d3
# then d4, d5, d6, d7 contiguously
VST1.8 {q0}, [r2]!
@ -213,7 +213,7 @@ k_loop:
VLD1.32 {d6[]}, [r7]
VLD1.32 {d7[]}, [r11]
# Now we have 4x8 block of values that we will tranpose
# Now we have 4x8 block of values that we will transpose
# _d{0-3} are arm neon vector registers
# va04 = _d0 = a0 a1 a2 a3 e0 e1 e2 e3
# va15 = _d1 = b0 b1 b2 b3 f0 f1 f2 f3
@ -260,7 +260,7 @@ k_loop:
VTRN.16 d0, d2
VTRN.16 d1, d3
# Now store the tranposed values
# Now store the transposed values
# d0, d1, d2, d3
# then d4, d5, d6, d7 contiguously
VST1.8 {q0}, [r2]!

View File

@ -9,7 +9,7 @@
#include <qnnpack/assembly.h>
# Packed A format.
# 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
# 8kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
# Original A
# --------- K ----------- -- (K + 4 - 1) / 4 --
# | | | |
@ -30,7 +30,7 @@
# This locality helps in loading 8kx8m blocks of activations
# Note when M is not multiple of 8, the rest can contain arbitrary
# data in packed A as we will not be writing those out.
# This wil be taken care by just copying the appropriate valid data
# This will be taken care by just copying the appropriate valid data
# void pytorch_q8gemm_sparse_packA_ukernel_8x4__aarch32_neon(
# size_t mr,
@ -93,7 +93,7 @@ k_loop:
LD1 {v3.d}[0], [x7], 8
LD1 {v3.d}[1], [x11], 8
# Now we have 8x8 block of values that we will tranpose
# Now we have 8x8 block of values that we will transpose
# A matrix
# ------------------------
# | |
@ -180,7 +180,7 @@ k_loop:
LD1 {v3.s}[0], [x7]
LD1 {v3.s}[1], [x11]
# Now we have 8x4 block of values that we will tranpose
# Now we have 8x4 block of values that we will transpose
# A matrix
# ----------------------------
# | |

View File

@ -14,7 +14,7 @@
#include "8x4c1x4-packed-sse2.h"
// This is a super slow kernel in that it does not use intrinsics to
// tranpose. Since this is for x86 we are not optimizing it.
// transpose. Since this is for x86 we are not optimizing it.
// For ARM this will be optimized.
void pytorch_q8gemm_sparse_packA_ukernel_8x4__sse2(
const size_t mr,
@ -24,7 +24,7 @@ void pytorch_q8gemm_sparse_packA_ukernel_8x4__sse2(
uint8_t* a_packed) {
// Packed A format.
// 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
// 8kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
// Original A
// --------- K ----------- -- (K + 4 - 1) / 4 --
// | | | |
@ -45,7 +45,7 @@ void pytorch_q8gemm_sparse_packA_ukernel_8x4__sse2(
// This locality helps in loading 8kx8m blocks of activations
// Note when M is not multiple of 8, the rest can contain arbitrary
// data in packed A as we will not be writing those out.
// This wil be taken care by just copying the appropriate valid data
// This will be taken care by just copying the appropriate valid data
// Note that parts of A that are not filled are:
// Remainder of M blocks. So some m values are random. This is ok

View File

@ -47,7 +47,7 @@ void KERNEL_NAME(
const __m128i vzero = _mm_setzero_si128();
// Packed A format.
// 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
// 8kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
// Original A
// --------- K ----------- -- (K + 4 - 1) / 4 --
// | | | |
@ -68,7 +68,7 @@ void KERNEL_NAME(
// This locality helps in loading 8kx8m blocks of activations
// Note when M is not multiple of 8, the rest can contain arbitrary
// data in packed A as we will not be writing those out.
// This wil be taken care by just copying the appropriate valid data
// This will be taken care by just copying the appropriate valid data
__m128i vacc_low[4];
__m128i vacc_high[4];

View File

@ -42,11 +42,11 @@
# Args passed via stack.
# TOS
# |-----------|
# |c_stride | 0
# |out ch indx| 8
# |params | 16
# |-----------|
# |------------|
# |c_stride | 0
# |out ch index| 8
# |params | 16
# |------------|
# void pytorch_q8gemm_dq_sparse_1x4_ukernel_8x8_packedA_w##W_INDEX_DTYPE_NUM_BITS##__aarch64_neon(
# size_t mr,
@ -234,7 +234,7 @@
/* v16, v17, v18, v19, v20, v21, v22, v23 */ XX\
/* When nr < 8, say nr = 1, extra v values will be fetched from stack which may overlap */ XX\
/* with other parts of stack storing local variables. To avoid that we just */ XX\
/* create a buffer of 256 bytes inbetween to make sure pointer increment */ XX\
/* create a buffer of 256 bytes in between to make sure pointer increment */ XX\
/* never produces address that is beyond the stack frame of this function. */ XX\
SUB x9, sp, 320 XX\
/* Each iteration produce 8 values each of 4 bytes */ XX\
@ -287,7 +287,7 @@
LD1 {v22.4s}, [x9], 16 XX\
LD1 {v23.4s}, [x9] XX\
XX\
/* We can tranpose one 4x4 block using macro */ XX\
/* We can transpose one 4x4 block using macro */ XX\
/* TRANSPOSE_4X4_S32 v8, v10, v12, v14, v0, v1, v2, v3 */ XX\
/* After this we have */ XX\
/* v8 : x00, x01, x02, x03 */ XX\
@ -302,7 +302,7 @@
/* v20 : x24, x25, x26, x27 */ XX\
/* v22 : x34, x35, x36, x37 */ XX\
/* Similarly we can transpose other two 4x4 blocks and we get */ XX\
/* tranposed 8x8 */ XX\
/* transposed 8x8 */ XX\
XX\
TRANSPOSE_4X4_S32 v8, v10, v12, v14, v0, v1, v2, v3 XX\
TRANSPOSE_4X4_S32 v16, v18, v20, v22, v4, v5, v6, v7 XX\

View File

@ -31,11 +31,11 @@
# Args passed via stack.
# TOS
# |-----------|
# |c_stride | 0
# |out ch indx| 8
# |params | 16
# |-----------|
# |------------|
# |c_stride | 0
# |out ch index| 8
# |params | 16
# |------------|
# void pytorch_q8gemm_dq_sparse_8x1_ukernel_8x8_packedA_w##W_INDEX_DTYPE_NUM_BITS##__aarch64_neon(
# size_t mr,

View File

@ -238,7 +238,7 @@ static inline void pytorch_pack_q8conv_wrq(
}
}
if (kzp != 0) {
// This part fills the packed wights with zero points for output channels
// This part fills the packed weights with zero points for output channels
// when they are not divisible by nr blocking parameter.
// In that case
for (size_t nr_block_offset = 0; nr_block_offset < (nr - nr_block_size);
@ -360,7 +360,7 @@ static inline void pytorch_pack_q8deconv_wrq(
}
}
if (kzp != 0) {
// This part fills the packed wights with zero points for output channels
// This part fills the packed weights with zero points for output channels
// when they are not divisible by nr blocking parameter.
// In that case
for (size_t nr_block_offset = 0; nr_block_offset < (nr - nr_block_size);

View File

@ -93,7 +93,7 @@ void pytorch_qnnp_requantize_q31__scalar(
* overflow is possible only when input is positive, and even when addition
* of a rounding constant overflows 32-bit signed integer, it still doesn't
* overflow 32-bit unsigned integer. Thus, in case of signed overflow, we
* can compute the result using unsigned arithmetics, specifically using
* can compute the result using unsigned arithmetic, specifically using
* logical shift right instead of arithmetic shift right.
* 3. Performs arithmetic shift as is, which will produce division result
* rounded down. Then compute remainder of this division by a power of 2,

View File

@ -17,7 +17,7 @@
#include "requantization-tester.h"
/*
* Precise scalar implementation using unsigned 32-bit arithmetics.
* Precise scalar implementation using unsigned 32-bit arithmetic.
*/
TEST(PRECISE__SCALAR_UNSIGNED32, exact_divide_by_po2) {
@ -83,7 +83,7 @@ TEST(PRECISE__SCALAR_UNSIGNED32, random_cases) {
}
/*
* Precise scalar implementation using unsigned 64-bit arithmetics.
* Precise scalar implementation using unsigned 64-bit arithmetic.
*/
TEST(PRECISE__SCALAR_UNSIGNED64, exact_divide_by_po2) {
@ -149,7 +149,7 @@ TEST(PRECISE__SCALAR_UNSIGNED64, random_cases) {
}
/*
* Precise scalar implementation using signed 64-bit arithmetics.
* Precise scalar implementation using signed 64-bit arithmetic.
*/
TEST(PRECISE__SCALAR_SIGNED64, exact_divide_by_po2) {
@ -302,7 +302,7 @@ TEST(GEMMLOWP__SCALAR, random_cases) {
}
/*
* Precise PSIMD implementation using unsigned 32-bit arithmetics.
* Precise PSIMD implementation using unsigned 32-bit arithmetic.
*/
TEST(PRECISE__PSIMD, exact_divide_by_po2) {

View File

@ -171,7 +171,7 @@ void PackedLinearWeightCudnn::apply_impl_helper(const at::Tensor& quantized_outp
return;
}
// linear_op computes act_int8 * tranpose(w_int8) (matrix multiplication)
// linear_op computes act_int8 * transpose(w_int8) (matrix multiplication)
// where act_int8 and w_int8 are the input and weight variables, resp.
// output is a fp32 tensor
auto linear_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)

View File

@ -54,7 +54,7 @@ void check_maxpool2d_params(
Tensor adaptive_avg_pool2d_quantized_cuda(
const at::Tensor& input,
IntArrayRef output_size) {
// TODO: renable these cudnn preprocessors like quantized_max_pool2d_cudnn below when we implement this function with cudnn
// TODO: re-enable these cudnn preprocessors like quantized_max_pool2d_cudnn below when we implement this function with cudnn
#ifdef USE_CUDA
// #if AT_CUDNN_ENABLED()
// TODO: limit this to per tensor quantized tensors for now, though should be easy to adapt

View File

@ -22,6 +22,7 @@ froms
Halfs
hsa
indexT
inH
inp
inps
inpt
@ -57,5 +58,6 @@ strat
supercede
supercedes
te
THW
tne
WONT