mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[BE][3/5] fix typos in aten/ (aten/src/ATen/native/) (#157552)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/157552 Approved by: https://github.com/albanD ghstack dependencies: #156605, #157637, #157550, #157551
This commit is contained in:
parent
f57ef62ebc
commit
d5af0eca8d
|
|
@ -1162,7 +1162,6 @@ exclude_patterns = [
|
|||
# These files are all grandfathered in, feel free to remove from this list
|
||||
# as necessary
|
||||
# NOTE: remove the patterns in the order they are listed
|
||||
'aten/src/ATen/native/q*/**',
|
||||
'aten/src/ATen/native/[a-pA-P]*/**',
|
||||
'aten/src/ATen/[a-mA-M]*/**',
|
||||
'test/**',
|
||||
|
|
|
|||
|
|
@ -81,7 +81,7 @@ DynamicQuantMatmul::DynamicQuantMatmul(
|
|||
auto src_q_tensor_info = arm_compute::TensorInfo(
|
||||
arm_compute::TensorShape(weight_dim_0, m),
|
||||
1,
|
||||
// ACL dyanamically quantized matmuls only support (signed) int8_t
|
||||
// ACL dynamically quantized matmuls only support (signed) int8_t
|
||||
arm_compute::DataType::QASYMM8_SIGNED,
|
||||
// TODO: setting the initial offset value to int8_t max instead of zero,
|
||||
// because ACL currently skips MatrixBReduction calculation if the
|
||||
|
|
|
|||
|
|
@ -456,7 +456,7 @@ make_zero_points_and_scales_tensor(
|
|||
uint32_t groups = 1) {
|
||||
const int out_ch_idx = transpose ? 1 : 0;
|
||||
const auto num_output_channels = weight_contig.size(out_ch_idx) * (transpose ? groups : 1);
|
||||
// Add 8 to account for bufferring needed by QNNPACK.
|
||||
// Add 8 to account for buffering needed by QNNPACK.
|
||||
const auto num_output_channels_padded = num_output_channels + kPaddingChannels;
|
||||
const auto qtype = weight_contig.qscheme();
|
||||
std::vector<uint8_t> weight_zp(num_output_channels_padded, 0);
|
||||
|
|
|
|||
|
|
@ -366,7 +366,7 @@ Tensor ConvertConvWeightsToChannelLastTensor<3>(
|
|||
#endif // USE_FBGEMM
|
||||
|
||||
namespace {
|
||||
// This is really terrible, but couldnt figure out a better way to constexpr convert int to
|
||||
// This is really terrible, but couldn't figure out a better way to constexpr convert int to
|
||||
// string and then perform string concatenation on/with it
|
||||
constexpr const char* _hack_int_to_class_name(int x) {
|
||||
switch(x) {
|
||||
|
|
|
|||
|
|
@ -1277,7 +1277,7 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
|
|||
float sum_scale = has_accum ? accum.value().q_scale() : 1.0;
|
||||
int32_t sum_zero_point = has_accum ? accum.value().q_zero_point() : 0;
|
||||
if (has_accum) {
|
||||
// Just tells we have these post op, the actual value such as scale and zero point will be setted later.
|
||||
// Just tells we have these post op, the actual value such as scale and zero point will be set later.
|
||||
op_attr = kReluFused ? ideep::attr_t::residual_with_sum_zero_point() : ideep::attr_t::fuse_sum();
|
||||
const ideep::scale_t accum_scale = ideep::scale_t(1, 1.0/sum_scale);
|
||||
const ideep::zero_point_t accum_zero_points = ideep::zero_point_t(1, sum_zero_point);
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ CMAKE_ARGS+=("-DANDROID_PIE=ON")
|
|||
CMAKE_ARGS+=("-DANDROID_STL=c++_static")
|
||||
CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=exceptions")
|
||||
|
||||
# Use-specified CMake arguments go last to allow overridding defaults
|
||||
# Use-specified CMake arguments go last to allow overriding defaults
|
||||
CMAKE_ARGS+=($@)
|
||||
|
||||
cd build/android/arm64-v8a && cmake ../../.. \
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ CMAKE_ARGS+=("-DANDROID_PIE=ON")
|
|||
CMAKE_ARGS+=("-DANDROID_STL=c++_static")
|
||||
CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=exceptions")
|
||||
|
||||
# Use-specified CMake arguments go last to allow overridding defaults
|
||||
# Use-specified CMake arguments go last to allow overriding defaults
|
||||
CMAKE_ARGS+=($@)
|
||||
|
||||
cd build/android/armeabi-v7a && cmake ../../.. \
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ CMAKE_ARGS+=("-DANDROID_PIE=ON")
|
|||
CMAKE_ARGS+=("-DANDROID_STL=c++_static")
|
||||
CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=exceptions")
|
||||
|
||||
# Use-specified CMake arguments go last to allow overridding defaults
|
||||
# Use-specified CMake arguments go last to allow overriding defaults
|
||||
CMAKE_ARGS+=($@)
|
||||
|
||||
cd build/android/x86 && cmake ../../.. \
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=arm64")
|
|||
CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
|
||||
CMAKE_ARGS+=("-DENABLE_ARC=OFF")
|
||||
|
||||
# Use-specified CMake arguments go last to allow overridding defaults
|
||||
# Use-specified CMake arguments go last to allow overriding defaults
|
||||
CMAKE_ARGS+=($@)
|
||||
|
||||
cd build/ios/arm64 && cmake ../../.. \
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=arm64e")
|
|||
CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
|
||||
CMAKE_ARGS+=("-DENABLE_ARC=OFF")
|
||||
|
||||
# Use-specified CMake arguments go last to allow overridding defaults
|
||||
# Use-specified CMake arguments go last to allow overriding defaults
|
||||
CMAKE_ARGS+=($@)
|
||||
|
||||
cd build/ios/arm64e && cmake ../../.. \
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=armv7")
|
|||
CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
|
||||
CMAKE_ARGS+=("-DENABLE_ARC=OFF")
|
||||
|
||||
# Use-specified CMake arguments go last to allow overridding defaults
|
||||
# Use-specified CMake arguments go last to allow overriding defaults
|
||||
CMAKE_ARGS+=($@)
|
||||
|
||||
cd build/ios/armv7 && cmake ../../.. \
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=armv7s")
|
|||
CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
|
||||
CMAKE_ARGS+=("-DENABLE_ARC=OFF")
|
||||
|
||||
# Use-specified CMake arguments go last to allow overridding defaults
|
||||
# Use-specified CMake arguments go last to allow overriding defaults
|
||||
CMAKE_ARGS+=($@)
|
||||
|
||||
cd build/ios/armv7s && cmake ../../.. \
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=i386")
|
|||
CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
|
||||
CMAKE_ARGS+=("-DENABLE_ARC=OFF")
|
||||
|
||||
# Use-specified CMake arguments go last to allow overridding defaults
|
||||
# Use-specified CMake arguments go last to allow overriding defaults
|
||||
CMAKE_ARGS+=($@)
|
||||
|
||||
cd build/ios/i386 && cmake ../../.. \
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ CMAKE_ARGS+=("-DIOS_ARCH=x86_64")
|
|||
CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
|
||||
CMAKE_ARGS+=("-DENABLE_ARC=OFF")
|
||||
|
||||
# Use-specified CMake arguments go last to allow overridding defaults
|
||||
# Use-specified CMake arguments go last to allow overriding defaults
|
||||
CMAKE_ARGS+=($@)
|
||||
|
||||
cd build/ios/x86_64 && cmake ../../.. \
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ CMAKE_ARGS+=("-DPYTORCH_QNNPACK_LIBRARY_TYPE=static")
|
|||
CMAKE_ARGS+=("-DPYTORCH_QNNPACK_BUILD_BENCHMARKS=ON")
|
||||
CMAKE_ARGS+=("-DPYTORCH_QNNPACK_BUILD_TESTS=ON")
|
||||
|
||||
# Use-specified CMake arguments go last to allow overridding defaults
|
||||
# Use-specified CMake arguments go last to allow overriding defaults
|
||||
CMAKE_ARGS+=($@)
|
||||
|
||||
cd build/local && cmake ../.. \
|
||||
|
|
|
|||
|
|
@ -368,7 +368,7 @@ static enum pytorch_qnnp_status pytorch_qnnp_create_convolution_ndhwc_q8(
|
|||
case pytorch_qnnp_ukernel_type_xzp_gemm: {
|
||||
// TODO: XZP kernels won't be supporting per channel quantization.
|
||||
// For now we dont use XZP kernels anywhere. Probably deprecate it for now
|
||||
// and ressurrect later if needed.
|
||||
// and resurrect later if needed.
|
||||
const uint32_t nr = pytorch_qnnp_params.q8conv_xzp.nr;
|
||||
const uint32_t kr = pytorch_qnnp_params.q8conv_xzp.kr;
|
||||
const uint32_t sr = pytorch_qnnp_params.q8conv_xzp.kc;
|
||||
|
|
|
|||
|
|
@ -20,28 +20,28 @@
|
|||
|
||||
# Args passed via stack.
|
||||
# TOS
|
||||
# |-----------|
|
||||
# |a | 0
|
||||
# |w | 4
|
||||
# |c | 8
|
||||
# |c_stride | 12
|
||||
# |out ch indx| 16
|
||||
# |params | 20
|
||||
# |-----------|
|
||||
# |------------|
|
||||
# |a | 0
|
||||
# |w | 4
|
||||
# |c | 8
|
||||
# |c_stride | 12
|
||||
# |out ch index| 16
|
||||
# |params | 20
|
||||
# |------------|
|
||||
#
|
||||
|
||||
# After loading w pointer in ip reg.
|
||||
# And after pushing r4-r8 and d8-d15 on stack
|
||||
# |-----------|
|
||||
# |d8 - d15 | 0
|
||||
# |r4 - r11 | 64
|
||||
# |a | 96
|
||||
# |w | 100
|
||||
# |c | 104
|
||||
# |c_stride | 108
|
||||
# |out ch indx| 112
|
||||
# |params | 116
|
||||
# |-----------|
|
||||
# |------------|
|
||||
# |d8 - d15 | 0
|
||||
# |r4 - r11 | 64
|
||||
# |a | 96
|
||||
# |w | 100
|
||||
# |c | 104
|
||||
# |c_stride | 108
|
||||
# |out ch index| 112
|
||||
# |params | 116
|
||||
# |------------|
|
||||
#
|
||||
|
||||
# void pytorch_q8conv_ukernel_4x8__aarch32_neon(
|
||||
|
|
|
|||
|
|
@ -23,10 +23,10 @@
|
|||
|
||||
# Args passed via stack.
|
||||
# TOS
|
||||
# |-----------|
|
||||
# |out ch indx| 0
|
||||
# |params | 8
|
||||
# |-----------|
|
||||
# |------------|
|
||||
# |out ch index| 0
|
||||
# |params | 8
|
||||
# |------------|
|
||||
|
||||
# void pytorch_q8conv_ukernel_8x8__aarch64_neon(
|
||||
# size_t mr,
|
||||
|
|
|
|||
|
|
@ -20,28 +20,28 @@
|
|||
|
||||
# Args passed via stack.
|
||||
# TOS
|
||||
# |-----------|
|
||||
# |a_stride | 0
|
||||
# |w | 4
|
||||
# |c | 8
|
||||
# |c_stride | 12
|
||||
# |out ch indx| 16
|
||||
# |params | 20
|
||||
# |-----------|
|
||||
# |------------|
|
||||
# |a_stride | 0
|
||||
# |w | 4
|
||||
# |c | 8
|
||||
# |c_stride | 12
|
||||
# |out ch index| 16
|
||||
# |params | 20
|
||||
# |------------|
|
||||
#
|
||||
|
||||
# After loading w pointer in ip reg.
|
||||
# And after pushing r4-r9 and d8-d15 on stack
|
||||
# |-----------|
|
||||
# |d8 - d15 | 0
|
||||
# |r4 - r9 | 64
|
||||
# |a_stride | 88
|
||||
# |w | 92
|
||||
# |c | 96
|
||||
# |c_stride | 100
|
||||
# |out ch indx| 104
|
||||
# |params | 108
|
||||
# |-----------|
|
||||
# |------------|
|
||||
# |d8 - d15 | 0
|
||||
# |r4 - r9 | 64
|
||||
# |a_stride | 88
|
||||
# |w | 92
|
||||
# |c | 96
|
||||
# |c_stride | 100
|
||||
# |out ch index| 104
|
||||
# |params | 108
|
||||
# |------------|
|
||||
#
|
||||
|
||||
#
|
||||
|
|
|
|||
|
|
@ -33,29 +33,29 @@
|
|||
|
||||
# Args passed via stack.
|
||||
# TOS
|
||||
# |-----------|
|
||||
# |a_stride | 0
|
||||
# |w | 4
|
||||
# |c | 8
|
||||
# |c_stride | 12
|
||||
# |out ch indx| 16
|
||||
# |params | 20
|
||||
# |-----------|
|
||||
# |------------|
|
||||
# |a_stride | 0
|
||||
# |w | 4
|
||||
# |c | 8
|
||||
# |c_stride | 12
|
||||
# |out ch index| 16
|
||||
# |params | 20
|
||||
# |------------|
|
||||
#
|
||||
|
||||
# After loading w pointer in ip reg.
|
||||
# And after pushing r4-r8 and d8-d15 on stack
|
||||
# |-----------|
|
||||
# |d8 - d15 | 0
|
||||
# |r4 - r7 | 64
|
||||
# |a_stride | 80
|
||||
# |w | 84
|
||||
# |b | 88
|
||||
# |c | 92
|
||||
# |c_stride | 96
|
||||
# |out ch indx| 100
|
||||
# |params | 104
|
||||
# |-----------|
|
||||
# |------------|
|
||||
# |d8 - d15 | 0
|
||||
# |r4 - r7 | 64
|
||||
# |a_stride | 80
|
||||
# |w | 84
|
||||
# |b | 88
|
||||
# |c | 92
|
||||
# |c_stride | 96
|
||||
# |out ch index| 100
|
||||
# |params | 104
|
||||
# |------------|
|
||||
#
|
||||
|
||||
# void pytorch_q8gemm_ukernel_4x8__aarch32_neon(
|
||||
|
|
|
|||
|
|
@ -22,10 +22,10 @@
|
|||
|
||||
# Args passed via stack.
|
||||
# TOS
|
||||
# |-----------|
|
||||
# |out ch indx| 0
|
||||
# |params | 8
|
||||
# |-----------|
|
||||
# |------------|
|
||||
# |out ch index| 0
|
||||
# |params | 8
|
||||
# |------------|
|
||||
|
||||
# void pytorch_q8gemm_ukernel_8x8__aarch64_neon(
|
||||
# size_t mr,
|
||||
|
|
|
|||
|
|
@ -14,11 +14,11 @@
|
|||
|
||||
# Args passed via stack.
|
||||
# TOS
|
||||
# |-----------|
|
||||
# |c_stride | 0
|
||||
# |out ch indx| 8
|
||||
# |params | 16
|
||||
# |-----------|
|
||||
# |------------|
|
||||
# |c_stride | 0
|
||||
# |out ch index| 8
|
||||
# |params | 16
|
||||
# |------------|
|
||||
|
||||
# void pytorch_q8gemm_dq_ukernel_8x8__aarch64_neon(
|
||||
# size_t mr,
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@
|
|||
#
|
||||
|
||||
# Packed A format.
|
||||
# 4kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
|
||||
# 4kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
|
||||
# Original A
|
||||
# --------- K ----------- -- (K + 4 - 1) / 4 --
|
||||
# | | | |
|
||||
|
|
@ -53,7 +53,7 @@
|
|||
# This locality helps in loading 8kx4m blocks of activations
|
||||
# Note when M is not multiple of 4, the rest can contain arbitrary
|
||||
# data in packed A as we will not be writing those out.
|
||||
# This wil be taken care by just copying the appropriate valid data
|
||||
# This will be taken care by just copying the appropriate valid data
|
||||
|
||||
# Also note that this packing is same as taking for 4x1 pattern.
|
||||
# This is because all the adjacent k's are laid next to each other
|
||||
|
|
@ -109,7 +109,7 @@ k_loop:
|
|||
VLD1.8 {d2}, [r6]!
|
||||
VLD1.8 {d3}, [r7]!
|
||||
|
||||
# Now we have 4x8 block of values that we will tranpose
|
||||
# Now we have 4x8 block of values that we will transpose
|
||||
# A matrix
|
||||
# --------------------------------
|
||||
# | |
|
||||
|
|
@ -155,7 +155,7 @@ k_loop:
|
|||
VTRN.32 d2, d3
|
||||
VSWP d1, d2
|
||||
|
||||
# Now store the tranposed values
|
||||
# Now store the transposed values
|
||||
# d0, d1, d2, d3
|
||||
VST1.8 {q0}, [r2]!
|
||||
VST1.8 {q1}, [r2]!
|
||||
|
|
@ -172,7 +172,7 @@ k_loop:
|
|||
VLD1.32 {d2[]}, [r6]
|
||||
VLD1.32 {d3[]}, [r7]
|
||||
|
||||
# Now we have 4x8 block of values that we will tranpose
|
||||
# Now we have 4x8 block of values that we will transpose
|
||||
# _d{0-3} are arm neon vector registers
|
||||
# va0 = _d0 = a0 a1 a2 a3
|
||||
# va1 = _d1 = b0 b1 b2 b3
|
||||
|
|
@ -218,7 +218,7 @@ k_loop:
|
|||
VEXT.8 d0, d0, d1, #4
|
||||
VEXT.8 d1, d2, d3, #4
|
||||
|
||||
# Now store the tranposed values
|
||||
# Now store the transposed values
|
||||
# d0, d1, d2, d3
|
||||
VST1.8 {q0}, [r2]
|
||||
.p2align 4
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@
|
|||
# |b | 12
|
||||
# |c | 16
|
||||
# |c_stride | 20
|
||||
# |out ch indx | 24
|
||||
# |out ch index | 24
|
||||
# |params | 28
|
||||
# |----------------|
|
||||
#
|
||||
|
|
@ -61,7 +61,7 @@
|
|||
# |b | 108
|
||||
# |c | 112
|
||||
# |c_stride | 116
|
||||
# |out ch indx | 120
|
||||
# |out ch index | 120
|
||||
# |params | 124
|
||||
# |----------------|
|
||||
#
|
||||
|
|
@ -101,7 +101,7 @@
|
|||
/* Add output_channel_index to the b_zero_point pointer */ ;\
|
||||
ADD r4, r4, r5 ;\
|
||||
;\
|
||||
/* We enter the loop if r1 is atleast 1. */ ;\
|
||||
/* We enter the loop if r1 is at least 1. */ ;\
|
||||
/* r1 = r1 - 1 will happen in the epilogue */ ;\
|
||||
/* of the loop */ ;\
|
||||
CMP r1, 1 ;\
|
||||
|
|
@ -222,7 +222,7 @@
|
|||
/* Thus we will load accumulators back in q0, q1, q2, q3, q4, q5, q6, q7 */ ;\
|
||||
/* When nr < 4, extra q values will be fetched from stack which may overlap */ ;\
|
||||
/* with other parts of stack storing local variables. To avoid that we just */ ;\
|
||||
/* create a buffer of 128 bytes inbetween to make sure pointer increment */ ;\
|
||||
/* create a buffer of 128 bytes in between to make sure pointer increment */ ;\
|
||||
/* never produces address that is beyond the stack frame of this function. */ ;\
|
||||
SUB r9, sp, 140 ;\
|
||||
/* Each iteration produce 4 values each of 4 bytes */ ;\
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@
|
|||
# |b | 12
|
||||
# |c | 16
|
||||
# |c_stride | 20
|
||||
# |out ch indx | 24
|
||||
# |out ch index | 24
|
||||
# |params | 28
|
||||
# |----------------|
|
||||
#
|
||||
|
|
@ -61,7 +61,7 @@
|
|||
# |b | 108
|
||||
# |c | 112
|
||||
# |c_stride | 116
|
||||
# |out ch indx | 120
|
||||
# |out ch index | 120
|
||||
# |params | 124
|
||||
# |----------------|
|
||||
#
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@
|
|||
#
|
||||
|
||||
# Packed A format.
|
||||
# 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
|
||||
# 8kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
|
||||
# Original A
|
||||
# --------- K ----------- -- (K + 4 - 1) / 4 --
|
||||
# | | | |
|
||||
|
|
@ -53,7 +53,7 @@
|
|||
# This locality helps in loading 8kx8m blocks of activations
|
||||
# Note when M is not multiple of 8, the rest can contain arbitrary
|
||||
# data in packed A as we will not be writing those out.
|
||||
# This wil be taken care by just copying the appropriate valid data
|
||||
# This will be taken care by just copying the appropriate valid data
|
||||
|
||||
# void pytorch_q8gemm_sparse_packA_ukernel_8x4__aarch32_neon(
|
||||
# size_t mr,
|
||||
|
|
@ -125,7 +125,7 @@ k_loop:
|
|||
VLD1.8 {d6}, [r10]!
|
||||
VLD1.8 {d7}, [r11]!
|
||||
|
||||
# Now we have 8x8 block of values that we will tranpose
|
||||
# Now we have 8x8 block of values that we will transpose
|
||||
# A matrix
|
||||
# --------------------------------
|
||||
# | |
|
||||
|
|
@ -189,7 +189,7 @@ k_loop:
|
|||
VTRN.32 q0, q2
|
||||
VTRN.32 q1, q3
|
||||
|
||||
# Now store the tranposed values
|
||||
# Now store the transposed values
|
||||
# d0, d1, d2, d3
|
||||
# then d4, d5, d6, d7 contiguously
|
||||
VST1.8 {q0}, [r2]!
|
||||
|
|
@ -213,7 +213,7 @@ k_loop:
|
|||
VLD1.32 {d6[]}, [r7]
|
||||
VLD1.32 {d7[]}, [r11]
|
||||
|
||||
# Now we have 4x8 block of values that we will tranpose
|
||||
# Now we have 4x8 block of values that we will transpose
|
||||
# _d{0-3} are arm neon vector registers
|
||||
# va04 = _d0 = a0 a1 a2 a3 e0 e1 e2 e3
|
||||
# va15 = _d1 = b0 b1 b2 b3 f0 f1 f2 f3
|
||||
|
|
@ -260,7 +260,7 @@ k_loop:
|
|||
VTRN.16 d0, d2
|
||||
VTRN.16 d1, d3
|
||||
|
||||
# Now store the tranposed values
|
||||
# Now store the transposed values
|
||||
# d0, d1, d2, d3
|
||||
# then d4, d5, d6, d7 contiguously
|
||||
VST1.8 {q0}, [r2]!
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@
|
|||
#include <qnnpack/assembly.h>
|
||||
|
||||
# Packed A format.
|
||||
# 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
|
||||
# 8kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
|
||||
# Original A
|
||||
# --------- K ----------- -- (K + 4 - 1) / 4 --
|
||||
# | | | |
|
||||
|
|
@ -30,7 +30,7 @@
|
|||
# This locality helps in loading 8kx8m blocks of activations
|
||||
# Note when M is not multiple of 8, the rest can contain arbitrary
|
||||
# data in packed A as we will not be writing those out.
|
||||
# This wil be taken care by just copying the appropriate valid data
|
||||
# This will be taken care by just copying the appropriate valid data
|
||||
|
||||
# void pytorch_q8gemm_sparse_packA_ukernel_8x4__aarch32_neon(
|
||||
# size_t mr,
|
||||
|
|
@ -93,7 +93,7 @@ k_loop:
|
|||
LD1 {v3.d}[0], [x7], 8
|
||||
LD1 {v3.d}[1], [x11], 8
|
||||
|
||||
# Now we have 8x8 block of values that we will tranpose
|
||||
# Now we have 8x8 block of values that we will transpose
|
||||
# A matrix
|
||||
# ------------------------
|
||||
# | |
|
||||
|
|
@ -180,7 +180,7 @@ k_loop:
|
|||
LD1 {v3.s}[0], [x7]
|
||||
LD1 {v3.s}[1], [x11]
|
||||
|
||||
# Now we have 8x4 block of values that we will tranpose
|
||||
# Now we have 8x4 block of values that we will transpose
|
||||
# A matrix
|
||||
# ----------------------------
|
||||
# | |
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@
|
|||
#include "8x4c1x4-packed-sse2.h"
|
||||
|
||||
// This is a super slow kernel in that it does not use intrinsics to
|
||||
// tranpose. Since this is for x86 we are not optimizing it.
|
||||
// transpose. Since this is for x86 we are not optimizing it.
|
||||
// For ARM this will be optimized.
|
||||
void pytorch_q8gemm_sparse_packA_ukernel_8x4__sse2(
|
||||
const size_t mr,
|
||||
|
|
@ -24,7 +24,7 @@ void pytorch_q8gemm_sparse_packA_ukernel_8x4__sse2(
|
|||
uint8_t* a_packed) {
|
||||
|
||||
// Packed A format.
|
||||
// 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
|
||||
// 8kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
|
||||
// Original A
|
||||
// --------- K ----------- -- (K + 4 - 1) / 4 --
|
||||
// | | | |
|
||||
|
|
@ -45,7 +45,7 @@ void pytorch_q8gemm_sparse_packA_ukernel_8x4__sse2(
|
|||
// This locality helps in loading 8kx8m blocks of activations
|
||||
// Note when M is not multiple of 8, the rest can contain arbitrary
|
||||
// data in packed A as we will not be writing those out.
|
||||
// This wil be taken care by just copying the appropriate valid data
|
||||
// This will be taken care by just copying the appropriate valid data
|
||||
|
||||
// Note that parts of A that are not filled are:
|
||||
// Remainder of M blocks. So some m values are random. This is ok
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@ void KERNEL_NAME(
|
|||
const __m128i vzero = _mm_setzero_si128();
|
||||
|
||||
// Packed A format.
|
||||
// 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
|
||||
// 8kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
|
||||
// Original A
|
||||
// --------- K ----------- -- (K + 4 - 1) / 4 --
|
||||
// | | | |
|
||||
|
|
@ -68,7 +68,7 @@ void KERNEL_NAME(
|
|||
// This locality helps in loading 8kx8m blocks of activations
|
||||
// Note when M is not multiple of 8, the rest can contain arbitrary
|
||||
// data in packed A as we will not be writing those out.
|
||||
// This wil be taken care by just copying the appropriate valid data
|
||||
// This will be taken care by just copying the appropriate valid data
|
||||
|
||||
__m128i vacc_low[4];
|
||||
__m128i vacc_high[4];
|
||||
|
|
|
|||
|
|
@ -42,11 +42,11 @@
|
|||
|
||||
# Args passed via stack.
|
||||
# TOS
|
||||
# |-----------|
|
||||
# |c_stride | 0
|
||||
# |out ch indx| 8
|
||||
# |params | 16
|
||||
# |-----------|
|
||||
# |------------|
|
||||
# |c_stride | 0
|
||||
# |out ch index| 8
|
||||
# |params | 16
|
||||
# |------------|
|
||||
|
||||
# void pytorch_q8gemm_dq_sparse_1x4_ukernel_8x8_packedA_w##W_INDEX_DTYPE_NUM_BITS##__aarch64_neon(
|
||||
# size_t mr,
|
||||
|
|
@ -234,7 +234,7 @@
|
|||
/* v16, v17, v18, v19, v20, v21, v22, v23 */ XX\
|
||||
/* When nr < 8, say nr = 1, extra v values will be fetched from stack which may overlap */ XX\
|
||||
/* with other parts of stack storing local variables. To avoid that we just */ XX\
|
||||
/* create a buffer of 256 bytes inbetween to make sure pointer increment */ XX\
|
||||
/* create a buffer of 256 bytes in between to make sure pointer increment */ XX\
|
||||
/* never produces address that is beyond the stack frame of this function. */ XX\
|
||||
SUB x9, sp, 320 XX\
|
||||
/* Each iteration produce 8 values each of 4 bytes */ XX\
|
||||
|
|
@ -287,7 +287,7 @@
|
|||
LD1 {v22.4s}, [x9], 16 XX\
|
||||
LD1 {v23.4s}, [x9] XX\
|
||||
XX\
|
||||
/* We can tranpose one 4x4 block using macro */ XX\
|
||||
/* We can transpose one 4x4 block using macro */ XX\
|
||||
/* TRANSPOSE_4X4_S32 v8, v10, v12, v14, v0, v1, v2, v3 */ XX\
|
||||
/* After this we have */ XX\
|
||||
/* v8 : x00, x01, x02, x03 */ XX\
|
||||
|
|
@ -302,7 +302,7 @@
|
|||
/* v20 : x24, x25, x26, x27 */ XX\
|
||||
/* v22 : x34, x35, x36, x37 */ XX\
|
||||
/* Similarly we can transpose other two 4x4 blocks and we get */ XX\
|
||||
/* tranposed 8x8 */ XX\
|
||||
/* transposed 8x8 */ XX\
|
||||
XX\
|
||||
TRANSPOSE_4X4_S32 v8, v10, v12, v14, v0, v1, v2, v3 XX\
|
||||
TRANSPOSE_4X4_S32 v16, v18, v20, v22, v4, v5, v6, v7 XX\
|
||||
|
|
|
|||
|
|
@ -31,11 +31,11 @@
|
|||
|
||||
# Args passed via stack.
|
||||
# TOS
|
||||
# |-----------|
|
||||
# |c_stride | 0
|
||||
# |out ch indx| 8
|
||||
# |params | 16
|
||||
# |-----------|
|
||||
# |------------|
|
||||
# |c_stride | 0
|
||||
# |out ch index| 8
|
||||
# |params | 16
|
||||
# |------------|
|
||||
|
||||
# void pytorch_q8gemm_dq_sparse_8x1_ukernel_8x8_packedA_w##W_INDEX_DTYPE_NUM_BITS##__aarch64_neon(
|
||||
# size_t mr,
|
||||
|
|
|
|||
|
|
@ -238,7 +238,7 @@ static inline void pytorch_pack_q8conv_wrq(
|
|||
}
|
||||
}
|
||||
if (kzp != 0) {
|
||||
// This part fills the packed wights with zero points for output channels
|
||||
// This part fills the packed weights with zero points for output channels
|
||||
// when they are not divisible by nr blocking parameter.
|
||||
// In that case
|
||||
for (size_t nr_block_offset = 0; nr_block_offset < (nr - nr_block_size);
|
||||
|
|
@ -360,7 +360,7 @@ static inline void pytorch_pack_q8deconv_wrq(
|
|||
}
|
||||
}
|
||||
if (kzp != 0) {
|
||||
// This part fills the packed wights with zero points for output channels
|
||||
// This part fills the packed weights with zero points for output channels
|
||||
// when they are not divisible by nr blocking parameter.
|
||||
// In that case
|
||||
for (size_t nr_block_offset = 0; nr_block_offset < (nr - nr_block_size);
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ void pytorch_qnnp_requantize_q31__scalar(
|
|||
* overflow is possible only when input is positive, and even when addition
|
||||
* of a rounding constant overflows 32-bit signed integer, it still doesn't
|
||||
* overflow 32-bit unsigned integer. Thus, in case of signed overflow, we
|
||||
* can compute the result using unsigned arithmetics, specifically using
|
||||
* can compute the result using unsigned arithmetic, specifically using
|
||||
* logical shift right instead of arithmetic shift right.
|
||||
* 3. Performs arithmetic shift as is, which will produce division result
|
||||
* rounded down. Then compute remainder of this division by a power of 2,
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@
|
|||
#include "requantization-tester.h"
|
||||
|
||||
/*
|
||||
* Precise scalar implementation using unsigned 32-bit arithmetics.
|
||||
* Precise scalar implementation using unsigned 32-bit arithmetic.
|
||||
*/
|
||||
|
||||
TEST(PRECISE__SCALAR_UNSIGNED32, exact_divide_by_po2) {
|
||||
|
|
@ -83,7 +83,7 @@ TEST(PRECISE__SCALAR_UNSIGNED32, random_cases) {
|
|||
}
|
||||
|
||||
/*
|
||||
* Precise scalar implementation using unsigned 64-bit arithmetics.
|
||||
* Precise scalar implementation using unsigned 64-bit arithmetic.
|
||||
*/
|
||||
|
||||
TEST(PRECISE__SCALAR_UNSIGNED64, exact_divide_by_po2) {
|
||||
|
|
@ -149,7 +149,7 @@ TEST(PRECISE__SCALAR_UNSIGNED64, random_cases) {
|
|||
}
|
||||
|
||||
/*
|
||||
* Precise scalar implementation using signed 64-bit arithmetics.
|
||||
* Precise scalar implementation using signed 64-bit arithmetic.
|
||||
*/
|
||||
|
||||
TEST(PRECISE__SCALAR_SIGNED64, exact_divide_by_po2) {
|
||||
|
|
@ -302,7 +302,7 @@ TEST(GEMMLOWP__SCALAR, random_cases) {
|
|||
}
|
||||
|
||||
/*
|
||||
* Precise PSIMD implementation using unsigned 32-bit arithmetics.
|
||||
* Precise PSIMD implementation using unsigned 32-bit arithmetic.
|
||||
*/
|
||||
|
||||
TEST(PRECISE__PSIMD, exact_divide_by_po2) {
|
||||
|
|
|
|||
|
|
@ -171,7 +171,7 @@ void PackedLinearWeightCudnn::apply_impl_helper(const at::Tensor& quantized_outp
|
|||
return;
|
||||
}
|
||||
|
||||
// linear_op computes act_int8 * tranpose(w_int8) (matrix multiplication)
|
||||
// linear_op computes act_int8 * transpose(w_int8) (matrix multiplication)
|
||||
// where act_int8 and w_int8 are the input and weight variables, resp.
|
||||
// output is a fp32 tensor
|
||||
auto linear_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ void check_maxpool2d_params(
|
|||
Tensor adaptive_avg_pool2d_quantized_cuda(
|
||||
const at::Tensor& input,
|
||||
IntArrayRef output_size) {
|
||||
// TODO: renable these cudnn preprocessors like quantized_max_pool2d_cudnn below when we implement this function with cudnn
|
||||
// TODO: re-enable these cudnn preprocessors like quantized_max_pool2d_cudnn below when we implement this function with cudnn
|
||||
#ifdef USE_CUDA
|
||||
// #if AT_CUDNN_ENABLED()
|
||||
// TODO: limit this to per tensor quantized tensors for now, though should be easy to adapt
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ froms
|
|||
Halfs
|
||||
hsa
|
||||
indexT
|
||||
inH
|
||||
inp
|
||||
inps
|
||||
inpt
|
||||
|
|
@ -57,5 +58,6 @@ strat
|
|||
supercede
|
||||
supercedes
|
||||
te
|
||||
THW
|
||||
tne
|
||||
WONT
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user