mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Add cppcoreguidelines-avoid-magic-numbers exclusion to clang-tidy Remove existing nolint warnings using following script: ``` for file in `git ls-files | grep -v \.py`; do gsed '/^ *\/\/ NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)/d' -i $file; done ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/57841 Reviewed By: samestep Differential Revision: D28295045 Pulled By: malfet fbshipit-source-id: 7c6e8d1213c9593f169ed3df6a916498f1a97163
65 lines
2.3 KiB
C++
65 lines
2.3 KiB
C++
#include "transpose.h"
|
|
|
|
#include <immintrin.h>
|
|
|
|
namespace fbgemm {
|
|
|
|
void transpose_4rows(int N, const std::uint8_t* src, std::uint8_t* dst) {
|
|
constexpr int M = 4;
|
|
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
|
int j;
|
|
// vectorized loop
|
|
for (j = 0; j < N / 32 * 32; j += 32) {
|
|
// a : a0 a1 ... a31
|
|
// b : b0 b1 ... b31
|
|
// c : c0 c1 ... c31
|
|
// d : d0 d1 ... d31
|
|
__m256i a = _mm256_lddqu_si256((const __m256i*)(src + j + 0 * N));
|
|
__m256i b = _mm256_lddqu_si256((const __m256i*)(src + j + 1 * N));
|
|
__m256i c = _mm256_lddqu_si256((const __m256i*)(src + j + 2 * N));
|
|
__m256i d = _mm256_lddqu_si256((const __m256i*)(src + j + 3 * N));
|
|
|
|
// even-odd interleaving
|
|
// ab_lo : a0 b0 a1 b1 ... a7 b7 | a16 b16 ... a23 b23
|
|
// ab_hi : a8 b8 a9 b9 ... a15 b15 | a24 b24 ... a31 b31
|
|
// cd_lo : c0 d0 c1 d1 ... c7 d7 | c16 d16 ... c23 d23
|
|
// cd_hi : c8 d8 c9 d9 ... c15 d15 | c24 d24 ... c31 d31
|
|
__m256i ab_lo = _mm256_unpacklo_epi8(a, b);
|
|
__m256i ab_hi = _mm256_unpackhi_epi8(a, b);
|
|
__m256i cd_lo = _mm256_unpacklo_epi8(c, d);
|
|
__m256i cd_hi = _mm256_unpackhi_epi8(c, d);
|
|
|
|
// 4-row interleaving but permuted at 128-bit granularity
|
|
// y0 : a0 b0 c0 d0 ... a-d3 | a-d16 ... a-d19
|
|
// y1 : a4 b4 c4 d4 ... a-d7 | a-d20 ... a-d23
|
|
// y2 : a8 b8 c8 d8 ... a-d11 | a-d24 ... a-d27
|
|
// y3 : a12 b12 c12 d12 ... a-d15 | a-d28 ... a-d31
|
|
__m256i y0 = _mm256_unpacklo_epi16(ab_lo, cd_lo);
|
|
__m256i y1 = _mm256_unpackhi_epi16(ab_lo, cd_lo);
|
|
__m256i y2 = _mm256_unpacklo_epi16(ab_hi, cd_hi);
|
|
__m256i y3 = _mm256_unpackhi_epi16(ab_hi, cd_hi);
|
|
|
|
// Storing with 128-bit lanes are permuted so that everything is in order
|
|
_mm256_storeu_si256(
|
|
(__m256i*)(dst + j * M + 0 * 32),
|
|
_mm256_permute2f128_si256(y0, y1, 0x20));
|
|
_mm256_storeu_si256(
|
|
(__m256i*)(dst + j * M + 1 * 32),
|
|
_mm256_permute2f128_si256(y2, y3, 0x20));
|
|
_mm256_storeu_si256(
|
|
(__m256i*)(dst + j * M + 2 * 32),
|
|
_mm256_permute2f128_si256(y0, y1, 0x31));
|
|
_mm256_storeu_si256(
|
|
(__m256i*)(dst + j * M + 3 * 32),
|
|
_mm256_permute2f128_si256(y2, y3, 0x31));
|
|
}
|
|
// scalar loop for remainder
|
|
for (; j < N; ++j) {
|
|
for (int i = 0; i < M; ++i) {
|
|
dst[j * M + i] = src[j + i * N];
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace fbgemm
|