mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/17089 clangr codemod Reviewed By: ezyang Differential Revision: D14078539 fbshipit-source-id: 9ca196af4af7f26fc82e6cf82b35d478d0597752
597 lines
19 KiB
C++
597 lines
19 KiB
C++
#include "caffe2/core/operator.h"
|
|
#include "caffe2/utils/cpu_neon.h"
|
|
#include "caffe2/utils/math.h"
|
|
|
|
#ifdef CAFFE2_USE_MKLDNN
|
|
#include <caffe2/ideep/operators/operator_fallback_ideep.h>
|
|
#include <caffe2/ideep/utils/ideep_operator.h>
|
|
#endif
|
|
|
|
namespace caffe2 {
|
|
|
|
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
namespace {
|
|
|
|
//
|
|
// ARM Neon code utilities
|
|
//
|
|
|
|
inline float32x4_t to_v4_f32(uint16x4_t v) {
|
|
return vcvtq_f32_u32(vmovl_u16(v));
|
|
}
|
|
|
|
inline float32x4x4_t to_f32_v4_x4(uint8x16_t v) {
|
|
float32x4x4_t out;
|
|
|
|
uint16x8_t lo_u16 = vmovl_u8(vget_low_u8(v));
|
|
|
|
out.val[0] = to_v4_f32(vget_low_u16(lo_u16));
|
|
out.val[1] = to_v4_f32(vget_high_u16(lo_u16));
|
|
|
|
uint16x8_t hi_u16 = vmovl_u8(vget_high_u8(v));
|
|
|
|
out.val[2] = to_v4_f32(vget_low_u16(hi_u16));
|
|
out.val[3] = to_v4_f32(vget_high_u16(hi_u16));
|
|
|
|
return out;
|
|
}
|
|
|
|
inline void clamp(float32x4_t& v) {
|
|
v = vmaxq_f32(v, vdupq_n_f32(0));
|
|
v = vminq_f32(v, vdupq_n_f32((float)std::numeric_limits<uint8_t>::max()));
|
|
}
|
|
|
|
inline void addMeanAndClamp(float32x4_t& v, float mean) {
|
|
v = vaddq_f32(v, vdupq_n_f32(mean));
|
|
clamp(v);
|
|
}
|
|
|
|
inline uint8x8_t convertNarrowAndPack(float32x4_t v0, float32x4_t v1) {
|
|
uint16x4_t u16_0 = vmovn_u32(vcvtq_u32_f32(v0));
|
|
uint16x4_t u16_1 = vmovn_u32(vcvtq_u32_f32(v1));
|
|
uint16x8_t u16_01 = vcombine_u16(u16_0, u16_1);
|
|
return vmovn_u16(u16_01);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
|
|
class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
|
|
: public Operator<CPUContext> {
|
|
public:
|
|
// Expect this many channels as input
|
|
static constexpr int kInputChannels = 4;
|
|
|
|
// Expect this many channels as output
|
|
static constexpr int kOutputChannels = 3;
|
|
|
|
// We read this much noise per vectorized cycle
|
|
static constexpr int kNeonNoiseReadSize = kOutputChannels * 16;
|
|
|
|
USE_OPERATOR_FUNCTIONS(CPUContext);
|
|
explicit PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp(const OperatorDef& operator_def, Workspace* ws)
|
|
: Operator<CPUContext>(operator_def, ws), ws_(ws) {}
|
|
|
|
bool RunOnDevice() override {
|
|
const auto& X = Input(0);
|
|
const auto& mean = Input(1);
|
|
|
|
auto* noiseBlob = ws_->CreateBlob("__CAFFE2_STYLIZER_NOISE__");
|
|
auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
|
|
"noise_size", 491 /* prime to avoid artifacts */);
|
|
|
|
if (!BlobIsTensorType(*noiseBlob, CPU)) {
|
|
// Initialize random noise on first use.
|
|
// Cache it to maintain temporal consistency.
|
|
auto* t = BlobGetMutableTensor(noiseBlob, CPU);
|
|
|
|
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
// Noise space is larger for vectorized code due to the
|
|
// vectorized load
|
|
initNoiseCPUNeon(t, defaultNoiseSize);
|
|
#else
|
|
initNoiseCPU(t, defaultNoiseSize);
|
|
#endif
|
|
}
|
|
const auto& noise = noiseBlob->template Get<TensorCPU>();
|
|
CAFFE_ENFORCE(noise.numel() >= defaultNoiseSize);
|
|
|
|
CAFFE_ENFORCE(X.dim() == 4);
|
|
const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
|
|
// Assume BGR or BGRA
|
|
CAFFE_ENFORCE(mean.numel() == kOutputChannels);
|
|
|
|
CAFFE_ENFORCE(C == kInputChannels);
|
|
auto* Y = Output(0, {N, kOutputChannels, H, W}, at::dtype<float>());
|
|
|
|
runBatch(
|
|
N,
|
|
C,
|
|
H,
|
|
W,
|
|
defaultNoiseSize,
|
|
X.data<uint8_t>(),
|
|
mean.data<float>(),
|
|
noise.data<float>(),
|
|
Y->template mutable_data<float>());
|
|
|
|
return true;
|
|
}
|
|
|
|
#if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
|
|
void initNoiseCPU(Tensor* noise, int size) {
|
|
noise->Resize(size);
|
|
|
|
math::RandGaussian<float, CPUContext>(
|
|
size,
|
|
0.0,
|
|
OperatorBase::GetSingleArgument<float>("noise_std", 10.0),
|
|
noise->template mutable_data<float>(),
|
|
&context_);
|
|
}
|
|
#endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)
|
|
|
|
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
void initNoiseCPUNeon(Tensor* noise, int size) {
|
|
// For ARM NEON, we read in multiples of kNeonNoiseReadSize since
|
|
// the inner loop is vectorized. Round up to the next highest
|
|
// multiple of kNeonNoiseReadSize
|
|
size = math::RoundUp(size, kNeonNoiseReadSize) + size;
|
|
noise->Resize(size);
|
|
|
|
math::RandGaussian<float, CPUContext>(
|
|
size,
|
|
0.0,
|
|
OperatorBase::GetSingleArgument<float>("noise_std", 10.0),
|
|
noise->template mutable_data<float>(),
|
|
&context_);
|
|
}
|
|
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
|
|
void runBatch(
|
|
int N,
|
|
int /*C*/,
|
|
int H,
|
|
int W,
|
|
int noiseCycle,
|
|
const uint8_t* input,
|
|
const float* meanChannel,
|
|
const float* noise,
|
|
float* output) {
|
|
int planeSize = H * W;
|
|
|
|
for (int n = 0; n < N; ++n) {
|
|
auto curInput = input + n * kInputChannels * planeSize;
|
|
auto curOutput = output + n * kOutputChannels * planeSize;
|
|
|
|
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
runCPUNeon(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
|
|
#else
|
|
runCPU(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
|
|
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
}
|
|
}
|
|
|
|
#if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
|
|
void runCPU(
|
|
int H,
|
|
int W,
|
|
int noiseCycle,
|
|
const uint8_t* input,
|
|
const float* meanChannel,
|
|
const float* noise,
|
|
float* output) {
|
|
int planeSize = H * W;
|
|
int noiseOffset = 0;
|
|
|
|
for (int point = 0; point < planeSize; ++point) {
|
|
for (int c = 0; c < kOutputChannels; ++c) {
|
|
float v = (float)input[point * kInputChannels + c];
|
|
output[c * planeSize + point] = v - meanChannel[c] + noise[noiseOffset];
|
|
|
|
if (++noiseOffset >= noiseCycle) {
|
|
noiseOffset = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)
|
|
|
|
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
void runCPUNeon(
|
|
int H,
|
|
int W,
|
|
int noiseCycle,
|
|
const uint8_t* input,
|
|
const float* meanChannel,
|
|
const float* noise,
|
|
float* output) {
|
|
// Vectorized load parameters:
|
|
|
|
// Loop unroll factor
|
|
// FIXME: this doesn't actually unroll; clang has per-loop unroll
|
|
// pragmas but GCC does not
|
|
constexpr int kUnroll = 1;
|
|
|
|
// How much data we load for each inner loop
|
|
constexpr int kInnerLoadSize = sizeof(uint8x16x4_t);
|
|
|
|
// What we write out
|
|
constexpr int kInnerStoreSize = sizeof(float32x4_t);
|
|
|
|
// We load 16 pixels at a time, with 4 channels each
|
|
constexpr int kLoadPixels = kInnerLoadSize / kInputChannels;
|
|
static_assert(kLoadPixels == 16, "unexpected");
|
|
|
|
// How many pixels we load per loop
|
|
constexpr int kLoadPixelsPerLoop = kLoadPixels * kUnroll;
|
|
|
|
// We need at least this much noise each loop through
|
|
CAFFE_ENFORCE_GE(noiseCycle, kOutputChannels * kLoadPixelsPerLoop);
|
|
|
|
int noiseUsed = 0;
|
|
const float* curNoise = noise;
|
|
|
|
float mean[kOutputChannels] = {
|
|
meanChannel[0], meanChannel[1], meanChannel[2]};
|
|
int planeSize = H * W;
|
|
|
|
// Vectorized portion
|
|
int point = 0;
|
|
|
|
// If the slice is not aligned, then we have to use the
|
|
// un-vectorized version
|
|
bool isAligned = isPointerAligned(input, kInnerLoadSize) &&
|
|
isPointerAligned(output, kInnerStoreSize) &&
|
|
// Because we are writing to output at offsets of planeSize,
|
|
// planeSize has to be an even multiple of kInnerStoreSize
|
|
(planeSize % kInnerStoreSize == 0);
|
|
|
|
// What portion the vectorized loop will handle
|
|
int limit =
|
|
isAligned ? (planeSize / kLoadPixelsPerLoop) * kLoadPixelsPerLoop : 0;
|
|
|
|
for (; point < limit; point += kLoadPixelsPerLoop) {
|
|
// Unroll load/update/store by kUnroll
|
|
for (int j = 0; j < kUnroll; ++j) {
|
|
// We load 16 pixels x 4 channels at a time
|
|
const uint8_t* inputAligned = (const uint8_t*)__builtin_assume_aligned(
|
|
input + (point + j * kLoadPixels) * kInputChannels,
|
|
sizeof(uint8x16x4_t));
|
|
uint8x16x4_t loadV = vld4q_u8(inputAligned);
|
|
|
|
// The compiler doesn't want to unroll this when we put it in a
|
|
// loop, and in GCC there's no per-loop unroll pragma, so we do
|
|
// it manually.
|
|
// This seems to involve no register spillage, crossing fingers
|
|
// that it remains that way.
|
|
{
|
|
constexpr int kChannel = 0;
|
|
float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 0);
|
|
float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 4);
|
|
float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 8);
|
|
float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 12);
|
|
|
|
float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
|
|
float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
|
|
outV.val[0] = vsubq_f32(outV.val[0], meanV);
|
|
outV.val[1] = vsubq_f32(outV.val[1], meanV);
|
|
outV.val[2] = vsubq_f32(outV.val[2], meanV);
|
|
outV.val[3] = vsubq_f32(outV.val[3], meanV);
|
|
|
|
outV.val[0] = vaddq_f32(outV.val[0], noise0);
|
|
outV.val[1] = vaddq_f32(outV.val[1], noise1);
|
|
outV.val[2] = vaddq_f32(outV.val[2], noise2);
|
|
outV.val[3] = vaddq_f32(outV.val[3], noise3);
|
|
|
|
float* outputAligned = (float*)__builtin_assume_aligned(
|
|
&output[kChannel * planeSize + (point + j * kLoadPixels)],
|
|
sizeof(float32x4_t));
|
|
|
|
vst1q_f32(outputAligned + 0, outV.val[0]);
|
|
vst1q_f32(outputAligned + 4, outV.val[1]);
|
|
vst1q_f32(outputAligned + 8, outV.val[2]);
|
|
vst1q_f32(outputAligned + 12, outV.val[3]);
|
|
}
|
|
|
|
{
|
|
constexpr int kChannel = 1;
|
|
float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 16);
|
|
float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 20);
|
|
float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 24);
|
|
float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 28);
|
|
|
|
float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
|
|
float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
|
|
outV.val[0] = vsubq_f32(outV.val[0], meanV);
|
|
outV.val[1] = vsubq_f32(outV.val[1], meanV);
|
|
outV.val[2] = vsubq_f32(outV.val[2], meanV);
|
|
outV.val[3] = vsubq_f32(outV.val[3], meanV);
|
|
|
|
outV.val[0] = vaddq_f32(outV.val[0], noise0);
|
|
outV.val[1] = vaddq_f32(outV.val[1], noise1);
|
|
outV.val[2] = vaddq_f32(outV.val[2], noise2);
|
|
outV.val[3] = vaddq_f32(outV.val[3], noise3);
|
|
|
|
float* outputAligned = (float*)__builtin_assume_aligned(
|
|
&output[kChannel * planeSize + (point + j * kLoadPixels)],
|
|
sizeof(float32x4_t));
|
|
|
|
vst1q_f32(outputAligned + 0, outV.val[0]);
|
|
vst1q_f32(outputAligned + 4, outV.val[1]);
|
|
vst1q_f32(outputAligned + 8, outV.val[2]);
|
|
vst1q_f32(outputAligned + 12, outV.val[3]);
|
|
}
|
|
|
|
{
|
|
constexpr int kChannel = 2;
|
|
float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 32);
|
|
float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 36);
|
|
float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 40);
|
|
float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 44);
|
|
|
|
float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
|
|
float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
|
|
outV.val[0] = vsubq_f32(outV.val[0], meanV);
|
|
outV.val[1] = vsubq_f32(outV.val[1], meanV);
|
|
outV.val[2] = vsubq_f32(outV.val[2], meanV);
|
|
outV.val[3] = vsubq_f32(outV.val[3], meanV);
|
|
|
|
outV.val[0] = vaddq_f32(outV.val[0], noise0);
|
|
outV.val[1] = vaddq_f32(outV.val[1], noise1);
|
|
outV.val[2] = vaddq_f32(outV.val[2], noise2);
|
|
outV.val[3] = vaddq_f32(outV.val[3], noise3);
|
|
|
|
float* outputAligned = (float*)__builtin_assume_aligned(
|
|
&output[kChannel * planeSize + (point + j * kLoadPixels)],
|
|
sizeof(float32x4_t));
|
|
|
|
vst1q_f32(outputAligned + 0, outV.val[0]);
|
|
vst1q_f32(outputAligned + 4, outV.val[1]);
|
|
vst1q_f32(outputAligned + 8, outV.val[2]);
|
|
vst1q_f32(outputAligned + 12, outV.val[3]);
|
|
}
|
|
}
|
|
|
|
curNoise += (kLoadPixels * kOutputChannels) * kUnroll;
|
|
noiseUsed += (kLoadPixels * kOutputChannels) * kUnroll;
|
|
|
|
if (noiseUsed >= noiseCycle) {
|
|
noiseUsed = 0;
|
|
curNoise = noise + ((curNoise - noise) % noiseCycle);
|
|
}
|
|
}
|
|
|
|
// Epilogue: non-vectorized remainder
|
|
for (; point < planeSize; ++point) {
|
|
for (int c = 0; c < kOutputChannels; ++c) {
|
|
float v = (float)input[point * kInputChannels + c];
|
|
output[c * planeSize + point] = v - mean[c] + *curNoise++;
|
|
++noiseUsed;
|
|
}
|
|
|
|
if (noiseUsed >= noiseCycle) {
|
|
noiseUsed = 0;
|
|
curNoise = noise + ((curNoise - noise) % noiseCycle);
|
|
}
|
|
}
|
|
}
|
|
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
|
|
private:
|
|
Workspace* ws_;
|
|
};
|
|
|
|
namespace {
|
|
|
|
template <typename T>
|
|
static inline T clamped_cast(float f) {
|
|
if (f >= std::numeric_limits<T>::max()) {
|
|
return std::numeric_limits<T>::max();
|
|
}
|
|
if (f <= std::numeric_limits<T>::min()) {
|
|
return std::numeric_limits<T>::min();
|
|
}
|
|
return static_cast<T>(f);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
|
|
class BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp
|
|
: public Operator<CPUContext> {
|
|
public:
|
|
using Operator<CPUContext>::Operator;
|
|
|
|
// Expect this many channels as input
|
|
static constexpr int kInputChannels = 3;
|
|
|
|
// Expect this many channels as output
|
|
static constexpr int kOutputChannels = 4;
|
|
|
|
bool RunOnDevice() override {
|
|
const auto& X = Input(0);
|
|
const auto& mean = Input(1);
|
|
|
|
CAFFE_ENFORCE(X.dim() == 4);
|
|
const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
|
|
// Assume BGR or BGRA
|
|
CAFFE_ENFORCE(mean.numel() == kInputChannels);
|
|
CAFFE_ENFORCE(C == kInputChannels);
|
|
// RGB
|
|
auto* Y = Output(0, {N, H, W, kOutputChannels}, at::dtype<uint8_t>());
|
|
|
|
runBatch(
|
|
N,
|
|
C,
|
|
H,
|
|
W,
|
|
X.data<float>(),
|
|
mean.data<float>(),
|
|
Y->template mutable_data<uint8_t>());
|
|
|
|
return true;
|
|
}
|
|
|
|
void runBatch(
|
|
int N,
|
|
int /*C*/,
|
|
int H,
|
|
int W,
|
|
const float* input,
|
|
const float* meanChannel,
|
|
uint8_t* output) {
|
|
int planeSize = H * W;
|
|
|
|
for (int n = 0; n < N; ++n) {
|
|
auto curInput = input + n * kInputChannels * planeSize;
|
|
auto curOutput = output + n * kOutputChannels * planeSize;
|
|
|
|
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
runCPUNeon(H, W, curInput, meanChannel, curOutput);
|
|
#else
|
|
runCPU(H, W, curInput, meanChannel, curOutput);
|
|
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
}
|
|
}
|
|
|
|
#if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
|
|
void runCPU(
|
|
int H,
|
|
int W,
|
|
const float* input,
|
|
const float* meanChannel,
|
|
uint8_t* output) {
|
|
int planeSize = H * W;
|
|
|
|
for (int point = 0; point < planeSize; ++point) {
|
|
for (int c = 0; c < kInputChannels; ++c) {
|
|
uint8_t v = clamped_cast<uint8_t>(
|
|
input[c * planeSize + point] + meanChannel[c]);
|
|
output[point * kOutputChannels + c] = v;
|
|
}
|
|
|
|
// alpha
|
|
output[point * kOutputChannels + (kOutputChannels - 1)] =
|
|
std::numeric_limits<uint8_t>::max();
|
|
}
|
|
}
|
|
#endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)
|
|
|
|
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
void runCPUNeon(
|
|
int H,
|
|
int W,
|
|
const float* input,
|
|
const float* meanChannel,
|
|
uint8_t* output) {
|
|
// Vectorized load parameters:
|
|
|
|
// We load in chunks of this size
|
|
constexpr int kLoadUnit = sizeof(float32x4_t);
|
|
constexpr int kLoadFloats = (sizeof(float32x4_t) / sizeof(float));
|
|
|
|
// We store in chunks of this size
|
|
constexpr int kStoreUnit = sizeof(uint8x8x4_t);
|
|
|
|
// The vector portion loads this many f32 pixels at a time (8)
|
|
constexpr int kLoadPixels = 2 * kLoadFloats;
|
|
|
|
float mean[kInputChannels] = {
|
|
meanChannel[0], meanChannel[1], meanChannel[2]};
|
|
int planeSize = H * W;
|
|
|
|
// Vectorized portion
|
|
int point = 0;
|
|
|
|
// If the slice is not aligned, then we have to use the
|
|
// un-vectorized version
|
|
bool isAligned = isPointerAligned(input, kLoadUnit) &&
|
|
isPointerAligned(output, kStoreUnit) &&
|
|
// Because we are reading from input at offsets of planeSize,
|
|
// planeSize has to be an even multiple of kLoadUnit
|
|
(planeSize % kLoadUnit == 0);
|
|
|
|
// What portion the vectorized loop will handle
|
|
int limit = isAligned ? (planeSize / kLoadPixels) * kLoadPixels : 0;
|
|
|
|
for (; point < limit; point += kLoadPixels) {
|
|
// Load 8 f32 pixels from each channel; loading 16 involves
|
|
// register spills it seems
|
|
float32x4_t inputc0_0 =
|
|
vld1q_f32_aligned(input + 0 * planeSize + point + 0 * kLoadFloats);
|
|
float32x4_t inputc0_1 =
|
|
vld1q_f32_aligned(input + 0 * planeSize + point + 1 * kLoadFloats);
|
|
|
|
float32x4_t inputc1_0 =
|
|
vld1q_f32_aligned(input + 1 * planeSize + point + 0 * kLoadFloats);
|
|
float32x4_t inputc1_1 =
|
|
vld1q_f32_aligned(input + 1 * planeSize + point + 1 * kLoadFloats);
|
|
|
|
float32x4_t inputc2_0 =
|
|
vld1q_f32_aligned(input + 2 * planeSize + point + 0 * kLoadFloats);
|
|
float32x4_t inputc2_1 =
|
|
vld1q_f32_aligned(input + 2 * planeSize + point + 1 * kLoadFloats);
|
|
|
|
addMeanAndClamp(inputc0_0, mean[0]);
|
|
addMeanAndClamp(inputc0_1, mean[0]);
|
|
uint8x8_t u8_c0 = convertNarrowAndPack(inputc0_0, inputc0_1);
|
|
|
|
addMeanAndClamp(inputc1_0, mean[1]);
|
|
addMeanAndClamp(inputc1_1, mean[1]);
|
|
uint8x8_t u8_c1 = convertNarrowAndPack(inputc1_0, inputc1_1);
|
|
|
|
addMeanAndClamp(inputc2_0, mean[2]);
|
|
addMeanAndClamp(inputc2_1, mean[2]);
|
|
uint8x8_t u8_c2 = convertNarrowAndPack(inputc2_0, inputc2_1);
|
|
|
|
// This is the alpha channel
|
|
uint8x8_t u8_c3 = vdup_n_u8(std::numeric_limits<uint8_t>::max());
|
|
|
|
// We now have 8 bytes of each channel in a separate vector
|
|
// Write BGRA interleaved to output
|
|
uint8x8x4_t u8_out = {{ u8_c0, u8_c1, u8_c2, u8_c3 }};
|
|
vst4_u8_aligned(output + kOutputChannels * point, u8_out);
|
|
}
|
|
|
|
// Epilogue: non-vectorized remainder
|
|
for (; point < planeSize; ++point) {
|
|
for (int c = 0; c < kInputChannels; ++c) {
|
|
uint8_t v =
|
|
clamped_cast<uint8_t>(input[c * planeSize + point] + mean[c]);
|
|
output[point * kOutputChannels + c] = v;
|
|
}
|
|
|
|
// alpha
|
|
output[point * kOutputChannels + (kOutputChannels - 1)] =
|
|
std::numeric_limits<uint8_t>::max();
|
|
}
|
|
}
|
|
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
};
|
|
|
|
namespace {
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
PackedInt8BGRANHWCToNCHWCStylizerPreprocess,
|
|
PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp);
|
|
OPERATOR_SCHEMA(PackedInt8BGRANHWCToNCHWCStylizerPreprocess)
|
|
.NumInputs(2)
|
|
.NumOutputs(1);
|
|
REGISTER_CPU_OPERATOR(
|
|
BRGNCHWCToPackedInt8BGRAStylizerDeprocess,
|
|
BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp);
|
|
OPERATOR_SCHEMA(BRGNCHWCToPackedInt8BGRAStylizerDeprocess)
|
|
.NumInputs(2)
|
|
.NumOutputs(1);
|
|
|
|
#ifdef CAFFE2_USE_MKLDNN
|
|
REGISTER_IDEEP_OPERATOR(
|
|
BRGNCHWCToPackedInt8BGRAStylizerDeprocess,
|
|
IDEEPFallbackOp<BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp, SkipIndices<0>>);
|
|
REGISTER_IDEEP_OPERATOR(
|
|
PackedInt8BGRANHWCToNCHWCStylizerPreprocess,
|
|
IDEEPFallbackOp<PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp>);
|
|
#endif
|
|
} // namespace
|
|
} // namespace caffe2
|