pytorch/caffe2/operators/prelu_op.cc

#include "caffe2/operators/prelu_op.h"
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/math.h"

#include "caffe2/core/types.h"
#include "caffe2/utils/cpu_neon.h"

namespace caffe2 {

#if defined(__ARM_NEON__) || defined(__ARM_NEON)
namespace {

void runNeonPrelu(float* out, const float* in, int size, float w) {
  float32x4_t vZero = vdupq_n_f32(0.0f);
  float32x4_t vW = vdupq_n_f32(w);

  constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);

  if (size < kVecSizeInFloat) {
    for (int i = 0; i < size; ++i) {
      float v = in[i];
      out[i] = v > 0 ? v : v * w;
    }

    return;
  }

  // We want to load aligned from the input, but assume the output is unaligned
  int prologue =
    kVecSizeInFloat -
    // remainder in floats
    (((uintptr_t) in) % (sizeof(float32x4_t))) / sizeof(float);

  int i = 0;

  // Prologue loop
  for (; i < prologue; ++i) {
    float v = in[i];
    out[i] = v > 0 ? v : v * w;
  }

  // The loop is manually unrolled by 6; seems to be the limit for
  // armv7 to avoid register spills
  constexpr int kUnroll = 6;
  constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;

  int remainder = size - prologue;
  int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;

  for (; i < vectorizable; i += kFloatsPerLoop) {
    float32x4_t v0 = vld1q_f32_aligned(in + i + 0);
    float32x4_t v1 = vld1q_f32_aligned(in + i + 4);
    float32x4_t v2 = vld1q_f32_aligned(in + i + 8);
    float32x4_t v3 = vld1q_f32_aligned(in + i + 12);
    float32x4_t v4 = vld1q_f32_aligned(in + i + 16);
    float32x4_t v5 = vld1q_f32_aligned(in + i + 20);

    uint32x4_t gz0 = vcgtq_f32(v0, vZero);
    uint32x4_t gz1 = vcgtq_f32(v1, vZero);
    uint32x4_t gz2 = vcgtq_f32(v2, vZero);
    uint32x4_t gz3 = vcgtq_f32(v3, vZero);
    uint32x4_t gz4 = vcgtq_f32(v4, vZero);
    uint32x4_t gz5 = vcgtq_f32(v5, vZero);

    float32x4_t v0neg = vmulq_f32(v0, vW);
    float32x4_t v1neg = vmulq_f32(v1, vW);
    float32x4_t v2neg = vmulq_f32(v2, vW);
    float32x4_t v3neg = vmulq_f32(v3, vW);
    float32x4_t v4neg = vmulq_f32(v4, vW);
    float32x4_t v5neg = vmulq_f32(v5, vW);

    // v0 > 0 ? v0 : v0 * w
    v0 = vbslq_f32(gz0, v0, v0neg);
    v1 = vbslq_f32(gz1, v1, v1neg);
    v2 = vbslq_f32(gz2, v2, v2neg);
    v3 = vbslq_f32(gz3, v3, v3neg);
    v4 = vbslq_f32(gz4, v4, v4neg);
    v5 = vbslq_f32(gz5, v5, v5neg);

    vst1q_f32(out + i + 0, v0);
    vst1q_f32(out + i + 4, v1);
    vst1q_f32(out + i + 8, v2);
    vst1q_f32(out + i + 12, v3);
    vst1q_f32(out + i + 16, v4);
    vst1q_f32(out + i + 20, v5);
  }

  for (; i < size; ++i) {
    float v = in[i];
    out[i] = v > 0 ? v : v * w;
  }
}

}
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)

template <>
bool PReluOp<float, CPUContext>::RunOnDevice() {
  const auto& X = Input(0);
  const auto& W = Input(1);

  auto* Y = Output(0, X.sizes(), at::dtype<float>());
  const auto* Xdata = X.template data<float>();
  const auto* Wdata = W.template data<float>();
  auto* Ydata = Y->template mutable_data<float>();

  const auto C = order_ == StorageOrder::NCHW ? X.size(1) : X.size(X.dim() - 1);
  const auto C_shared = (W.numel() == 1);

  if (!C_shared) {
    CAFFE_ENFORCE_EQ(C, W.numel());
  }

  if (C_shared) {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
    // The function is completely pointwise
    runNeonPrelu(Ydata, Xdata, X.size(), Wdata[0]);
#else
    ConstEigenVectorMap<float> Xvec(Xdata, X.numel());
    EigenVectorMap<float> Yvec(Ydata, Y->numel());
    Yvec = Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[0];
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
    return true;
  }

  // non-shared case.
  switch (order_) {
    case StorageOrder::NCHW: {
      const auto N = X.size(0);
      const auto dim = X.size_from_dim(2);

#if defined(__ARM_NEON__) || defined(__ARM_NEON)
      // Pointwise for each channel
      for (int n = 0; n < N; ++n) {
        for (int c = 0; c < C; ++c) {
          runNeonPrelu(Ydata + (n * C + c) * dim,
                       Xdata + (n * C + c) * dim,
                       dim, Wdata[c]);
        }
      }
#else
      int nc = 0;
      for (int n = 0; n < N; ++n) {
        for (int c = 0; c < C; ++c) {
          ConstEigenVectorMap<float> Xvec(Xdata + nc * dim, dim);
          EigenVectorMap<float>(Ydata + nc * dim, dim) =
              Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[c];
          nc++;
        }
      }
#endif
      break;
    }
    case StorageOrder::NHWC: {
      // Lay out matrix as (NHW, C) and multiply by C
      const auto NHW = X.numel() / C;
      ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
      ConstEigenVectorArrayMap<float> Wvec(Wdata, C);
      EigenArrayMap<float> Ymat(Ydata, C, NHW);
      Ymat = (Xmat > 0).select(Xmat, Xmat.colwise() * Wvec);
      break;
    }
    default:
      CAFFE_THROW("Unknown storage order: ", order_);
  }
  return true;
}

template <>
bool PReluGradientOp<float, CPUContext>::RunOnDevice() {
  auto& Y = Input(0);
  auto& dY = Input(1);
  auto& X = Input(2);
  auto& W = Input(3);

  CAFFE_ENFORCE(&Y != &X, "Cannot backpropagate through an in-place PReLU");

  DCHECK_EQ(dY.numel(), Y.numel());
  auto* dX = Output(0, Y.sizes(), at::dtype<float>());
  auto* dW = Output(1, W.sizes(), at::dtype<float>());

  const auto C = order_ == StorageOrder::NCHW ? X.size(1) : X.size(X.dim() - 1);
  const auto C_shared = (W.numel() == 1);

  const float* Ydata = Y.data<float>();
  const float* dYdata = dY.data<float>();
  const float* Xdata = X.data<float>();
  const float* Wdata = W.data<float>();
  float* dXdata = dX->template mutable_data<float>();
  float* dWdata = dW->template mutable_data<float>();

  // non-shared case.
  switch (order_) {
    case StorageOrder::NCHW: {
      const auto dim = X.size_from_dim(2);
      const auto div_factor = C_shared ? C : 1;
      for (auto c = 0; c < W.numel(); ++c) {
        dWdata[c] = 0;
      }

      for (int i = 0; i < Y.numel(); ++i) {
        if (Xdata[i] <= 0) {
          int c = (i / dim) % C / div_factor;
          dWdata[c] += dYdata[i] * Xdata[i];
        }
      }

      for (int i = 0; i < Y.numel(); ++i) {
        if (Xdata[i] > 0) {
          dXdata[i] = dYdata[i];
        } else {
          int c = (i / dim) % C / div_factor;
          dXdata[i] = Wdata[c] * dYdata[i];
        }
      }
      break;
    }
    case StorageOrder::NHWC: {
      const auto NHW = X.numel() / C;
      ConstEigenVectorArrayMap<float> Wvec(Wdata, W.numel());
      EigenVectorArrayMap<float> dWvec(dWdata, dW->numel());

      ConstEigenArrayMap<float> Ymat(Ydata, C, NHW);
      ConstEigenArrayMap<float> dYmat(dYdata, C, NHW);
      ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
      EigenArrayMap<float> dXmat(dXdata, C, NHW);

      if (C_shared) {
        dXmat = (Xmat > 0).select(dYmat, dYmat * Wdata[0]);
        dWdata[0] =
            (Xmat > 0)
                .select(
                    Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
                    dYmat * Xmat)
                .sum();
      } else {
        dXmat = (Xmat > 0).select(dYmat, dYmat.colwise() * Wvec);
        dWvec = (Xmat > 0)
                    .select(
                        Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
                        dYmat * Xmat)
                    .rowwise()
                    .sum();
      }
      break;
    }
    default:
      CAFFE_THROW("Unknown storage order: ", order_);
  }

  return true;
}

REGISTER_CPU_OPERATOR(PRelu, PReluOp<float, CPUContext>);
REGISTER_CPU_GRADIENT_OPERATOR(
    PReluGradient,
    PReluGradientOp<float, CPUContext>);

// Input: X, Slope, output: Y
OPERATOR_SCHEMA(PRelu)
    .NumInputs(2)
    .NumOutputs(1)
    .AllowInplace({{0, 0}})
    .IdenticalTypeAndShapeOfInput(0)
    .SetDoc(R"DOC(

The *PRelu* op takes input data tensor $X$, an input slope tensor $slope$, and produces one output tensor $Y$ of the same shape as $X.$ The op performs the element wise *PRelu* operation, defined as

$$y=prelu(x) =\begin{cases}slope * x & x < 0\\x & otherwise\end{cases}$$

Note, is slope is size 1, the value is shared across the channels, otherwise $X$ and $slope$ must be the same shape. See [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852) for more information.

Github Links:

- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/prelu_op.h
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/prelu_op.cc


<details>

<summary> <b>Example</b> </summary>

**Code**

```

workspace.ResetWorkspace()

op = core.CreateOperator(
    "PRelu",
    ["X","Slope"],
    ["Y"],
)

workspace.FeedBlob("X", np.random.randn(3, 3).astype(np.float32))
print("X:\n", workspace.FetchBlob("X"), "\n")

workspace.FeedBlob("Slope", np.array([0.1]).astype(np.float32))
print("Slope:\n", workspace.FetchBlob("Slope"), "\n")

workspace.RunOperatorOnce(op)
print("Y:\n", workspace.FetchBlob("Y"))

```

**Result**

```

X:
 [[ 0.3957382  -0.19725518 -0.26991343]
 [ 1.5513182  -0.27427664 -0.14584002]
 [-0.4121164   0.9292345   0.96426094]]

Slope:
 [0.1]

Y:
 [[ 0.3957382  -0.01972552 -0.02699134]
 [ 1.5513182  -0.02742766 -0.014584  ]
 [-0.04121164  0.9292345   0.96426094]]

```

</details>


)DOC")
    .Input(0, "X", "Input tensor of data to be operated on.")
    .Input(
        1,
        "Slope",
        "1D input slope tensor. If `Slope` is of size 1, the value is shared across different channels")
    .Output(0, "Y", "Output tensor, with same shape as $X$.")
    .InheritOnnxSchema();

// Input: Y, dY, output: dX
GRADIENT_OPERATOR_SCHEMA(PReluGradient).NumInputs(4).NumOutputs(2).SetDoc(R"DOC(

PReluGradient takes both Y and dY and uses this to update dX and dW according
to the chain rule and derivatives of the rectified linear function.

)DOC");

class GetPReluGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
    return SingleGradientDef(
        def_.type() + "Gradient",
        "",
        vector<string>{O(0), GO(0), I(0), I(1)},
        vector<string>{GI(0), GI(1)});
  }
};
REGISTER_GRADIENT(PRelu, GetPReluGradient);

} // namespace caffe2