mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/15137 Codemod generated with clangr shard mode, 25 files per diff, motivation: https://github.com/pytorch/pytorch/pull/12407 Reviewed By: ezyang Differential Revision: D13419736 fbshipit-source-id: f4ad7b9582c2f809258169b7fef9adbca7063d99
358 lines
10 KiB
C++
358 lines
10 KiB
C++
#include "caffe2/operators/prelu_op.h"
|
|
#include "caffe2/utils/eigen_utils.h"
|
|
#include "caffe2/utils/math.h"
|
|
|
|
#include "caffe2/core/types.h"
|
|
#include "caffe2/utils/cpu_neon.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
namespace {
|
|
|
|
void runNeonPrelu(float* out, const float* in, int size, float w) {
|
|
float32x4_t vZero = vdupq_n_f32(0.0f);
|
|
float32x4_t vW = vdupq_n_f32(w);
|
|
|
|
constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);
|
|
|
|
if (size < kVecSizeInFloat) {
|
|
for (int i = 0; i < size; ++i) {
|
|
float v = in[i];
|
|
out[i] = v > 0 ? v : v * w;
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
// We want to load aligned from the input, but assume the output is unaligned
|
|
int prologue =
|
|
kVecSizeInFloat -
|
|
// remainder in floats
|
|
(((uintptr_t) in) % (sizeof(float32x4_t))) / sizeof(float);
|
|
|
|
int i = 0;
|
|
|
|
// Prologue loop
|
|
for (; i < prologue; ++i) {
|
|
float v = in[i];
|
|
out[i] = v > 0 ? v : v * w;
|
|
}
|
|
|
|
// The loop is manually unrolled by 6; seems to be the limit for
|
|
// armv7 to avoid register spills
|
|
constexpr int kUnroll = 6;
|
|
constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
|
|
|
|
int remainder = size - prologue;
|
|
int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
|
|
|
|
for (; i < vectorizable; i += kFloatsPerLoop) {
|
|
float32x4_t v0 = vld1q_f32_aligned(in + i + 0);
|
|
float32x4_t v1 = vld1q_f32_aligned(in + i + 4);
|
|
float32x4_t v2 = vld1q_f32_aligned(in + i + 8);
|
|
float32x4_t v3 = vld1q_f32_aligned(in + i + 12);
|
|
float32x4_t v4 = vld1q_f32_aligned(in + i + 16);
|
|
float32x4_t v5 = vld1q_f32_aligned(in + i + 20);
|
|
|
|
uint32x4_t gz0 = vcgtq_f32(v0, vZero);
|
|
uint32x4_t gz1 = vcgtq_f32(v1, vZero);
|
|
uint32x4_t gz2 = vcgtq_f32(v2, vZero);
|
|
uint32x4_t gz3 = vcgtq_f32(v3, vZero);
|
|
uint32x4_t gz4 = vcgtq_f32(v4, vZero);
|
|
uint32x4_t gz5 = vcgtq_f32(v5, vZero);
|
|
|
|
float32x4_t v0neg = vmulq_f32(v0, vW);
|
|
float32x4_t v1neg = vmulq_f32(v1, vW);
|
|
float32x4_t v2neg = vmulq_f32(v2, vW);
|
|
float32x4_t v3neg = vmulq_f32(v3, vW);
|
|
float32x4_t v4neg = vmulq_f32(v4, vW);
|
|
float32x4_t v5neg = vmulq_f32(v5, vW);
|
|
|
|
// v0 > 0 ? v0 : v0 * w
|
|
v0 = vbslq_f32(gz0, v0, v0neg);
|
|
v1 = vbslq_f32(gz1, v1, v1neg);
|
|
v2 = vbslq_f32(gz2, v2, v2neg);
|
|
v3 = vbslq_f32(gz3, v3, v3neg);
|
|
v4 = vbslq_f32(gz4, v4, v4neg);
|
|
v5 = vbslq_f32(gz5, v5, v5neg);
|
|
|
|
vst1q_f32(out + i + 0, v0);
|
|
vst1q_f32(out + i + 4, v1);
|
|
vst1q_f32(out + i + 8, v2);
|
|
vst1q_f32(out + i + 12, v3);
|
|
vst1q_f32(out + i + 16, v4);
|
|
vst1q_f32(out + i + 20, v5);
|
|
}
|
|
|
|
for (; i < size; ++i) {
|
|
float v = in[i];
|
|
out[i] = v > 0 ? v : v * w;
|
|
}
|
|
}
|
|
|
|
}
|
|
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
|
|
template <>
|
|
bool PReluOp<float, CPUContext>::RunOnDevice() {
|
|
const auto& X = Input(0);
|
|
const auto& W = Input(1);
|
|
|
|
auto* Y = Output(0, X.sizes(), at::dtype<float>());
|
|
const auto* Xdata = X.template data<float>();
|
|
const auto* Wdata = W.template data<float>();
|
|
auto* Ydata = Y->template mutable_data<float>();
|
|
|
|
const auto C = order_ == StorageOrder::NCHW ? X.size(1) : X.size(X.dim() - 1);
|
|
const auto C_shared = (W.numel() == 1);
|
|
|
|
if (!C_shared) {
|
|
CAFFE_ENFORCE_EQ(C, W.numel());
|
|
}
|
|
|
|
if (C_shared) {
|
|
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
// The function is completely pointwise
|
|
runNeonPrelu(Ydata, Xdata, X.size(), Wdata[0]);
|
|
#else
|
|
ConstEigenVectorMap<float> Xvec(Xdata, X.numel());
|
|
EigenVectorMap<float> Yvec(Ydata, Y->numel());
|
|
Yvec = Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[0];
|
|
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
return true;
|
|
}
|
|
|
|
// non-shared case.
|
|
switch (order_) {
|
|
case StorageOrder::NCHW: {
|
|
const auto N = X.size(0);
|
|
const auto dim = X.size_from_dim(2);
|
|
|
|
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
// Pointwise for each channel
|
|
for (int n = 0; n < N; ++n) {
|
|
for (int c = 0; c < C; ++c) {
|
|
runNeonPrelu(Ydata + (n * C + c) * dim,
|
|
Xdata + (n * C + c) * dim,
|
|
dim, Wdata[c]);
|
|
}
|
|
}
|
|
#else
|
|
int nc = 0;
|
|
for (int n = 0; n < N; ++n) {
|
|
for (int c = 0; c < C; ++c) {
|
|
ConstEigenVectorMap<float> Xvec(Xdata + nc * dim, dim);
|
|
EigenVectorMap<float>(Ydata + nc * dim, dim) =
|
|
Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[c];
|
|
nc++;
|
|
}
|
|
}
|
|
#endif
|
|
break;
|
|
}
|
|
case StorageOrder::NHWC: {
|
|
// Lay out matrix as (NHW, C) and multiply by C
|
|
const auto NHW = X.numel() / C;
|
|
ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
|
|
ConstEigenVectorArrayMap<float> Wvec(Wdata, C);
|
|
EigenArrayMap<float> Ymat(Ydata, C, NHW);
|
|
Ymat = (Xmat > 0).select(Xmat, Xmat.colwise() * Wvec);
|
|
break;
|
|
}
|
|
default:
|
|
CAFFE_THROW("Unknown storage order: ", order_);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
template <>
|
|
bool PReluGradientOp<float, CPUContext>::RunOnDevice() {
|
|
auto& Y = Input(0);
|
|
auto& dY = Input(1);
|
|
auto& X = Input(2);
|
|
auto& W = Input(3);
|
|
|
|
CAFFE_ENFORCE(&Y != &X, "Cannot backpropagate through an in-place PReLU");
|
|
|
|
DCHECK_EQ(dY.numel(), Y.numel());
|
|
auto* dX = Output(0, Y.sizes(), at::dtype<float>());
|
|
auto* dW = Output(1, W.sizes(), at::dtype<float>());
|
|
|
|
const auto C = order_ == StorageOrder::NCHW ? X.size(1) : X.size(X.dim() - 1);
|
|
const auto C_shared = (W.numel() == 1);
|
|
|
|
const float* Ydata = Y.data<float>();
|
|
const float* dYdata = dY.data<float>();
|
|
const float* Xdata = X.data<float>();
|
|
const float* Wdata = W.data<float>();
|
|
float* dXdata = dX->template mutable_data<float>();
|
|
float* dWdata = dW->template mutable_data<float>();
|
|
|
|
// non-shared case.
|
|
switch (order_) {
|
|
case StorageOrder::NCHW: {
|
|
const auto dim = X.size_from_dim(2);
|
|
const auto div_factor = C_shared ? C : 1;
|
|
for (auto c = 0; c < W.numel(); ++c) {
|
|
dWdata[c] = 0;
|
|
}
|
|
|
|
for (int i = 0; i < Y.numel(); ++i) {
|
|
if (Xdata[i] <= 0) {
|
|
int c = (i / dim) % C / div_factor;
|
|
dWdata[c] += dYdata[i] * Xdata[i];
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < Y.numel(); ++i) {
|
|
if (Xdata[i] > 0) {
|
|
dXdata[i] = dYdata[i];
|
|
} else {
|
|
int c = (i / dim) % C / div_factor;
|
|
dXdata[i] = Wdata[c] * dYdata[i];
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case StorageOrder::NHWC: {
|
|
const auto NHW = X.numel() / C;
|
|
ConstEigenVectorArrayMap<float> Wvec(Wdata, W.numel());
|
|
EigenVectorArrayMap<float> dWvec(dWdata, dW->numel());
|
|
|
|
ConstEigenArrayMap<float> Ymat(Ydata, C, NHW);
|
|
ConstEigenArrayMap<float> dYmat(dYdata, C, NHW);
|
|
ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
|
|
EigenArrayMap<float> dXmat(dXdata, C, NHW);
|
|
|
|
if (C_shared) {
|
|
dXmat = (Xmat > 0).select(dYmat, dYmat * Wdata[0]);
|
|
dWdata[0] =
|
|
(Xmat > 0)
|
|
.select(
|
|
Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
|
|
dYmat * Xmat)
|
|
.sum();
|
|
} else {
|
|
dXmat = (Xmat > 0).select(dYmat, dYmat.colwise() * Wvec);
|
|
dWvec = (Xmat > 0)
|
|
.select(
|
|
Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
|
|
dYmat * Xmat)
|
|
.rowwise()
|
|
.sum();
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
CAFFE_THROW("Unknown storage order: ", order_);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
REGISTER_CPU_OPERATOR(PRelu, PReluOp<float, CPUContext>);
|
|
REGISTER_CPU_GRADIENT_OPERATOR(
|
|
PReluGradient,
|
|
PReluGradientOp<float, CPUContext>);
|
|
|
|
// Input: X, Slope, output: Y
|
|
OPERATOR_SCHEMA(PRelu)
|
|
.NumInputs(2)
|
|
.NumOutputs(1)
|
|
.AllowInplace({{0, 0}})
|
|
.IdenticalTypeAndShapeOfInput(0)
|
|
.SetDoc(R"DOC(
|
|
|
|
The *PRelu* op takes input data tensor $X$, an input slope tensor $slope$, and produces one output tensor $Y$ of the same shape as $X.$ The op performs the element wise *PRelu* operation, defined as
|
|
|
|
$$y=prelu(x) =\begin{cases}slope * x & x < 0\\x & otherwise\end{cases}$$
|
|
|
|
Note, is slope is size 1, the value is shared across the channels, otherwise $X$ and $slope$ must be the same shape. See [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852) for more information.
|
|
|
|
Github Links:
|
|
|
|
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/prelu_op.h
|
|
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/prelu_op.cc
|
|
|
|
|
|
<details>
|
|
|
|
<summary> <b>Example</b> </summary>
|
|
|
|
**Code**
|
|
|
|
```
|
|
|
|
workspace.ResetWorkspace()
|
|
|
|
op = core.CreateOperator(
|
|
"PRelu",
|
|
["X","Slope"],
|
|
["Y"],
|
|
)
|
|
|
|
workspace.FeedBlob("X", np.random.randn(3, 3).astype(np.float32))
|
|
print("X:\n", workspace.FetchBlob("X"), "\n")
|
|
|
|
workspace.FeedBlob("Slope", np.array([0.1]).astype(np.float32))
|
|
print("Slope:\n", workspace.FetchBlob("Slope"), "\n")
|
|
|
|
workspace.RunOperatorOnce(op)
|
|
print("Y:\n", workspace.FetchBlob("Y"))
|
|
|
|
```
|
|
|
|
**Result**
|
|
|
|
```
|
|
|
|
X:
|
|
[[ 0.3957382 -0.19725518 -0.26991343]
|
|
[ 1.5513182 -0.27427664 -0.14584002]
|
|
[-0.4121164 0.9292345 0.96426094]]
|
|
|
|
Slope:
|
|
[0.1]
|
|
|
|
Y:
|
|
[[ 0.3957382 -0.01972552 -0.02699134]
|
|
[ 1.5513182 -0.02742766 -0.014584 ]
|
|
[-0.04121164 0.9292345 0.96426094]]
|
|
|
|
```
|
|
|
|
</details>
|
|
|
|
|
|
)DOC")
|
|
.Input(0, "X", "Input tensor of data to be operated on.")
|
|
.Input(
|
|
1,
|
|
"Slope",
|
|
"1D input slope tensor. If `Slope` is of size 1, the value is shared across different channels")
|
|
.Output(0, "Y", "Output tensor, with same shape as $X$.")
|
|
.InheritOnnxSchema();
|
|
|
|
// Input: Y, dY, output: dX
|
|
GRADIENT_OPERATOR_SCHEMA(PReluGradient).NumInputs(4).NumOutputs(2).SetDoc(R"DOC(
|
|
|
|
PReluGradient takes both Y and dY and uses this to update dX and dW according
|
|
to the chain rule and derivatives of the rectified linear function.
|
|
|
|
)DOC");
|
|
|
|
class GetPReluGradient : public GradientMakerBase {
|
|
using GradientMakerBase::GradientMakerBase;
|
|
vector<OperatorDef> GetGradientDefs() override {
|
|
return SingleGradientDef(
|
|
def_.type() + "Gradient",
|
|
"",
|
|
vector<string>{O(0), GO(0), I(0), I(1)},
|
|
vector<string>{GI(0), GI(1)});
|
|
}
|
|
};
|
|
REGISTER_GRADIENT(PRelu, GetPReluGradient);
|
|
|
|
} // namespace caffe2
|