mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/19713 Add elementwise_affine for layer_norm_op Reviewed By: houseroad Differential Revision: D15075454 fbshipit-source-id: e8a7d3da1c81e49fa55323f5e74a68bc4ef8d83f
237 lines
7.3 KiB
C++
237 lines
7.3 KiB
C++
#include "caffe2/operators/layer_norm_op.h"
|
|
|
|
#include "caffe2/core/operator_c10wrapper.h"
|
|
#include "caffe2/utils/eigen_utils.h"
|
|
#include "caffe2/utils/math.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
template <>
|
|
template <typename T>
|
|
void LayerNormOp<CPUContext>::ComputeSigmaAndFusedParams(
|
|
const int N,
|
|
const float eps,
|
|
const T* mean,
|
|
const T* var,
|
|
T* sigma,
|
|
T* scale,
|
|
T* bias,
|
|
CPUContext* context) {
|
|
ConstEigenVectorArrayMap<T> var_arr(var, N);
|
|
EigenVectorArrayMap<T> sigma_arr(sigma, N);
|
|
sigma_arr = var_arr + static_cast<T>(eps);
|
|
math::Rsqrt<T>(N, sigma, scale, context);
|
|
math::Mul<T>(N, scale, sigma, sigma, context);
|
|
EigenVectorArrayMap<T>(bias, N) = -ConstEigenVectorArrayMap<T>(scale, N) *
|
|
ConstEigenVectorArrayMap<T>(mean, N);
|
|
}
|
|
|
|
template <>
|
|
template <typename T>
|
|
void LayerNormOp<CPUContext>::LayerNormForward(
|
|
const int M,
|
|
const int N,
|
|
const T* X,
|
|
const T* scale,
|
|
const T* bias,
|
|
const T* gamma,
|
|
const T* beta,
|
|
T* Y,
|
|
CPUContext* /* context */) {
|
|
ConstEigenArrayMap<T> X_arr(X, N, M);
|
|
ConstEigenVectorArrayMap<T> scale_arr(scale, M);
|
|
ConstEigenVectorArrayMap<T> bias_arr(bias, M);
|
|
EigenArrayMap<T> Y_arr(Y, N, M);
|
|
if (gamma != nullptr && beta != nullptr) {
|
|
ConstEigenVectorArrayMap<T> gamma_arr(gamma, N);
|
|
ConstEigenVectorArrayMap<T> beta_arr(beta, N);
|
|
Y_arr = (((X_arr.rowwise() * scale_arr.transpose()).rowwise() +
|
|
bias_arr.transpose())
|
|
.colwise() *
|
|
gamma_arr)
|
|
.colwise() +
|
|
beta_arr;
|
|
} else {
|
|
CAFFE_ENFORCE(gamma == nullptr);
|
|
CAFFE_ENFORCE(beta == nullptr);
|
|
Y_arr = (X_arr.rowwise() * scale_arr.transpose()).rowwise() +
|
|
bias_arr.transpose();
|
|
}
|
|
}
|
|
|
|
REGISTER_CPU_OPERATOR(LayerNorm, LayerNormOp<CPUContext>);
|
|
|
|
template <>
|
|
template <typename T>
|
|
void LayerNormGradientOp<CPUContext>::ComputeInternalGradients(
|
|
const int M,
|
|
const int N,
|
|
const T* dY,
|
|
const T* X,
|
|
T* ds,
|
|
T* db) {
|
|
ConstEigenArrayMap<T> dY_arr(dY, N, M);
|
|
ConstEigenArrayMap<T> X_arr(X, N, M);
|
|
for (int i = 0; i < M; ++i) {
|
|
ds[i] = (dY_arr.col(i) * X_arr.col(i)).sum();
|
|
db[i] = dY_arr.col(i).sum();
|
|
}
|
|
}
|
|
|
|
template <>
|
|
template <typename T>
|
|
void LayerNormGradientOp<CPUContext>::ComputeFusedParams(
|
|
const int M,
|
|
const int N,
|
|
const T* mean,
|
|
const T* sig,
|
|
const T* ds,
|
|
const T* db,
|
|
T* dY_scale,
|
|
T* X_scale,
|
|
T* bias) {
|
|
const T scale = T(1) / static_cast<T>(N);
|
|
ConstEigenVectorArrayMap<T> mean_arr(mean, M);
|
|
ConstEigenVectorArrayMap<T> ds_arr(ds, M);
|
|
ConstEigenVectorArrayMap<T> db_arr(db, M);
|
|
EigenVectorArrayMap<T> rsig_arr(dY_scale, M);
|
|
EigenVectorArrayMap<T> X_scale_arr(X_scale, M);
|
|
rsig_arr = ConstEigenVectorArrayMap<T>(sig, M).inverse();
|
|
X_scale_arr = (db_arr * mean_arr - ds_arr) * rsig_arr.cube() * scale;
|
|
EigenVectorArrayMap<T>(bias, M) =
|
|
-X_scale_arr * mean_arr - db_arr * rsig_arr * scale;
|
|
}
|
|
|
|
template <>
|
|
template <typename T>
|
|
void LayerNormGradientOp<CPUContext>::LayerNormBackward(
|
|
const int M,
|
|
const int N,
|
|
const T* dY_scale,
|
|
const T* dY,
|
|
const T* X_scale,
|
|
const T* X,
|
|
const T* bias,
|
|
T* dX) {
|
|
EigenArrayMap<T>(dX, N, M) =
|
|
(ConstEigenArrayMap<T>(dY, N, M).rowwise() *
|
|
ConstEigenVectorArrayMap<T>(dY_scale, M).transpose() +
|
|
ConstEigenArrayMap<T>(X, N, M).rowwise() *
|
|
ConstEigenVectorArrayMap<T>(X_scale, M).transpose())
|
|
.rowwise() +
|
|
ConstEigenVectorArrayMap<T>(bias, M).transpose();
|
|
}
|
|
|
|
OPERATOR_SCHEMA(LayerNormGradient).NumInputs(5).NumOutputs(1);
|
|
|
|
REGISTER_CPU_OPERATOR(LayerNormGradient, LayerNormGradientOp<CPUContext>);
|
|
|
|
namespace {
|
|
|
|
class GetLayerNormGradient : public GradientMakerBase {
|
|
using GradientMakerBase::GradientMakerBase;
|
|
std::vector<OperatorDef> GetGradientDefs() override {
|
|
return SingleGradientDef(
|
|
"LayerNormGradient",
|
|
"",
|
|
std::vector<std::string>{GO(0), O(0), O(1), O(2), I(0)},
|
|
std::vector<std::string>{GI(0)});
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
REGISTER_GRADIENT(LayerNorm, GetLayerNormGradient);
|
|
|
|
OPERATOR_SCHEMA(LayerNorm)
|
|
.NumInputs({1, 3})
|
|
.NumOutputs(3)
|
|
.TensorInferenceFunction([](const OperatorDef& def,
|
|
const vector<TensorShape>& in) {
|
|
std::vector<TensorShape> out(3);
|
|
auto input_dims_long = GetDimsVector(in[0]);
|
|
std::vector<int> input_dims(
|
|
input_dims_long.begin(), input_dims_long.end());
|
|
out[0] = CreateTensorShape(input_dims, TensorProto::FLOAT);
|
|
|
|
ArgumentHelper helper(def);
|
|
|
|
auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
|
|
const auto canonical_axis =
|
|
canonical_axis_index_(axis, in[0].dims().size());
|
|
std::vector<int> stat_dims(
|
|
input_dims.begin(), input_dims.begin() + canonical_axis);
|
|
stat_dims.push_back(1);
|
|
out[1] = CreateTensorShape(stat_dims, TensorProto::FLOAT);
|
|
out[2] = CreateTensorShape(stat_dims, TensorProto::FLOAT);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
Computes layer normalization as described in https://arxiv.org/pdf/1607.06450.pdf.
|
|
Given an input vector x \in [a_0, a_1, ...,a_{k-1}, a_k, ..., a_{n-1}],
|
|
this op treats dimensions a_k through a_{n-1} as feature vectors. For each
|
|
feature vector, the op contains the mean and standard deviation. Then,
|
|
it returns the normalized values (with respect to the feature vector).
|
|
|
|
Note that this op does not contain the scale an bias terms described in the
|
|
paper. Simply follow this op with an FC op to add those. Concretely, this op
|
|
implements:
|
|
|
|
h = \frac{1}{\sigma}(a - \mu)
|
|
where \mu = \frac{1}{H}\sum_{i=1}^{H} a_i
|
|
and \sigma = \sqrt{\frac{1}{H}\sum_{i=1}^{H}(a_i - \mu)^2}
|
|
where H is the number of hidden units (i.e. product of dimensions from 'axis'
|
|
to the end.)
|
|
)DOC")
|
|
.Arg(
|
|
"axis",
|
|
"(int) default to 1; Describes axis of the inputs. Defaults to one "
|
|
"because the 0th axis most likely describes the batch size")
|
|
.Arg(
|
|
"epsilon",
|
|
"(float) default to 0.001. Small value to be added to the stdev when"
|
|
" dividing out by that value. This prevents division by zero.")
|
|
.Arg(
|
|
"elementwise_affine",
|
|
"(bool) default to False; If true, this op will do affine "
|
|
"transformation after normalization.")
|
|
.Input(
|
|
0,
|
|
"input",
|
|
"Input tensor which layer normalization will be applied to")
|
|
.Input(
|
|
1,
|
|
"gamma",
|
|
"scale tensor for elementwise_affine, the shape should be the same as "
|
|
"the dimensions of X begin from axis")
|
|
.Input(
|
|
2,
|
|
"beta",
|
|
"bias tensor for elementwise_affine, the shape should be the same as "
|
|
"the dimensions of X begin from axis")
|
|
.Output(0, "output", "Normalized values")
|
|
.Output(1, "mean", "Mean values for each feature vector")
|
|
.Output(2, "stddev", "Standard deviations for each feature vector");
|
|
|
|
} // namespace caffe2
|
|
|
|
C10_REGISTER_CAFFE2_OPERATOR_CPU(
|
|
LayerNorm,
|
|
"_caffe2::LayerNorm("
|
|
" Tensor X,"
|
|
" Tensor? gamma,"
|
|
" Tensor? beta,"
|
|
" int axis = 1,"
|
|
" float epsilon = 1e-5,"
|
|
" bool elementwise_affine = False"
|
|
") -> (Tensor Y, Tensor mean, Tensor std)",
|
|
caffe2::LayerNormOp<caffe2::CPUContext>)
|
|
|
|
namespace caffe2 {
|
|
|
|
REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU(
|
|
"_caffe2::LayerNorm",
|
|
C10LayerNorm_DontUseThisOpYet);
|
|
|
|
} // namespace caffe2
|