mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/17083 clangr codemod Reviewed By: ezyang Differential Revision: D14078504 fbshipit-source-id: 34dddb035eee2fca3150e47c57489614b91b6725
222 lines
8.1 KiB
C++
222 lines
8.1 KiB
C++
#include "Eigen/Core"
|
|
#include "caffe2/utils/eigen_utils.h"
|
|
|
|
#if EIGEN_VERSION_AT_LEAST(3, 3, 0)
|
|
|
|
#include "caffe2/core/context.h"
|
|
#include "caffe2/core/operator.h"
|
|
#include "caffe2/operators/conv_pool_op_base.h"
|
|
|
|
#include "unsupported/Eigen/CXX11/Tensor"
|
|
|
|
namespace caffe2 {
|
|
|
|
template <typename T>
|
|
class EigenConvOp final : public ConvPoolOpBase<CPUContext> {
|
|
public:
|
|
USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
|
|
explicit EigenConvOp(const OperatorDef& operator_def, Workspace* ws)
|
|
: ConvPoolOpBase<CPUContext>(operator_def, ws) {
|
|
OPERATOR_NEEDS_FEATURE(group_ == 1, "Group convolution not supported yet.");
|
|
}
|
|
~EigenConvOp() override {}
|
|
|
|
bool RunOnDeviceWithOrderNCHW() override;
|
|
bool RunOnDeviceWithOrderNHWC() override;
|
|
|
|
private:
|
|
INPUT_TAGS(INPUT, FILTER, BIAS);
|
|
};
|
|
|
|
// The NCHW implementation: we do explicit transposes before and after, which
|
|
// are not ideal but provides a compatible path instead of throwing the error.
|
|
template <typename T>
|
|
bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
|
|
auto& X = Input(INPUT);
|
|
auto& filter = Input(FILTER);
|
|
auto* Y = Output(0);
|
|
const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
|
|
CAFFE_ENFORCE(4 == filter.dim());
|
|
const int M = filter.dim32(0);
|
|
CAFFE_ENFORCE(filter.dim32(1) == C);
|
|
CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
|
|
CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
|
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
|
|
Eigen::array<int64_t, 4> kernel_shuffles
|
|
{ {int64_t(2), int64_t(3), int64_t(1), int64_t(0)} };
|
|
Eigen::array<int64_t, 4> input_shuffles
|
|
{ {int64_t(0), int64_t(2), int64_t(3), int64_t(1)} };
|
|
|
|
Eigen::Tensor<T, 4, Eigen::RowMajor> filter_tensor =
|
|
Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
|
|
const_cast<T*>(filter.template data<T>()),
|
|
M,
|
|
C,
|
|
kernel_h(),
|
|
kernel_w())
|
|
.shuffle(kernel_shuffles);
|
|
Eigen::Tensor<T, 4, Eigen::RowMajor> X_tensor =
|
|
Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
|
|
const_cast<T*>(X.template data<T>()), N, C, H, W)
|
|
.shuffle(input_shuffles);
|
|
|
|
// For Eigen, the definition of row and col actually correspond to width
|
|
// and height instead of the other way round, so notice how we pass the
|
|
// stride, pad and dilation values.
|
|
typedef typename Eigen::internal::traits<
|
|
Eigen::Tensor<T, 4, Eigen::RowMajor>>::Index TensorIndex;
|
|
Eigen::array<Eigen::IndexPair<TensorIndex>, 1> contract_dims;
|
|
contract_dims[0] = Eigen::IndexPair<TensorIndex>(1, 0);
|
|
|
|
Eigen::DSizes<TensorIndex, 2> pre_contract_dims;
|
|
pre_contract_dims[1] = kernel_h() * kernel_w() * C;
|
|
pre_contract_dims[0] = Y->numel() / M;
|
|
|
|
Eigen::DSizes<TensorIndex, 2> kernel_dims;
|
|
kernel_dims[0] = kernel_h() * kernel_w() * C;
|
|
kernel_dims[1] = M;
|
|
|
|
Eigen::array<TensorIndex, 4> bcast_dims;
|
|
bcast_dims[0] = N;
|
|
bcast_dims[1] = Y->dim32(1);
|
|
bcast_dims[2] = Y->dim32(2);
|
|
bcast_dims[3] = 1;
|
|
|
|
Eigen::Tensor<T, 4, Eigen::RowMajor> Y_tensor(
|
|
Y->dim32(0), Y->dim32(2), Y->dim32(3), Y->dim32(1));
|
|
Y_tensor = X_tensor
|
|
.extract_image_patches(
|
|
kernel_w(),
|
|
kernel_h(),
|
|
stride_w(),
|
|
stride_h(),
|
|
dilation_w(),
|
|
dilation_h(),
|
|
1,
|
|
1,
|
|
pad_l(),
|
|
pad_r(),
|
|
pad_t(),
|
|
pad_b(),
|
|
0)
|
|
.reshape(pre_contract_dims)
|
|
.contract(filter_tensor.reshape(kernel_dims), contract_dims)
|
|
.reshape(Y_tensor.dimensions());
|
|
if (InputSize() == 3) {
|
|
auto& bias = Input(BIAS);
|
|
CAFFE_ENFORCE(1 == bias.dim());
|
|
CAFFE_ENFORCE(bias.dim32(0) == M);
|
|
// It seems that the bias broadcast is still slower so let's do the
|
|
// following for now.
|
|
EigenArrayMap<T> Y_arr(
|
|
Y_tensor.data(), static_cast<int64_t>(M), Y->numel() / M);
|
|
ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), M);
|
|
Y_arr = Y_arr.colwise() + bias_arr;
|
|
}
|
|
|
|
// Do a last transpose.
|
|
Eigen::array<int64_t, 4> output_shuffles
|
|
{ {int64_t(0), int64_t(3), int64_t(1), int64_t(2) } };
|
|
|
|
Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
|
|
Y->template mutable_data<T>(), N, M, Y->dim32(2), Y->dim32(3)) =
|
|
Y_tensor.shuffle(output_shuffles);
|
|
return true;
|
|
}
|
|
|
|
template <typename T>
|
|
bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() {
|
|
auto& X = Input(INPUT);
|
|
auto& filter = Input(FILTER);
|
|
auto* Y = Output(0);
|
|
const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
|
|
CAFFE_ENFORCE(4 == filter.dim());
|
|
const int M = filter.dim32(0);
|
|
CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
|
|
CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
|
|
CAFFE_ENFORCE(filter.dim32(3) == C);
|
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
|
|
// Eigen expects filter to be of shape (kernel_h, kernel_w, C, M) for
|
|
// optimization purposes, so we will create a temp one.
|
|
Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic> temp_filter(
|
|
M, kernel_h() * kernel_w() * C);
|
|
temp_filter = ConstEigenArrayMap<T>(
|
|
filter.template data<T>(), kernel_h() * kernel_w() * C, M)
|
|
.transpose();
|
|
|
|
// Create tensor maps, and call spatial convolution.
|
|
// TODO(jiayq): right now we const cast away the const pointer, but we will
|
|
// need to figure out how to properly do a const tensormap.
|
|
Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> X_tensor(
|
|
const_cast<T*>(X.template data<T>()), N, H, W, C);
|
|
Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> Y_tensor(
|
|
Y->template mutable_data<T>(), N, Y->dim32(1), Y->dim32(2), M);
|
|
Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> filter_tensor(
|
|
const_cast<T*>(temp_filter.data()), kernel_h(), kernel_w(), C, M);
|
|
|
|
// For Eigen, the definition of row and col actually correspond to width
|
|
// and height instead of the other way round, so notice how we pass the
|
|
// stride, pad and dilation values.
|
|
typedef typename Eigen::internal::traits<
|
|
Eigen::Tensor<T, 4, Eigen::RowMajor>>::Index TensorIndex;
|
|
Eigen::array<Eigen::IndexPair<TensorIndex>, 1> contract_dims;
|
|
contract_dims[0] = Eigen::IndexPair<TensorIndex>(1, 0);
|
|
|
|
Eigen::DSizes<TensorIndex, 2> pre_contract_dims;
|
|
pre_contract_dims[1] = kernel_h() * kernel_w() * C;
|
|
pre_contract_dims[0] = Y->numel() / M;
|
|
|
|
Eigen::DSizes<TensorIndex, 2> kernel_dims;
|
|
kernel_dims[0] = kernel_h() * kernel_w() * C;
|
|
kernel_dims[1] = M;
|
|
|
|
Eigen::array<TensorIndex, 4> bcast_dims;
|
|
bcast_dims[0] = N;
|
|
bcast_dims[1] = Y->dim32(1);
|
|
bcast_dims[2] = Y->dim32(2);
|
|
bcast_dims[3] = 1;
|
|
|
|
Y_tensor = X_tensor
|
|
.extract_image_patches(
|
|
kernel_w(),
|
|
kernel_h(),
|
|
stride_w(),
|
|
stride_h(),
|
|
dilation_w(),
|
|
dilation_h(),
|
|
1,
|
|
1,
|
|
pad_l(),
|
|
pad_r(),
|
|
pad_t(),
|
|
pad_b(),
|
|
0)
|
|
.reshape(pre_contract_dims)
|
|
.contract(filter_tensor.reshape(kernel_dims), contract_dims)
|
|
.reshape(Y_tensor.dimensions());
|
|
|
|
if (InputSize() == 3) {
|
|
auto& bias = Input(BIAS);
|
|
CAFFE_ENFORCE(1 == bias.dim());
|
|
CAFFE_ENFORCE(bias.dim32(0) == M);
|
|
Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> bias_tensor(
|
|
const_cast<T*>(bias.template data<T>()), 1, 1, 1, M);
|
|
// It seems that the bias broadcast is still slower so let's do the
|
|
// following for now.
|
|
EigenArrayMap<T> Y_arr(
|
|
Y->template mutable_data<T>(), static_cast<int64_t>(M), Y->numel() / M);
|
|
ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), M);
|
|
Y_arr = Y_arr.colwise() + bias_arr;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, EIGEN, EigenConvOp<float>);
|
|
REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv1D, EIGEN, EigenConvOp<float>);
|
|
REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv2D, EIGEN, EigenConvOp<float>);
|
|
REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv3D, EIGEN, EigenConvOp<float>);
|
|
|
|
} // namespace caffe2
|
|
|
|
#endif
|