mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Back out "[pt1][tensor] Change ConvPoolOp<Context>::SetOutputSize to ConvPoolOp<Context>::GetOutputSize" (#16516)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/16516 Original commit changeset: 64abce3dbaed Reviewed By: dzhulgakov Differential Revision: D13863715 fbshipit-source-id: f1923fdca4a1a82768d9c280a8493ff15a7eb2ba
This commit is contained in:
parent
cdbd388206
commit
2af95d8e3e
|
|
@ -119,6 +119,7 @@ class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
|
||||||
auto& X = Input(0);
|
auto& X = Input(0);
|
||||||
auto& filter = Input(1);
|
auto& filter = Input(1);
|
||||||
auto& bias = Input(2);
|
auto& bias = Input(2);
|
||||||
|
auto* Y = Output(0);
|
||||||
|
|
||||||
const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
|
const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
|
||||||
const int M = filter.dim32(0);
|
const int M = filter.dim32(0);
|
||||||
|
|
@ -132,8 +133,7 @@ class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
|
||||||
CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w(), "");
|
CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w(), "");
|
||||||
CAFFE_ENFORCE(bias.numel() == M, "");
|
CAFFE_ENFORCE(bias.numel() == M, "");
|
||||||
|
|
||||||
auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0));
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
|
||||||
auto* Y = Output(0, sizes, at::dtype<float>());
|
|
||||||
const int oH = Y->dim32(2), oW = Y->dim32(3);
|
const int oH = Y->dim32(2), oW = Y->dim32(3);
|
||||||
|
|
||||||
if (N > 1) {
|
if (N > 1) {
|
||||||
|
|
@ -250,10 +250,10 @@ class NNPACKMaxPoolOp final : public ConvPoolOpBase<CPUContext> {
|
||||||
|
|
||||||
bool RunOnDeviceWithOrderNCHW() override {
|
bool RunOnDeviceWithOrderNCHW() override {
|
||||||
auto& X = Input(0);
|
auto& X = Input(0);
|
||||||
|
auto* Y = Output(0);
|
||||||
CAFFE_ENFORCE(X.dim() == 4, "");
|
CAFFE_ENFORCE(X.dim() == 4, "");
|
||||||
const int H = X.dim32(2), W = X.dim32(3);
|
const int H = X.dim32(2), W = X.dim32(3);
|
||||||
auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, X.dim32(1));
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1));
|
||||||
auto* Y = Output(0, sizes, at::dtype<float>());
|
|
||||||
std::vector<int> pads(
|
std::vector<int> pads(
|
||||||
{this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
|
{this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
|
||||||
std::vector<int> stride({this->stride_h(), this->stride_w()});
|
std::vector<int> stride({this->stride_h(), this->stride_w()});
|
||||||
|
|
|
||||||
|
|
@ -196,8 +196,8 @@ class MaxPoolRTCOp final : public ConvPoolOpBase<CUDAContext> {
|
||||||
|
|
||||||
bool RunOnDeviceWithOrderNCHW() override {
|
bool RunOnDeviceWithOrderNCHW() override {
|
||||||
auto& X = Input(0);
|
auto& X = Input(0);
|
||||||
auto output_sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, X.dim32(1));
|
auto* Y = Output(0);
|
||||||
auto* Y = Output(0, output_sizes, at::dtype<float>());
|
ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1));
|
||||||
|
|
||||||
if (input_dims_ != X.sizes()) {
|
if (input_dims_ != X.sizes()) {
|
||||||
// recompile
|
// recompile
|
||||||
|
|
|
||||||
|
|
@ -257,10 +257,11 @@ void computeOutputHW(
|
||||||
int* OH,
|
int* OH,
|
||||||
int* OW) {
|
int* OW) {
|
||||||
Tensor input = caffe2::empty({1, 1, H, W}, at::dtype<float>().device(CPU));
|
Tensor input = caffe2::empty({1, 1, H, W}, at::dtype<float>().device(CPU));
|
||||||
auto sizes = op->GetOutputSize(input, 1);
|
Tensor output(CPU);
|
||||||
CAFFE_ENFORCE_EQ(sizes.size(), 4);
|
op->SetOutputSize(input, &output, 1);
|
||||||
*OH = sizes[2];
|
CAFFE_ENFORCE_EQ(output.dim(), 4);
|
||||||
*OW = sizes[3];
|
*OH = output.size(2);
|
||||||
|
*OW = output.size(3);
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr int computeMPSAlignOffset(int kernel, int pad) {
|
constexpr int computeMPSAlignOffset(int kernel, int pad) {
|
||||||
|
|
|
||||||
|
|
@ -516,13 +516,13 @@ template <typename T_X, typename T_W, typename T_B, typename T_Y>
|
||||||
bool CudnnConvOp::DoRunWithType() {
|
bool CudnnConvOp::DoRunWithType() {
|
||||||
auto& X = Input(INPUT);
|
auto& X = Input(INPUT);
|
||||||
auto& filter = Input(FILTER);
|
auto& filter = Input(FILTER);
|
||||||
|
auto* Y = Output(0);
|
||||||
|
|
||||||
// Figure out the output shape
|
// Figure out the output shape
|
||||||
CAFFE_ENFORCE(X.dim() >= 3 && X.dim() <= 5);
|
CAFFE_ENFORCE(X.dim() >= 3 && X.dim() <= 5);
|
||||||
CAFFE_ENFORCE(filter.dim() >= 3 && filter.dim() <= 5);
|
CAFFE_ENFORCE(filter.dim() >= 3 && filter.dim() <= 5);
|
||||||
const int M = filter.dim32(0);
|
const int M = filter.dim32(0);
|
||||||
auto output_sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, M);
|
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, M);
|
||||||
auto* Y = Output(0, output_sizes, at::dtype<T_Y>());
|
|
||||||
int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0;
|
int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0;
|
||||||
int group_offset_X = 0, group_offset_Y = 0;
|
int group_offset_X = 0, group_offset_Y = 0;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -34,14 +34,14 @@ template <typename T>
|
||||||
bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
|
bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
|
||||||
auto& X = Input(INPUT);
|
auto& X = Input(INPUT);
|
||||||
auto& filter = Input(FILTER);
|
auto& filter = Input(FILTER);
|
||||||
|
auto* Y = Output(0);
|
||||||
const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
|
const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
|
||||||
CAFFE_ENFORCE(4 == filter.dim());
|
CAFFE_ENFORCE(4 == filter.dim());
|
||||||
const int M = filter.dim32(0);
|
const int M = filter.dim32(0);
|
||||||
CAFFE_ENFORCE(filter.dim32(1) == C);
|
CAFFE_ENFORCE(filter.dim32(1) == C);
|
||||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
|
CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
|
||||||
CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
|
CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
|
||||||
auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0));
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
|
||||||
auto* Y = Output(0, sizes, at::dtype<T>());
|
|
||||||
Eigen::array<int64_t, 4> kernel_shuffles
|
Eigen::array<int64_t, 4> kernel_shuffles
|
||||||
{ {int64_t(2), int64_t(3), int64_t(1), int64_t(0)} };
|
{ {int64_t(2), int64_t(3), int64_t(1), int64_t(0)} };
|
||||||
Eigen::array<int64_t, 4> input_shuffles
|
Eigen::array<int64_t, 4> input_shuffles
|
||||||
|
|
@ -128,14 +128,14 @@ template <typename T>
|
||||||
bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() {
|
bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() {
|
||||||
auto& X = Input(INPUT);
|
auto& X = Input(INPUT);
|
||||||
auto& filter = Input(FILTER);
|
auto& filter = Input(FILTER);
|
||||||
|
auto* Y = Output(0);
|
||||||
const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
|
const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
|
||||||
CAFFE_ENFORCE(4 == filter.dim());
|
CAFFE_ENFORCE(4 == filter.dim());
|
||||||
const int M = filter.dim32(0);
|
const int M = filter.dim32(0);
|
||||||
CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
|
CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
|
||||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
|
CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
|
||||||
CAFFE_ENFORCE(filter.dim32(3) == C);
|
CAFFE_ENFORCE(filter.dim32(3) == C);
|
||||||
auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0));
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
|
||||||
auto* Y = Output(0, sizes, at::dtype<T>());
|
|
||||||
// Eigen expects filter to be of shape (kernel_h, kernel_w, C, M) for
|
// Eigen expects filter to be of shape (kernel_h, kernel_w, C, M) for
|
||||||
// optimization purposes, so we will create a temp one.
|
// optimization purposes, so we will create a temp one.
|
||||||
Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic> temp_filter(
|
Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic> temp_filter(
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,7 @@ template <typename T, class Context>
|
||||||
bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||||
const auto& X = Input(INPUT);
|
const auto& X = Input(INPUT);
|
||||||
const auto& filter = Input(FILTER);
|
const auto& filter = Input(FILTER);
|
||||||
|
auto* Y = Output(0);
|
||||||
const int N = X.dim32(0);
|
const int N = X.dim32(0);
|
||||||
const int C = X.dim32(1);
|
const int C = X.dim32(1);
|
||||||
const int G = group_;
|
const int G = group_;
|
||||||
|
|
@ -43,8 +44,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||||
CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
|
CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
|
||||||
kernel_size *= kernel_[i];
|
kernel_size *= kernel_[i];
|
||||||
}
|
}
|
||||||
auto output_sizes = ConvPoolOpBase<Context>::GetOutputSize(X, M);
|
ConvPoolOpBase<Context>::SetOutputSize(X, Y, M);
|
||||||
auto* Y = Output(0, output_sizes, at::dtype<T>());
|
|
||||||
const vector<int> X_dims = GetDims(X);
|
const vector<int> X_dims = GetDims(X);
|
||||||
const vector<int> Y_dims = GetDims(*Y);
|
const vector<int> Y_dims = GetDims(*Y);
|
||||||
const int X_HxW = X.numel() / (N * C);
|
const int X_HxW = X.numel() / (N * C);
|
||||||
|
|
@ -190,6 +190,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
|
||||||
"Only 1-3d convolution is supported for NHWC storage type");
|
"Only 1-3d convolution is supported for NHWC storage type");
|
||||||
const Tensor& X = Input(INPUT);
|
const Tensor& X = Input(INPUT);
|
||||||
const auto& filter = Input(FILTER);
|
const auto& filter = Input(FILTER);
|
||||||
|
Tensor* Y = Output(0);
|
||||||
const int N = X.dim32(0), C = X.dim32(X.dim() - 1);
|
const int N = X.dim32(0), C = X.dim32(X.dim() - 1);
|
||||||
const int G = group_;
|
const int G = group_;
|
||||||
CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
|
CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
|
||||||
|
|
@ -211,8 +212,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
|
||||||
CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
|
CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
|
||||||
kernel_size *= kernel_[i];
|
kernel_size *= kernel_[i];
|
||||||
}
|
}
|
||||||
auto output_sizes = ConvPoolOpBase<Context>::GetOutputSize(X, M);
|
ConvPoolOpBase<Context>::SetOutputSize(X, Y, M);
|
||||||
auto* Y = Output(0, output_sizes, at::dtype<T>());
|
|
||||||
const vector<int> Y_dims = GetDims(*Y);
|
const vector<int> Y_dims = GetDims(*Y);
|
||||||
const int X_HxW = X.numel() / (N * C);
|
const int X_HxW = X.numel() / (N * C);
|
||||||
const int Y_HxW = Y->numel() / (N * M);
|
const int Y_HxW = Y->numel() / (N * M);
|
||||||
|
|
|
||||||
|
|
@ -207,7 +207,7 @@ class ConvPoolOpBase : public Operator<Context> {
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Gets the output size. The output channel is manually provided since
|
// Sets the output size. The output channel is manually provided since
|
||||||
// it may not be identical to the input channels.
|
// it may not be identical to the input channels.
|
||||||
// This function can be used in the forward functions to obtain the output
|
// This function can be used in the forward functions to obtain the output
|
||||||
// sizes.
|
// sizes.
|
||||||
|
|
@ -215,7 +215,8 @@ class ConvPoolOpBase : public Operator<Context> {
|
||||||
// implementations that do not use first-class Tensor objects, such as the
|
// implementations that do not use first-class Tensor objects, such as the
|
||||||
// MKL operator. One can still call this function with dummy
|
// MKL operator. One can still call this function with dummy
|
||||||
// Tensor objects in order to obtain the sizes.
|
// Tensor objects in order to obtain the sizes.
|
||||||
std::vector<int64_t> GetOutputSize(const Tensor& input, int output_channel) {
|
// TODO: passing sizes directly rather than Tensor
|
||||||
|
void SetOutputSize(const Tensor& input, Tensor* output, int output_channel) {
|
||||||
CAFFE_ENFORCE(input.numel() > 0);
|
CAFFE_ENFORCE(input.numel() > 0);
|
||||||
vector<int> output_dims;
|
vector<int> output_dims;
|
||||||
int N = input.dim32(0);
|
int N = input.dim32(0);
|
||||||
|
|
@ -240,7 +241,7 @@ class ConvPoolOpBase : public Operator<Context> {
|
||||||
output_dims.insert(output_dims.begin(), N);
|
output_dims.insert(output_dims.begin(), N);
|
||||||
output_dims.push_back(output_channel);
|
output_dims.push_back(output_channel);
|
||||||
}
|
}
|
||||||
return std::vector<int64_t>(output_dims.cbegin(), output_dims.cend());
|
output->Resize(output_dims);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function that is also called from OperatorSchema. Modified
|
// Helper function that is also called from OperatorSchema. Modified
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ bool DeformConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||||
const Tensor& X = Input(INPUT);
|
const Tensor& X = Input(INPUT);
|
||||||
const Tensor& offset = Input(OFFSET);
|
const Tensor& offset = Input(OFFSET);
|
||||||
auto& filter = Input(FILTER);
|
auto& filter = Input(FILTER);
|
||||||
|
Tensor* Y = Output(0);
|
||||||
const int N = X.dim32(0), C = X.dim32(1);
|
const int N = X.dim32(0), C = X.dim32(1);
|
||||||
CAFFE_ENFORCE_EQ(X.dim(), filter.ndim());
|
CAFFE_ENFORCE_EQ(X.dim(), filter.ndim());
|
||||||
const int M = filter.dim32(0);
|
const int M = filter.dim32(0);
|
||||||
|
|
@ -81,8 +82,7 @@ bool DeformConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||||
kernel_dims_size *= kernel_[i];
|
kernel_dims_size *= kernel_[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
auto output_sizes = ConvPoolOpBase<Context>::GetOutputSize(X, filter.dim32(0));
|
ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
|
||||||
auto* Y = Output(0, output_sizes, at::dtype<T>());
|
|
||||||
|
|
||||||
const vector<int> input_dims = GetDims(X);
|
const vector<int> input_dims = GetDims(X);
|
||||||
const vector<int> output_dims = GetDims(*Y);
|
const vector<int> output_dims = GetDims(*Y);
|
||||||
|
|
@ -196,8 +196,8 @@ bool DeformConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||||
auto& offset = Input(OFFSET);
|
auto& offset = Input(OFFSET);
|
||||||
auto& filter = Input(FILTER);
|
auto& filter = Input(FILTER);
|
||||||
auto& dY = Input(OUTPUT_GRAD);
|
auto& dY = Input(OUTPUT_GRAD);
|
||||||
|
|
||||||
|
|
||||||
const int N = X.dim32(0), C = X.dim32(1);
|
const int N = X.dim32(0), C = X.dim32(1);
|
||||||
|
|
||||||
const vector<int> input_dims = this->GetDims(X);
|
const vector<int> input_dims = this->GetDims(X);
|
||||||
|
|
@ -303,7 +303,7 @@ bool DeformConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||||
|
|
||||||
T* dbias_data = nullptr;
|
T* dbias_data = nullptr;
|
||||||
if (!no_bias_) {
|
if (!no_bias_) {
|
||||||
|
|
||||||
auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<T>());
|
auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<T>());
|
||||||
if (bias_multiplier_.size() != output_image_size) {
|
if (bias_multiplier_.size() != output_image_size) {
|
||||||
// If the helper bias multiplier is not M, reshape and fill it with one.
|
// If the helper bias multiplier is not M, reshape and fill it with one.
|
||||||
|
|
@ -323,7 +323,7 @@ bool DeformConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||||
|
|
||||||
T* dXdata = nullptr;
|
T* dXdata = nullptr;
|
||||||
if (OutputSize() == 4 || (no_bias_ && (OutputSize() == 3))) {
|
if (OutputSize() == 4 || (no_bias_ && (OutputSize() == 3))) {
|
||||||
|
|
||||||
auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>());
|
auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>());
|
||||||
dXdata = dX->template mutable_data<T>();
|
dXdata = dX->template mutable_data<T>();
|
||||||
math::Set<T, Context>(dX->size(), 0, dXdata, &context_);
|
math::Set<T, Context>(dX->size(), 0, dXdata, &context_);
|
||||||
|
|
|
||||||
|
|
@ -288,6 +288,7 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CUDAContext> {
|
||||||
bool RunOnDeviceWithOrderNCHW() override {
|
bool RunOnDeviceWithOrderNCHW() override {
|
||||||
const Tensor& X = Input(0);
|
const Tensor& X = Input(0);
|
||||||
auto& filter = Input(1);
|
auto& filter = Input(1);
|
||||||
|
Tensor* Y = Output(0);
|
||||||
const int N = X.dim32(0), C = X.dim32(1);
|
const int N = X.dim32(0), C = X.dim32(1);
|
||||||
CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
|
CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
|
||||||
const int M = filter.dim32(0);
|
const int M = filter.dim32(0);
|
||||||
|
|
@ -299,8 +300,7 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CUDAContext> {
|
||||||
CAFFE_ENFORCE_EQ(this->kernel_w(), 3);
|
CAFFE_ENFORCE_EQ(this->kernel_w(), 3);
|
||||||
CAFFE_ENFORCE_EQ(this->kernel_h(), 3);
|
CAFFE_ENFORCE_EQ(this->kernel_h(), 3);
|
||||||
CAFFE_ENFORCE_EQ(this->stride_h(), this->stride_w());
|
CAFFE_ENFORCE_EQ(this->stride_h(), this->stride_w());
|
||||||
auto sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, filter.dim32(0));
|
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, filter.dim32(0));
|
||||||
Tensor* Y = Output(0, sizes, at::dtype<float>());
|
|
||||||
DepthwiseArgs args;
|
DepthwiseArgs args;
|
||||||
args.batch = X.dim32(0);
|
args.batch = X.dim32(0);
|
||||||
args.in_rows = X.dim32(2);
|
args.in_rows = X.dim32(2);
|
||||||
|
|
@ -455,7 +455,7 @@ class Depthwise3x3ConvGradientOp final : public ConvPoolOpBase<CUDAContext> {
|
||||||
M,
|
M,
|
||||||
dY.dim32(2),
|
dY.dim32(2),
|
||||||
dY.dim32(3)));
|
dY.dim32(3)));
|
||||||
|
|
||||||
auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<float>());
|
auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<float>());
|
||||||
CUDNN_ENFORCE(cudnnConvolutionBackwardBias(
|
CUDNN_ENFORCE(cudnnConvolutionBackwardBias(
|
||||||
cudnn_wrapper_.inline_cudnn_handle(),
|
cudnn_wrapper_.inline_cudnn_handle(),
|
||||||
|
|
|
||||||
|
|
@ -205,6 +205,7 @@ template <typename T_X, typename T_W, typename T_B, typename MATH, typename T_Y>
|
||||||
bool MIOPENConvOp::DoRunWithType() {
|
bool MIOPENConvOp::DoRunWithType() {
|
||||||
auto& X = Input(INPUT);
|
auto& X = Input(INPUT);
|
||||||
auto& Weight = Input(FILTER);
|
auto& Weight = Input(FILTER);
|
||||||
|
auto* Y = Output(0);
|
||||||
|
|
||||||
// Figure out the output shape
|
// Figure out the output shape
|
||||||
CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
|
CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
|
||||||
|
|
@ -213,8 +214,7 @@ bool MIOPENConvOp::DoRunWithType() {
|
||||||
"Conv op with MIOpen engine is supported only for 2D convolutions");
|
"Conv op with MIOpen engine is supported only for 2D convolutions");
|
||||||
|
|
||||||
const int M = Weight.dim32(0);
|
const int M = Weight.dim32(0);
|
||||||
auto sizes = ConvPoolOpBase<HIPContext>::GetOutputSize(X, M);
|
ConvPoolOpBase<HIPContext>::SetOutputSize(X, Y, M);
|
||||||
auto* Y = Output(0, sizes, at::dtype<T_Y>());
|
|
||||||
|
|
||||||
int N = X.dim32(0);
|
int N = X.dim32(0);
|
||||||
int C = X.dim32(1);
|
int C = X.dim32(1);
|
||||||
|
|
|
||||||
|
|
@ -61,6 +61,7 @@ class MIOPENPoolOp : public ConvPoolOpBase<HIPContext> {
|
||||||
template <typename T, typename M>
|
template <typename T, typename M>
|
||||||
bool DoRunWithType() {
|
bool DoRunWithType() {
|
||||||
auto& X = Input(0);
|
auto& X = Input(0);
|
||||||
|
auto* Y = Output(0);
|
||||||
int N = 0, C = 0, H = 0, W = 0, D = 0;
|
int N = 0, C = 0, H = 0, W = 0, D = 0;
|
||||||
int N_out = 0, C_out = 0, H_out = 0, W_out = 0;
|
int N_out = 0, C_out = 0, H_out = 0, W_out = 0;
|
||||||
CAFFE_ENFORCE(X.ndim() >= 4 && X.ndim() <= 5);
|
CAFFE_ENFORCE(X.ndim() >= 4 && X.ndim() <= 5);
|
||||||
|
|
@ -68,8 +69,7 @@ class MIOPENPoolOp : public ConvPoolOpBase<HIPContext> {
|
||||||
C = X.dim32(1);
|
C = X.dim32(1);
|
||||||
H = X.dim32(2);
|
H = X.dim32(2);
|
||||||
W = X.ndim() > 3 ? X.dim32(3) : 1;
|
W = X.ndim() > 3 ? X.dim32(3) : 1;
|
||||||
auto sizes = ConvPoolOpBase::GetOutputSize(X, C);
|
ConvPoolOpBase::SetOutputSize(X, Y, C);
|
||||||
auto* Y = Output(0, sizes, at::dtype<T>());
|
|
||||||
|
|
||||||
N_out = Y->dim32(0);
|
N_out = Y->dim32(0);
|
||||||
C_out = Y->dim32(1);
|
C_out = Y->dim32(1);
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,7 @@ template <typename T, class Context>
|
||||||
bool LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
bool LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||||
const auto& X = Input(INPUT);
|
const auto& X = Input(INPUT);
|
||||||
const auto& filter = Input(FILTER);
|
const auto& filter = Input(FILTER);
|
||||||
|
auto* Y = Output(0);
|
||||||
const int image_ndim = X.dim() - 2;
|
const int image_ndim = X.dim() - 2;
|
||||||
CAFFE_ENFORCE_EQ(X.dim() + image_ndim, filter.dim());
|
CAFFE_ENFORCE_EQ(X.dim() + image_ndim, filter.dim());
|
||||||
lc_op_util::ShapeParams shape;
|
lc_op_util::ShapeParams shape;
|
||||||
|
|
@ -40,8 +41,7 @@ bool LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||||
0,
|
0,
|
||||||
"The number of output channels is not divisible by group.");
|
"The number of output channels is not divisible by group.");
|
||||||
|
|
||||||
auto output_sizes = ConvPoolOpBase<Context>::GetOutputSize(X, shape.M);
|
ConvPoolOpBase<Context>::SetOutputSize(X, Y, shape.M);
|
||||||
auto* Y = Output(0, output_sizes, at::dtype<T>());
|
|
||||||
shape.input_image_size = GetDimsSize(X);
|
shape.input_image_size = GetDimsSize(X);
|
||||||
shape.output_image_size = GetDimsSize(*Y);
|
shape.output_image_size = GetDimsSize(*Y);
|
||||||
const std::vector<int> output_image_dims = GetDims(*Y);
|
const std::vector<int> output_image_dims = GetDims(*Y);
|
||||||
|
|
@ -109,6 +109,7 @@ template <typename T, class Context>
|
||||||
bool LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNHWC() {
|
bool LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNHWC() {
|
||||||
const auto& X = Input(INPUT);
|
const auto& X = Input(INPUT);
|
||||||
const auto& filter = Input(FILTER);
|
const auto& filter = Input(FILTER);
|
||||||
|
auto* Y = Output(0);
|
||||||
CAFFE_ENFORCE_EQ(
|
CAFFE_ENFORCE_EQ(
|
||||||
kernel_.size(),
|
kernel_.size(),
|
||||||
2,
|
2,
|
||||||
|
|
@ -123,8 +124,7 @@ bool LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNHWC() {
|
||||||
CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 1), kernel_h());
|
CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 1), kernel_h());
|
||||||
CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 2), kernel_w());
|
CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 2), kernel_w());
|
||||||
CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 3), shape.C);
|
CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 3), shape.C);
|
||||||
auto sizes = ConvPoolOpBase<Context>::GetOutputSize(X, shape.M);
|
ConvPoolOpBase<Context>::SetOutputSize(X, Y, shape.M);
|
||||||
auto* Y = Output(0, sizes, at::dtype<T>());
|
|
||||||
|
|
||||||
shape.input_image_size = GetDimsSize(X);
|
shape.input_image_size = GetDimsSize(X);
|
||||||
shape.output_image_size = GetDimsSize(*Y);
|
shape.output_image_size = GetDimsSize(*Y);
|
||||||
|
|
|
||||||
|
|
@ -13,8 +13,8 @@ struct LpPoolFunctor {
|
||||||
template <>
|
template <>
|
||||||
bool PoolOp<float, CPUContext, LpPoolFunctor>::RunOnDeviceWithOrderNCHW() {
|
bool PoolOp<float, CPUContext, LpPoolFunctor>::RunOnDeviceWithOrderNCHW() {
|
||||||
auto& X = Input(0);
|
auto& X = Input(0);
|
||||||
auto sizes = ConvPoolOpBase::GetOutputSize(X, X.dim32(1));
|
auto* Y = Output(0);
|
||||||
auto* Y = Output(0, sizes, at::dtype<float>());
|
ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1));
|
||||||
const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
|
const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
|
||||||
const auto inv_p = 1.0 / p;
|
const auto inv_p = 1.0 / p;
|
||||||
|
|
||||||
|
|
@ -59,11 +59,11 @@ bool PoolOp<float, CPUContext, LpPoolFunctor>::RunOnDeviceWithOrderNCHW() {
|
||||||
template <>
|
template <>
|
||||||
bool PoolOp<float, CPUContext, LpPoolFunctor>::RunOnDeviceWithOrderNHWC() {
|
bool PoolOp<float, CPUContext, LpPoolFunctor>::RunOnDeviceWithOrderNHWC() {
|
||||||
auto& X = Input(0);
|
auto& X = Input(0);
|
||||||
|
auto* Y = Output(0);
|
||||||
int height = X.dim32(1);
|
int height = X.dim32(1);
|
||||||
int width = X.dim32(2);
|
int width = X.dim32(2);
|
||||||
int channels = X.dim32(3);
|
int channels = X.dim32(3);
|
||||||
auto sizes = ConvPoolOpBase::GetOutputSize(X, channels);
|
ConvPoolOpBase::SetOutputSize(X, Y, channels);
|
||||||
auto* Y = Output(0, sizes, at::dtype<float>());
|
|
||||||
|
|
||||||
const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
|
const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
|
||||||
const auto inv_p = 1.0 / p;
|
const auto inv_p = 1.0 / p;
|
||||||
|
|
|
||||||
|
|
@ -215,9 +215,8 @@ __global__ void LpPoolBackwardNHWC(
|
||||||
template <>
|
template <>
|
||||||
bool PoolOp<float, CUDAContext, LpPoolFunctor>::RunOnDeviceWithOrderNCHW() {
|
bool PoolOp<float, CUDAContext, LpPoolFunctor>::RunOnDeviceWithOrderNCHW() {
|
||||||
auto& X = Input(0);
|
auto& X = Input(0);
|
||||||
auto sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, X.dim32(1));
|
auto* Y = Output(0);
|
||||||
auto* Y = Output(0, sizes, at::dtype<float>());
|
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1));
|
||||||
|
|
||||||
int output_size = Y->size();
|
int output_size = Y->size();
|
||||||
LpPoolForwardNCHW<float>
|
LpPoolForwardNCHW<float>
|
||||||
<<<CAFFE_GET_BLOCKS(output_size),
|
<<<CAFFE_GET_BLOCKS(output_size),
|
||||||
|
|
@ -246,9 +245,8 @@ bool PoolOp<float, CUDAContext, LpPoolFunctor>::RunOnDeviceWithOrderNCHW() {
|
||||||
template <>
|
template <>
|
||||||
bool PoolOp<float, CUDAContext, LpPoolFunctor>::RunOnDeviceWithOrderNHWC() {
|
bool PoolOp<float, CUDAContext, LpPoolFunctor>::RunOnDeviceWithOrderNHWC() {
|
||||||
auto& X = Input(0);
|
auto& X = Input(0);
|
||||||
auto sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, X.dim32(3));
|
auto* Y = Output(0);
|
||||||
auto* Y = Output(0, sizes, at::dtype<float>());
|
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(3));
|
||||||
|
|
||||||
int output_size = Y->size();
|
int output_size = Y->size();
|
||||||
LpPoolForwardNHWC<float>
|
LpPoolForwardNHWC<float>
|
||||||
<<<CAFFE_GET_BLOCKS(output_size),
|
<<<CAFFE_GET_BLOCKS(output_size),
|
||||||
|
|
|
||||||
|
|
@ -108,11 +108,10 @@ __global__ void MaxPoolBackward(
|
||||||
template <typename T>
|
template <typename T>
|
||||||
bool MaxPoolWithIndexOp::DoRunWithType() {
|
bool MaxPoolWithIndexOp::DoRunWithType() {
|
||||||
auto& X = Input(0);
|
auto& X = Input(0);
|
||||||
|
auto* Y = Output(0);
|
||||||
auto* mask = Output(1);
|
auto* mask = Output(1);
|
||||||
|
|
||||||
auto sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, X.dim32(1));
|
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1));
|
||||||
auto* Y = Output(0, sizes, at::dtype<T>());
|
|
||||||
|
|
||||||
int output_size = Y->size();
|
int output_size = Y->size();
|
||||||
mask->Resize(output_size);
|
mask->Resize(output_size);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -22,11 +22,11 @@ using std::max;
|
||||||
template <>
|
template <>
|
||||||
bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
||||||
auto& X = Input(0);
|
auto& X = Input(0);
|
||||||
|
auto* Y = Output(0);
|
||||||
int channels = X.dim32(1);
|
int channels = X.dim32(1);
|
||||||
int height = X.dim32(2);
|
int height = X.dim32(2);
|
||||||
int width = X.dim32(3);
|
int width = X.dim32(3);
|
||||||
auto sizes = ConvPoolOpBase::GetOutputSize(X, channels);
|
ConvPoolOpBase::SetOutputSize(X, Y, channels);
|
||||||
auto* Y = Output(0, sizes, at::dtype<float>());
|
|
||||||
|
|
||||||
const float* Xdata = X.data<float>();
|
const float* Xdata = X.data<float>();
|
||||||
float* Ydata = Y->template mutable_data<float>();
|
float* Ydata = Y->template mutable_data<float>();
|
||||||
|
|
@ -160,11 +160,11 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
||||||
template <>
|
template <>
|
||||||
bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
|
bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
|
||||||
auto& X = Input(0);
|
auto& X = Input(0);
|
||||||
|
auto* Y = Output(0);
|
||||||
int height = X.dim32(1);
|
int height = X.dim32(1);
|
||||||
int width = X.dim32(2);
|
int width = X.dim32(2);
|
||||||
int channels = X.dim32(3);
|
int channels = X.dim32(3);
|
||||||
auto sizes = ConvPoolOpBase::GetOutputSize(X, channels);
|
ConvPoolOpBase::SetOutputSize(X, Y, channels);
|
||||||
auto* Y = Output(0, sizes, at::dtype<float>());
|
|
||||||
const float* Xdata = X.data<float>();
|
const float* Xdata = X.data<float>();
|
||||||
float* Ydata = Y->template mutable_data<float>();
|
float* Ydata = Y->template mutable_data<float>();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -251,13 +251,12 @@ __global__ void PadImageGradientEdgeNHWC(
|
||||||
template <>
|
template <>
|
||||||
bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
|
bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
|
||||||
auto& X = Input(0);
|
auto& X = Input(0);
|
||||||
|
auto* Y = Output(0);
|
||||||
const int num = X.dim32(0);
|
const int num = X.dim32(0);
|
||||||
const int channels = X.dim32(1);
|
const int channels = X.dim32(1);
|
||||||
const int height = X.dim32(2);
|
const int height = X.dim32(2);
|
||||||
const int width = X.dim32(3);
|
const int width = X.dim32(3);
|
||||||
auto sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, channels);
|
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, channels);
|
||||||
auto* Y = Output(0, sizes, at::dtype<float>());
|
|
||||||
|
|
||||||
const int output_size = Y->size();
|
const int output_size = Y->size();
|
||||||
const int padded_height = Y->dim32(2);
|
const int padded_height = Y->dim32(2);
|
||||||
const int padded_width = Y->dim32(3);
|
const int padded_width = Y->dim32(3);
|
||||||
|
|
@ -328,13 +327,12 @@ bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
|
||||||
template<>
|
template<>
|
||||||
bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
|
bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
|
||||||
auto& X = Input(0);
|
auto& X = Input(0);
|
||||||
|
auto* Y = Output(0);
|
||||||
const int num = X.dim32(0);
|
const int num = X.dim32(0);
|
||||||
const int height = X.dim32(1);
|
const int height = X.dim32(1);
|
||||||
const int width = X.dim32(2);
|
const int width = X.dim32(2);
|
||||||
const int channels = X.dim32(3);
|
const int channels = X.dim32(3);
|
||||||
auto sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, channels);
|
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, channels);
|
||||||
auto* Y = Output(0, sizes, at::dtype<float>());
|
|
||||||
|
|
||||||
const int output_size = Y->size();
|
const int output_size = Y->size();
|
||||||
const int padded_height = Y->dim32(1);
|
const int padded_height = Y->dim32(1);
|
||||||
const int padded_width = Y->dim32(2);
|
const int padded_width = Y->dim32(2);
|
||||||
|
|
@ -405,7 +403,7 @@ bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
|
||||||
template<>
|
template<>
|
||||||
bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
|
bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
|
||||||
auto& dY = Input(0);
|
auto& dY = Input(0);
|
||||||
|
|
||||||
auto* dX = Output(0, { dY.dim32(0),
|
auto* dX = Output(0, { dY.dim32(0),
|
||||||
dY.dim32(1),
|
dY.dim32(1),
|
||||||
dY.dim32(2) - pad_t() - pad_b(),
|
dY.dim32(2) - pad_t() - pad_b(),
|
||||||
|
|
@ -485,7 +483,7 @@ bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
|
||||||
template<>
|
template<>
|
||||||
bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
|
bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
|
||||||
auto& dY = Input(0);
|
auto& dY = Input(0);
|
||||||
|
|
||||||
auto* dX = Output(0, { dY.dim32(0),
|
auto* dX = Output(0, { dY.dim32(0),
|
||||||
dY.dim32(1) - pad_t() - pad_b(),
|
dY.dim32(1) - pad_t() - pad_b(),
|
||||||
dY.dim32(2) - pad_l() - pad_r(),
|
dY.dim32(2) - pad_l() - pad_r(),
|
||||||
|
|
|
||||||
|
|
@ -36,10 +36,10 @@ class PoolOp final : public ConvPoolOpBase<Context> {
|
||||||
|
|
||||||
bool RunOnDeviceWithOrderNCHW() override {
|
bool RunOnDeviceWithOrderNCHW() override {
|
||||||
const auto& X = Input(0);
|
const auto& X = Input(0);
|
||||||
|
auto* Y = Output(0);
|
||||||
const int N = X.dim32(0);
|
const int N = X.dim32(0);
|
||||||
const int C = X.dim32(1);
|
const int C = X.dim32(1);
|
||||||
auto sizes = ConvPoolOpBase<Context>::GetOutputSize(X, C);
|
ConvPoolOpBase<Context>::SetOutputSize(X, Y, C);
|
||||||
auto* Y = Output(0, sizes, at::dtype<T>());
|
|
||||||
const T* X_data = X.template data<T>();
|
const T* X_data = X.template data<T>();
|
||||||
T* Y_data = Y->template mutable_data<T>();
|
T* Y_data = Y->template mutable_data<T>();
|
||||||
if (global_pooling_) {
|
if (global_pooling_) {
|
||||||
|
|
@ -65,11 +65,11 @@ class PoolOp final : public ConvPoolOpBase<Context> {
|
||||||
|
|
||||||
bool RunOnDeviceWithOrderNHWC() override {
|
bool RunOnDeviceWithOrderNHWC() override {
|
||||||
const auto& X = Input(0);
|
const auto& X = Input(0);
|
||||||
|
auto* Y = Output(0);
|
||||||
const int ndim = X.ndim();
|
const int ndim = X.ndim();
|
||||||
const int N = X.dim32(0);
|
const int N = X.dim32(0);
|
||||||
const int C = X.dim32(ndim - 1);
|
const int C = X.dim32(ndim - 1);
|
||||||
auto sizes = ConvPoolOpBase<Context>::GetOutputSize(X, C);
|
ConvPoolOpBase<Context>::SetOutputSize(X, Y, C);
|
||||||
auto* Y = Output(0, sizes, at::dtype<T>());
|
|
||||||
const T* X_data = X.template data<T>();
|
const T* X_data = X.template data<T>();
|
||||||
T* Y_data = Y->template mutable_data<T>();
|
T* Y_data = Y->template mutable_data<T>();
|
||||||
if (global_pooling_) {
|
if (global_pooling_) {
|
||||||
|
|
|
||||||
|
|
@ -99,11 +99,11 @@ class CuDNNPoolOp final : public ConvPoolOpBase<CUDAContext> {
|
||||||
template <typename T>
|
template <typename T>
|
||||||
bool DoRunWithType() {
|
bool DoRunWithType() {
|
||||||
const auto& X = Input(0);
|
const auto& X = Input(0);
|
||||||
|
auto* Y = Output(0);
|
||||||
const int ndim = X.ndim();
|
const int ndim = X.ndim();
|
||||||
const int N = X.dim32(0);
|
const int N = X.dim32(0);
|
||||||
const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1);
|
const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1);
|
||||||
auto sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, C);
|
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, C);
|
||||||
auto* Y = Output(0, sizes, at::dtype<T>());
|
|
||||||
const T* X_data = X.template data<T>();
|
const T* X_data = X.template data<T>();
|
||||||
T* Y_data = Y->template mutable_data<T>();
|
T* Y_data = Y->template mutable_data<T>();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -44,8 +44,7 @@ class Int8AveragePoolOp final : public ConvPoolOpBase<CPUContext> {
|
||||||
|
|
||||||
CHECK_EQ(X.t.dim(), 4);
|
CHECK_EQ(X.t.dim(), 4);
|
||||||
const int channels = X.t.dim32(3);
|
const int channels = X.t.dim32(3);
|
||||||
auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X.t, channels);
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X.t, &(Y->t), channels);
|
||||||
ReinitializeTensor(&(Y->t), sizes, at::dtype<uint8_t>().device(CPU));
|
|
||||||
|
|
||||||
initQNNPACK();
|
initQNNPACK();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -43,8 +43,7 @@ class Int8ConvOp final : public ConvPoolOpBase<CPUContext> {
|
||||||
this->template GetSingleArgument<int>("Y_zero_point", 0);
|
this->template GetSingleArgument<int>("Y_zero_point", 0);
|
||||||
double Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
|
double Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
|
||||||
|
|
||||||
auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X.t, W.t.dim32(0));
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X.t, &(Y->t), W.t.dim32(0));
|
||||||
ReinitializeTensor(&(Y->t), sizes, at::dtype<uint8_t>().device(CPU));
|
|
||||||
Y->scale = Y_scale;
|
Y->scale = Y_scale;
|
||||||
Y->zero_point = Y_offset;
|
Y->zero_point = Y_offset;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -42,8 +42,7 @@ class Int8MaxPoolOp final : public ConvPoolOpBase<CPUContext> {
|
||||||
|
|
||||||
CHECK_EQ(X.t.dim(), 4);
|
CHECK_EQ(X.t.dim(), 4);
|
||||||
const int channels = X.t.dim32(3);
|
const int channels = X.t.dim32(3);
|
||||||
auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X.t, channels);
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X.t, &(Y->t), channels);
|
||||||
ReinitializeTensor(&(Y->t), sizes, at::dtype<uint8_t>().device(CPU));
|
|
||||||
|
|
||||||
initQNNPACK();
|
initQNNPACK();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -102,8 +102,8 @@ bool ConvDNNLowPAcc16Op<ReluFused>::GetQuantizationParameters_() {
|
||||||
const Tensor& X = InputTensorCPU_(INPUT);
|
const Tensor& X = InputTensorCPU_(INPUT);
|
||||||
int N = X.dim32(0);
|
int N = X.dim32(0);
|
||||||
|
|
||||||
auto sizes = this->GetOutputSize(X, filter.dim32(0));
|
Tensor* Y = OutputTensorCPU_(0);
|
||||||
Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<uint8_t>());
|
this->SetOutputSize(X, Y, filter.dim32(0));
|
||||||
const int output_image_size = this->GetDimsSize(*Y);
|
const int output_image_size = this->GetDimsSize(*Y);
|
||||||
|
|
||||||
if (N * output_image_size < FLAGS_caffe2_dnnlowp_acc16_m_threshold) {
|
if (N * output_image_size < FLAGS_caffe2_dnnlowp_acc16_m_threshold) {
|
||||||
|
|
@ -228,6 +228,7 @@ bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNCHW() {
|
||||||
|
|
||||||
const Tensor& X = InputTensorCPU_(INPUT);
|
const Tensor& X = InputTensorCPU_(INPUT);
|
||||||
auto& filter = InputTensorCPU_(FILTER);
|
auto& filter = InputTensorCPU_(FILTER);
|
||||||
|
Tensor* Y = OutputTensorCPU_(0);
|
||||||
const int N = X.dim32(0), C = X.dim32(1);
|
const int N = X.dim32(0), C = X.dim32(1);
|
||||||
CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
|
CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
|
||||||
const int M = filter.dim32(0);
|
const int M = filter.dim32(0);
|
||||||
|
|
@ -245,8 +246,7 @@ bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNCHW() {
|
||||||
0,
|
0,
|
||||||
"The number of output channels is not divisible by group.");
|
"The number of output channels is not divisible by group.");
|
||||||
|
|
||||||
auto sizes = this->GetOutputSize(X, filter.dim32(0));
|
this->SetOutputSize(X, Y, filter.dim32(0));
|
||||||
Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<uint8_t>());
|
|
||||||
|
|
||||||
const vector<int> input_dims = GetDims(X);
|
const vector<int> input_dims = GetDims(X);
|
||||||
const vector<int> output_dims = GetDims(*Y);
|
const vector<int> output_dims = GetDims(*Y);
|
||||||
|
|
@ -618,14 +618,14 @@ bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNHWC() {
|
||||||
|
|
||||||
const Tensor& X = InputTensorCPU_(INPUT);
|
const Tensor& X = InputTensorCPU_(INPUT);
|
||||||
auto& filter = InputTensorCPU_(FILTER);
|
auto& filter = InputTensorCPU_(FILTER);
|
||||||
|
Tensor* Y = OutputTensorCPU_(0);
|
||||||
const int N = X.dim32(0), C = X.dim32(X.ndim() - 1);
|
const int N = X.dim32(0), C = X.dim32(X.ndim() - 1);
|
||||||
|
|
||||||
CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
|
CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
|
||||||
const int M = filter.dim32(0);
|
const int M = filter.dim32(0);
|
||||||
CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_);
|
CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_);
|
||||||
|
|
||||||
auto sizes = this->GetOutputSize(X, filter.dim32(0));
|
this->SetOutputSize(X, Y, filter.dim32(0));
|
||||||
Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<uint8_t>());
|
|
||||||
// The dimension of each kernel
|
// The dimension of each kernel
|
||||||
const int kernel_dim = this->KernelDim_();
|
const int kernel_dim = this->KernelDim_();
|
||||||
// The output image size is the spatial size of the output.
|
// The output image size is the spatial size of the output.
|
||||||
|
|
|
||||||
|
|
@ -559,6 +559,7 @@ bool ConvDNNLowPOp<T, ReluFused>::RunOnDeviceWithOrderNCHW() {
|
||||||
|
|
||||||
const Tensor& X = InputTensorCPU_(INPUT);
|
const Tensor& X = InputTensorCPU_(INPUT);
|
||||||
auto& filter = InputTensorCPU_(FILTER);
|
auto& filter = InputTensorCPU_(FILTER);
|
||||||
|
Tensor* Y = OutputTensorCPU_(0);
|
||||||
const int N = X.dim32(0), C = X.dim32(1);
|
const int N = X.dim32(0), C = X.dim32(1);
|
||||||
CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
|
CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
|
||||||
const int M = filter.dim32(0);
|
const int M = filter.dim32(0);
|
||||||
|
|
@ -576,8 +577,7 @@ bool ConvDNNLowPOp<T, ReluFused>::RunOnDeviceWithOrderNCHW() {
|
||||||
0,
|
0,
|
||||||
"The number of output channels is not divisible by group.");
|
"The number of output channels is not divisible by group.");
|
||||||
|
|
||||||
auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0));
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
|
||||||
Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<T>());
|
|
||||||
|
|
||||||
const vector<int> input_dims = GetDims(X);
|
const vector<int> input_dims = GetDims(X);
|
||||||
const vector<int> output_dims = GetDims(*Y);
|
const vector<int> output_dims = GetDims(*Y);
|
||||||
|
|
@ -1417,6 +1417,7 @@ bool ConvDNNLowPOp<T, ReluFused>::RunOnDeviceWithOrderNHWC() {
|
||||||
|
|
||||||
const Tensor& X = InputTensorCPU_(INPUT);
|
const Tensor& X = InputTensorCPU_(INPUT);
|
||||||
auto& filter = InputTensorCPU_(FILTER);
|
auto& filter = InputTensorCPU_(FILTER);
|
||||||
|
Tensor* Y = OutputTensorCPU_(0);
|
||||||
const int C = X.dim32(X.dim() - 1);
|
const int C = X.dim32(X.dim() - 1);
|
||||||
const int G = group_;
|
const int G = group_;
|
||||||
CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
|
CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
|
||||||
|
|
@ -1433,8 +1434,7 @@ bool ConvDNNLowPOp<T, ReluFused>::RunOnDeviceWithOrderNHWC() {
|
||||||
CAFFE_ENFORCE_EQ(
|
CAFFE_ENFORCE_EQ(
|
||||||
M % G, 0, "The number of output channels is not divisible by group.");
|
M % G, 0, "The number of output channels is not divisible by group.");
|
||||||
|
|
||||||
auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0));
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
|
||||||
Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<T>());
|
|
||||||
|
|
||||||
// The col buffer is stored in HWC order as well - kernel_dim, and the height
|
// The col buffer is stored in HWC order as well - kernel_dim, and the height
|
||||||
// and width.
|
// and width.
|
||||||
|
|
|
||||||
|
|
@ -61,12 +61,6 @@ class ConvPoolDNNLowPOpBase : public ConvPoolOpBase<CPUContext> {
|
||||||
return &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t;
|
return &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor* OutputTensorCPU_(int idx, at::IntList dims, at::TensorOptions options) {
|
|
||||||
auto* t = &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t;
|
|
||||||
ReinitializeTensor(t, dims, options.device(CPU));
|
|
||||||
return t;
|
|
||||||
}
|
|
||||||
|
|
||||||
T* GetQuantizedOutputData_() {
|
T* GetQuantizedOutputData_() {
|
||||||
return OutputTensorCPU_(0)->template mutable_data<T>();
|
return OutputTensorCPU_(0)->template mutable_data<T>();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -115,16 +115,6 @@ class DNNLowPOp : public Operator<CPUContext> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor* OutputTensorCPU_(int idx, at::IntList dims, at::TensorOptions options) {
|
|
||||||
if (dequantize_output_) {
|
|
||||||
return Output(idx, dims, options.device(CPU));
|
|
||||||
} else {
|
|
||||||
auto* t = &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t;
|
|
||||||
ReinitializeTensor(t, dims, options.device(CPU));
|
|
||||||
return t;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
T* GetQuantizedOutputData_() {
|
T* GetQuantizedOutputData_() {
|
||||||
if (dequantize_output_) {
|
if (dequantize_output_) {
|
||||||
out_temp_.resize(Output(0)->numel());
|
out_temp_.resize(Output(0)->numel());
|
||||||
|
|
|
||||||
|
|
@ -100,8 +100,8 @@ class AveragePoolDnnLowPOp final
|
||||||
GetOutputQuantizationParams_();
|
GetOutputQuantizationParams_();
|
||||||
|
|
||||||
auto& X = InputTensorCPU_(0);
|
auto& X = InputTensorCPU_(0);
|
||||||
auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, X.dim32(1));
|
auto* Y = OutputTensorCPU_(0);
|
||||||
auto* Y = OutputTensorCPU_(0, sizes, at::dtype<T>());
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1));
|
||||||
|
|
||||||
T* Ydata = GetQuantizedOutputData_();
|
T* Ydata = GetQuantizedOutputData_();
|
||||||
|
|
||||||
|
|
@ -238,9 +238,9 @@ class AveragePoolDnnLowPOp final
|
||||||
GetOutputQuantizationParams_();
|
GetOutputQuantizationParams_();
|
||||||
|
|
||||||
auto& X = InputTensorCPU_(0);
|
auto& X = InputTensorCPU_(0);
|
||||||
|
auto* Y = OutputTensorCPU_(0);
|
||||||
int channels = X.dim32(X.ndim() - 1);
|
int channels = X.dim32(X.ndim() - 1);
|
||||||
auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, channels);
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, channels);
|
||||||
auto* Y = OutputTensorCPU_(0, sizes, at::dtype<T>());
|
|
||||||
|
|
||||||
T* Ydata = GetQuantizedOutputData_();
|
T* Ydata = GetQuantizedOutputData_();
|
||||||
|
|
||||||
|
|
@ -397,8 +397,8 @@ class MaxPoolDnnLowPOp final : public ConvPoolDNNLowPOpBase<T, MaxPoolFp32Op> {
|
||||||
const T* Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp);
|
const T* Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp);
|
||||||
|
|
||||||
auto& X = InputTensorCPU_(0);
|
auto& X = InputTensorCPU_(0);
|
||||||
auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, X.dim32(1));
|
auto* Y = OutputTensorCPU_(0);
|
||||||
auto* Y = OutputTensorCPU_(0, sizes, at::dtype<T>());
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1));
|
||||||
|
|
||||||
T* Ydata = GetQuantizedOutputData_();
|
T* Ydata = GetQuantizedOutputData_();
|
||||||
|
|
||||||
|
|
@ -543,9 +543,9 @@ class MaxPoolDnnLowPOp final : public ConvPoolDNNLowPOpBase<T, MaxPoolFp32Op> {
|
||||||
const T* Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp);
|
const T* Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp);
|
||||||
|
|
||||||
auto& X = InputTensorCPU_(0);
|
auto& X = InputTensorCPU_(0);
|
||||||
|
auto* Y = OutputTensorCPU_(0);
|
||||||
int channels = X.dim32(X.ndim() - 1);
|
int channels = X.dim32(X.ndim() - 1);
|
||||||
auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, channels);
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, channels);
|
||||||
auto* Y = OutputTensorCPU_(0, sizes, at::dtype<T>());
|
|
||||||
|
|
||||||
T* Ydata = GetQuantizedOutputData_();
|
T* Ydata = GetQuantizedOutputData_();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -442,6 +442,7 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CPUContext> {
|
||||||
bool RunOnDeviceWithOrderNCHW() override {
|
bool RunOnDeviceWithOrderNCHW() override {
|
||||||
const Tensor& X = Input(0);
|
const Tensor& X = Input(0);
|
||||||
auto& filter = Input(1);
|
auto& filter = Input(1);
|
||||||
|
Tensor* Y = Output(0);
|
||||||
const int N = X.dim32(0), C = X.dim32(1);
|
const int N = X.dim32(0), C = X.dim32(1);
|
||||||
CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
|
CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
|
||||||
const int M = filter.dim32(0);
|
const int M = filter.dim32(0);
|
||||||
|
|
@ -451,8 +452,8 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CPUContext> {
|
||||||
CAFFE_ENFORCE_EQ(C, this->group_);
|
CAFFE_ENFORCE_EQ(C, this->group_);
|
||||||
CAFFE_ENFORCE_EQ(M, this->group_);
|
CAFFE_ENFORCE_EQ(M, this->group_);
|
||||||
|
|
||||||
auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0));
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
|
||||||
Tensor* Y = Output(0, sizes, at::dtype<float>());
|
Y->mutable_data<float>();
|
||||||
|
|
||||||
DepthwiseArgs args;
|
DepthwiseArgs args;
|
||||||
args.batch = X.dim32(0);
|
args.batch = X.dim32(0);
|
||||||
|
|
|
||||||
|
|
@ -147,8 +147,10 @@ NNPACKConvOp::getActivationType() const {
|
||||||
bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
|
bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
|
||||||
/* Global variable with a unique ID of the pre-transformed kernel blob */
|
/* Global variable with a unique ID of the pre-transformed kernel blob */
|
||||||
volatile static uint32_t precomputed_transform_id = 0;
|
volatile static uint32_t precomputed_transform_id = 0;
|
||||||
|
|
||||||
auto& X = Input(0);
|
auto& X = Input(0);
|
||||||
auto& filter = Input(1);
|
auto& filter = Input(1);
|
||||||
|
auto* Y = Output(0);
|
||||||
CAFFE_ENFORCE(X.ndim() == 4, "Input dim should be 4");
|
CAFFE_ENFORCE(X.ndim() == 4, "Input dim should be 4");
|
||||||
const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
|
const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
|
||||||
CAFFE_ENFORCE(filter.ndim() == 4, "");
|
CAFFE_ENFORCE(filter.ndim() == 4, "");
|
||||||
|
|
@ -158,8 +160,7 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
|
||||||
CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, "");
|
CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, "");
|
||||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "");
|
CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "");
|
||||||
CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "");
|
CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "");
|
||||||
auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0));
|
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
|
||||||
Tensor* Y = Output(0, sizes, at::dtype<float>());
|
|
||||||
const int oH = Y->dim32(2), oW = Y->dim32(3);
|
const int oH = Y->dim32(2), oW = Y->dim32(3);
|
||||||
|
|
||||||
const float* biasData = NULL;
|
const float* biasData = NULL;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user