diff --git a/caffe2/contrib/nnpack/nnpack_ops.cc b/caffe2/contrib/nnpack/nnpack_ops.cc index e6e82f48290..23e52a152ea 100644 --- a/caffe2/contrib/nnpack/nnpack_ops.cc +++ b/caffe2/contrib/nnpack/nnpack_ops.cc @@ -119,6 +119,7 @@ class NNPACKConvOp final : public ConvPoolOpBase { auto& X = Input(0); auto& filter = Input(1); auto& bias = Input(2); + auto* Y = Output(0); const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3); const int M = filter.dim32(0); @@ -132,8 +133,7 @@ class NNPACKConvOp final : public ConvPoolOpBase { CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w(), ""); CAFFE_ENFORCE(bias.numel() == M, ""); - auto sizes = ConvPoolOpBase::GetOutputSize(X, filter.dim32(0)); - auto* Y = Output(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, filter.dim32(0)); const int oH = Y->dim32(2), oW = Y->dim32(3); if (N > 1) { @@ -250,10 +250,10 @@ class NNPACKMaxPoolOp final : public ConvPoolOpBase { bool RunOnDeviceWithOrderNCHW() override { auto& X = Input(0); + auto* Y = Output(0); CAFFE_ENFORCE(X.dim() == 4, ""); const int H = X.dim32(2), W = X.dim32(3); - auto sizes = ConvPoolOpBase::GetOutputSize(X, X.dim32(1)); - auto* Y = Output(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1)); std::vector pads( {this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()}); std::vector stride({this->stride_h(), this->stride_w()}); diff --git a/caffe2/cuda_rtc/pool_op_rtc_gpu.cc b/caffe2/cuda_rtc/pool_op_rtc_gpu.cc index ab52d0a070f..c27e7e57c05 100644 --- a/caffe2/cuda_rtc/pool_op_rtc_gpu.cc +++ b/caffe2/cuda_rtc/pool_op_rtc_gpu.cc @@ -196,8 +196,8 @@ class MaxPoolRTCOp final : public ConvPoolOpBase { bool RunOnDeviceWithOrderNCHW() override { auto& X = Input(0); - auto output_sizes = ConvPoolOpBase::GetOutputSize(X, X.dim32(1)); - auto* Y = Output(0, output_sizes, at::dtype()); + auto* Y = Output(0); + ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1)); if (input_dims_ != X.sizes()) { // recompile diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm index 7409c273e0f..4cd912e2f2a 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm @@ -257,10 +257,11 @@ void computeOutputHW( int* OH, int* OW) { Tensor input = caffe2::empty({1, 1, H, W}, at::dtype().device(CPU)); - auto sizes = op->GetOutputSize(input, 1); - CAFFE_ENFORCE_EQ(sizes.size(), 4); - *OH = sizes[2]; - *OW = sizes[3]; + Tensor output(CPU); + op->SetOutputSize(input, &output, 1); + CAFFE_ENFORCE_EQ(output.dim(), 4); + *OH = output.size(2); + *OW = output.size(3); } constexpr int computeMPSAlignOffset(int kernel, int pad) { diff --git a/caffe2/operators/conv_op_cudnn.cc b/caffe2/operators/conv_op_cudnn.cc index 4e7c081b768..9c965368490 100644 --- a/caffe2/operators/conv_op_cudnn.cc +++ b/caffe2/operators/conv_op_cudnn.cc @@ -516,13 +516,13 @@ template bool CudnnConvOp::DoRunWithType() { auto& X = Input(INPUT); auto& filter = Input(FILTER); + auto* Y = Output(0); // Figure out the output shape CAFFE_ENFORCE(X.dim() >= 3 && X.dim() <= 5); CAFFE_ENFORCE(filter.dim() >= 3 && filter.dim() <= 5); const int M = filter.dim32(0); - auto output_sizes = ConvPoolOpBase::GetOutputSize(X, M); - auto* Y = Output(0, output_sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, M); int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0; int group_offset_X = 0, group_offset_Y = 0; diff --git a/caffe2/operators/conv_op_eigen.cc b/caffe2/operators/conv_op_eigen.cc index 5a0412396a1..c68de091152 100644 --- a/caffe2/operators/conv_op_eigen.cc +++ b/caffe2/operators/conv_op_eigen.cc @@ -34,14 +34,14 @@ template bool EigenConvOp::RunOnDeviceWithOrderNCHW() { auto& X = Input(INPUT); auto& filter = Input(FILTER); + auto* Y = Output(0); const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3); CAFFE_ENFORCE(4 == filter.dim()); const int M = filter.dim32(0); CAFFE_ENFORCE(filter.dim32(1) == C); CAFFE_ENFORCE(filter.dim32(2) == kernel_h()); CAFFE_ENFORCE(filter.dim32(3) == kernel_w()); - auto sizes = ConvPoolOpBase::GetOutputSize(X, filter.dim32(0)); - auto* Y = Output(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, filter.dim32(0)); Eigen::array kernel_shuffles { {int64_t(2), int64_t(3), int64_t(1), int64_t(0)} }; Eigen::array input_shuffles @@ -128,14 +128,14 @@ template bool EigenConvOp::RunOnDeviceWithOrderNHWC() { auto& X = Input(INPUT); auto& filter = Input(FILTER); + auto* Y = Output(0); const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3); CAFFE_ENFORCE(4 == filter.dim()); const int M = filter.dim32(0); CAFFE_ENFORCE(filter.dim32(1) == kernel_h()); CAFFE_ENFORCE(filter.dim32(2) == kernel_w()); CAFFE_ENFORCE(filter.dim32(3) == C); - auto sizes = ConvPoolOpBase::GetOutputSize(X, filter.dim32(0)); - auto* Y = Output(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, filter.dim32(0)); // Eigen expects filter to be of shape (kernel_h, kernel_w, C, M) for // optimization purposes, so we will create a temp one. Eigen::Array temp_filter( diff --git a/caffe2/operators/conv_op_impl.h b/caffe2/operators/conv_op_impl.h index fc5300f8dc8..29d98c0b788 100644 --- a/caffe2/operators/conv_op_impl.h +++ b/caffe2/operators/conv_op_impl.h @@ -21,6 +21,7 @@ template bool ConvOp::RunOnDeviceWithOrderNCHW() { const auto& X = Input(INPUT); const auto& filter = Input(FILTER); + auto* Y = Output(0); const int N = X.dim32(0); const int C = X.dim32(1); const int G = group_; @@ -43,8 +44,7 @@ bool ConvOp::RunOnDeviceWithOrderNCHW() { CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]); kernel_size *= kernel_[i]; } - auto output_sizes = ConvPoolOpBase::GetOutputSize(X, M); - auto* Y = Output(0, output_sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, M); const vector X_dims = GetDims(X); const vector Y_dims = GetDims(*Y); const int X_HxW = X.numel() / (N * C); @@ -190,6 +190,7 @@ bool ConvOp::RunOnDeviceWithOrderNHWC() { "Only 1-3d convolution is supported for NHWC storage type"); const Tensor& X = Input(INPUT); const auto& filter = Input(FILTER); + Tensor* Y = Output(0); const int N = X.dim32(0), C = X.dim32(X.dim() - 1); const int G = group_; CAFFE_ENFORCE_EQ(X.dim(), filter.dim()); @@ -211,8 +212,7 @@ bool ConvOp::RunOnDeviceWithOrderNHWC() { CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]); kernel_size *= kernel_[i]; } - auto output_sizes = ConvPoolOpBase::GetOutputSize(X, M); - auto* Y = Output(0, output_sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, M); const vector Y_dims = GetDims(*Y); const int X_HxW = X.numel() / (N * C); const int Y_HxW = Y->numel() / (N * M); diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h index 392d813dcf4..b591d3dd46d 100644 --- a/caffe2/operators/conv_pool_op_base.h +++ b/caffe2/operators/conv_pool_op_base.h @@ -207,7 +207,7 @@ class ConvPoolOpBase : public Operator { return size; } - // Gets the output size. The output channel is manually provided since + // Sets the output size. The output channel is manually provided since // it may not be identical to the input channels. // This function can be used in the forward functions to obtain the output // sizes. @@ -215,7 +215,8 @@ class ConvPoolOpBase : public Operator { // implementations that do not use first-class Tensor objects, such as the // MKL operator. One can still call this function with dummy // Tensor objects in order to obtain the sizes. - std::vector GetOutputSize(const Tensor& input, int output_channel) { + // TODO: passing sizes directly rather than Tensor + void SetOutputSize(const Tensor& input, Tensor* output, int output_channel) { CAFFE_ENFORCE(input.numel() > 0); vector output_dims; int N = input.dim32(0); @@ -240,7 +241,7 @@ class ConvPoolOpBase : public Operator { output_dims.insert(output_dims.begin(), N); output_dims.push_back(output_channel); } - return std::vector(output_dims.cbegin(), output_dims.cend()); + output->Resize(output_dims); } // Helper function that is also called from OperatorSchema. Modified diff --git a/caffe2/operators/deform_conv_op_impl.h b/caffe2/operators/deform_conv_op_impl.h index 9bf9df5132d..94dea27f643 100644 --- a/caffe2/operators/deform_conv_op_impl.h +++ b/caffe2/operators/deform_conv_op_impl.h @@ -17,6 +17,7 @@ bool DeformConvOp::RunOnDeviceWithOrderNCHW() { const Tensor& X = Input(INPUT); const Tensor& offset = Input(OFFSET); auto& filter = Input(FILTER); + Tensor* Y = Output(0); const int N = X.dim32(0), C = X.dim32(1); CAFFE_ENFORCE_EQ(X.dim(), filter.ndim()); const int M = filter.dim32(0); @@ -81,8 +82,7 @@ bool DeformConvOp::RunOnDeviceWithOrderNCHW() { kernel_dims_size *= kernel_[i]; } - auto output_sizes = ConvPoolOpBase::GetOutputSize(X, filter.dim32(0)); - auto* Y = Output(0, output_sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, filter.dim32(0)); const vector input_dims = GetDims(X); const vector output_dims = GetDims(*Y); @@ -196,8 +196,8 @@ bool DeformConvGradientOp::RunOnDeviceWithOrderNCHW() { auto& offset = Input(OFFSET); auto& filter = Input(FILTER); auto& dY = Input(OUTPUT_GRAD); - - + + const int N = X.dim32(0), C = X.dim32(1); const vector input_dims = this->GetDims(X); @@ -303,7 +303,7 @@ bool DeformConvGradientOp::RunOnDeviceWithOrderNCHW() { T* dbias_data = nullptr; if (!no_bias_) { - + auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype()); if (bias_multiplier_.size() != output_image_size) { // If the helper bias multiplier is not M, reshape and fill it with one. @@ -323,7 +323,7 @@ bool DeformConvGradientOp::RunOnDeviceWithOrderNCHW() { T* dXdata = nullptr; if (OutputSize() == 4 || (no_bias_ && (OutputSize() == 3))) { - + auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype()); dXdata = dX->template mutable_data(); math::Set(dX->size(), 0, dXdata, &context_); diff --git a/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu b/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu index f4b530ea7f4..beefcfdd665 100644 --- a/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu +++ b/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu @@ -288,6 +288,7 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase { bool RunOnDeviceWithOrderNCHW() override { const Tensor& X = Input(0); auto& filter = Input(1); + Tensor* Y = Output(0); const int N = X.dim32(0), C = X.dim32(1); CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim()); const int M = filter.dim32(0); @@ -299,8 +300,7 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase { CAFFE_ENFORCE_EQ(this->kernel_w(), 3); CAFFE_ENFORCE_EQ(this->kernel_h(), 3); CAFFE_ENFORCE_EQ(this->stride_h(), this->stride_w()); - auto sizes = ConvPoolOpBase::GetOutputSize(X, filter.dim32(0)); - Tensor* Y = Output(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, filter.dim32(0)); DepthwiseArgs args; args.batch = X.dim32(0); args.in_rows = X.dim32(2); @@ -455,7 +455,7 @@ class Depthwise3x3ConvGradientOp final : public ConvPoolOpBase { M, dY.dim32(2), dY.dim32(3))); - + auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype()); CUDNN_ENFORCE(cudnnConvolutionBackwardBias( cudnn_wrapper_.inline_cudnn_handle(), diff --git a/caffe2/operators/hip/conv_op_miopen.hip b/caffe2/operators/hip/conv_op_miopen.hip index fe4320bc093..376b49aa2ad 100644 --- a/caffe2/operators/hip/conv_op_miopen.hip +++ b/caffe2/operators/hip/conv_op_miopen.hip @@ -205,6 +205,7 @@ template bool MIOPENConvOp::DoRunWithType() { auto& X = Input(INPUT); auto& Weight = Input(FILTER); + auto* Y = Output(0); // Figure out the output shape CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5); @@ -213,8 +214,7 @@ bool MIOPENConvOp::DoRunWithType() { "Conv op with MIOpen engine is supported only for 2D convolutions"); const int M = Weight.dim32(0); - auto sizes = ConvPoolOpBase::GetOutputSize(X, M); - auto* Y = Output(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, M); int N = X.dim32(0); int C = X.dim32(1); diff --git a/caffe2/operators/hip/pool_op_miopen.hip b/caffe2/operators/hip/pool_op_miopen.hip index c1d5ee387b3..614b6cf09bc 100644 --- a/caffe2/operators/hip/pool_op_miopen.hip +++ b/caffe2/operators/hip/pool_op_miopen.hip @@ -61,6 +61,7 @@ class MIOPENPoolOp : public ConvPoolOpBase { template bool DoRunWithType() { auto& X = Input(0); + auto* Y = Output(0); int N = 0, C = 0, H = 0, W = 0, D = 0; int N_out = 0, C_out = 0, H_out = 0, W_out = 0; CAFFE_ENFORCE(X.ndim() >= 4 && X.ndim() <= 5); @@ -68,8 +69,7 @@ class MIOPENPoolOp : public ConvPoolOpBase { C = X.dim32(1); H = X.dim32(2); W = X.ndim() > 3 ? X.dim32(3) : 1; - auto sizes = ConvPoolOpBase::GetOutputSize(X, C); - auto* Y = Output(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, C); N_out = Y->dim32(0); C_out = Y->dim32(1); diff --git a/caffe2/operators/locally_connected_op_impl.h b/caffe2/operators/locally_connected_op_impl.h index 60e641eb63e..df05cad403e 100644 --- a/caffe2/operators/locally_connected_op_impl.h +++ b/caffe2/operators/locally_connected_op_impl.h @@ -20,6 +20,7 @@ template bool LocallyConnectedOp::RunOnDeviceWithOrderNCHW() { const auto& X = Input(INPUT); const auto& filter = Input(FILTER); + auto* Y = Output(0); const int image_ndim = X.dim() - 2; CAFFE_ENFORCE_EQ(X.dim() + image_ndim, filter.dim()); lc_op_util::ShapeParams shape; @@ -40,8 +41,7 @@ bool LocallyConnectedOp::RunOnDeviceWithOrderNCHW() { 0, "The number of output channels is not divisible by group."); - auto output_sizes = ConvPoolOpBase::GetOutputSize(X, shape.M); - auto* Y = Output(0, output_sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, shape.M); shape.input_image_size = GetDimsSize(X); shape.output_image_size = GetDimsSize(*Y); const std::vector output_image_dims = GetDims(*Y); @@ -109,6 +109,7 @@ template bool LocallyConnectedOp::RunOnDeviceWithOrderNHWC() { const auto& X = Input(INPUT); const auto& filter = Input(FILTER); + auto* Y = Output(0); CAFFE_ENFORCE_EQ( kernel_.size(), 2, @@ -123,8 +124,7 @@ bool LocallyConnectedOp::RunOnDeviceWithOrderNHWC() { CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 1), kernel_h()); CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 2), kernel_w()); CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 3), shape.C); - auto sizes = ConvPoolOpBase::GetOutputSize(X, shape.M); - auto* Y = Output(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, shape.M); shape.input_image_size = GetDimsSize(X); shape.output_image_size = GetDimsSize(*Y); diff --git a/caffe2/operators/lp_pool_op.cc b/caffe2/operators/lp_pool_op.cc index cfe9e125334..2b6aef179c8 100644 --- a/caffe2/operators/lp_pool_op.cc +++ b/caffe2/operators/lp_pool_op.cc @@ -13,8 +13,8 @@ struct LpPoolFunctor { template <> bool PoolOp::RunOnDeviceWithOrderNCHW() { auto& X = Input(0); - auto sizes = ConvPoolOpBase::GetOutputSize(X, X.dim32(1)); - auto* Y = Output(0, sizes, at::dtype()); + auto* Y = Output(0); + ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1)); const auto p = OperatorBase::GetSingleArgument("p", 2.0); const auto inv_p = 1.0 / p; @@ -59,11 +59,11 @@ bool PoolOp::RunOnDeviceWithOrderNCHW() { template <> bool PoolOp::RunOnDeviceWithOrderNHWC() { auto& X = Input(0); + auto* Y = Output(0); int height = X.dim32(1); int width = X.dim32(2); int channels = X.dim32(3); - auto sizes = ConvPoolOpBase::GetOutputSize(X, channels); - auto* Y = Output(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, channels); const auto p = OperatorBase::GetSingleArgument("p", 2.0); const auto inv_p = 1.0 / p; diff --git a/caffe2/operators/lp_pool_op.cu b/caffe2/operators/lp_pool_op.cu index 8380be1b923..1547b002f3c 100644 --- a/caffe2/operators/lp_pool_op.cu +++ b/caffe2/operators/lp_pool_op.cu @@ -215,9 +215,8 @@ __global__ void LpPoolBackwardNHWC( template <> bool PoolOp::RunOnDeviceWithOrderNCHW() { auto& X = Input(0); - auto sizes = ConvPoolOpBase::GetOutputSize(X, X.dim32(1)); - auto* Y = Output(0, sizes, at::dtype()); - + auto* Y = Output(0); + ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1)); int output_size = Y->size(); LpPoolForwardNCHW <<::RunOnDeviceWithOrderNCHW() { template <> bool PoolOp::RunOnDeviceWithOrderNHWC() { auto& X = Input(0); - auto sizes = ConvPoolOpBase::GetOutputSize(X, X.dim32(3)); - auto* Y = Output(0, sizes, at::dtype()); - + auto* Y = Output(0); + ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(3)); int output_size = Y->size(); LpPoolForwardNHWC << bool MaxPoolWithIndexOp::DoRunWithType() { auto& X = Input(0); + auto* Y = Output(0); auto* mask = Output(1); - auto sizes = ConvPoolOpBase::GetOutputSize(X, X.dim32(1)); - auto* Y = Output(0, sizes, at::dtype()); - + ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1)); int output_size = Y->size(); mask->Resize(output_size); diff --git a/caffe2/operators/pad_op.cc b/caffe2/operators/pad_op.cc index 3813fc29c69..2dca106e59d 100644 --- a/caffe2/operators/pad_op.cc +++ b/caffe2/operators/pad_op.cc @@ -22,11 +22,11 @@ using std::max; template <> bool PadImageOp::RunOnDeviceWithOrderNCHW() { auto& X = Input(0); + auto* Y = Output(0); int channels = X.dim32(1); int height = X.dim32(2); int width = X.dim32(3); - auto sizes = ConvPoolOpBase::GetOutputSize(X, channels); - auto* Y = Output(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, channels); const float* Xdata = X.data(); float* Ydata = Y->template mutable_data(); @@ -160,11 +160,11 @@ bool PadImageOp::RunOnDeviceWithOrderNCHW() { template <> bool PadImageOp::RunOnDeviceWithOrderNHWC() { auto& X = Input(0); + auto* Y = Output(0); int height = X.dim32(1); int width = X.dim32(2); int channels = X.dim32(3); - auto sizes = ConvPoolOpBase::GetOutputSize(X, channels); - auto* Y = Output(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, channels); const float* Xdata = X.data(); float* Ydata = Y->template mutable_data(); diff --git a/caffe2/operators/pad_op_gpu.cu b/caffe2/operators/pad_op_gpu.cu index a9fcb879079..b36a6b25a31 100644 --- a/caffe2/operators/pad_op_gpu.cu +++ b/caffe2/operators/pad_op_gpu.cu @@ -251,13 +251,12 @@ __global__ void PadImageGradientEdgeNHWC( template <> bool PadImageOp::RunOnDeviceWithOrderNCHW() { auto& X = Input(0); + auto* Y = Output(0); const int num = X.dim32(0); const int channels = X.dim32(1); const int height = X.dim32(2); const int width = X.dim32(3); - auto sizes = ConvPoolOpBase::GetOutputSize(X, channels); - auto* Y = Output(0, sizes, at::dtype()); - + ConvPoolOpBase::SetOutputSize(X, Y, channels); const int output_size = Y->size(); const int padded_height = Y->dim32(2); const int padded_width = Y->dim32(3); @@ -328,13 +327,12 @@ bool PadImageOp::RunOnDeviceWithOrderNCHW() { template<> bool PadImageOp::RunOnDeviceWithOrderNHWC() { auto& X = Input(0); + auto* Y = Output(0); const int num = X.dim32(0); const int height = X.dim32(1); const int width = X.dim32(2); const int channels = X.dim32(3); - auto sizes = ConvPoolOpBase::GetOutputSize(X, channels); - auto* Y = Output(0, sizes, at::dtype()); - + ConvPoolOpBase::SetOutputSize(X, Y, channels); const int output_size = Y->size(); const int padded_height = Y->dim32(1); const int padded_width = Y->dim32(2); @@ -405,7 +403,7 @@ bool PadImageOp::RunOnDeviceWithOrderNHWC() { template<> bool PadImageGradientOp::RunOnDeviceWithOrderNCHW() { auto& dY = Input(0); - + auto* dX = Output(0, { dY.dim32(0), dY.dim32(1), dY.dim32(2) - pad_t() - pad_b(), @@ -485,7 +483,7 @@ bool PadImageGradientOp::RunOnDeviceWithOrderNCHW() { template<> bool PadImageGradientOp::RunOnDeviceWithOrderNHWC() { auto& dY = Input(0); - + auto* dX = Output(0, { dY.dim32(0), dY.dim32(1) - pad_t() - pad_b(), dY.dim32(2) - pad_l() - pad_r(), diff --git a/caffe2/operators/pool_op.h b/caffe2/operators/pool_op.h index 004ec6a1ed5..909fe120caf 100644 --- a/caffe2/operators/pool_op.h +++ b/caffe2/operators/pool_op.h @@ -36,10 +36,10 @@ class PoolOp final : public ConvPoolOpBase { bool RunOnDeviceWithOrderNCHW() override { const auto& X = Input(0); + auto* Y = Output(0); const int N = X.dim32(0); const int C = X.dim32(1); - auto sizes = ConvPoolOpBase::GetOutputSize(X, C); - auto* Y = Output(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, C); const T* X_data = X.template data(); T* Y_data = Y->template mutable_data(); if (global_pooling_) { @@ -65,11 +65,11 @@ class PoolOp final : public ConvPoolOpBase { bool RunOnDeviceWithOrderNHWC() override { const auto& X = Input(0); + auto* Y = Output(0); const int ndim = X.ndim(); const int N = X.dim32(0); const int C = X.dim32(ndim - 1); - auto sizes = ConvPoolOpBase::GetOutputSize(X, C); - auto* Y = Output(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, C); const T* X_data = X.template data(); T* Y_data = Y->template mutable_data(); if (global_pooling_) { diff --git a/caffe2/operators/pool_op_cudnn.cc b/caffe2/operators/pool_op_cudnn.cc index 95bf2d55679..1ed723ca4ed 100644 --- a/caffe2/operators/pool_op_cudnn.cc +++ b/caffe2/operators/pool_op_cudnn.cc @@ -99,11 +99,11 @@ class CuDNNPoolOp final : public ConvPoolOpBase { template bool DoRunWithType() { const auto& X = Input(0); + auto* Y = Output(0); const int ndim = X.ndim(); const int N = X.dim32(0); const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1); - auto sizes = ConvPoolOpBase::GetOutputSize(X, C); - auto* Y = Output(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, C); const T* X_data = X.template data(); T* Y_data = Y->template mutable_data(); diff --git a/caffe2/operators/quantized/int8_average_pool_op.h b/caffe2/operators/quantized/int8_average_pool_op.h index e81829e49ef..1d542ce2301 100644 --- a/caffe2/operators/quantized/int8_average_pool_op.h +++ b/caffe2/operators/quantized/int8_average_pool_op.h @@ -44,8 +44,7 @@ class Int8AveragePoolOp final : public ConvPoolOpBase { CHECK_EQ(X.t.dim(), 4); const int channels = X.t.dim32(3); - auto sizes = ConvPoolOpBase::GetOutputSize(X.t, channels); - ReinitializeTensor(&(Y->t), sizes, at::dtype().device(CPU)); + ConvPoolOpBase::SetOutputSize(X.t, &(Y->t), channels); initQNNPACK(); diff --git a/caffe2/operators/quantized/int8_conv_op.h b/caffe2/operators/quantized/int8_conv_op.h index 5f007f5654b..51afa7a38e7 100644 --- a/caffe2/operators/quantized/int8_conv_op.h +++ b/caffe2/operators/quantized/int8_conv_op.h @@ -43,8 +43,7 @@ class Int8ConvOp final : public ConvPoolOpBase { this->template GetSingleArgument("Y_zero_point", 0); double Y_scale = this->template GetSingleArgument("Y_scale", 1); - auto sizes = ConvPoolOpBase::GetOutputSize(X.t, W.t.dim32(0)); - ReinitializeTensor(&(Y->t), sizes, at::dtype().device(CPU)); + ConvPoolOpBase::SetOutputSize(X.t, &(Y->t), W.t.dim32(0)); Y->scale = Y_scale; Y->zero_point = Y_offset; diff --git a/caffe2/operators/quantized/int8_max_pool_op.h b/caffe2/operators/quantized/int8_max_pool_op.h index c1ab54da498..ce65bb49cc0 100644 --- a/caffe2/operators/quantized/int8_max_pool_op.h +++ b/caffe2/operators/quantized/int8_max_pool_op.h @@ -42,8 +42,7 @@ class Int8MaxPoolOp final : public ConvPoolOpBase { CHECK_EQ(X.t.dim(), 4); const int channels = X.t.dim32(3); - auto sizes = ConvPoolOpBase::GetOutputSize(X.t, channels); - ReinitializeTensor(&(Y->t), sizes, at::dtype().device(CPU)); + ConvPoolOpBase::SetOutputSize(X.t, &(Y->t), channels); initQNNPACK(); diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc b/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc index b339e522403..b3713377bee 100644 --- a/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc +++ b/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc @@ -102,8 +102,8 @@ bool ConvDNNLowPAcc16Op::GetQuantizationParameters_() { const Tensor& X = InputTensorCPU_(INPUT); int N = X.dim32(0); - auto sizes = this->GetOutputSize(X, filter.dim32(0)); - Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype()); + Tensor* Y = OutputTensorCPU_(0); + this->SetOutputSize(X, Y, filter.dim32(0)); const int output_image_size = this->GetDimsSize(*Y); if (N * output_image_size < FLAGS_caffe2_dnnlowp_acc16_m_threshold) { @@ -228,6 +228,7 @@ bool ConvDNNLowPAcc16Op::RunOnDeviceWithOrderNCHW() { const Tensor& X = InputTensorCPU_(INPUT); auto& filter = InputTensorCPU_(FILTER); + Tensor* Y = OutputTensorCPU_(0); const int N = X.dim32(0), C = X.dim32(1); CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim()); const int M = filter.dim32(0); @@ -245,8 +246,7 @@ bool ConvDNNLowPAcc16Op::RunOnDeviceWithOrderNCHW() { 0, "The number of output channels is not divisible by group."); - auto sizes = this->GetOutputSize(X, filter.dim32(0)); - Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype()); + this->SetOutputSize(X, Y, filter.dim32(0)); const vector input_dims = GetDims(X); const vector output_dims = GetDims(*Y); @@ -618,14 +618,14 @@ bool ConvDNNLowPAcc16Op::RunOnDeviceWithOrderNHWC() { const Tensor& X = InputTensorCPU_(INPUT); auto& filter = InputTensorCPU_(FILTER); + Tensor* Y = OutputTensorCPU_(0); const int N = X.dim32(0), C = X.dim32(X.ndim() - 1); CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim()); const int M = filter.dim32(0); CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_); - auto sizes = this->GetOutputSize(X, filter.dim32(0)); - Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype()); + this->SetOutputSize(X, Y, filter.dim32(0)); // The dimension of each kernel const int kernel_dim = this->KernelDim_(); // The output image size is the spatial size of the output. diff --git a/caffe2/quantization/server/conv_dnnlowp_op.cc b/caffe2/quantization/server/conv_dnnlowp_op.cc index 366f235ba83..f4de3992155 100644 --- a/caffe2/quantization/server/conv_dnnlowp_op.cc +++ b/caffe2/quantization/server/conv_dnnlowp_op.cc @@ -559,6 +559,7 @@ bool ConvDNNLowPOp::RunOnDeviceWithOrderNCHW() { const Tensor& X = InputTensorCPU_(INPUT); auto& filter = InputTensorCPU_(FILTER); + Tensor* Y = OutputTensorCPU_(0); const int N = X.dim32(0), C = X.dim32(1); CAFFE_ENFORCE_EQ(X.dim(), filter.dim()); const int M = filter.dim32(0); @@ -576,8 +577,7 @@ bool ConvDNNLowPOp::RunOnDeviceWithOrderNCHW() { 0, "The number of output channels is not divisible by group."); - auto sizes = ConvPoolOpBase::GetOutputSize(X, filter.dim32(0)); - Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, filter.dim32(0)); const vector input_dims = GetDims(X); const vector output_dims = GetDims(*Y); @@ -1417,6 +1417,7 @@ bool ConvDNNLowPOp::RunOnDeviceWithOrderNHWC() { const Tensor& X = InputTensorCPU_(INPUT); auto& filter = InputTensorCPU_(FILTER); + Tensor* Y = OutputTensorCPU_(0); const int C = X.dim32(X.dim() - 1); const int G = group_; CAFFE_ENFORCE_EQ(X.dim(), filter.dim()); @@ -1433,8 +1434,7 @@ bool ConvDNNLowPOp::RunOnDeviceWithOrderNHWC() { CAFFE_ENFORCE_EQ( M % G, 0, "The number of output channels is not divisible by group."); - auto sizes = ConvPoolOpBase::GetOutputSize(X, filter.dim32(0)); - Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, filter.dim32(0)); // The col buffer is stored in HWC order as well - kernel_dim, and the height // and width. diff --git a/caffe2/quantization/server/conv_pool_dnnlowp_op_base.h b/caffe2/quantization/server/conv_pool_dnnlowp_op_base.h index c82a2879f23..ef2df608144 100644 --- a/caffe2/quantization/server/conv_pool_dnnlowp_op_base.h +++ b/caffe2/quantization/server/conv_pool_dnnlowp_op_base.h @@ -61,12 +61,6 @@ class ConvPoolDNNLowPOpBase : public ConvPoolOpBase { return &Outputs()[idx]->template GetMutable()->t; } - Tensor* OutputTensorCPU_(int idx, at::IntList dims, at::TensorOptions options) { - auto* t = &Outputs()[idx]->template GetMutable()->t; - ReinitializeTensor(t, dims, options.device(CPU)); - return t; - } - T* GetQuantizedOutputData_() { return OutputTensorCPU_(0)->template mutable_data(); } diff --git a/caffe2/quantization/server/dnnlowp_op.h b/caffe2/quantization/server/dnnlowp_op.h index 715567abd35..d9414569a7f 100644 --- a/caffe2/quantization/server/dnnlowp_op.h +++ b/caffe2/quantization/server/dnnlowp_op.h @@ -115,16 +115,6 @@ class DNNLowPOp : public Operator { } } - Tensor* OutputTensorCPU_(int idx, at::IntList dims, at::TensorOptions options) { - if (dequantize_output_) { - return Output(idx, dims, options.device(CPU)); - } else { - auto* t = &Outputs()[idx]->template GetMutable()->t; - ReinitializeTensor(t, dims, options.device(CPU)); - return t; - } - } - T* GetQuantizedOutputData_() { if (dequantize_output_) { out_temp_.resize(Output(0)->numel()); diff --git a/caffe2/quantization/server/pool_dnnlowp_op.cc b/caffe2/quantization/server/pool_dnnlowp_op.cc index 6b887f2350a..bbf60261139 100644 --- a/caffe2/quantization/server/pool_dnnlowp_op.cc +++ b/caffe2/quantization/server/pool_dnnlowp_op.cc @@ -100,8 +100,8 @@ class AveragePoolDnnLowPOp final GetOutputQuantizationParams_(); auto& X = InputTensorCPU_(0); - auto sizes = ConvPoolOpBase::GetOutputSize(X, X.dim32(1)); - auto* Y = OutputTensorCPU_(0, sizes, at::dtype()); + auto* Y = OutputTensorCPU_(0); + ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1)); T* Ydata = GetQuantizedOutputData_(); @@ -238,9 +238,9 @@ class AveragePoolDnnLowPOp final GetOutputQuantizationParams_(); auto& X = InputTensorCPU_(0); + auto* Y = OutputTensorCPU_(0); int channels = X.dim32(X.ndim() - 1); - auto sizes = ConvPoolOpBase::GetOutputSize(X, channels); - auto* Y = OutputTensorCPU_(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, channels); T* Ydata = GetQuantizedOutputData_(); @@ -397,8 +397,8 @@ class MaxPoolDnnLowPOp final : public ConvPoolDNNLowPOpBase { const T* Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp); auto& X = InputTensorCPU_(0); - auto sizes = ConvPoolOpBase::GetOutputSize(X, X.dim32(1)); - auto* Y = OutputTensorCPU_(0, sizes, at::dtype()); + auto* Y = OutputTensorCPU_(0); + ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1)); T* Ydata = GetQuantizedOutputData_(); @@ -543,9 +543,9 @@ class MaxPoolDnnLowPOp final : public ConvPoolDNNLowPOpBase { const T* Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp); auto& X = InputTensorCPU_(0); + auto* Y = OutputTensorCPU_(0); int channels = X.dim32(X.ndim() - 1); - auto sizes = ConvPoolOpBase::GetOutputSize(X, channels); - auto* Y = OutputTensorCPU_(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, channels); T* Ydata = GetQuantizedOutputData_(); diff --git a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc index 33b1a983a26..7b1c662c6b9 100644 --- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc +++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc @@ -442,6 +442,7 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase { bool RunOnDeviceWithOrderNCHW() override { const Tensor& X = Input(0); auto& filter = Input(1); + Tensor* Y = Output(0); const int N = X.dim32(0), C = X.dim32(1); CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim()); const int M = filter.dim32(0); @@ -451,8 +452,8 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase { CAFFE_ENFORCE_EQ(C, this->group_); CAFFE_ENFORCE_EQ(M, this->group_); - auto sizes = ConvPoolOpBase::GetOutputSize(X, filter.dim32(0)); - Tensor* Y = Output(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, filter.dim32(0)); + Y->mutable_data(); DepthwiseArgs args; args.batch = X.dim32(0); diff --git a/caffe2/share/contrib/nnpack/conv_op.cc b/caffe2/share/contrib/nnpack/conv_op.cc index dab207b02ff..67dd1804531 100644 --- a/caffe2/share/contrib/nnpack/conv_op.cc +++ b/caffe2/share/contrib/nnpack/conv_op.cc @@ -147,8 +147,10 @@ NNPACKConvOp::getActivationType() const { bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() { /* Global variable with a unique ID of the pre-transformed kernel blob */ volatile static uint32_t precomputed_transform_id = 0; + auto& X = Input(0); auto& filter = Input(1); + auto* Y = Output(0); CAFFE_ENFORCE(X.ndim() == 4, "Input dim should be 4"); const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3); CAFFE_ENFORCE(filter.ndim() == 4, ""); @@ -158,8 +160,7 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() { CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, ""); CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), ""); CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), ""); - auto sizes = ConvPoolOpBase::GetOutputSize(X, filter.dim32(0)); - Tensor* Y = Output(0, sizes, at::dtype()); + ConvPoolOpBase::SetOutputSize(X, Y, filter.dim32(0)); const int oH = Y->dim32(2), oW = Y->dim32(3); const float* biasData = NULL;