mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: [Comment](https://github.com/pytorch/pytorch/pull/62445/files#r680132022) claims, it got added for consistency with top level CMakeLists.txt, but `-Wno-unused-variable` is not mentioned there. Modify violations in 50+ files that were added in the interim by either removing unused variables, or decorating the code with `C10_UNUSED` if local variable is likely used to extend object lifetime until the end of the block. Caused preventable revert in https://github.com/pytorch/pytorch/pull/72633#issuecomment-1092300787 Pull Request resolved: https://github.com/pytorch/pytorch/pull/75538 Reviewed By: anjali411 Differential Revision: D35747333 Pulled By: malfet fbshipit-source-id: 3fc5828e44a4c05ba0e89e92613e6ebbdb260626 (cherry picked from commit c179fba21cfa2a0093fad50ccad5a22dd7cff52c)
345 lines
12 KiB
C++
345 lines
12 KiB
C++
#pragma once
|
|
|
|
#ifdef _OPENMP
|
|
#include <omp.h>
|
|
#endif
|
|
|
|
#include "caffe2/core/operator.h"
|
|
#include "caffe2/utils/math.h"
|
|
#include "caffe2/utils/math/utils.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
namespace math {
|
|
|
|
template <typename T>
|
|
static void Im2ColNCHW(
|
|
const int channels,
|
|
const int height,
|
|
const int width,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
const int pad_b,
|
|
const int pad_r,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const T* data_im,
|
|
T* data_col,
|
|
CPUContext* /*context*/,
|
|
const T& zero_point = 0) {
|
|
const int output_h =
|
|
(height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
|
|
1;
|
|
const int output_w =
|
|
(width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
|
|
1;
|
|
|
|
// Fast path for zero padding and no dilation
|
|
// From Torch, THNN_(unfolded_copy)
|
|
if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 &&
|
|
pad_t == 0 && pad_b == 0) {
|
|
for (auto k = 0; k < channels * kernel_h * kernel_w; k++) {
|
|
const auto nip = k / (kernel_h * kernel_w);
|
|
const auto rest = k % (kernel_h * kernel_w);
|
|
const auto kh = rest / kernel_w;
|
|
const auto kw = rest % kernel_w;
|
|
auto* dst = data_col + nip * (kernel_h * kernel_w * output_h * output_w) +
|
|
kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
|
|
const auto* src = data_im + nip * (height * width);
|
|
for (const auto y : c10::irange(output_h)) {
|
|
const auto iy = y * stride_h + kh;
|
|
const auto ix = kw;
|
|
if (stride_w == 1) {
|
|
memcpy(
|
|
dst + (y * output_w),
|
|
src + (iy * width + ix),
|
|
sizeof(T) * output_w);
|
|
} else {
|
|
for (const auto x : c10::irange(output_w)) {
|
|
memcpy(
|
|
dst + (y * output_w + x),
|
|
src + (iy * width + ix + x * stride_w),
|
|
sizeof(T));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Fast path for equal padding
|
|
if (pad_l == pad_r && pad_t == pad_b) {
|
|
// From Intel, https://github.com/BVLC/caffe/pull/3536
|
|
const int pad_h = pad_t;
|
|
const int pad_w = pad_l;
|
|
const int channel_size = height * width;
|
|
for (int channel = channels; channel--; data_im += channel_size) {
|
|
for (const auto kernel_row : c10::irange(kernel_h)) {
|
|
for (const auto kernel_col : c10::irange(kernel_w)) {
|
|
int input_row = -pad_h + kernel_row * dilation_h;
|
|
for (int output_rows = output_h; output_rows; output_rows--) {
|
|
if (!utils::IsAGeZeroAndALtB(input_row, height)) {
|
|
for (int output_cols = output_w; output_cols; output_cols--) {
|
|
*(data_col++) = zero_point;
|
|
}
|
|
} else {
|
|
int input_col = -pad_w + kernel_col * dilation_w;
|
|
for (int output_col = output_w; output_col; output_col--) {
|
|
if (utils::IsAGeZeroAndALtB(input_col, width)) {
|
|
*(data_col++) = data_im[input_row * width + input_col];
|
|
} else {
|
|
*(data_col++) = zero_point;
|
|
}
|
|
input_col += stride_w;
|
|
}
|
|
}
|
|
input_row += stride_h;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Baseline
|
|
const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
|
|
const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
|
|
|
|
int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
|
|
int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
|
|
|
|
int channels_col = channels * kernel_h * kernel_w;
|
|
for (const auto c : c10::irange(channels_col)) {
|
|
int w_offset = c % kernel_w;
|
|
int h_offset = (c / kernel_w) % kernel_h;
|
|
int c_im = c / kernel_h / kernel_w;
|
|
for (const auto h : c10::irange(height_col)) {
|
|
for (const auto w : c10::irange(width_col)) {
|
|
int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
|
|
int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
|
|
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
|
|
data_col[(c * height_col + h) * width_col + w] =
|
|
data_im[(c_im * height + h_pad) * width + w_pad];
|
|
else
|
|
data_col[(c * height_col + h) * width_col + w] = zero_point;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
static void Im2ColNdNCHW(
|
|
const int N,
|
|
const int /* img_size*/,
|
|
const int col_size,
|
|
const int* img_shape,
|
|
const int* col_shape,
|
|
const int* kernel_shape,
|
|
const int* stride,
|
|
const int* dilation,
|
|
const int* pad,
|
|
const T* X_data,
|
|
T* Y_data,
|
|
CPUContext* /* context */,
|
|
const T& zero_point = 0) {
|
|
const int outer_size = col_shape[0];
|
|
const int inner_size = col_size / outer_size;
|
|
const int kernel_size = std::accumulate(
|
|
kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
|
|
std::vector<int> d_offset(N, 0);
|
|
std::vector<int> d_iter(N, 0);
|
|
for (const auto i : c10::irange(outer_size)) {
|
|
// Loop over spatial axes in reverse order to compute a per-axis offset.
|
|
int offset = i;
|
|
for (int d_i = N - 1; d_i >= 0; --d_i) {
|
|
d_offset[d_i] = offset % kernel_shape[d_i];
|
|
offset /= kernel_shape[d_i];
|
|
}
|
|
for (const auto j : c10::irange(inner_size)) {
|
|
// Loop over spatial axes in forward order to compute the indices in the
|
|
// image and column, and whether the index lies in the padding.
|
|
const int col_index = i * inner_size + j;
|
|
int img_index = i / kernel_size;
|
|
bool is_padding = false;
|
|
for (const auto d_i : c10::irange(N)) {
|
|
const int d_img = d_iter[d_i] * stride[d_i] - pad[d_i] +
|
|
d_offset[d_i] * dilation[d_i];
|
|
is_padding |= d_img < 0 || d_img >= img_shape[d_i + 1];
|
|
img_index = img_index * img_shape[d_i + 1] + d_img;
|
|
}
|
|
Y_data[col_index] = is_padding ? zero_point : X_data[img_index];
|
|
utils::IncreaseIndexInDims(N, col_shape + 1, d_iter.data());
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* The layout of the result is N H W G R S C/G.
|
|
* Note that groups are pulled out to an outer dimension so that we can use
|
|
* GEMMs efficiently.
|
|
*/
|
|
template <typename T>
|
|
static void Im2ColNHWC(
|
|
const int channels,
|
|
const int height,
|
|
const int width,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
const int pad_b,
|
|
const int pad_r,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const T* data_im,
|
|
T* data_col,
|
|
CPUContext* /*context*/,
|
|
const int groups,
|
|
const T& zero_point) {
|
|
const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
|
|
const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
|
|
|
|
int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
|
|
int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
|
|
|
|
#ifdef _OPENMP
|
|
#pragma omp parallel for if (!omp_in_parallel())
|
|
#endif
|
|
for (int h = 0; h < height_col; ++h) {
|
|
int h_pad = -pad_t + h * stride_h;
|
|
T* data_col_temp =
|
|
data_col + h * width_col * kernel_h * kernel_w * channels;
|
|
int w_pad = -pad_l;
|
|
for (C10_UNUSED const auto w : c10::irange(width_col)) {
|
|
int r = 0;
|
|
for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
|
|
int s = 0;
|
|
for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w, ++s) {
|
|
if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
|
|
for (const auto g : c10::irange(groups)) {
|
|
memcpy(
|
|
data_col_temp +
|
|
((g * kernel_h + r) * kernel_w + s) * (channels / groups),
|
|
data_im + (ih * width + iw) * channels +
|
|
g * (channels / groups),
|
|
sizeof(T) * (channels / groups));
|
|
}
|
|
} else {
|
|
// This should be simply padded with zero.
|
|
for (const auto g : c10::irange(groups)) {
|
|
for (int i = 0; i < channels / groups; ++i) {
|
|
data_col_temp
|
|
[(((g * kernel_h + r) * kernel_w) + s) *
|
|
(channels / groups) +
|
|
i] = zero_point;
|
|
}
|
|
}
|
|
}
|
|
} // for each iw
|
|
} // for each ih
|
|
data_col_temp += kernel_h * kernel_w * channels;
|
|
w_pad += stride_w;
|
|
} // for each output pixel
|
|
} // for each image row
|
|
}
|
|
|
|
/**
|
|
* The layout of the result is N T H W G Q R S C/G.
|
|
* Note that groups are pulled out to an outer dimension so that we can use
|
|
* GEMMs efficiently.
|
|
*/
|
|
template <typename T>
|
|
static void Im2Col3DNHWC(
|
|
const int channels,
|
|
const int num_frames,
|
|
const int height,
|
|
const int width,
|
|
const int kernel_t,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int dilation_t,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_p, // previous frame
|
|
const int pad_t, // top
|
|
const int pad_l, // left
|
|
const int pad_n, // next frame
|
|
const int pad_b, // bottom
|
|
const int pad_r, // right
|
|
const int stride_t,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const T* data_im,
|
|
T* data_col,
|
|
CPUContext* /*context*/,
|
|
const int groups,
|
|
const T& zero_point) {
|
|
const int dkernel_t = dilation_t * (kernel_t - 1) + 1;
|
|
const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
|
|
const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
|
|
|
|
int frame_col = (num_frames + pad_p + pad_n - dkernel_t) / stride_t + 1;
|
|
int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
|
|
int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
|
|
|
|
#ifdef _OPENMP
|
|
#pragma omp parallel for if (!omp_in_parallel())
|
|
#endif
|
|
for (int t = 0; t < frame_col; ++t) {
|
|
int t_pad = -pad_p + t * stride_t;
|
|
for (const auto h : c10::irange(height_col)) {
|
|
int h_pad = -pad_t + h * stride_h;
|
|
T* data_col_temp = data_col +
|
|
(t * height_col + h) * width_col * kernel_t * kernel_h * kernel_w *
|
|
channels;
|
|
for (const auto w : c10::irange(width_col)) {
|
|
int w_pad = -pad_l + w * stride_w;
|
|
int q = 0;
|
|
for (int it = t_pad; it < t_pad + dkernel_t; it += dilation_t, ++q) {
|
|
int r = 0;
|
|
for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
|
|
int s = 0;
|
|
for (int iw = w_pad; iw < w_pad + dkernel_w;
|
|
iw += dilation_w, ++s) {
|
|
if (it >= 0 && it < num_frames && ih >= 0 && ih < height &&
|
|
iw >= 0 && iw < width) {
|
|
for (const auto g : c10::irange(groups)) {
|
|
memcpy(
|
|
data_col_temp +
|
|
(((g * kernel_t + q) * kernel_h + r) * kernel_w + s) *
|
|
(channels / groups),
|
|
data_im + ((it * height + ih) * width + iw) * channels +
|
|
g * (channels / groups),
|
|
sizeof(T) * (channels / groups));
|
|
}
|
|
} else {
|
|
// This should be simply padded with zero.
|
|
for (const auto g : c10::irange(groups)) {
|
|
for (int i = 0; i < channels / groups; ++i) {
|
|
data_col_temp
|
|
[((((g * kernel_t + q) * kernel_h + r) * kernel_w) +
|
|
s) *
|
|
(channels / groups) +
|
|
i] = zero_point;
|
|
}
|
|
}
|
|
}
|
|
} // for each iw
|
|
} // for each ih
|
|
} // for each it
|
|
data_col_temp += kernel_t * kernel_h * kernel_w * channels;
|
|
} // for each output pixel
|
|
} // for each image row
|
|
} // for each frame
|
|
}
|
|
|
|
} // namespace math
|
|
|
|
} // namespace caffe2
|