add aligned option to RoIAlign

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/23706 Reviewed By: ppwwyyxx Differential Revision: D16615823 fbshipit-source-id: fd9152af8bc979cb04044413e66af349b032a99d
2025-12-06 12:20:52 +01:00 · 2019-08-07 21:18:51 -07:00 · 2019-08-07 21:18:51 -07:00 · ad64789a1e
commit ad64789a1e
parent 15d3f0242b
14 changed files with 178 additions and 101 deletions
--- a/caffe2/operators/quantized/int8_roi_align_op.h
+++ b/caffe2/operators/quantized/int8_roi_align_op.h
@ -146,7 +146,8 @@ void ROIAlignForward(
    const float y_scale,
    const int32_t x_offset,
    const int32_t y_offset,
-    StorageOrder order) {
+    StorageOrder order /* unused */,
+    bool continuous_coordinate) {
  DCHECK(roi_cols == 4 || roi_cols == 5);

  int n_rois = nthreads / channels / pooled_width / pooled_height;
@ -163,14 +164,23 @@ void ROIAlignForward(
    }

    // Do not using rounding; this implementation detail is critical
-    float roi_start_w = offset_bottom_rois[0] * spatial_scale;
-    float roi_start_h = offset_bottom_rois[1] * spatial_scale;
-    float roi_end_w = offset_bottom_rois[2] * spatial_scale;
-    float roi_end_h = offset_bottom_rois[3] * spatial_scale;
+    float roi_offset = continuous_coordinate ? 0.5 : 0;
+    float roi_start_w = offset_bottom_rois[0] * spatial_scale - roi_offset;
+    float roi_start_h = offset_bottom_rois[1] * spatial_scale - roi_offset;
+    float roi_end_w = offset_bottom_rois[2] * spatial_scale - roi_offset;
+    float roi_end_h = offset_bottom_rois[3] * spatial_scale - roi_offset;

-    // Force malformed ROIs to be 1x1
-    float roi_width = std::max(roi_end_w - roi_start_w, (float)1.);
-    float roi_height = std::max(roi_end_h - roi_start_h, (float)1.);
+    float roi_width = roi_end_w - roi_start_w;
+    float roi_height = roi_end_h - roi_start_h;
+    if (continuous_coordinate) {
+      CAFFE_ENFORCE(
+          roi_width > 0 && roi_height > 0,
+          "ROIs in ROIAlign do not have positive size!");
+    } else { // backward compatiblity
+      // Force malformed ROIs to be 1x1
+      roi_width = std::max(roi_width, (float)1.);
+      roi_height = std::max(roi_height, (float)1.);
+    }
    float bin_size_h =
        static_cast<float>(roi_height) / static_cast<float>(pooled_height);
    float bin_size_w =
@ -268,7 +278,8 @@ class Int8RoIAlignOp final : public Operator<CPUContext> {
        pooled_height_(this->template GetSingleArgument<int>("pooled_h", 1)),
        pooled_width_(this->template GetSingleArgument<int>("pooled_w", 1)),
        sampling_ratio_(
-            this->template GetSingleArgument<int>("sampling_ratio", -1)) {
+            this->template GetSingleArgument<int>("sampling_ratio", -1)),
+        aligned_(this->template GetSingleArgument<bool>("aligned", false)) {
    DCHECK_GT(spatial_scale_, 0);
    DCHECK_GT(pooled_height_, 0);
    DCHECK_GT(pooled_width_, 0);
@ -325,7 +336,8 @@ class Int8RoIAlignOp final : public Operator<CPUContext> {
        Y_scale,
        X.zero_point,
        Y_offset,
-        order_);
+        order_,
+        aligned_);

    return true;
  }
@ -336,6 +348,7 @@ class Int8RoIAlignOp final : public Operator<CPUContext> {
  int pooled_height_;
  int pooled_width_;
  int sampling_ratio_;
+  bool aligned_;
 };

 } // namespace int8
--- a/caffe2/operators/roi_align_gradient_op.cc
+++ b/caffe2/operators/roi_align_gradient_op.cc
@ -88,7 +88,8 @@ void ROIAlignBackwardFeature(
    const int sampling_ratio,
    T* bottom_diff,
    const T* bottom_rois,
-    int rois_cols) {
+    int rois_cols,
+    bool continuous_coordinate) {
  DCHECK(rois_cols == 4 || rois_cols == 5);

  for (int index = 0; index < nthreads; index++) {
@ -106,18 +107,23 @@ void ROIAlignBackwardFeature(
    }

    // Do not using rounding; this implementation detail is critical
-    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
-    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
-    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
-    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
-    // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
-    // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
-    // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
-    // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
+    T roi_offset = continuous_coordinate ? T(0.5) : 0;
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale - roi_offset;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale - roi_offset;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale - roi_offset;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale - roi_offset;

-    // Force malformed ROIs to be 1x1
-    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
-    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (continuous_coordinate) {
+      CAFFE_ENFORCE(
+          roi_width > 0 && roi_height > 0,
+          "ROIs in ROIAlign do not have positive size!");
+    } else { // backward compatiblity
+      // Force malformed ROIs to be 1x1
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

@ -220,7 +226,8 @@ bool RoIAlignGradientOp<float, CPUContext>::RunOnDevice() {
        sampling_ratio_,
        dX->template mutable_data<float>(),
        R.data<float>(),
-        R.dim32(1));
+        R.dim32(1),
+        aligned_);
  }
  return true;
 }
--- a/caffe2/operators/roi_align_gradient_op.cu
+++ b/caffe2/operators/roi_align_gradient_op.cu
@ -93,7 +93,8 @@ __global__ void RoIAlignBackwardFeature(
    const int pooled_width,
    const int sampling_ratio,
    T* bottom_diff,
-    const T* bottom_rois) {
+    const T* bottom_rois,
+    bool continuous_coordinate) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
@ -105,18 +106,19 @@ __global__ void RoIAlignBackwardFeature(
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not using rounding; this implementation detail is critical
-    T roi_start_w = offset_bottom_rois[1] * spatial_scale;
-    T roi_start_h = offset_bottom_rois[2] * spatial_scale;
-    T roi_end_w = offset_bottom_rois[3] * spatial_scale;
-    T roi_end_h = offset_bottom_rois[4] * spatial_scale;
-    // T roi_start_w = roundf(offset_bottom_rois[1] * spatial_scale);
-    // T roi_start_h = roundf(offset_bottom_rois[2] * spatial_scale);
-    // T roi_end_w = roundf(offset_bottom_rois[3] * spatial_scale);
-    // T roi_end_h = roundf(offset_bottom_rois[4] * spatial_scale);
+    T roi_offset = continuous_coordinate ? T(0.5) : 0;
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale - roi_offset;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale - roi_offset;
+    T roi_end_w = offset_bottom_rois[3] * spatial_scale - roi_offset;
+    T roi_end_h = offset_bottom_rois[4] * spatial_scale - roi_offset;

-    // Force malformed ROIs to be 1x1
-    T roi_width = c10::cuda::compat::max(roi_end_w - roi_start_w, (T)1.);
-    T roi_height = c10::cuda::compat::max(roi_end_h - roi_start_h, (T)1.);
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!continuous_coordinate) { // backward compatiblity
+      // Force malformed ROIs to be 1x1
+      roi_width = c10::cuda::compat::max(roi_width, (T)1.);
+      roi_height = c10::cuda::compat::max(roi_height, (T)1.);
+    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

@ -220,7 +222,8 @@ bool RoIAlignGradientOp<float, CUDAContext>::RunOnDevice() {
            pooled_width_,
            sampling_ratio_,
            dX->template mutable_data<float>(),
-            R.data<float>());
+            R.data<float>(),
+            aligned_);
  }
  return true;
 }
--- a/caffe2/operators/roi_align_gradient_op.h
+++ b/caffe2/operators/roi_align_gradient_op.h
@ -20,7 +20,8 @@ class RoIAlignGradientOp final : public Operator<Context> {
        pooled_height_(this->template GetSingleArgument<int>("pooled_h", 1)),
        pooled_width_(this->template GetSingleArgument<int>("pooled_w", 1)),
        sampling_ratio_(
-            this->template GetSingleArgument<int>("sampling_ratio", -1)) {
+            this->template GetSingleArgument<int>("sampling_ratio", -1)),
+        aligned_(this->template GetSingleArgument<bool>("aligned", false)) {
    DCHECK_GT(spatial_scale_, 0);
    DCHECK_GT(pooled_height_, 0);
    DCHECK_GT(pooled_width_, 0);
@ -37,6 +38,7 @@ class RoIAlignGradientOp final : public Operator<Context> {
  int pooled_height_;
  int pooled_width_;
  int sampling_ratio_;
+  bool aligned_;
 };

 } // namespace caffe2
--- a/caffe2/operators/roi_align_op.cc
+++ b/caffe2/operators/roi_align_op.cc
@ -128,7 +128,8 @@ void ROIAlignForward(
    const T* bottom_rois,
    int roi_cols,
    T* top_data,
-    StorageOrder order) {
+    StorageOrder order,
+    bool continuous_coordinate) {
  DCHECK(roi_cols == 4 || roi_cols == 5);

  int n_rois = nthreads / channels / pooled_width / pooled_height;
@ -148,18 +149,23 @@ void ROIAlignForward(
    }

    // Do not using rounding; this implementation detail is critical
-    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
-    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
-    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
-    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
-    // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
-    // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
-    // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
-    // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
+    T roi_offset = continuous_coordinate ? T(0.5) : 0;
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale - roi_offset;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale - roi_offset;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale - roi_offset;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale - roi_offset;

-    // Force malformed ROIs to be 1x1
-    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
-    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (continuous_coordinate) {
+      CAFFE_ENFORCE(
+          roi_width > 0 && roi_height > 0,
+          "ROIs in ROIAlign do not have positive size!");
+    } else { // backward compatiblity
+      // Force malformed ROIs to be 1x1
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

@ -293,7 +299,7 @@ bool RoIAlignOp<float, CPUContext>::RunOnDevice() {
    auto* Y = Output(
        0,
        {R.dim32(0), X.dim32(1), pooled_height_, pooled_width_},
-        at::dtype<float>());  // RoI pooled data
+        at::dtype<float>()); // RoI pooled data
    int output_size = Y->numel();
    ROIAlignForward<float>(
        output_size,
@ -308,12 +314,13 @@ bool RoIAlignOp<float, CPUContext>::RunOnDevice() {
        R.data<float>(),
        R.dim32(1),
        Y->template mutable_data<float>(),
-        order_);
+        order_,
+        aligned_);
  } else if (order_ == StorageOrder::NHWC) {
    auto* Y = Output(
        0,
        {R.dim32(0), pooled_height_, pooled_width_, X.dim32(3)},
-        at::dtype<float>());  // RoI pooled data
+        at::dtype<float>()); // RoI pooled data
    int output_size = Y->numel();
    ROIAlignForward<float>(
        output_size,
@ -328,7 +335,8 @@ bool RoIAlignOp<float, CPUContext>::RunOnDevice() {
        R.data<float>(),
        R.dim32(1),
        Y->template mutable_data<float>(),
-        order_);
+        order_,
+        aligned_);
  }

  return true;
@ -376,6 +384,7 @@ Region of Interest (RoI) align operation as used in Mask R-CNN.

 using RoIAlignOpFloatCPU = caffe2::RoIAlignOp<float, caffe2::CPUContext>;

+// clang-format off
 C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
    RoIAlign,
    "_caffe2::RoIAlign("
@ -385,6 +394,8 @@ C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
      "float spatial_scale, "
      "int pooled_h, "
      "int pooled_w, "
-      "int sampling_ratio"
+      "int sampling_ratio, "
+      "bool aligned"
    ") -> Tensor",
    RoIAlignOpFloatCPU);
+// clang-format on
--- a/caffe2/operators/roi_align_op.cu
+++ b/caffe2/operators/roi_align_op.cu
@ -77,7 +77,8 @@ __global__ void RoIAlignForward(
    const int sampling_ratio,
    const T* bottom_rois,
    int roi_cols,
-    T* top_data) {
+    T* top_data,
+    bool continuous_coordinate) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
@ -94,18 +95,19 @@ __global__ void RoIAlignForward(
    }

    // Do not using rounding; this implementation detail is critical
-    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
-    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
-    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
-    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
-    // T roi_start_w = roundf(offset_bottom_rois[0] * spatial_scale);
-    // T roi_start_h = roundf(offset_bottom_rois[1] * spatial_scale);
-    // T roi_end_w = roundf(offset_bottom_rois[2] * spatial_scale);
-    // T roi_end_h = roundf(offset_bottom_rois[3] * spatial_scale);
+    T roi_offset = continuous_coordinate ? T(0.5) : 0;
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale - roi_offset;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale - roi_offset;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale - roi_offset;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale - roi_offset;

-    // Force malformed ROIs to be 1x1
-    T roi_width = c10::cuda::compat::max(roi_end_w - roi_start_w, (T)1.);
-    T roi_height = c10::cuda::compat::max(roi_end_h - roi_start_h, (T)1.);
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!continuous_coordinate) { // backward compatiblity
+      // Force malformed ROIs to be 1x1
+      roi_width = c10::cuda::compat::max(roi_width, (T)1.);
+      roi_height = c10::cuda::compat::max(roi_height, (T)1.);
+    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

@ -150,17 +152,21 @@ template <>
 bool RoIAlignOp<float, CUDAContext>::RunOnDevice() {
  auto& X = Input(0); // Input data to pool
  auto& R = Input(1); // RoIs
-   // RoI pooled data
+                      // RoI pooled data

  if (R.numel() == 0) {
    // Handle empty rois
-    Output(0, {0, X.dim32(1), pooled_height_, pooled_width_}, at::dtype<float>());
+    Output(
+        0, {0, X.dim32(1), pooled_height_, pooled_width_}, at::dtype<float>());
    return true;
  }

  assert(sampling_ratio_ >= 0);

-  auto* Y = Output(0, {R.dim32(0), X.dim32(1), pooled_height_, pooled_width_}, at::dtype<float>());
+  auto* Y = Output(
+      0,
+      {R.dim32(0), X.dim32(1), pooled_height_, pooled_width_},
+      at::dtype<float>());
  int output_size = Y->numel();
  RoIAlignForward<float>
      <<<CAFFE_GET_BLOCKS(output_size),
@ -178,7 +184,8 @@ bool RoIAlignOp<float, CUDAContext>::RunOnDevice() {
          sampling_ratio_,
          R.data<float>(),
          R.dim32(1),
-          Y->mutable_data<float>());
+          Y->mutable_data<float>(),
+          aligned_);
  return true;
 }

--- a/caffe2/operators/roi_align_op.h
+++ b/caffe2/operators/roi_align_op.h
@ -4,9 +4,9 @@
 #define ROI_ALIGN_OP_H_

 #include "caffe2/core/context.h"
+#include "caffe2/core/export_caffe2_op_to_c10.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
-#include "caffe2/core/export_caffe2_op_to_c10.h"

 C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(RoIAlign)

@ -25,7 +25,8 @@ class RoIAlignOp final : public Operator<Context> {
        pooled_height_(this->template GetSingleArgument<int>("pooled_h", 1)),
        pooled_width_(this->template GetSingleArgument<int>("pooled_w", 1)),
        sampling_ratio_(
-            this->template GetSingleArgument<int>("sampling_ratio", -1)) {
+            this->template GetSingleArgument<int>("sampling_ratio", -1)),
+        aligned_(this->template GetSingleArgument<bool>("aligned", false)) {
    DCHECK_GT(spatial_scale_, 0);
    DCHECK_GT(pooled_height_, 0);
    DCHECK_GT(pooled_width_, 0);
@ -44,6 +45,7 @@ class RoIAlignOp final : public Operator<Context> {
  int pooled_height_;
  int pooled_width_;
  int sampling_ratio_;
+  bool aligned_;
 };

 } // namespace caffe2
--- a/caffe2/operators/roi_align_rotated_gradient_op.cu
+++ b/caffe2/operators/roi_align_rotated_gradient_op.cu
@ -91,7 +91,8 @@ __global__ void RoIAlignRotatedBackward(
    const int pooled_width,
    const int sampling_ratio,
    T* bottom_diff,
-    const T* bottom_rois) {
+    const T* bottom_rois,
+    bool continuous_coordinate) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
@ -103,15 +104,18 @@ __global__ void RoIAlignRotatedBackward(
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not round
-    T roi_center_w = offset_bottom_rois[1] * spatial_scale;
-    T roi_center_h = offset_bottom_rois[2] * spatial_scale;
+    T roi_offset = continuous_coordinate ? T(0.5) : 0;
+    T roi_center_w = offset_bottom_rois[1] * spatial_scale - roi_offset;
+    T roi_center_h = offset_bottom_rois[2] * spatial_scale - roi_offset;
    T roi_width = offset_bottom_rois[3] * spatial_scale;
    T roi_height = offset_bottom_rois[4] * spatial_scale;
    T theta = offset_bottom_rois[5] * M_PI / 180.0;

-    // Force malformed ROIs to be 1x1
-    roi_width = c10::cuda::compat::max(roi_width, (T)1.);
-    roi_height = c10::cuda::compat::max(roi_height, (T)1.);
+    if (!continuous_coordinate) { // backward compatiblity
+      // Force malformed ROIs to be 1x1
+      roi_width = c10::cuda::compat::max(roi_width, (T)1.);
+      roi_height = c10::cuda::compat::max(roi_height, (T)1.);
+    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

@ -225,7 +229,8 @@ bool RoIAlignRotatedGradientOp<float, CUDAContext>::RunOnDevice() {
            pooled_width_,
            sampling_ratio_,
            dX->mutable_data<float>(),
-            R.data<float>());
+            R.data<float>(),
+            aligned_);
  }
  return true;
 }
--- a/caffe2/operators/roi_align_rotated_gradient_op.h
+++ b/caffe2/operators/roi_align_rotated_gradient_op.h
@ -20,7 +20,8 @@ class RoIAlignRotatedGradientOp final : public Operator<Context> {
        pooled_height_(this->template GetSingleArgument<int>("pooled_h", 1)),
        pooled_width_(this->template GetSingleArgument<int>("pooled_w", 1)),
        sampling_ratio_(
-            this->template GetSingleArgument<int>("sampling_ratio", -1)) {
+            this->template GetSingleArgument<int>("sampling_ratio", -1)),
+        aligned_(this->template GetSingleArgument<bool>("aligned", false)) {
    DCHECK_GT(spatial_scale_, 0);
    DCHECK_GT(pooled_height_, 0);
    DCHECK_GT(pooled_width_, 0);
@ -37,6 +38,7 @@ class RoIAlignRotatedGradientOp final : public Operator<Context> {
  int pooled_height_;
  int pooled_width_;
  int sampling_ratio_;
+  bool aligned_;
 };

 } // namespace caffe2
--- a/caffe2/operators/roi_align_rotated_op.cc
+++ b/caffe2/operators/roi_align_rotated_op.cc
@ -140,7 +140,8 @@ void ROIAlignRotatedForward(
    const T* bottom_rois,
    int roi_cols,
    T* top_data,
-    StorageOrder order) {
+    StorageOrder order,
+    bool continuous_coordinate) {
  DCHECK(roi_cols == 5 || roi_cols == 6);

  int n_rois = nthreads / channels / pooled_width / pooled_height;
@ -159,15 +160,22 @@ void ROIAlignRotatedForward(
    }

    // Do not round
-    T roi_center_w = offset_bottom_rois[0] * spatial_scale;
-    T roi_center_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_offset = continuous_coordinate ? T(0.5) : 0;
+    T roi_center_w = offset_bottom_rois[0] * spatial_scale - roi_offset;
+    T roi_center_h = offset_bottom_rois[1] * spatial_scale - roi_offset;
    T roi_width = offset_bottom_rois[2] * spatial_scale;
    T roi_height = offset_bottom_rois[3] * spatial_scale;
    T theta = offset_bottom_rois[4] * M_PI / 180.0;

-    // Force malformed ROIs to be 1x1
-    roi_width = std::max(roi_width, (T)1.);
-    roi_height = std::max(roi_height, (T)1.);
+    if (continuous_coordinate) {
+      CAFFE_ENFORCE(
+          roi_width > 0 && roi_height > 0,
+          "ROIs in ROIAlign do not have positive size!");
+    } else { // backward compatiblity
+      // Force malformed ROIs to be 1x1
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

@ -309,7 +317,7 @@ bool RoIAlignRotatedOp<float, CPUContext>::RunOnDevice() {
    auto* Y = Output(
        0,
        {R.dim32(0), X.dim32(1), pooled_height_, pooled_width_},
-        at::dtype<float>());  // RoI pooled data
+        at::dtype<float>()); // RoI pooled data

    size_t output_size = Y->numel();
    ROIAlignRotatedForward<float>(
@ -325,12 +333,13 @@ bool RoIAlignRotatedOp<float, CPUContext>::RunOnDevice() {
        R.data<float>(),
        R.dim32(1),
        Y->mutable_data<float>(),
-        order_);
+        order_,
+        aligned_);
  } else if (order_ == StorageOrder::NHWC) {
    auto* Y = Output(
        0,
        {R.dim32(0), pooled_height_, pooled_width_, X.dim32(3)},
-        at::dtype<float>());   // RoI pooled data
+        at::dtype<float>()); // RoI pooled data
    size_t output_size = Y->numel();
    ROIAlignRotatedForward<float>(
        output_size,
@ -345,7 +354,8 @@ bool RoIAlignRotatedOp<float, CPUContext>::RunOnDevice() {
        R.data<float>(),
        R.dim32(1),
        Y->mutable_data<float>(),
-        order_);
+        order_,
+        aligned_);
  }

  return true;
--- a/caffe2/operators/roi_align_rotated_op.cu
+++ b/caffe2/operators/roi_align_rotated_op.cu
@ -81,7 +81,8 @@ __global__ void RoIAlignRotatedForward(
    const int pooled_width,
    const int sampling_ratio,
    const T* bottom_rois,
-    T* top_data) {
+    T* top_data,
+    bool continuous_coordinate) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
@ -93,15 +94,18 @@ __global__ void RoIAlignRotatedForward(
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not round
-    T roi_center_w = offset_bottom_rois[1] * spatial_scale;
-    T roi_center_h = offset_bottom_rois[2] * spatial_scale;
+    T roi_offset = continuous_coordinate ? T(0.5) : 0;
+    T roi_center_w = offset_bottom_rois[1] * spatial_scale - roi_offset;
+    T roi_center_h = offset_bottom_rois[2] * spatial_scale - roi_offset;
    T roi_width = offset_bottom_rois[3] * spatial_scale;
    T roi_height = offset_bottom_rois[4] * spatial_scale;
    T theta = offset_bottom_rois[5] * M_PI / 180.0;

-    // Force malformed ROIs to be 1x1
-    roi_width = c10::cuda::compat::max(roi_width, (T)1.);
-    roi_height = c10::cuda::compat::max(roi_height, (T)1.);
+    if (!continuous_coordinate) { // backward compatiblity
+      // Force malformed ROIs to be 1x1
+      roi_width = c10::cuda::compat::max(roi_width, (T)1.);
+      roi_height = c10::cuda::compat::max(roi_height, (T)1.);
+    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

@ -162,7 +166,10 @@ bool RoIAlignRotatedOp<float, CUDAContext>::RunOnDevice() {

  if (R.numel() == 0) {
    // Handle empty rois
-    Output(0, {0, X.dim32(1), pooled_height_, pooled_width_}, at::dtype<float>()); // RoI pooled data
+    Output(
+        0,
+        {0, X.dim32(1), pooled_height_, pooled_width_},
+        at::dtype<float>()); // RoI pooled data
    return true;
  }

@ -171,7 +178,10 @@ bool RoIAlignRotatedOp<float, CUDAContext>::RunOnDevice() {

  assert(sampling_ratio_ >= 0);

-  auto* Y = Output(0, {R.dim32(0), X.dim32(1), pooled_height_, pooled_width_}, at::dtype<float>()); // RoI pooled data
+  auto* Y = Output(
+      0,
+      {R.dim32(0), X.dim32(1), pooled_height_, pooled_width_},
+      at::dtype<float>()); // RoI pooled data

  int output_size = Y->numel();
  RoIAlignRotatedForward<float>
@ -189,7 +199,8 @@ bool RoIAlignRotatedOp<float, CUDAContext>::RunOnDevice() {
          pooled_width_,
          sampling_ratio_,
          R.data<float>(),
-          Y->mutable_data<float>());
+          Y->mutable_data<float>(),
+          aligned_);
  return true;
 }

--- a/caffe2/operators/roi_align_rotated_op.h
+++ b/caffe2/operators/roi_align_rotated_op.h
@ -22,7 +22,8 @@ class RoIAlignRotatedOp final : public Operator<Context> {
        pooled_height_(this->template GetSingleArgument<int>("pooled_h", 1)),
        pooled_width_(this->template GetSingleArgument<int>("pooled_w", 1)),
        sampling_ratio_(
-            this->template GetSingleArgument<int>("sampling_ratio", -1)) {
+            this->template GetSingleArgument<int>("sampling_ratio", -1)),
+        aligned_(this->template GetSingleArgument<bool>("aligned", false)) {
    DCHECK_GT(spatial_scale_, 0);
    DCHECK_GT(pooled_height_, 0);
    DCHECK_GT(pooled_width_, 0);
@ -41,6 +42,7 @@ class RoIAlignRotatedOp final : public Operator<Context> {
  int pooled_height_;
  int pooled_width_;
  int sampling_ratio_;
+  bool aligned_;
 };

 } // namespace caffe2
--- a/caffe2/python/operator_test/torch_integration_test.py
+++ b/caffe2/python/operator_test/torch_integration_test.py
@ -446,6 +446,7 @@ class TorchIntegration(hu.HypothesisTestCase):
            pooled_h=3,
            pooled_w=3,
            sampling_ratio=0,
+            aligned=False,
        )
        torch.testing.assert_allclose(roi_feature_ref, roi_feature.cpu())

--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@ -1551,6 +1551,7 @@ class TestCaffe2Backend_opset9(unittest.TestCase):
                    pooled_h=3,
                    pooled_w=3,
                    sampling_ratio=0,
+                    aligned=False,
                )
                return output

@ -1571,7 +1572,7 @@ class TestCaffe2Backend_opset9(unittest.TestCase):
            def forward(self, feature, rois):
                roi_feature = torch.ops._caffe2.RoIAlign(
                    feature, rois, order="NCHW", spatial_scale=1.0,
-                    pooled_h=3, pooled_w=3, sampling_ratio=3,
+                    pooled_h=3, pooled_w=3, sampling_ratio=3, aligned=False,
                )
                return roi_feature