Make android segmentation network run on both iOS and android with tiling

Summary: Add tiling support to GLAdd, GLPool, and GLResizeNearest Differential Revision: D5733208 fbshipit-source-id: b73113326b96d421787d4695ccf7d2d919ee2ed8
2025-12-06 12:20:52 +01:00 · 2017-09-04 17:18:53 -07:00 · 2017-09-04 17:18:53 -07:00 · dd5400e452
commit dd5400e452
parent 2d9728d594
19 changed files with 714 additions and 868 deletions
--- a/caffe2/contrib/opengl/core/GLContext.h
+++ b/caffe2/contrib/opengl/core/GLContext.h
@ -26,9 +26,7 @@ class GLContext {

  static bool GL_EXT_texture_border_clamp_defined();

-  inline bool halfFloatTextureSupported() {
-    return half_float_supported;
-  }
+  inline bool halfFloatTextureSupported() { return half_float_supported; }

  void setTextureAllocator(
      std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
--- a/caffe2/contrib/opengl/core/GLFilter.cc
+++ b/caffe2/contrib/opengl/core/GLFilter.cc
@ -76,13 +76,9 @@ std::string GLFilter::process_replacements(std::string shader,
  // Add some #defines for convenience
  std::string version_tag = "#version 300 es";
  if (GLContext::getGLContext()->halfFloatTextureSupported()) {
-    shader.insert(
-        shader.find(version_tag) + version_tag.size(),
-        half_float_texture_utils);
+    shader.insert(shader.find(version_tag) + version_tag.size(), half_float_texture_utils);
  } else {
-    shader.insert(
-        shader.find(version_tag) + version_tag.size(),
-        half_float_compat_texture_utils);
+    shader.insert(shader.find(version_tag) + version_tag.size(), half_float_compat_texture_utils);
  }
  shader.insert(shader.find(version_tag) + version_tag.size(), shader_utils);
  return shader;
--- a/caffe2/contrib/opengl/core/GLImage.h
+++ b/caffe2/contrib/opengl/core/GLImage.h
@ -18,6 +18,8 @@ class GLImage {

  const int tile_x;
  const int tile_y;
+  const int texture_width;
+  const int texture_height;
  const int slices;

  const std::vector<const GLTexture*> textures;
@ -50,6 +52,8 @@ class GLImage {
        data_size(sizeof(T)),
        tile_x(_tile_x),
        tile_y(_tile_y),
+        texture_width(_width * _tile_x),
+        texture_height(_height * _tile_y),
        slices(channels_to_slices(_channels, _tile_x, _tile_y)),
        textures(allocate_textures(slices, texture_loader)) {
    CAFFE_ENFORCE_EQ(slices * tile_x * tile_y, (channels + 3) / 4);
@ -68,13 +72,23 @@ class GLImage {
        data_size(sizeof(T)),
        tile_x(_tile_x),
        tile_y(_tile_y),
+        texture_width(_width * _tile_x),
+        texture_height(_height * _tile_y),
        slices(channels_to_slices(_channels, _tile_x, _tile_y)),
        textures(allocate_textures(slices, texture_loader)) {
    CAFFE_ENFORCE_EQ(slices * tile_x * tile_y, (channels + 3) / 4);
  }

  GLImage()
-      : width(0), height(0), channels(0), data_size(sizeof(T)), tile_x(0), tile_y(0), slices(0){};
+      : width(0),
+        height(0),
+        channels(0),
+        data_size(sizeof(T)),
+        tile_x(0),
+        tile_y(0),
+        texture_width(0),
+        texture_height(0),
+        slices(0){};

  virtual ~GLImage() {
    gl_log(GL_VERBOSE, "deleting GLImage\n");
--- a/caffe2/contrib/opengl/core/GLPlainTexture.cc
+++ b/caffe2/contrib/opengl/core/GLPlainTexture.cc
@ -6,37 +6,18 @@
 #include "caffe2/core/logging.h"
 #include "caffe2/core/timer.h"

-#define half_float_supported \
-  (GLContext::getGLContext()->halfFloatTextureSupported())
+#define half_float_supported (GLContext::getGLContext()->halfFloatTextureSupported())

-#define FIXED_TYPE(_t)                                  \
-  (((_t).type != GL_HALF_FLOAT || half_float_supported) \
-       ? (_t)                                           \
-       : GLTexture::FP16_COMPAT)
+#define FIXED_TYPE(_t) (((_t).type != GL_HALF_FLOAT || half_float_supported) ? (_t) : GLTexture::FP16_COMPAT)

 GLPlainTexture::GLPlainTexture(
-    const Type& type,
-    const void* input,
-    GLsizei width,
-    GLsizei height,
-    bool use_padding,
-    GLint filter,
-    GLint wrap)
+    const Type& type, const void* input, GLsizei width, GLsizei height, bool use_padding, GLint filter, GLint wrap)
    : GLTexture(FIXED_TYPE(type), width, height, use_padding, filter, wrap) {
  //  caffe2::Timer timer;
  //  timer.Start();
  glGenTextures(1, &_textureId);
  glBindTexture(GL_TEXTURE_2D, _textureId);
-  glTexImage2D(
-      GL_TEXTURE_2D,
-      0,
-      _type.internalFormat,
-      _stride,
-      _height,
-      0,
-      _type.format,
-      _type.type,
-      input);
+  glTexImage2D(GL_TEXTURE_2D, 0, _type.internalFormat, _stride, _height, 0, _type.format, _type.type, input);

  gl_log(
      GL_VERBOSE,
@ -64,13 +45,7 @@ GLPlainTexture::GLPlainTexture(
 }

 GLPlainTexture::GLPlainTexture(
-    const Type& type,
-    const GLuint textureID,
-    GLsizei width,
-    GLsizei height,
-    bool use_padding,
-    GLint filter,
-    GLint wrap)
+    const Type& type, const GLuint textureID, GLsizei width, GLsizei height, bool use_padding, GLint filter, GLint wrap)
    : GLTexture(FIXED_TYPE(type), width, height, use_padding, filter, wrap) {
  _textureId = textureID;
  isOwner = false;
--- a/caffe2/contrib/opengl/core/GLTexture.cc
+++ b/caffe2/contrib/opengl/core/GLTexture.cc
@ -31,9 +31,7 @@ void arm_memcpy(volatile unsigned char* dst, volatile unsigned char* src, int sz

 const GLTexture::Type GLTexture::FP16 = {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT};
 const GLTexture::Type GLTexture::UI8 = {GL_RGBA, GL_RGBA, GL_UNSIGNED_BYTE};
-const GLTexture::Type GLTexture::FP16_COMPAT = {GL_RG32UI,
-                                                GL_RG_INTEGER,
-                                                GL_UNSIGNED_INT};
+const GLTexture::Type GLTexture::FP16_COMPAT = {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT};

 void GLTexture::map_read(std::function<void(const void* buffer,
                                            size_t width,
--- a/caffe2/contrib/opengl/core/GLTexture.h
+++ b/caffe2/contrib/opengl/core/GLTexture.h
@ -13,14 +13,14 @@ class GLTexture {

    int dataSize() const {
      switch (type) {
-        case GL_UNSIGNED_INT:
-          return 4;
-        case GL_HALF_FLOAT:
-          return 2;
-        case GL_UNSIGNED_BYTE:
-          return 1;
-        default:
-          throw std::runtime_error("Unknown Texture Type");
+      case GL_UNSIGNED_INT:
+        return 4;
+      case GL_HALF_FLOAT:
+        return 2;
+      case GL_UNSIGNED_BYTE:
+        return 1;
+      default:
+        throw std::runtime_error("Unknown Texture Type");
      }
    }

--- a/caffe2/contrib/opengl/core/rewrite_net.cc
+++ b/caffe2/contrib/opengl/core/rewrite_net.cc
@ -251,7 +251,7 @@ void dumpDefForOpenGL(const NetDef& d) {
 //  }
 //}

-NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput, bool useTiling) {
+NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput, bool useTiling, bool runFusion) {
  CAFFE_ENFORCE_GE(predictNet.op_size(), 1);
  NetDef net;
  net.CopyFrom(predictNet);
@ -303,7 +303,9 @@ NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput
    CAFFE_THROW("OpenGL operator missing");
  }

-  net = runOpenGLFusion(net, openGLOps);
+  if (runFusion) {
+    net = runOpenGLFusion(net, openGLOps);
+  }

  if (net.op(0).type() == replacements["OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess"]) {
    // For end-to-end testing
@ -320,7 +322,9 @@ NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput
      copy_op->add_output(net.external_output(0));
    }
  } else {
-    needCopyOps = true;
+    if (!useTextureInput) {
+      needCopyOps = true;
+    }
  }

  // copy ops are needed when the input is not a texture
@ -335,10 +339,12 @@ NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput
 bool tryConvertToOpenGL(const NetDef& initNet,
                        const NetDef& predictNet,
                        NetDef* glPredictNet,
-                        bool useTextureInput) {
+                        bool useTextureInput,
+                        bool useTiling,
+                        bool runFusion) {
  try {
    // Throws if unsupported operators are found.
-    *glPredictNet = rewritePredictNetForOpenGL(predictNet, useTextureInput);
+    *glPredictNet = rewritePredictNetForOpenGL(predictNet, useTextureInput, useTiling, runFusion);
    dumpDefForOpenGL(*glPredictNet);
    // Throws if unsupported parameters are found.
    Workspace ws;
--- a/caffe2/contrib/opengl/core/rewrite_net.h
+++ b/caffe2/contrib/opengl/core/rewrite_net.h
@ -8,11 +8,14 @@ namespace caffe2 {
 bool tryConvertToOpenGL(const NetDef& initNet,
                        const NetDef& predictNet,
                        NetDef* glPredictNet,
-                        bool useTextureInput = false);
+                        bool useTextureInput = false,
+                        bool useTiling       = false,
+                        bool runFusion       = true);

 // Exposed for testing
 NetDef rewritePredictNetForOpenGL(const NetDef& predictNet,
                                  bool useTextureInput = false,
-                                  bool useTiling = false);
+                                  bool useTiling       = false,
+                                  bool runFusion       = true);
 void dumpDefForOpenGL(const NetDef& net);
 } // namespace caffe2
--- a/caffe2/contrib/opengl/operators/GLAdd.cc
+++ b/caffe2/contrib/opengl/operators/GLAdd.cc
@ -75,9 +75,9 @@ void GLAdd::add(const GLImageVector<T>& input_images0,

      run(input_attachments,
          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
-          [&]() { glUniform2i(outputSize->location, output_image->width, output_image->height); },
-          output_image->width,
-          output_image->height);
+          [&]() { glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height); },
+          output_image->texture_width,
+          output_image->texture_height);
    }
  }
 }
@ -91,8 +91,7 @@ class OpenGLAddOp final : public Operator<CPUContext>, ImageAllocator<T> {
    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("broadcast") == false,
                           "OpenGLAdd does not support broadcast");

-    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false,
-                           "OpenGLMul does not support axis");
+    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, "OpenGLAdd does not support axis");
  }

  bool RunOnDevice() override {
@ -105,18 +104,25 @@ class OpenGLAddOp final : public Operator<CPUContext>, ImageAllocator<T> {
    const int input_channels = input0.channels();
    const int input_width = input0.width();
    const int input_height = input0.height();
+    const int input_tile_x   = input0.tile_x();
+    const int input_tile_y   = input0.tile_y();
+
    CAFFE_ENFORCE_EQ(input1.channels(), input_channels);
    CAFFE_ENFORCE_EQ(input1.width(), input_width);
    CAFFE_ENFORCE_EQ(input1.height(), input_height);
+    CAFFE_ENFORCE_EQ(input1.tile_x(), input_tile_x);
+    CAFFE_ENFORCE_EQ(input1.tile_y(), input_tile_y);

    const int output_channels = input_channels;
    const int output_width = input_width;
    const int output_height = input_height;
+    const int output_tile_x   = input_tile_x;
+    const int output_tile_y   = input_tile_y;

    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);

    GLImageVector<T>* output = ImageAllocator<T>::newImage(
-        num_images, output_width, output_height, output_channels, is_last);
+        num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);

    if (!_add) {
      _add.reset(new GLAdd());
--- a/caffe2/contrib/opengl/operators/GLConvolution.cc
+++ b/caffe2/contrib/opengl/operators/GLConvolution.cc
--- a/caffe2/contrib/opengl/operators/GLPRelu.cc
+++ b/caffe2/contrib/opengl/operators/GLPRelu.cc
@ -32,19 +32,18 @@ class GLPRelu : public GLFilter {
          int _output_tile_y,
          int _output_tile_width,
          int _output_tile_height)
-      : GLFilter(
-            "GLPRelu",
-            vertex_shader,
-            fragment_shader,
-            std::vector<binding*>({BINDING(inputData)}),
-            std::vector<binding*>({BINDING(scale_block)}),
-            {/* no attributes */},
-            {{"USE_RELU", caffe2::to_string(PRelu)},
-             {"OUTPUT_TILES", caffe2::to_string(_output_tile_x * _output_tile_y)},
-             {"OUTPUT_TILE_X", caffe2::to_string(_output_tile_x)},
-             {"OUTPUT_TILE_WIDTH", caffe2::to_string(_output_tile_width)},
-             {"OUTPUT_TILE_HEIGHT", caffe2::to_string(_output_tile_height)},
-             {"TILED_CONVOLUTION", caffe2::to_string(_output_tile_x > 1 || _output_tile_y > 1)}}),
+      : GLFilter("GLPRelu",
+                 vertex_shader,
+                 fragment_shader,
+                 std::vector<binding*>({BINDING(inputData)}),
+                 std::vector<binding*>({BINDING(scale_block)}),
+                 {/* no attributes */},
+                 {{"USE_RELU", caffe2::to_string(PRelu)},
+                  {"OUTPUT_TILES", caffe2::to_string(_output_tile_x * _output_tile_y)},
+                  {"OUTPUT_TILE_X", caffe2::to_string(_output_tile_x)},
+                  {"OUTPUT_TILE_WIDTH", caffe2::to_string(_output_tile_width)},
+                  {"OUTPUT_TILE_HEIGHT", caffe2::to_string(_output_tile_height)},
+                  {"TILED_PRELU", caffe2::to_string(_output_tile_x > 1 || _output_tile_y > 1)}}),
        scale(_scale),
        scale_size(_scale_size),
        channels(_channels),
@ -67,7 +66,7 @@ class GLPRelu : public GLFilter {
                  {"OUTPUT_TILE_X", caffe2::to_string(1)},
                  {"OUTPUT_TILE_WIDTH", caffe2::to_string(1)},
                  {"OUTPUT_TILE_HEIGHT", caffe2::to_string(1)},
-                  {"TILED_CONVOLUTION", caffe2::to_string(0)}}),
+                  {"TILED_PRELU", caffe2::to_string(0)}}),
        scale(nullptr),
        scale_block(nullptr),
        scale_size(0),
@ -88,75 +87,72 @@ class GLPRelu : public GLFilter {
 // MARK: GLSL

 const char* GLPRelu::fragment_shader = R"GLSL(#version 300 es
-
+#define TILED_PRELU                 $(TILED_PRELU)
 #define USE_RELU                    $(USE_RELU)
+
+// tiling
 #define OUTPUT_TILES                $(OUTPUT_TILES)
 #define OUTPUT_TILE_X               $(OUTPUT_TILE_X)
 #define OUTPUT_TILE_WIDTH           $(OUTPUT_TILE_WIDTH)
 #define OUTPUT_TILE_HEIGHT          $(OUTPUT_TILE_HEIGHT)
-#define TILED_CONVOLUTION           $(TILED_CONVOLUTION)

+// common
 precision mediump float;
 precision highp int;

 TEXTURE_INPUT(inputData);
 TEXTURE_OUTPUT(0, outputData);

-const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
-
 in highp vec2 v_texCoord;

-#if !USE_RELU
-#if TILED_CONVOLUTION == 1
-layout (std140) uniform scale_block {
-  highp uvec4 scale[(OUTPUT_TILES + 1) / 2];
-};
+#if USE_RELU

-#else
-layout (std140) uniform scale_block {
-  highp uvec4 scale;
-};
-#endif
-#endif
-
-#if !USE_RELU
-
-#if TILED_CONVOLUTION
-void main() {
-  ivec2 inputSize = textureSize(inputData, 0);
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
-  
-  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
-  
-  int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
-  
-  // output.data     = value > 0 ? value : value * weight;
-  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
-  vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw);
-  value = mix(value * preluValue, value, vec4(greaterThan(value, vec4(0))));
-  outputData = TEXTURE_STORE(value);
-}
-
-#else
-
-void main() {
-  ivec2 inputSize = textureSize(inputData, 0);
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
-
-  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
-  value = mix(value * unpackHalf4x16(scale.xy), value, vec4(greaterThan(value, vec4(0))));
-  outputData = TEXTURE_STORE(value);
-}
-#endif // TILED_CONVOLUTION
-
-#else // Relu
+// Relu
 void main() {
  ivec2 inputSize = textureSize(inputData, 0);
  ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
  outputData = TEXTURE_STORE(max(value, vec4(0.0)));
 }
-#endif
+
+#else
+
+#if TILED_PRELU
+const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
+
+layout (std140) uniform scale_block {
+  highp uvec4 scale[(OUTPUT_TILES + 1) / 2];
+};
+
+void main() {
+  ivec2 inputSize = textureSize(inputData, 0);
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
+
+  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
+  int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
+
+  // outputData = value > 0 ? value : value * weight;
+  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
+  vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw);
+  value = mix(value * preluValue, value, vec4(greaterThan(value, vec4(0))));
+  outputData = TEXTURE_STORE(value);
+}
+#else
+layout (std140) uniform scale_block {
+  highp uvec4 scale;
+};
+void main() {
+  ivec2 inputSize = textureSize(inputData, 0);
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
+
+  // outputData = value > 0 ? value : value * weight;
+  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
+  value = mix(value * unpackHalf4x16(scale.xy), value, vec4(greaterThan(value, vec4(0))));
+  outputData = TEXTURE_STORE(value);
+}
+#endif // TILED_PRELU
+
+#endif // USE_RELU

 )GLSL";

@ -190,8 +186,8 @@ void GLPRelu::prelu(const GLImageVector<T>& input_images,
      run(input_attachments,
          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
          [&]() {},
-          output_image->width * output_image->tile_x,
-          output_image->height * output_image->tile_y);
+          output_image->texture_width,
+          output_image->texture_height);
    }
  }
 }
--- a/caffe2/contrib/opengl/operators/GLPadImage.cc
+++ b/caffe2/contrib/opengl/operators/GLPadImage.cc
@ -139,8 +139,8 @@ class OpenGLPadImageOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator
      padImage_.reset(new GLPadImage());
      LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => "
                << output_channels << ": " << output_height << " X " << output_width;
-      LOG(INFO) << "Padmode: " << mode_ << "pad_l = " << pad_l() << ", pad_r = " << pad_r()
-                << ", pad_t = " << pad_t() << ", pad_b = " << pad_b();
+      LOG(INFO) << "Padmode: " << mode_ << ", pad_l = " << pad_l() << ", pad_r = " << pad_r() << ", pad_t = " << pad_t()
+                << ", pad_b = " << pad_b();
    }

    padImage_->pad(input, *output, pad_l(), pad_t());
--- a/caffe2/contrib/opengl/operators/GLPool.cc
+++ b/caffe2/contrib/opengl/operators/GLPool.cc
@ -21,6 +21,8 @@ class GLPool : public GLFilter {
    point kernel_size;
    point input_padding;
    point input_stride;
+    point input_tile_size;
+    point output_tile_size;
  };

  binding* inputData;
@ -29,25 +31,29 @@ class GLPool : public GLFilter {

  const descriptor geometry;

-  GLPool(const descriptor& _geometry, PoolType poolType)
-      : GLFilter(
-            "GLPool",
-            vertex_shader,
-            fragment_shader,
-            {
-                BINDING(inputData), BINDING(kernelSize), BINDING(outputSize),
-            },
-            {/* no uniform blocks */},
-            {/* no attributes */},
-            {{"KERNEL_SIZE_X", caffe2::to_string(_geometry.kernel_size.x)},
-             {"KERNEL_SIZE_Y", caffe2::to_string(_geometry.kernel_size.y)},
-             {"INPUT_PADDING_X", caffe2::to_string(_geometry.input_padding.x)},
-             {"INPUT_PADDING_Y", caffe2::to_string(_geometry.input_padding.y)},
-             {"INPUT_STRIDE_X", caffe2::to_string(_geometry.input_stride.x)},
-             {"INPUT_STRIDE_Y", caffe2::to_string(_geometry.input_stride.y)},
-             {"TEXTURE_BORDER_CLAMP",
-              caffe2::to_string(GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined())},
-             {"MAX_POOL", caffe2::to_string(poolType == MaxPool)}}),
+  GLPool(const descriptor& _geometry, PoolType poolType, bool _tiling)
+      : GLFilter("GLPool",
+                 vertex_shader,
+                 fragment_shader,
+                 {
+                     BINDING(inputData), BINDING(kernelSize), BINDING(outputSize),
+                 },
+                 {/* no uniform blocks */},
+                 {/* no attributes */},
+                 {{"KERNEL_SIZE_X", caffe2::to_string(_geometry.kernel_size.x)},
+                  {"KERNEL_SIZE_Y", caffe2::to_string(_geometry.kernel_size.y)},
+                  {"INPUT_PADDING_X", caffe2::to_string(_geometry.input_padding.x)},
+                  {"INPUT_PADDING_Y", caffe2::to_string(_geometry.input_padding.y)},
+                  {"INPUT_STRIDE_X", caffe2::to_string(_geometry.input_stride.x)},
+                  {"INPUT_STRIDE_Y", caffe2::to_string(_geometry.input_stride.y)},
+                  {"INPUT_TILE_WIDTH", caffe2::to_string(_geometry.input_tile_size.x)},
+                  {"INPUT_TILE_HEIGHT", caffe2::to_string(_geometry.input_tile_size.y)},
+                  {"OUTPUT_TILE_WIDTH", caffe2::to_string(_geometry.output_tile_size.x)},
+                  {"OUTPUT_TILE_HEIGHT", caffe2::to_string(_geometry.output_tile_size.y)},
+                  {"TILED_POOLING", caffe2::to_string(_tiling)},
+                  {"TEXTURE_BORDER_CLAMP",
+                   caffe2::to_string(GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined())},
+                  {"MAX_POOL", caffe2::to_string(poolType == MaxPool)}}),
        geometry(_geometry) {}
  ~GLPool() {}

@ -63,11 +69,11 @@ class GLPool : public GLFilter {
        run({{input_image->textures[is], inputData}},
            {output_image->textures[is]},
            [&]() {
-              glUniform2i(outputSize->location, output_image->width, output_image->height);
+              glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
              glUniform2i(kernelSize->location, geometry.kernel_size.x, geometry.kernel_size.y);
            },
-            output_image->width,
-            output_image->height);
+            output_image->texture_width,
+            output_image->texture_height);
      }
    }
  }
@ -78,10 +84,16 @@ class GLPool : public GLFilter {

 // MARK: GLSL
 const char* GLPool::fragment_shader = R"GLSL(#version 300 es
-
+#define TILED_POOLING           $(TILED_POOLING)
 #define TEXTURE_BORDER_CLAMP    $(TEXTURE_BORDER_CLAMP)
 #define MAX_POOL                $(MAX_POOL)

+// tiling
+#define INPUT_TILE_WIDTH            $(INPUT_TILE_WIDTH)
+#define INPUT_TILE_HEIGHT           $(INPUT_TILE_HEIGHT)
+#define OUTPUT_TILE_WIDTH           $(OUTPUT_TILE_WIDTH)
+#define OUTPUT_TILE_HEIGHT          $(OUTPUT_TILE_HEIGHT)
+
 precision mediump float;
 precision mediump int;

@ -90,24 +102,84 @@ in highp vec2 v_texCoord;
 const ivec2 input_padding = ivec2($(INPUT_PADDING_X), $(INPUT_PADDING_Y));
 const ivec2 input_stride = ivec2($(INPUT_STRIDE_X), $(INPUT_STRIDE_Y));
 const ivec2 kernel_size = ivec2($(KERNEL_SIZE_X), $(KERNEL_SIZE_Y));
-const int channels = 4;

 uniform ivec2 kernelSize;
 uniform ivec2 outputSize;

 TEXTURE_INPUT(inputData);
-
 TEXTURE_OUTPUT(0, outputData);

-const bool no_bounds = bool(TEXTURE_BORDER_CLAMP) || all(equal(input_padding, ivec2(0)));
-
+const bool no_bounds = (TILED_POOLING == 0) && (bool(TEXTURE_BORDER_CLAMP) || all(equal(input_padding, ivec2(0))));
 #define IN_BOUNDS(p, p0, p1) (all(greaterThanEqual(p, p0)) && all(lessThan(p, p1)))

+// MIN_FLOAT is -2^14, which is the minimum precision requirement for mediump in OpenGL ES 3.0
+const float MIN_FLOAT = -exp2(14.0);
+
+#if TILED_POOLING
+
+const ivec2 inputTileSize = ivec2(INPUT_TILE_WIDTH, INPUT_TILE_HEIGHT);
+const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
+
+// tiled pooling
 #if MAX_POOL

-// MIN_FLOAT is -2^14, which is the minimum precision requirement for mediump in OpenGL ES 3.0
+#define POOL { \
+  pool = vec4(MIN_FLOAT); \
+  for (int y = 0; y < kernelSize.y; y++) { \
+    for (int x = 0; x < kernelSize.x; x++) { \
+      ivec2 idx = tileCoord + ivec2(x, y); \
+      if (no_bounds || IN_BOUNDS(idx, ivec2(0), inputTileSize)) { \
+        vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
+        pool = max(pool, data); \
+      } \
+    } \
+  } \
+}

-const float MIN_FLOAT = -exp2(14.0);
+#else
+
+#define POOL { \
+  int count = 0; \
+  for (int y = 0; y < kernelSize.y; y++) { \
+    for (int x = 0; x < kernelSize.x; x++) { \
+      ivec2 idx = tileCoord + ivec2(x, y); \
+      if (no_bounds || IN_BOUNDS(idx, ivec2(0), inputTileSize)) { \
+        vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
+        pool += data;\
+        count += 1; \
+      } \
+    } \
+  } \
+  pool = pool / float(count); \
+}
+
+#endif // MAX_POOL
+
+void main() {
+  ivec2 inputSize = textureSize(inputData, 0);
+  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
+
+  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
+  ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
+  tileCoord = input_stride * tileCoord - input_padding;
+
+  ivec2 inputTileOffset = tile * inputTileSize;
+
+#if MAX_POOL
+  vec4 pool = vec4(0);
+#else
+  highp vec4 pool = vec4(0);
+#endif
+
+  POOL;
+
+  outputData = TEXTURE_STORE(pool);
+}
+
+#else
+
+// no tiling
+#if MAX_POOL

 #define POOL { \
  pool = vec4(MIN_FLOAT); \
@ -125,22 +197,21 @@ const float MIN_FLOAT = -exp2(14.0);
 #else

 #define POOL { \
+  int count = 0; \
  for (int y = 0; y < kernelSize.y; y++) { \
    for (int x = 0; x < kernelSize.x; x++) { \
      ivec2 idx = texelCoord + ivec2(x, y); \
      if (no_bounds || IN_BOUNDS(idx, ivec2(0), inputSize)) { \
        vec4 data = TEXTURE_LOAD(inputData, idx); \
-        pool += data;\
+        pool += data; \
+        count += 1; \
      } \
    } \
  } \
-  ivec2 start = texelCoord; \
-  ivec2 end = min(start + kernel_size, inputSize); \
-  start = max(ivec2(0), start); \
-  pool = pool / float((end.x - start.x) * (end.y - start.y)); \
+  pool = pool / float(count); \
 }

-#endif
+#endif // MAX_POOL

 void main() {
  ivec2 inputSize = textureSize(inputData, 0);
@ -155,6 +226,8 @@ void main() {

  outputData = TEXTURE_STORE(pool);
 }
+#endif // TILED_POOLING
+
 )GLSL";

 namespace caffe2 {
@ -199,18 +272,25 @@ class GLPoolOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<float16

    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);

-    GLImageVector<T>* output = ImageAllocator<T>::newImage(
-        num_images, output_width, output_height, output_channels, is_last);
+    const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
+    const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;

-    GLPool::descriptor geometry{
-        input_channels, {kernel_w(), kernel_h()}, {pad_l(), pad_t()}, {stride_w(), stride_h()}};
+    GLImageVector<T>* output = ImageAllocator<T>::newImage(
+        num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
+
+    GLPool::descriptor geometry{input_channels,
+                                {kernel_w(), kernel_h()},
+                                {pad_l(), pad_t()},
+                                {stride_w(), stride_h()},
+                                {input_width, input_height},
+                                {output_height, output_width}};

    if (!glPool_) {
-      LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => "
-                << output_channels << ": " << output_height << " X " << output_width
-                << " Kernel: " << kernel_w() << "X" << kernel_h();
+      LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => " << output_channels << ": "
+                << output_height << " X " << output_width << " Kernel: " << kernel_w() << "X" << kernel_h()
+                << " Tiling: " << input_tile_x << "X" << input_tile_y;

-      glPool_.reset(new GLPool(geometry, poolType));
+      glPool_.reset(new GLPool(geometry, poolType, input_tile_x > 1 || input_tile_y > 1));
    }

    glPool_->pool(input, *output);
--- a/caffe2/contrib/opengl/operators/GLResize.cc
+++ b/caffe2/contrib/opengl/operators/GLResize.cc
@ -11,89 +11,48 @@

 class GLResizeNearest : public GLFilter {
 public:
-  static constexpr int MaxBatchSize = 4;
+   binding* inputData;
+   binding* outputSize;
+   binding* scale_reverse;

-  binding* inputData[MaxBatchSize];
-  binding* inputSize;
-  binding* outputSize;
-  binding* scale_reverse;
+   GLResizeNearest()
+       : GLFilter("GLResizeNearest",
+                  vertex_shader,
+                  fragment_shader,
+                  std::vector<binding*>({BINDING(outputSize), BINDING(scale_reverse), BINDING(inputData)}),
+                  {/* no uniform blocks*/},
+                  {/* no attributes */},
+                  {/* replacements */}) {}

-  const int batch_size;
+   template <typename T>
+   void resize(const GLImageVector<T>& input_images,
+               const GLImageVector<T>& output_images,
+               float width_scale_rev,
+               float height_scale_rev);

-  const std::vector<binding*> input_bindings(int batch_size) {
-    std::vector<binding*> bindings(
-        {BINDING(inputSize), BINDING(outputSize), BINDING(scale_reverse)});
-    for (int i = 0; i < batch_size; i++) {
-      bindings.push_back(inputData[i] = new binding{"inputData[" + caffe2::to_string(i) + "]"});
-    }
-    return bindings;
-  }
-
-  GLResizeNearest(int _batch_size = 1)
-      : GLFilter("GLResizeNearest",
-                 vertex_shader,
-                 fragment_shader,
-                 input_bindings(_batch_size),
-                 {/* no uniform blocks*/},
-                 {/* no attributes */},
-                 {{"BATCH_SIZE", caffe2::to_string(_batch_size)}}),
-        batch_size(_batch_size) {}
-
-  template <typename T>
-  void resize(const GLImageVector<T>& input_images,
-              const GLImageVector<T>& output_images,
-              float width_scale_rev,
-              float height_scale_rev);
-
-  static const char* fragment_shader;
+   static const char* fragment_shader;
 };

 // MARK: GLSL

 const char* GLResizeNearest::fragment_shader = R"GLSL(#version 300 es

-#define BATCH_SIZE    $(BATCH_SIZE)
-
 precision mediump float;
 precision mediump int;

 in highp vec2 v_texCoord;

-uniform ivec2 inputSize;
 uniform ivec2 outputSize;
 uniform highp vec2 scale_reverse;

-TEXTURE_INPUT(inputData[BATCH_SIZE]);
-
-TEXTURE_OUTPUT(0, outputData0);
-#if BATCH_SIZE > 1
-TEXTURE_OUTPUT(1, outputData1);
-#if BATCH_SIZE > 2
-TEXTURE_OUTPUT(2, outputData2);
-#if BATCH_SIZE > 3
-TEXTURE_OUTPUT(3, outputData3);
-#endif
-#endif
-#endif
+TEXTURE_INPUT(inputData);
+TEXTURE_OUTPUT(0, outputData);

 void main() {
  // it clamps to the edge by default
  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize) * scale_reverse);
-
-  vec4 v0 = TEXTURE_LOAD(inputData[0], texelCoord);
-  outputData0 = TEXTURE_STORE(v0);
-#if BATCH_SIZE > 1
-  vec4 v1 = TEXTURE_LOAD(inputData[1], texelCoord);
-  outputData1 = TEXTURE_STORE(v1);
-#if BATCH_SIZE > 2
-  vec4 v2 = TEXTURE_LOAD(inputData[2], texelCoord);
-  outputData2 = TEXTURE_STORE(v2);
-#if BATCH_SIZE > 3
-  vec4 v3 = TEXTURE_LOAD(inputData[3], texelCoord);
-  outputData3 = TEXTURE_STORE(v3);
-#endif
-#endif
-#endif
+  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
+  outputData = TEXTURE_STORE(value);
 }
 )GLSL";

@ -108,21 +67,17 @@ void GLResizeNearest::resize(const GLImageVector<T>& input_images,
    int input_slices = input_image->slices;
    int output_slices = output_image->slices;

-    for (int is = 0; is < input_slices; is += batch_size) {
-      std::vector<texture_attachment> input_attachments;
-      for (int ib = 0; ib < batch_size; ib++) {
-        input_attachments.push_back({input_image->textures[is + ib], inputData[ib]});
-      }
+    for (int is = 0; is < input_slices; is++) {
+      std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});

      run(input_attachments,
-          {output_image->textures.begin() + is, output_image->textures.begin() + is + batch_size},
+          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
          [&]() {
-            glUniform2i(inputSize->location, input_image->width, input_image->height);
-            glUniform2i(outputSize->location, output_image->width, output_image->height);
+            glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
            glUniform2f(scale_reverse->location, width_scale_rev, height_scale_rev);
          },
-          output_image->width,
-          output_image->height);
+          output_image->texture_width,
+          output_image->texture_height);
    }
  }
 }
@ -153,14 +108,15 @@ class OpenGLResizeNearestOp final : public Operator<CPUContext>, ImageAllocator<
    const int output_height = input_height * height_scale_;
    const int output_channels = input_channels;

+    const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
+    const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
+
    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
    GLImageVector<T>* output = ImageAllocator<T>::newImage(
-        num_images, output_width, output_height, output_channels, is_last);
+        num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);

    if (!resizeNearest_) {
-      int batch_size = OperatorBase::GetSingleArgument<int>("batch_size", 1);
-      resizeNearest_.reset(new GLResizeNearest(batch_size));
-      LOG(INFO) << "batch_size = " << batch_size;
+      resizeNearest_.reset(new GLResizeNearest());
    }
    resizeNearest_->resize(input, *output, 1.0 / width_scale_, 1.0 / height_scale_);
    Outputs()[0]->Reset(output);
--- a/caffe2/contrib/opengl/operators/GLStylizer.cc
+++ b/caffe2/contrib/opengl/operators/GLStylizer.cc
@ -18,28 +18,23 @@ class GLStylizer : public GLFilter {
  bool deprocess;

 public:
-  GLStylizer(bool _deprocess = false, InputFormat input_format = BGRA)
-      : GLFilter(
-            _deprocess ? "GLDeStylizer" : "GLStylizer",
-            vertex_shader,
-            fragment_shader,
-            std::vector<binding*>({BINDING(inputData),
-                                   BINDING(mean),
-                                   BINDING(noise_std),
-                                   BINDING(outputSize)}),
-            {/* no uniform blocks */},
-            {/* no attributes */},
-            {{"DEPROCESS", caffe2::to_string(_deprocess)},
-             {"RGBAINPUT", caffe2::to_string(input_format)}}),
-        deprocess(_deprocess) {}
+   GLStylizer(bool _deprocess = false, InputFormat input_format = BGRA)
+       : GLFilter(_deprocess ? "GLDeStylizer" : "GLStylizer",
+                  vertex_shader,
+                  fragment_shader,
+                  std::vector<binding*>({BINDING(inputData), BINDING(mean), BINDING(noise_std), BINDING(outputSize)}),
+                  {/* no uniform blocks */},
+                  {/* no attributes */},
+                  {{"DEPROCESS", caffe2::to_string(_deprocess)}, {"RGBAINPUT", caffe2::to_string(input_format)}}),
+         deprocess(_deprocess) {}

-  template <typename T1, typename T2>
-  void stylize(const GLImage<T1>* input_image,
-               const GLImage<T2>* output_image,
-               const float mean_values[3],
-               float noise_std_value);
+   template <typename T1, typename T2>
+   void stylize(const GLImage<T1>* input_image,
+                const GLImage<T2>* output_image,
+                const float mean_values[3],
+                float noise_std_value);

-  static const char* fragment_shader;
+   static const char* fragment_shader;
 };

 // MARK: GLSL
@ -116,8 +111,7 @@ void GLStylizer::stylize(const GLImage<T1>* input_image,
  run(std::vector<texture_attachment>({{input_image->textures[0], inputData}}),
      {output_image->textures[0]},
      [&]() {
-        glUniform2i(
-            outputSize->location, output_image->width, output_image->height);
+        glUniform2i(outputSize->location, output_image->width, output_image->height);
        glUniform3f(mean->location, mean_values[0], mean_values[1], mean_values[2]);
        if (!deprocess) {
          glUniform1f(noise_std->location, noise_std_value);
--- a/caffe2/contrib/opengl/operators/GLSub.cc
+++ b/caffe2/contrib/opengl/operators/GLSub.cc
@ -4,10 +4,10 @@
 #include "../core/GLImage.h"
 #include "../core/ImageAllocator.h"

-#include <iostream>
-#include <vector>
 #include "caffe2/core/operator.h"
 #include "caffe2/core/timer.h"
+#include <iostream>
+#include <vector>

 class GLSub : public GLFilter {
 public:
@ -15,22 +15,18 @@ class GLSub : public GLFilter {
  binding* outputSize;

  GLSub()
-      : GLFilter(
-            "GLSub",
-            vertex_shader,
-            fragment_shader,
-            std::vector<binding*>({BINDING(outputSize),
-                                   BINDING(inputData[0]),
-                                   BINDING(inputData[1])}),
-            {/* no uniform blocks */},
-            {/* no attributes */},
-            {/* no replacements */}) {}
+      : GLFilter("GLSub",
+                 vertex_shader,
+                 fragment_shader,
+                 std::vector<binding*>({BINDING(outputSize), BINDING(inputData[0]), BINDING(inputData[1])}),
+                 {/* no uniform blocks */},
+                 {/* no attributes */},
+                 {/* no replacements */}) {}

  template <typename T>
-  void sub(
-      const GLImageVector<T>& input_image0,
-      const GLImageVector<T>& input_image1,
-      const GLImageVector<T>& output_image);
+  void sub(const GLImageVector<T>& input_image0,
+           const GLImageVector<T>& input_image1,
+           const GLImageVector<T>& output_image);

  static const char* fragment_shader;
 };
@ -59,10 +55,9 @@ void main() {
 )GLSL";

 template <typename T>
-void GLSub::sub(
-    const GLImageVector<T>& input_images0,
-    const GLImageVector<T>& input_images1,
-    const GLImageVector<T>& output_images) {
+void GLSub::sub(const GLImageVector<T>& input_images0,
+                const GLImageVector<T>& input_images1,
+                const GLImageVector<T>& output_images) {
  const int num_images = input_images0.size();
  for (int i = 0; i < num_images; i++) {
    GLImage<T>* input_image0 = input_images0[i];
@ -77,14 +72,8 @@ void GLSub::sub(
      input_attachments.push_back({input_image1->textures[is], inputData[1]});

      run(input_attachments,
-          {output_image->textures.begin() + is,
-           output_image->textures.begin() + is + 1},
-          [&]() {
-            glUniform2i(
-                outputSize->location,
-                output_image->width,
-                output_image->height);
-          },
+          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
+          [&]() { glUniform2i(outputSize->location, output_image->width, output_image->height); },
          output_image->width,
          output_image->height);
    }
@ -97,20 +86,14 @@ class OpenGLSubOp final : public Operator<CPUContext>, ImageAllocator<T> {
 public:
  OpenGLSubOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<CPUContext>(operator_def, ws) {
-    OPERATOR_NEEDS_FEATURE(
-        OperatorBase::HasArgument("broadcast") == false,
-        "OpenGLSub does not support broadcast");
+    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("broadcast") == false, "OpenGLSub does not support broadcast");

-    OPERATOR_NEEDS_FEATURE(
-        OperatorBase::HasArgument("axis") == false,
-        "OpenGLSub does not support axis");
+    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, "OpenGLSub does not support axis");
  }

  bool RunOnDevice() override {
-    const GLImageVector<T>& input0 =
-        Inputs()[0]->template Get<GLImageVector<T>>();
-    const GLImageVector<T>& input1 =
-        Inputs()[1]->template Get<GLImageVector<T>>();
+    const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
+    const GLImageVector<T>& input1 = Inputs()[1]->template Get<GLImageVector<T>>();

    CAFFE_ENFORCE_EQ(input0.size(), input1.size());

--- a/caffe2/contrib/opengl/test/TestGLConvolution.cc
+++ b/caffe2/contrib/opengl/test/TestGLConvolution.cc
@ -57,8 +57,7 @@ double BenchOp(const std::string& typ,
  def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_l", 0));
  def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_b", 0));
  def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_r", 0));
-  def1.add_arg()->CopyFrom(caffe2::MakeArgument(
-      "convolution_transform_strategy", std::string("PRECOMPUTE")));
+  def1.add_arg()->CopyFrom(caffe2::MakeArgument("convolution_transform_strategy", std::string("PRECOMPUTE")));

  AddNoiseInput(std::vector<caffe2::TIndex>{1, inputC, inH, inW}, "X", ws);
  if (transposed) {
@ -281,11 +280,11 @@ void TestGLConvolution() {
  // std::vector<int> sizes({208, 312, 416, 720, 1080});
  // std::vector<int> channels({16, 4});
  //
-  std::vector<int> sizes({14, 26, 52, 104});
+  std::vector<int> sizes({14, 26, 52, 104, 208});
  // std::vector<int> channels({24, 16, 4});

  //  std::vector<int> sizes({14});
-  std::vector<int> channels({64, 128, 256, 512});
+  std::vector<int> channels({32, 64, 128, 192, 256, 384, 512});

  std::vector<int> kernels({3});

@ -321,19 +320,20 @@ void TestGLConvolution() {
          const double flops = double(input_channel) * output_channel * kernel * kernel *
                               (kernel == 1 ? space : space - 2) *
                               (kernel == 1 ? space : space - 2) * 2;
-          gl_log(GL_LOG,
-                 "Conv: X: %ix%i  \tC: %i -> %i\tK: %ix%i\t16b GPU GFLOPS: %.2f\t32b CPU GFLOPS:"
-                 "%.2f\tratio: "
-                 "%.2f\n",
-                 space,
-                 space,
-                 input_channel,
-                 output_channel,
-                 kernel,
-                 kernel,
-                 flops / gpuIterTime / 1E6,
-                 flops / cpuIterTime / 1E6,
-                 cpuIterTime / gpuIterTime);
+          // gl_log(GL_LOG,
+          printf(
+              "Conv: X: %ix%i  \tC: %i -> %i\tK: %ix%i\t16b GPU GFLOPS: %.2f\t32b CPU GFLOPS:"
+              "%.2f\tratio: "
+              "%.2f\n",
+              space,
+              space,
+              input_channel,
+              output_channel,
+              kernel,
+              kernel,
+              flops / gpuIterTime / 1E6,
+              flops / cpuIterTime / 1E6,
+              cpuIterTime / gpuIterTime);
        }
      }
    }
--- a/caffe2/contrib/opengl/test/opengl_test.cc
+++ b/caffe2/contrib/opengl/test/opengl_test.cc
@ -282,7 +282,7 @@ void testOpenGLConv(int N,
    } else {
      float* data = t->mutable_data<float>();
      for (int i = 0; i < t->size(); i++) {
-        data[i] = -1;
+        data[i] = 1;
      }
    }
 #if 0
@ -658,7 +658,7 @@ void testOpenGLRelu(int N, int C, int H, int W, int input_tile_x, int input_tile
  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
 }

-void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1) {
+void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1, int input_tile_x = 1, int input_tile_y = 1) {
  LOG(INFO) << "OpenGL Add Test "
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
@ -682,6 +682,16 @@ void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1) {
    op.set_type("CopyToOpenGL");
    op.add_input("X_cpu0");
    op.add_output("X_gl0");
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_x");
+      arg.set_i(input_tile_x);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_y");
+      arg.set_i(input_tile_y);
+    }
  }

  {
@ -689,6 +699,16 @@ void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1) {
    op.set_type("CopyToOpenGL");
    op.add_input("X_cpu1");
    op.add_output("X_gl1");
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_x");
+      arg.set_i(input_tile_x);
+    }
+    {
+      auto& arg = *(op.add_arg());
+      arg.set_name("tile_y");
+      arg.set_i(input_tile_y);
+    }
  }

  {
@ -733,15 +753,13 @@ void testOpenGLSub(int N, int C, int H, int W, float error = 0.1) {
    t0->Resize(N, C, H, W);
    CPUContext ctx0;
    // Too noisy.
-    math::RandGaussian<float, CPUContext>(
-        t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
+    math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);

    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutable<TensorCPU>();
    t1->Resize(N, C, H, W);
    CPUContext ctx1;
    // Too noisy.
-    math::RandGaussian<float, CPUContext>(
-        t1->size(), 0, 30, t1->mutable_data<float>(), &ctx1);
+    math::RandGaussian<float, CPUContext>(t1->size(), 0, 30, t1->mutable_data<float>(), &ctx1);
  }

  NetDef netdef;
@ -916,8 +934,7 @@ void testOpenGLTanh(int N, int C, int H, int W, float error) {
    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
    t->Resize(N, C, H, W);
    CPUContext ctx;
-    math::RandGaussian<float, CPUContext>(
-        t->size(), 0, 2, t->mutable_data<float>(), &ctx);
+    math::RandGaussian<float, CPUContext>(t->size(), 0, 2, t->mutable_data<float>(), &ctx);
  }

  NetDef netdef;
@ -1535,8 +1552,15 @@ void testOpenGLPadImage(
  }
 }

-void testOpenGLResize(
-    int N, int C, int H, int W, int width_scale, int height_scale, int batch_size, float error) {
+void testOpenGLResize(int N,
+                      int C,
+                      int H,
+                      int W,
+                      int width_scale,
+                      int height_scale,
+                      float error,
+                      int input_tile_x = 1,
+                      int input_tile_y = 1) {
  LOG(INFO) << "OpenGLResize Test";
  {
    Workspace ws;
@ -1553,6 +1577,16 @@ void testOpenGLResize(
      op.set_type("CopyToOpenGL");
      op.add_input("X_cpu");
      op.add_output("X_gl");
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("tile_x");
+        arg.set_i(input_tile_x);
+      }
+      {
+        auto& arg = *(op.add_arg());
+        arg.set_name("tile_y");
+        arg.set_i(input_tile_y);
+      }
    }

    {
@ -1569,11 +1603,6 @@ void testOpenGLResize(
        arg.set_name("height_scale");
        arg.set_f(height_scale);
      }
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("batch_size");
-        arg.set_i(batch_size);
-      }
      {
        auto& arg = *(op.add_arg());
        arg.set_name("is_last");
@ -2125,7 +2154,9 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
                       std::string input_order,
                       std::string engine, // "CPU", "OPENGL", or "MPSCNN"
                       bool run_individual,
-                       bool use_texture_input) {
+                       bool use_texture_input,
+                       bool use_tiling,
+                       bool run_fusion) {
  std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());

  // caffe2::dumpDefForOpenGL(init_net);
@ -2138,7 +2169,7 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
  if (engine == "CPU") {
    net_def.CopyFrom(predict_net);
  } else if (engine == "OPENGL") {
-    if (!caffe2::tryConvertToOpenGL(init_net, predict_net, &net_def, use_texture_input)) {
+    if (!caffe2::tryConvertToOpenGL(init_net, predict_net, &net_def, use_texture_input, use_tiling, run_fusion)) {
      CAFFE_THROW("Failed to convert to openGL. Benchmark failed to run");
      return -1;
    }
@ -2197,19 +2228,18 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
    }
    if (input_type == "float") {
      ImageAllocator<float16_t> allocator;
-      GLImageVector<float16_t>* output_image = allocator.newImage(
-          1,
-          width,
-          height,
-          channel,
-          tile_x,
-          tile_y,
+      GLImageVector<float16_t>* output_image = allocator.newImage(1,
+                                                                  width,
+                                                                  height,
+                                                                  channel,
+                                                                  tile_x,
+                                                                  tile_y,
 #if CAFFE2_IOS
-          true
+                                                                  true
 #else
-          false
+                                                                  false
 #endif
-      );
+                                                                  );
      blob->Reset(output_image);
      for (auto& texture : (*output_image)[0]->textures) {
        texture->map_load([&](void* buffer,
@ -2221,19 +2251,18 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
      }
    } else {
      ImageAllocator<uint8_t> allocator;
-      GLImageVector<uint8_t>* output_image = allocator.newImage(
-          1,
-          width,
-          height,
-          channel,
-          tile_x,
-          tile_y,
+      GLImageVector<uint8_t>* output_image = allocator.newImage(1,
+                                                                width,
+                                                                height,
+                                                                channel,
+                                                                tile_x,
+                                                                tile_y,
 #if CAFFE2_IOS
-          true
+                                                                true
 #else
-          false
+                                                                false
 #endif
-      );
+                                                                );
      blob->Reset(output_image);
      for (auto& texture : (*output_image)[0]->textures) {
        texture->map_load([&](void* buffer,
@ -2288,7 +2317,8 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
        }
        glFinish();

-        LOG(INFO) << net_def.op(k).type() << ": " << (double)timer.MilliSeconds() / main_runs;
+        LOG(INFO) << "Operator #" << k << " " << net_def.op(k).type() << ": "
+                  << (double)timer.MilliSeconds() / main_runs;
      }
    }
  }
@ -2501,6 +2531,10 @@ void testOpenGL() {

      testOpenGLPRelu(1, channel, 13, 4, channel, tile_x, tile_y, 0.1);
      testOpenGLRelu(1, channel, 4, 17, tile_x, tile_y, 0.1);
+      testOpenGLConv(1, channel, 16, 16, channel, 3, 3, 0, 2, MaxPool, 0.01, true, 1, 1, tile_x, tile_y, true);
+      testOpenGLConv(1, channel, 16, 16, channel, 3, 3, 0, 2, AveragePool, 0.01, true, 1, 1, tile_x, tile_y, true);
+      testOpenGLAdd(1, channel, 14, 8, 0.1, tile_x, tile_y);
+      testOpenGLResize(1, channel, 16, 16, 2, 2, 0.1, tile_x, tile_y);
      // clang-format on
    }
  }
@ -2760,14 +2794,14 @@ void testOpenGL() {
    testOpenGLInstanceNormPRelu(1, 6, 640, 360, 0.2);

    LOG(INFO) << "Test OpenGL ResizeNearest";
-    testOpenGLResize(1, 4, 16, 16, 1, 1, 1, 0.1);
-    testOpenGLResize(1, 4, 16, 16, 2, 2, 1, 0.1);
-    testOpenGLResize(1, 4, 16, 16, 3, 3, 1, 0.1);
-    testOpenGLResize(1, 4, 16, 16, 4, 4, 1, 0.1);
-    testOpenGLResize(1, 16, 25, 25, 3, 3, 2, 0.1);
-    testOpenGLResize(1, 16, 25, 25, 3, 3, 4, 0.1);
-    testOpenGLResize(1, 12, 25, 25, 3, 3, 3, 0.1);
-    testOpenGLResize(1, 4, 720, 1280, 3, 3, 1, 0.1);
+    testOpenGLResize(1, 4, 16, 16, 1, 1, 0.1);
+    testOpenGLResize(1, 4, 16, 16, 2, 2, 0.1);
+    testOpenGLResize(1, 4, 16, 16, 3, 3, 0.1);
+    testOpenGLResize(1, 4, 16, 16, 4, 4, 0.1);
+    testOpenGLResize(1, 16, 25, 25, 3, 3, 0.1);
+    testOpenGLResize(1, 16, 25, 25, 3, 3, 0.1);
+    testOpenGLResize(1, 12, 25, 25, 3, 3, 0.1);
+    testOpenGLResize(1, 4, 720, 1280, 3, 3, 0.1);

    // debug style transfer
    // conv
@ -2848,8 +2882,8 @@ void testOpenGL() {
    testOpenGLInstanceNormPRelu(3, 4, 16, 16, 0.2);
    testOpenGLInstanceNormPRelu(15, 4, 16, 16, 0.2);

-    testOpenGLResize(3, 4, 16, 16, 1, 1, 1, 0.1);
-    testOpenGLResize(16, 4, 16, 16, 1, 1, 1, 0.1);
+    testOpenGLResize(3, 4, 16, 16, 1, 1, 0.1);
+    testOpenGLResize(16, 4, 16, 16, 1, 1, 0.1);

    testOpenGLPadImage(3, 3, 4, 4, 0, 1, 0, 1, 0.01);
    testOpenGLPadImage(23, 3, 4, 4, 0, 1, 0, 1, 0.01);
--- a/caffe2/contrib/opengl/test/opengl_test.h
+++ b/caffe2/contrib/opengl/test/opengl_test.h
@ -32,6 +32,8 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
                       std::string input_type,
                       std::string input_order,
                       std::string engine,
-                       bool run_individual = false,
-                       bool use_texture_input = false);
+                       bool run_individual    = false,
+                       bool use_texture_input = false,
+                       bool use_tiling        = false,
+                       bool run_fusion        = true);
 } // namespace caffe2