Make android segmentation network run on both iOS and android with tiling

Summary: Add tiling support to GLAdd, GLPool, and GLResizeNearest

Differential Revision: D5733208

fbshipit-source-id: b73113326b96d421787d4695ccf7d2d919ee2ed8
This commit is contained in:
Hao Lu 2017-09-04 17:18:53 -07:00 committed by Facebook Github Bot
parent 2d9728d594
commit dd5400e452
19 changed files with 714 additions and 868 deletions

View File

@ -26,9 +26,7 @@ class GLContext {
static bool GL_EXT_texture_border_clamp_defined();
inline bool halfFloatTextureSupported() {
return half_float_supported;
}
inline bool halfFloatTextureSupported() { return half_float_supported; }
void setTextureAllocator(
std::function<const GLTexture*(const int width, const int height)> textureAllocator) {

View File

@ -76,13 +76,9 @@ std::string GLFilter::process_replacements(std::string shader,
// Add some #defines for convenience
std::string version_tag = "#version 300 es";
if (GLContext::getGLContext()->halfFloatTextureSupported()) {
shader.insert(
shader.find(version_tag) + version_tag.size(),
half_float_texture_utils);
shader.insert(shader.find(version_tag) + version_tag.size(), half_float_texture_utils);
} else {
shader.insert(
shader.find(version_tag) + version_tag.size(),
half_float_compat_texture_utils);
shader.insert(shader.find(version_tag) + version_tag.size(), half_float_compat_texture_utils);
}
shader.insert(shader.find(version_tag) + version_tag.size(), shader_utils);
return shader;

View File

@ -18,6 +18,8 @@ class GLImage {
const int tile_x;
const int tile_y;
const int texture_width;
const int texture_height;
const int slices;
const std::vector<const GLTexture*> textures;
@ -50,6 +52,8 @@ class GLImage {
data_size(sizeof(T)),
tile_x(_tile_x),
tile_y(_tile_y),
texture_width(_width * _tile_x),
texture_height(_height * _tile_y),
slices(channels_to_slices(_channels, _tile_x, _tile_y)),
textures(allocate_textures(slices, texture_loader)) {
CAFFE_ENFORCE_EQ(slices * tile_x * tile_y, (channels + 3) / 4);
@ -68,13 +72,23 @@ class GLImage {
data_size(sizeof(T)),
tile_x(_tile_x),
tile_y(_tile_y),
texture_width(_width * _tile_x),
texture_height(_height * _tile_y),
slices(channels_to_slices(_channels, _tile_x, _tile_y)),
textures(allocate_textures(slices, texture_loader)) {
CAFFE_ENFORCE_EQ(slices * tile_x * tile_y, (channels + 3) / 4);
}
GLImage()
: width(0), height(0), channels(0), data_size(sizeof(T)), tile_x(0), tile_y(0), slices(0){};
: width(0),
height(0),
channels(0),
data_size(sizeof(T)),
tile_x(0),
tile_y(0),
texture_width(0),
texture_height(0),
slices(0){};
virtual ~GLImage() {
gl_log(GL_VERBOSE, "deleting GLImage\n");

View File

@ -6,37 +6,18 @@
#include "caffe2/core/logging.h"
#include "caffe2/core/timer.h"
#define half_float_supported \
(GLContext::getGLContext()->halfFloatTextureSupported())
#define half_float_supported (GLContext::getGLContext()->halfFloatTextureSupported())
#define FIXED_TYPE(_t) \
(((_t).type != GL_HALF_FLOAT || half_float_supported) \
? (_t) \
: GLTexture::FP16_COMPAT)
#define FIXED_TYPE(_t) (((_t).type != GL_HALF_FLOAT || half_float_supported) ? (_t) : GLTexture::FP16_COMPAT)
GLPlainTexture::GLPlainTexture(
const Type& type,
const void* input,
GLsizei width,
GLsizei height,
bool use_padding,
GLint filter,
GLint wrap)
const Type& type, const void* input, GLsizei width, GLsizei height, bool use_padding, GLint filter, GLint wrap)
: GLTexture(FIXED_TYPE(type), width, height, use_padding, filter, wrap) {
// caffe2::Timer timer;
// timer.Start();
glGenTextures(1, &_textureId);
glBindTexture(GL_TEXTURE_2D, _textureId);
glTexImage2D(
GL_TEXTURE_2D,
0,
_type.internalFormat,
_stride,
_height,
0,
_type.format,
_type.type,
input);
glTexImage2D(GL_TEXTURE_2D, 0, _type.internalFormat, _stride, _height, 0, _type.format, _type.type, input);
gl_log(
GL_VERBOSE,
@ -64,13 +45,7 @@ GLPlainTexture::GLPlainTexture(
}
GLPlainTexture::GLPlainTexture(
const Type& type,
const GLuint textureID,
GLsizei width,
GLsizei height,
bool use_padding,
GLint filter,
GLint wrap)
const Type& type, const GLuint textureID, GLsizei width, GLsizei height, bool use_padding, GLint filter, GLint wrap)
: GLTexture(FIXED_TYPE(type), width, height, use_padding, filter, wrap) {
_textureId = textureID;
isOwner = false;

View File

@ -31,9 +31,7 @@ void arm_memcpy(volatile unsigned char* dst, volatile unsigned char* src, int sz
const GLTexture::Type GLTexture::FP16 = {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT};
const GLTexture::Type GLTexture::UI8 = {GL_RGBA, GL_RGBA, GL_UNSIGNED_BYTE};
const GLTexture::Type GLTexture::FP16_COMPAT = {GL_RG32UI,
GL_RG_INTEGER,
GL_UNSIGNED_INT};
const GLTexture::Type GLTexture::FP16_COMPAT = {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT};
void GLTexture::map_read(std::function<void(const void* buffer,
size_t width,

View File

@ -13,14 +13,14 @@ class GLTexture {
int dataSize() const {
switch (type) {
case GL_UNSIGNED_INT:
return 4;
case GL_HALF_FLOAT:
return 2;
case GL_UNSIGNED_BYTE:
return 1;
default:
throw std::runtime_error("Unknown Texture Type");
case GL_UNSIGNED_INT:
return 4;
case GL_HALF_FLOAT:
return 2;
case GL_UNSIGNED_BYTE:
return 1;
default:
throw std::runtime_error("Unknown Texture Type");
}
}

View File

@ -251,7 +251,7 @@ void dumpDefForOpenGL(const NetDef& d) {
// }
//}
NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput, bool useTiling) {
NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput, bool useTiling, bool runFusion) {
CAFFE_ENFORCE_GE(predictNet.op_size(), 1);
NetDef net;
net.CopyFrom(predictNet);
@ -303,7 +303,9 @@ NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput
CAFFE_THROW("OpenGL operator missing");
}
net = runOpenGLFusion(net, openGLOps);
if (runFusion) {
net = runOpenGLFusion(net, openGLOps);
}
if (net.op(0).type() == replacements["OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess"]) {
// For end-to-end testing
@ -320,7 +322,9 @@ NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput
copy_op->add_output(net.external_output(0));
}
} else {
needCopyOps = true;
if (!useTextureInput) {
needCopyOps = true;
}
}
// copy ops are needed when the input is not a texture
@ -335,10 +339,12 @@ NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput
bool tryConvertToOpenGL(const NetDef& initNet,
const NetDef& predictNet,
NetDef* glPredictNet,
bool useTextureInput) {
bool useTextureInput,
bool useTiling,
bool runFusion) {
try {
// Throws if unsupported operators are found.
*glPredictNet = rewritePredictNetForOpenGL(predictNet, useTextureInput);
*glPredictNet = rewritePredictNetForOpenGL(predictNet, useTextureInput, useTiling, runFusion);
dumpDefForOpenGL(*glPredictNet);
// Throws if unsupported parameters are found.
Workspace ws;

View File

@ -8,11 +8,14 @@ namespace caffe2 {
bool tryConvertToOpenGL(const NetDef& initNet,
const NetDef& predictNet,
NetDef* glPredictNet,
bool useTextureInput = false);
bool useTextureInput = false,
bool useTiling = false,
bool runFusion = true);
// Exposed for testing
NetDef rewritePredictNetForOpenGL(const NetDef& predictNet,
bool useTextureInput = false,
bool useTiling = false);
bool useTiling = false,
bool runFusion = true);
void dumpDefForOpenGL(const NetDef& net);
} // namespace caffe2

View File

@ -75,9 +75,9 @@ void GLAdd::add(const GLImageVector<T>& input_images0,
run(input_attachments,
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
[&]() { glUniform2i(outputSize->location, output_image->width, output_image->height); },
output_image->width,
output_image->height);
[&]() { glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height); },
output_image->texture_width,
output_image->texture_height);
}
}
}
@ -91,8 +91,7 @@ class OpenGLAddOp final : public Operator<CPUContext>, ImageAllocator<T> {
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("broadcast") == false,
"OpenGLAdd does not support broadcast");
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false,
"OpenGLMul does not support axis");
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, "OpenGLAdd does not support axis");
}
bool RunOnDevice() override {
@ -105,18 +104,25 @@ class OpenGLAddOp final : public Operator<CPUContext>, ImageAllocator<T> {
const int input_channels = input0.channels();
const int input_width = input0.width();
const int input_height = input0.height();
const int input_tile_x = input0.tile_x();
const int input_tile_y = input0.tile_y();
CAFFE_ENFORCE_EQ(input1.channels(), input_channels);
CAFFE_ENFORCE_EQ(input1.width(), input_width);
CAFFE_ENFORCE_EQ(input1.height(), input_height);
CAFFE_ENFORCE_EQ(input1.tile_x(), input_tile_x);
CAFFE_ENFORCE_EQ(input1.tile_y(), input_tile_y);
const int output_channels = input_channels;
const int output_width = input_width;
const int output_height = input_height;
const int output_tile_x = input_tile_x;
const int output_tile_y = input_tile_y;
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
GLImageVector<T>* output = ImageAllocator<T>::newImage(
num_images, output_width, output_height, output_channels, is_last);
num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
if (!_add) {
_add.reset(new GLAdd());

File diff suppressed because it is too large Load Diff

View File

@ -32,19 +32,18 @@ class GLPRelu : public GLFilter {
int _output_tile_y,
int _output_tile_width,
int _output_tile_height)
: GLFilter(
"GLPRelu",
vertex_shader,
fragment_shader,
std::vector<binding*>({BINDING(inputData)}),
std::vector<binding*>({BINDING(scale_block)}),
{/* no attributes */},
{{"USE_RELU", caffe2::to_string(PRelu)},
{"OUTPUT_TILES", caffe2::to_string(_output_tile_x * _output_tile_y)},
{"OUTPUT_TILE_X", caffe2::to_string(_output_tile_x)},
{"OUTPUT_TILE_WIDTH", caffe2::to_string(_output_tile_width)},
{"OUTPUT_TILE_HEIGHT", caffe2::to_string(_output_tile_height)},
{"TILED_CONVOLUTION", caffe2::to_string(_output_tile_x > 1 || _output_tile_y > 1)}}),
: GLFilter("GLPRelu",
vertex_shader,
fragment_shader,
std::vector<binding*>({BINDING(inputData)}),
std::vector<binding*>({BINDING(scale_block)}),
{/* no attributes */},
{{"USE_RELU", caffe2::to_string(PRelu)},
{"OUTPUT_TILES", caffe2::to_string(_output_tile_x * _output_tile_y)},
{"OUTPUT_TILE_X", caffe2::to_string(_output_tile_x)},
{"OUTPUT_TILE_WIDTH", caffe2::to_string(_output_tile_width)},
{"OUTPUT_TILE_HEIGHT", caffe2::to_string(_output_tile_height)},
{"TILED_PRELU", caffe2::to_string(_output_tile_x > 1 || _output_tile_y > 1)}}),
scale(_scale),
scale_size(_scale_size),
channels(_channels),
@ -67,7 +66,7 @@ class GLPRelu : public GLFilter {
{"OUTPUT_TILE_X", caffe2::to_string(1)},
{"OUTPUT_TILE_WIDTH", caffe2::to_string(1)},
{"OUTPUT_TILE_HEIGHT", caffe2::to_string(1)},
{"TILED_CONVOLUTION", caffe2::to_string(0)}}),
{"TILED_PRELU", caffe2::to_string(0)}}),
scale(nullptr),
scale_block(nullptr),
scale_size(0),
@ -88,75 +87,72 @@ class GLPRelu : public GLFilter {
// MARK: GLSL
const char* GLPRelu::fragment_shader = R"GLSL(#version 300 es
#define TILED_PRELU $(TILED_PRELU)
#define USE_RELU $(USE_RELU)
// tiling
#define OUTPUT_TILES $(OUTPUT_TILES)
#define OUTPUT_TILE_X $(OUTPUT_TILE_X)
#define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
#define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
#define TILED_CONVOLUTION $(TILED_CONVOLUTION)
// common
precision mediump float;
precision highp int;
TEXTURE_INPUT(inputData);
TEXTURE_OUTPUT(0, outputData);
const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
in highp vec2 v_texCoord;
#if !USE_RELU
#if TILED_CONVOLUTION == 1
layout (std140) uniform scale_block {
highp uvec4 scale[(OUTPUT_TILES + 1) / 2];
};
#if USE_RELU
#else
layout (std140) uniform scale_block {
highp uvec4 scale;
};
#endif
#endif
#if !USE_RELU
#if TILED_CONVOLUTION
void main() {
ivec2 inputSize = textureSize(inputData, 0);
ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
// output.data = value > 0 ? value : value * weight;
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw);
value = mix(value * preluValue, value, vec4(greaterThan(value, vec4(0))));
outputData = TEXTURE_STORE(value);
}
#else
void main() {
ivec2 inputSize = textureSize(inputData, 0);
ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
value = mix(value * unpackHalf4x16(scale.xy), value, vec4(greaterThan(value, vec4(0))));
outputData = TEXTURE_STORE(value);
}
#endif // TILED_CONVOLUTION
#else // Relu
// Relu
void main() {
ivec2 inputSize = textureSize(inputData, 0);
ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
outputData = TEXTURE_STORE(max(value, vec4(0.0)));
}
#endif
#else
#if TILED_PRELU
const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
layout (std140) uniform scale_block {
highp uvec4 scale[(OUTPUT_TILES + 1) / 2];
};
void main() {
ivec2 inputSize = textureSize(inputData, 0);
ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
// outputData = value > 0 ? value : value * weight;
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw);
value = mix(value * preluValue, value, vec4(greaterThan(value, vec4(0))));
outputData = TEXTURE_STORE(value);
}
#else
layout (std140) uniform scale_block {
highp uvec4 scale;
};
void main() {
ivec2 inputSize = textureSize(inputData, 0);
ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
// outputData = value > 0 ? value : value * weight;
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
value = mix(value * unpackHalf4x16(scale.xy), value, vec4(greaterThan(value, vec4(0))));
outputData = TEXTURE_STORE(value);
}
#endif // TILED_PRELU
#endif // USE_RELU
)GLSL";
@ -190,8 +186,8 @@ void GLPRelu::prelu(const GLImageVector<T>& input_images,
run(input_attachments,
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
[&]() {},
output_image->width * output_image->tile_x,
output_image->height * output_image->tile_y);
output_image->texture_width,
output_image->texture_height);
}
}
}

View File

@ -139,8 +139,8 @@ class OpenGLPadImageOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator
padImage_.reset(new GLPadImage());
LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => "
<< output_channels << ": " << output_height << " X " << output_width;
LOG(INFO) << "Padmode: " << mode_ << "pad_l = " << pad_l() << ", pad_r = " << pad_r()
<< ", pad_t = " << pad_t() << ", pad_b = " << pad_b();
LOG(INFO) << "Padmode: " << mode_ << ", pad_l = " << pad_l() << ", pad_r = " << pad_r() << ", pad_t = " << pad_t()
<< ", pad_b = " << pad_b();
}
padImage_->pad(input, *output, pad_l(), pad_t());

View File

@ -21,6 +21,8 @@ class GLPool : public GLFilter {
point kernel_size;
point input_padding;
point input_stride;
point input_tile_size;
point output_tile_size;
};
binding* inputData;
@ -29,25 +31,29 @@ class GLPool : public GLFilter {
const descriptor geometry;
GLPool(const descriptor& _geometry, PoolType poolType)
: GLFilter(
"GLPool",
vertex_shader,
fragment_shader,
{
BINDING(inputData), BINDING(kernelSize), BINDING(outputSize),
},
{/* no uniform blocks */},
{/* no attributes */},
{{"KERNEL_SIZE_X", caffe2::to_string(_geometry.kernel_size.x)},
{"KERNEL_SIZE_Y", caffe2::to_string(_geometry.kernel_size.y)},
{"INPUT_PADDING_X", caffe2::to_string(_geometry.input_padding.x)},
{"INPUT_PADDING_Y", caffe2::to_string(_geometry.input_padding.y)},
{"INPUT_STRIDE_X", caffe2::to_string(_geometry.input_stride.x)},
{"INPUT_STRIDE_Y", caffe2::to_string(_geometry.input_stride.y)},
{"TEXTURE_BORDER_CLAMP",
caffe2::to_string(GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined())},
{"MAX_POOL", caffe2::to_string(poolType == MaxPool)}}),
GLPool(const descriptor& _geometry, PoolType poolType, bool _tiling)
: GLFilter("GLPool",
vertex_shader,
fragment_shader,
{
BINDING(inputData), BINDING(kernelSize), BINDING(outputSize),
},
{/* no uniform blocks */},
{/* no attributes */},
{{"KERNEL_SIZE_X", caffe2::to_string(_geometry.kernel_size.x)},
{"KERNEL_SIZE_Y", caffe2::to_string(_geometry.kernel_size.y)},
{"INPUT_PADDING_X", caffe2::to_string(_geometry.input_padding.x)},
{"INPUT_PADDING_Y", caffe2::to_string(_geometry.input_padding.y)},
{"INPUT_STRIDE_X", caffe2::to_string(_geometry.input_stride.x)},
{"INPUT_STRIDE_Y", caffe2::to_string(_geometry.input_stride.y)},
{"INPUT_TILE_WIDTH", caffe2::to_string(_geometry.input_tile_size.x)},
{"INPUT_TILE_HEIGHT", caffe2::to_string(_geometry.input_tile_size.y)},
{"OUTPUT_TILE_WIDTH", caffe2::to_string(_geometry.output_tile_size.x)},
{"OUTPUT_TILE_HEIGHT", caffe2::to_string(_geometry.output_tile_size.y)},
{"TILED_POOLING", caffe2::to_string(_tiling)},
{"TEXTURE_BORDER_CLAMP",
caffe2::to_string(GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined())},
{"MAX_POOL", caffe2::to_string(poolType == MaxPool)}}),
geometry(_geometry) {}
~GLPool() {}
@ -63,11 +69,11 @@ class GLPool : public GLFilter {
run({{input_image->textures[is], inputData}},
{output_image->textures[is]},
[&]() {
glUniform2i(outputSize->location, output_image->width, output_image->height);
glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
glUniform2i(kernelSize->location, geometry.kernel_size.x, geometry.kernel_size.y);
},
output_image->width,
output_image->height);
output_image->texture_width,
output_image->texture_height);
}
}
}
@ -78,10 +84,16 @@ class GLPool : public GLFilter {
// MARK: GLSL
const char* GLPool::fragment_shader = R"GLSL(#version 300 es
#define TILED_POOLING $(TILED_POOLING)
#define TEXTURE_BORDER_CLAMP $(TEXTURE_BORDER_CLAMP)
#define MAX_POOL $(MAX_POOL)
// tiling
#define INPUT_TILE_WIDTH $(INPUT_TILE_WIDTH)
#define INPUT_TILE_HEIGHT $(INPUT_TILE_HEIGHT)
#define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
#define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
precision mediump float;
precision mediump int;
@ -90,24 +102,84 @@ in highp vec2 v_texCoord;
const ivec2 input_padding = ivec2($(INPUT_PADDING_X), $(INPUT_PADDING_Y));
const ivec2 input_stride = ivec2($(INPUT_STRIDE_X), $(INPUT_STRIDE_Y));
const ivec2 kernel_size = ivec2($(KERNEL_SIZE_X), $(KERNEL_SIZE_Y));
const int channels = 4;
uniform ivec2 kernelSize;
uniform ivec2 outputSize;
TEXTURE_INPUT(inputData);
TEXTURE_OUTPUT(0, outputData);
const bool no_bounds = bool(TEXTURE_BORDER_CLAMP) || all(equal(input_padding, ivec2(0)));
const bool no_bounds = (TILED_POOLING == 0) && (bool(TEXTURE_BORDER_CLAMP) || all(equal(input_padding, ivec2(0))));
#define IN_BOUNDS(p, p0, p1) (all(greaterThanEqual(p, p0)) && all(lessThan(p, p1)))
// MIN_FLOAT is -2^14, which is the minimum precision requirement for mediump in OpenGL ES 3.0
const float MIN_FLOAT = -exp2(14.0);
#if TILED_POOLING
const ivec2 inputTileSize = ivec2(INPUT_TILE_WIDTH, INPUT_TILE_HEIGHT);
const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
// tiled pooling
#if MAX_POOL
// MIN_FLOAT is -2^14, which is the minimum precision requirement for mediump in OpenGL ES 3.0
#define POOL { \
pool = vec4(MIN_FLOAT); \
for (int y = 0; y < kernelSize.y; y++) { \
for (int x = 0; x < kernelSize.x; x++) { \
ivec2 idx = tileCoord + ivec2(x, y); \
if (no_bounds || IN_BOUNDS(idx, ivec2(0), inputTileSize)) { \
vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
pool = max(pool, data); \
} \
} \
} \
}
const float MIN_FLOAT = -exp2(14.0);
#else
#define POOL { \
int count = 0; \
for (int y = 0; y < kernelSize.y; y++) { \
for (int x = 0; x < kernelSize.x; x++) { \
ivec2 idx = tileCoord + ivec2(x, y); \
if (no_bounds || IN_BOUNDS(idx, ivec2(0), inputTileSize)) { \
vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
pool += data;\
count += 1; \
} \
} \
} \
pool = pool / float(count); \
}
#endif // MAX_POOL
void main() {
ivec2 inputSize = textureSize(inputData, 0);
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
tileCoord = input_stride * tileCoord - input_padding;
ivec2 inputTileOffset = tile * inputTileSize;
#if MAX_POOL
vec4 pool = vec4(0);
#else
highp vec4 pool = vec4(0);
#endif
POOL;
outputData = TEXTURE_STORE(pool);
}
#else
// no tiling
#if MAX_POOL
#define POOL { \
pool = vec4(MIN_FLOAT); \
@ -125,22 +197,21 @@ const float MIN_FLOAT = -exp2(14.0);
#else
#define POOL { \
int count = 0; \
for (int y = 0; y < kernelSize.y; y++) { \
for (int x = 0; x < kernelSize.x; x++) { \
ivec2 idx = texelCoord + ivec2(x, y); \
if (no_bounds || IN_BOUNDS(idx, ivec2(0), inputSize)) { \
vec4 data = TEXTURE_LOAD(inputData, idx); \
pool += data;\
pool += data; \
count += 1; \
} \
} \
} \
ivec2 start = texelCoord; \
ivec2 end = min(start + kernel_size, inputSize); \
start = max(ivec2(0), start); \
pool = pool / float((end.x - start.x) * (end.y - start.y)); \
pool = pool / float(count); \
}
#endif
#endif // MAX_POOL
void main() {
ivec2 inputSize = textureSize(inputData, 0);
@ -155,6 +226,8 @@ void main() {
outputData = TEXTURE_STORE(pool);
}
#endif // TILED_POOLING
)GLSL";
namespace caffe2 {
@ -199,18 +272,25 @@ class GLPoolOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<float16
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
GLImageVector<T>* output = ImageAllocator<T>::newImage(
num_images, output_width, output_height, output_channels, is_last);
const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
GLPool::descriptor geometry{
input_channels, {kernel_w(), kernel_h()}, {pad_l(), pad_t()}, {stride_w(), stride_h()}};
GLImageVector<T>* output = ImageAllocator<T>::newImage(
num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
GLPool::descriptor geometry{input_channels,
{kernel_w(), kernel_h()},
{pad_l(), pad_t()},
{stride_w(), stride_h()},
{input_width, input_height},
{output_height, output_width}};
if (!glPool_) {
LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => "
<< output_channels << ": " << output_height << " X " << output_width
<< " Kernel: " << kernel_w() << "X" << kernel_h();
LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => " << output_channels << ": "
<< output_height << " X " << output_width << " Kernel: " << kernel_w() << "X" << kernel_h()
<< " Tiling: " << input_tile_x << "X" << input_tile_y;
glPool_.reset(new GLPool(geometry, poolType));
glPool_.reset(new GLPool(geometry, poolType, input_tile_x > 1 || input_tile_y > 1));
}
glPool_->pool(input, *output);

View File

@ -11,89 +11,48 @@
class GLResizeNearest : public GLFilter {
public:
static constexpr int MaxBatchSize = 4;
binding* inputData;
binding* outputSize;
binding* scale_reverse;
binding* inputData[MaxBatchSize];
binding* inputSize;
binding* outputSize;
binding* scale_reverse;
GLResizeNearest()
: GLFilter("GLResizeNearest",
vertex_shader,
fragment_shader,
std::vector<binding*>({BINDING(outputSize), BINDING(scale_reverse), BINDING(inputData)}),
{/* no uniform blocks*/},
{/* no attributes */},
{/* replacements */}) {}
const int batch_size;
template <typename T>
void resize(const GLImageVector<T>& input_images,
const GLImageVector<T>& output_images,
float width_scale_rev,
float height_scale_rev);
const std::vector<binding*> input_bindings(int batch_size) {
std::vector<binding*> bindings(
{BINDING(inputSize), BINDING(outputSize), BINDING(scale_reverse)});
for (int i = 0; i < batch_size; i++) {
bindings.push_back(inputData[i] = new binding{"inputData[" + caffe2::to_string(i) + "]"});
}
return bindings;
}
GLResizeNearest(int _batch_size = 1)
: GLFilter("GLResizeNearest",
vertex_shader,
fragment_shader,
input_bindings(_batch_size),
{/* no uniform blocks*/},
{/* no attributes */},
{{"BATCH_SIZE", caffe2::to_string(_batch_size)}}),
batch_size(_batch_size) {}
template <typename T>
void resize(const GLImageVector<T>& input_images,
const GLImageVector<T>& output_images,
float width_scale_rev,
float height_scale_rev);
static const char* fragment_shader;
static const char* fragment_shader;
};
// MARK: GLSL
const char* GLResizeNearest::fragment_shader = R"GLSL(#version 300 es
#define BATCH_SIZE $(BATCH_SIZE)
precision mediump float;
precision mediump int;
in highp vec2 v_texCoord;
uniform ivec2 inputSize;
uniform ivec2 outputSize;
uniform highp vec2 scale_reverse;
TEXTURE_INPUT(inputData[BATCH_SIZE]);
TEXTURE_OUTPUT(0, outputData0);
#if BATCH_SIZE > 1
TEXTURE_OUTPUT(1, outputData1);
#if BATCH_SIZE > 2
TEXTURE_OUTPUT(2, outputData2);
#if BATCH_SIZE > 3
TEXTURE_OUTPUT(3, outputData3);
#endif
#endif
#endif
TEXTURE_INPUT(inputData);
TEXTURE_OUTPUT(0, outputData);
void main() {
// it clamps to the edge by default
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize) * scale_reverse);
vec4 v0 = TEXTURE_LOAD(inputData[0], texelCoord);
outputData0 = TEXTURE_STORE(v0);
#if BATCH_SIZE > 1
vec4 v1 = TEXTURE_LOAD(inputData[1], texelCoord);
outputData1 = TEXTURE_STORE(v1);
#if BATCH_SIZE > 2
vec4 v2 = TEXTURE_LOAD(inputData[2], texelCoord);
outputData2 = TEXTURE_STORE(v2);
#if BATCH_SIZE > 3
vec4 v3 = TEXTURE_LOAD(inputData[3], texelCoord);
outputData3 = TEXTURE_STORE(v3);
#endif
#endif
#endif
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
outputData = TEXTURE_STORE(value);
}
)GLSL";
@ -108,21 +67,17 @@ void GLResizeNearest::resize(const GLImageVector<T>& input_images,
int input_slices = input_image->slices;
int output_slices = output_image->slices;
for (int is = 0; is < input_slices; is += batch_size) {
std::vector<texture_attachment> input_attachments;
for (int ib = 0; ib < batch_size; ib++) {
input_attachments.push_back({input_image->textures[is + ib], inputData[ib]});
}
for (int is = 0; is < input_slices; is++) {
std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
run(input_attachments,
{output_image->textures.begin() + is, output_image->textures.begin() + is + batch_size},
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
[&]() {
glUniform2i(inputSize->location, input_image->width, input_image->height);
glUniform2i(outputSize->location, output_image->width, output_image->height);
glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
glUniform2f(scale_reverse->location, width_scale_rev, height_scale_rev);
},
output_image->width,
output_image->height);
output_image->texture_width,
output_image->texture_height);
}
}
}
@ -153,14 +108,15 @@ class OpenGLResizeNearestOp final : public Operator<CPUContext>, ImageAllocator<
const int output_height = input_height * height_scale_;
const int output_channels = input_channels;
const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
GLImageVector<T>* output = ImageAllocator<T>::newImage(
num_images, output_width, output_height, output_channels, is_last);
num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
if (!resizeNearest_) {
int batch_size = OperatorBase::GetSingleArgument<int>("batch_size", 1);
resizeNearest_.reset(new GLResizeNearest(batch_size));
LOG(INFO) << "batch_size = " << batch_size;
resizeNearest_.reset(new GLResizeNearest());
}
resizeNearest_->resize(input, *output, 1.0 / width_scale_, 1.0 / height_scale_);
Outputs()[0]->Reset(output);

View File

@ -18,28 +18,23 @@ class GLStylizer : public GLFilter {
bool deprocess;
public:
GLStylizer(bool _deprocess = false, InputFormat input_format = BGRA)
: GLFilter(
_deprocess ? "GLDeStylizer" : "GLStylizer",
vertex_shader,
fragment_shader,
std::vector<binding*>({BINDING(inputData),
BINDING(mean),
BINDING(noise_std),
BINDING(outputSize)}),
{/* no uniform blocks */},
{/* no attributes */},
{{"DEPROCESS", caffe2::to_string(_deprocess)},
{"RGBAINPUT", caffe2::to_string(input_format)}}),
deprocess(_deprocess) {}
GLStylizer(bool _deprocess = false, InputFormat input_format = BGRA)
: GLFilter(_deprocess ? "GLDeStylizer" : "GLStylizer",
vertex_shader,
fragment_shader,
std::vector<binding*>({BINDING(inputData), BINDING(mean), BINDING(noise_std), BINDING(outputSize)}),
{/* no uniform blocks */},
{/* no attributes */},
{{"DEPROCESS", caffe2::to_string(_deprocess)}, {"RGBAINPUT", caffe2::to_string(input_format)}}),
deprocess(_deprocess) {}
template <typename T1, typename T2>
void stylize(const GLImage<T1>* input_image,
const GLImage<T2>* output_image,
const float mean_values[3],
float noise_std_value);
template <typename T1, typename T2>
void stylize(const GLImage<T1>* input_image,
const GLImage<T2>* output_image,
const float mean_values[3],
float noise_std_value);
static const char* fragment_shader;
static const char* fragment_shader;
};
// MARK: GLSL
@ -116,8 +111,7 @@ void GLStylizer::stylize(const GLImage<T1>* input_image,
run(std::vector<texture_attachment>({{input_image->textures[0], inputData}}),
{output_image->textures[0]},
[&]() {
glUniform2i(
outputSize->location, output_image->width, output_image->height);
glUniform2i(outputSize->location, output_image->width, output_image->height);
glUniform3f(mean->location, mean_values[0], mean_values[1], mean_values[2]);
if (!deprocess) {
glUniform1f(noise_std->location, noise_std_value);

View File

@ -4,10 +4,10 @@
#include "../core/GLImage.h"
#include "../core/ImageAllocator.h"
#include <iostream>
#include <vector>
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
#include <iostream>
#include <vector>
class GLSub : public GLFilter {
public:
@ -15,22 +15,18 @@ class GLSub : public GLFilter {
binding* outputSize;
GLSub()
: GLFilter(
"GLSub",
vertex_shader,
fragment_shader,
std::vector<binding*>({BINDING(outputSize),
BINDING(inputData[0]),
BINDING(inputData[1])}),
{/* no uniform blocks */},
{/* no attributes */},
{/* no replacements */}) {}
: GLFilter("GLSub",
vertex_shader,
fragment_shader,
std::vector<binding*>({BINDING(outputSize), BINDING(inputData[0]), BINDING(inputData[1])}),
{/* no uniform blocks */},
{/* no attributes */},
{/* no replacements */}) {}
template <typename T>
void sub(
const GLImageVector<T>& input_image0,
const GLImageVector<T>& input_image1,
const GLImageVector<T>& output_image);
void sub(const GLImageVector<T>& input_image0,
const GLImageVector<T>& input_image1,
const GLImageVector<T>& output_image);
static const char* fragment_shader;
};
@ -59,10 +55,9 @@ void main() {
)GLSL";
template <typename T>
void GLSub::sub(
const GLImageVector<T>& input_images0,
const GLImageVector<T>& input_images1,
const GLImageVector<T>& output_images) {
void GLSub::sub(const GLImageVector<T>& input_images0,
const GLImageVector<T>& input_images1,
const GLImageVector<T>& output_images) {
const int num_images = input_images0.size();
for (int i = 0; i < num_images; i++) {
GLImage<T>* input_image0 = input_images0[i];
@ -77,14 +72,8 @@ void GLSub::sub(
input_attachments.push_back({input_image1->textures[is], inputData[1]});
run(input_attachments,
{output_image->textures.begin() + is,
output_image->textures.begin() + is + 1},
[&]() {
glUniform2i(
outputSize->location,
output_image->width,
output_image->height);
},
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
[&]() { glUniform2i(outputSize->location, output_image->width, output_image->height); },
output_image->width,
output_image->height);
}
@ -97,20 +86,14 @@ class OpenGLSubOp final : public Operator<CPUContext>, ImageAllocator<T> {
public:
OpenGLSubOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {
OPERATOR_NEEDS_FEATURE(
OperatorBase::HasArgument("broadcast") == false,
"OpenGLSub does not support broadcast");
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("broadcast") == false, "OpenGLSub does not support broadcast");
OPERATOR_NEEDS_FEATURE(
OperatorBase::HasArgument("axis") == false,
"OpenGLSub does not support axis");
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, "OpenGLSub does not support axis");
}
bool RunOnDevice() override {
const GLImageVector<T>& input0 =
Inputs()[0]->template Get<GLImageVector<T>>();
const GLImageVector<T>& input1 =
Inputs()[1]->template Get<GLImageVector<T>>();
const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
const GLImageVector<T>& input1 = Inputs()[1]->template Get<GLImageVector<T>>();
CAFFE_ENFORCE_EQ(input0.size(), input1.size());

View File

@ -57,8 +57,7 @@ double BenchOp(const std::string& typ,
def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_l", 0));
def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_b", 0));
def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_r", 0));
def1.add_arg()->CopyFrom(caffe2::MakeArgument(
"convolution_transform_strategy", std::string("PRECOMPUTE")));
def1.add_arg()->CopyFrom(caffe2::MakeArgument("convolution_transform_strategy", std::string("PRECOMPUTE")));
AddNoiseInput(std::vector<caffe2::TIndex>{1, inputC, inH, inW}, "X", ws);
if (transposed) {
@ -281,11 +280,11 @@ void TestGLConvolution() {
// std::vector<int> sizes({208, 312, 416, 720, 1080});
// std::vector<int> channels({16, 4});
//
std::vector<int> sizes({14, 26, 52, 104});
std::vector<int> sizes({14, 26, 52, 104, 208});
// std::vector<int> channels({24, 16, 4});
// std::vector<int> sizes({14});
std::vector<int> channels({64, 128, 256, 512});
std::vector<int> channels({32, 64, 128, 192, 256, 384, 512});
std::vector<int> kernels({3});
@ -321,19 +320,20 @@ void TestGLConvolution() {
const double flops = double(input_channel) * output_channel * kernel * kernel *
(kernel == 1 ? space : space - 2) *
(kernel == 1 ? space : space - 2) * 2;
gl_log(GL_LOG,
"Conv: X: %ix%i \tC: %i -> %i\tK: %ix%i\t16b GPU GFLOPS: %.2f\t32b CPU GFLOPS:"
"%.2f\tratio: "
"%.2f\n",
space,
space,
input_channel,
output_channel,
kernel,
kernel,
flops / gpuIterTime / 1E6,
flops / cpuIterTime / 1E6,
cpuIterTime / gpuIterTime);
// gl_log(GL_LOG,
printf(
"Conv: X: %ix%i \tC: %i -> %i\tK: %ix%i\t16b GPU GFLOPS: %.2f\t32b CPU GFLOPS:"
"%.2f\tratio: "
"%.2f\n",
space,
space,
input_channel,
output_channel,
kernel,
kernel,
flops / gpuIterTime / 1E6,
flops / cpuIterTime / 1E6,
cpuIterTime / gpuIterTime);
}
}
}

View File

@ -282,7 +282,7 @@ void testOpenGLConv(int N,
} else {
float* data = t->mutable_data<float>();
for (int i = 0; i < t->size(); i++) {
data[i] = -1;
data[i] = 1;
}
}
#if 0
@ -658,7 +658,7 @@ void testOpenGLRelu(int N, int C, int H, int W, int input_tile_x, int input_tile
checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
}
void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1) {
void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1, int input_tile_x = 1, int input_tile_y = 1) {
LOG(INFO) << "OpenGL Add Test "
<< "C: " << C << ", H: " << H << ", W: " << W;
Workspace ws;
@ -682,6 +682,16 @@ void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1) {
op.set_type("CopyToOpenGL");
op.add_input("X_cpu0");
op.add_output("X_gl0");
{
auto& arg = *(op.add_arg());
arg.set_name("tile_x");
arg.set_i(input_tile_x);
}
{
auto& arg = *(op.add_arg());
arg.set_name("tile_y");
arg.set_i(input_tile_y);
}
}
{
@ -689,6 +699,16 @@ void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1) {
op.set_type("CopyToOpenGL");
op.add_input("X_cpu1");
op.add_output("X_gl1");
{
auto& arg = *(op.add_arg());
arg.set_name("tile_x");
arg.set_i(input_tile_x);
}
{
auto& arg = *(op.add_arg());
arg.set_name("tile_y");
arg.set_i(input_tile_y);
}
}
{
@ -733,15 +753,13 @@ void testOpenGLSub(int N, int C, int H, int W, float error = 0.1) {
t0->Resize(N, C, H, W);
CPUContext ctx0;
// Too noisy.
math::RandGaussian<float, CPUContext>(
t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
auto* t1 = ws.CreateBlob("X_cpu1")->GetMutable<TensorCPU>();
t1->Resize(N, C, H, W);
CPUContext ctx1;
// Too noisy.
math::RandGaussian<float, CPUContext>(
t1->size(), 0, 30, t1->mutable_data<float>(), &ctx1);
math::RandGaussian<float, CPUContext>(t1->size(), 0, 30, t1->mutable_data<float>(), &ctx1);
}
NetDef netdef;
@ -916,8 +934,7 @@ void testOpenGLTanh(int N, int C, int H, int W, float error) {
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
t->Resize(N, C, H, W);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(
t->size(), 0, 2, t->mutable_data<float>(), &ctx);
math::RandGaussian<float, CPUContext>(t->size(), 0, 2, t->mutable_data<float>(), &ctx);
}
NetDef netdef;
@ -1535,8 +1552,15 @@ void testOpenGLPadImage(
}
}
void testOpenGLResize(
int N, int C, int H, int W, int width_scale, int height_scale, int batch_size, float error) {
void testOpenGLResize(int N,
int C,
int H,
int W,
int width_scale,
int height_scale,
float error,
int input_tile_x = 1,
int input_tile_y = 1) {
LOG(INFO) << "OpenGLResize Test";
{
Workspace ws;
@ -1553,6 +1577,16 @@ void testOpenGLResize(
op.set_type("CopyToOpenGL");
op.add_input("X_cpu");
op.add_output("X_gl");
{
auto& arg = *(op.add_arg());
arg.set_name("tile_x");
arg.set_i(input_tile_x);
}
{
auto& arg = *(op.add_arg());
arg.set_name("tile_y");
arg.set_i(input_tile_y);
}
}
{
@ -1569,11 +1603,6 @@ void testOpenGLResize(
arg.set_name("height_scale");
arg.set_f(height_scale);
}
{
auto& arg = *(op.add_arg());
arg.set_name("batch_size");
arg.set_i(batch_size);
}
{
auto& arg = *(op.add_arg());
arg.set_name("is_last");
@ -2125,7 +2154,9 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
std::string input_order,
std::string engine, // "CPU", "OPENGL", or "MPSCNN"
bool run_individual,
bool use_texture_input) {
bool use_texture_input,
bool use_tiling,
bool run_fusion) {
std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
// caffe2::dumpDefForOpenGL(init_net);
@ -2138,7 +2169,7 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
if (engine == "CPU") {
net_def.CopyFrom(predict_net);
} else if (engine == "OPENGL") {
if (!caffe2::tryConvertToOpenGL(init_net, predict_net, &net_def, use_texture_input)) {
if (!caffe2::tryConvertToOpenGL(init_net, predict_net, &net_def, use_texture_input, use_tiling, run_fusion)) {
CAFFE_THROW("Failed to convert to openGL. Benchmark failed to run");
return -1;
}
@ -2197,19 +2228,18 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
}
if (input_type == "float") {
ImageAllocator<float16_t> allocator;
GLImageVector<float16_t>* output_image = allocator.newImage(
1,
width,
height,
channel,
tile_x,
tile_y,
GLImageVector<float16_t>* output_image = allocator.newImage(1,
width,
height,
channel,
tile_x,
tile_y,
#if CAFFE2_IOS
true
true
#else
false
false
#endif
);
);
blob->Reset(output_image);
for (auto& texture : (*output_image)[0]->textures) {
texture->map_load([&](void* buffer,
@ -2221,19 +2251,18 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
}
} else {
ImageAllocator<uint8_t> allocator;
GLImageVector<uint8_t>* output_image = allocator.newImage(
1,
width,
height,
channel,
tile_x,
tile_y,
GLImageVector<uint8_t>* output_image = allocator.newImage(1,
width,
height,
channel,
tile_x,
tile_y,
#if CAFFE2_IOS
true
true
#else
false
false
#endif
);
);
blob->Reset(output_image);
for (auto& texture : (*output_image)[0]->textures) {
texture->map_load([&](void* buffer,
@ -2288,7 +2317,8 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
}
glFinish();
LOG(INFO) << net_def.op(k).type() << ": " << (double)timer.MilliSeconds() / main_runs;
LOG(INFO) << "Operator #" << k << " " << net_def.op(k).type() << ": "
<< (double)timer.MilliSeconds() / main_runs;
}
}
}
@ -2501,6 +2531,10 @@ void testOpenGL() {
testOpenGLPRelu(1, channel, 13, 4, channel, tile_x, tile_y, 0.1);
testOpenGLRelu(1, channel, 4, 17, tile_x, tile_y, 0.1);
testOpenGLConv(1, channel, 16, 16, channel, 3, 3, 0, 2, MaxPool, 0.01, true, 1, 1, tile_x, tile_y, true);
testOpenGLConv(1, channel, 16, 16, channel, 3, 3, 0, 2, AveragePool, 0.01, true, 1, 1, tile_x, tile_y, true);
testOpenGLAdd(1, channel, 14, 8, 0.1, tile_x, tile_y);
testOpenGLResize(1, channel, 16, 16, 2, 2, 0.1, tile_x, tile_y);
// clang-format on
}
}
@ -2760,14 +2794,14 @@ void testOpenGL() {
testOpenGLInstanceNormPRelu(1, 6, 640, 360, 0.2);
LOG(INFO) << "Test OpenGL ResizeNearest";
testOpenGLResize(1, 4, 16, 16, 1, 1, 1, 0.1);
testOpenGLResize(1, 4, 16, 16, 2, 2, 1, 0.1);
testOpenGLResize(1, 4, 16, 16, 3, 3, 1, 0.1);
testOpenGLResize(1, 4, 16, 16, 4, 4, 1, 0.1);
testOpenGLResize(1, 16, 25, 25, 3, 3, 2, 0.1);
testOpenGLResize(1, 16, 25, 25, 3, 3, 4, 0.1);
testOpenGLResize(1, 12, 25, 25, 3, 3, 3, 0.1);
testOpenGLResize(1, 4, 720, 1280, 3, 3, 1, 0.1);
testOpenGLResize(1, 4, 16, 16, 1, 1, 0.1);
testOpenGLResize(1, 4, 16, 16, 2, 2, 0.1);
testOpenGLResize(1, 4, 16, 16, 3, 3, 0.1);
testOpenGLResize(1, 4, 16, 16, 4, 4, 0.1);
testOpenGLResize(1, 16, 25, 25, 3, 3, 0.1);
testOpenGLResize(1, 16, 25, 25, 3, 3, 0.1);
testOpenGLResize(1, 12, 25, 25, 3, 3, 0.1);
testOpenGLResize(1, 4, 720, 1280, 3, 3, 0.1);
// debug style transfer
// conv
@ -2848,8 +2882,8 @@ void testOpenGL() {
testOpenGLInstanceNormPRelu(3, 4, 16, 16, 0.2);
testOpenGLInstanceNormPRelu(15, 4, 16, 16, 0.2);
testOpenGLResize(3, 4, 16, 16, 1, 1, 1, 0.1);
testOpenGLResize(16, 4, 16, 16, 1, 1, 1, 0.1);
testOpenGLResize(3, 4, 16, 16, 1, 1, 0.1);
testOpenGLResize(16, 4, 16, 16, 1, 1, 0.1);
testOpenGLPadImage(3, 3, 4, 4, 0, 1, 0, 1, 0.01);
testOpenGLPadImage(23, 3, 4, 4, 0, 1, 0, 1, 0.01);

View File

@ -32,6 +32,8 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
std::string input_type,
std::string input_order,
std::string engine,
bool run_individual = false,
bool use_texture_input = false);
bool run_individual = false,
bool use_texture_input = false,
bool use_tiling = false,
bool run_fusion = true);
} // namespace caffe2