mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Make android segmentation network run on both iOS and android with tiling
Summary: Add tiling support to GLAdd, GLPool, and GLResizeNearest Differential Revision: D5733208 fbshipit-source-id: b73113326b96d421787d4695ccf7d2d919ee2ed8
This commit is contained in:
parent
2d9728d594
commit
dd5400e452
|
|
@ -26,9 +26,7 @@ class GLContext {
|
|||
|
||||
static bool GL_EXT_texture_border_clamp_defined();
|
||||
|
||||
inline bool halfFloatTextureSupported() {
|
||||
return half_float_supported;
|
||||
}
|
||||
inline bool halfFloatTextureSupported() { return half_float_supported; }
|
||||
|
||||
void setTextureAllocator(
|
||||
std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
|
||||
|
|
|
|||
|
|
@ -76,13 +76,9 @@ std::string GLFilter::process_replacements(std::string shader,
|
|||
// Add some #defines for convenience
|
||||
std::string version_tag = "#version 300 es";
|
||||
if (GLContext::getGLContext()->halfFloatTextureSupported()) {
|
||||
shader.insert(
|
||||
shader.find(version_tag) + version_tag.size(),
|
||||
half_float_texture_utils);
|
||||
shader.insert(shader.find(version_tag) + version_tag.size(), half_float_texture_utils);
|
||||
} else {
|
||||
shader.insert(
|
||||
shader.find(version_tag) + version_tag.size(),
|
||||
half_float_compat_texture_utils);
|
||||
shader.insert(shader.find(version_tag) + version_tag.size(), half_float_compat_texture_utils);
|
||||
}
|
||||
shader.insert(shader.find(version_tag) + version_tag.size(), shader_utils);
|
||||
return shader;
|
||||
|
|
|
|||
|
|
@ -18,6 +18,8 @@ class GLImage {
|
|||
|
||||
const int tile_x;
|
||||
const int tile_y;
|
||||
const int texture_width;
|
||||
const int texture_height;
|
||||
const int slices;
|
||||
|
||||
const std::vector<const GLTexture*> textures;
|
||||
|
|
@ -50,6 +52,8 @@ class GLImage {
|
|||
data_size(sizeof(T)),
|
||||
tile_x(_tile_x),
|
||||
tile_y(_tile_y),
|
||||
texture_width(_width * _tile_x),
|
||||
texture_height(_height * _tile_y),
|
||||
slices(channels_to_slices(_channels, _tile_x, _tile_y)),
|
||||
textures(allocate_textures(slices, texture_loader)) {
|
||||
CAFFE_ENFORCE_EQ(slices * tile_x * tile_y, (channels + 3) / 4);
|
||||
|
|
@ -68,13 +72,23 @@ class GLImage {
|
|||
data_size(sizeof(T)),
|
||||
tile_x(_tile_x),
|
||||
tile_y(_tile_y),
|
||||
texture_width(_width * _tile_x),
|
||||
texture_height(_height * _tile_y),
|
||||
slices(channels_to_slices(_channels, _tile_x, _tile_y)),
|
||||
textures(allocate_textures(slices, texture_loader)) {
|
||||
CAFFE_ENFORCE_EQ(slices * tile_x * tile_y, (channels + 3) / 4);
|
||||
}
|
||||
|
||||
GLImage()
|
||||
: width(0), height(0), channels(0), data_size(sizeof(T)), tile_x(0), tile_y(0), slices(0){};
|
||||
: width(0),
|
||||
height(0),
|
||||
channels(0),
|
||||
data_size(sizeof(T)),
|
||||
tile_x(0),
|
||||
tile_y(0),
|
||||
texture_width(0),
|
||||
texture_height(0),
|
||||
slices(0){};
|
||||
|
||||
virtual ~GLImage() {
|
||||
gl_log(GL_VERBOSE, "deleting GLImage\n");
|
||||
|
|
|
|||
|
|
@ -6,37 +6,18 @@
|
|||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
|
||||
#define half_float_supported \
|
||||
(GLContext::getGLContext()->halfFloatTextureSupported())
|
||||
#define half_float_supported (GLContext::getGLContext()->halfFloatTextureSupported())
|
||||
|
||||
#define FIXED_TYPE(_t) \
|
||||
(((_t).type != GL_HALF_FLOAT || half_float_supported) \
|
||||
? (_t) \
|
||||
: GLTexture::FP16_COMPAT)
|
||||
#define FIXED_TYPE(_t) (((_t).type != GL_HALF_FLOAT || half_float_supported) ? (_t) : GLTexture::FP16_COMPAT)
|
||||
|
||||
GLPlainTexture::GLPlainTexture(
|
||||
const Type& type,
|
||||
const void* input,
|
||||
GLsizei width,
|
||||
GLsizei height,
|
||||
bool use_padding,
|
||||
GLint filter,
|
||||
GLint wrap)
|
||||
const Type& type, const void* input, GLsizei width, GLsizei height, bool use_padding, GLint filter, GLint wrap)
|
||||
: GLTexture(FIXED_TYPE(type), width, height, use_padding, filter, wrap) {
|
||||
// caffe2::Timer timer;
|
||||
// timer.Start();
|
||||
glGenTextures(1, &_textureId);
|
||||
glBindTexture(GL_TEXTURE_2D, _textureId);
|
||||
glTexImage2D(
|
||||
GL_TEXTURE_2D,
|
||||
0,
|
||||
_type.internalFormat,
|
||||
_stride,
|
||||
_height,
|
||||
0,
|
||||
_type.format,
|
||||
_type.type,
|
||||
input);
|
||||
glTexImage2D(GL_TEXTURE_2D, 0, _type.internalFormat, _stride, _height, 0, _type.format, _type.type, input);
|
||||
|
||||
gl_log(
|
||||
GL_VERBOSE,
|
||||
|
|
@ -64,13 +45,7 @@ GLPlainTexture::GLPlainTexture(
|
|||
}
|
||||
|
||||
GLPlainTexture::GLPlainTexture(
|
||||
const Type& type,
|
||||
const GLuint textureID,
|
||||
GLsizei width,
|
||||
GLsizei height,
|
||||
bool use_padding,
|
||||
GLint filter,
|
||||
GLint wrap)
|
||||
const Type& type, const GLuint textureID, GLsizei width, GLsizei height, bool use_padding, GLint filter, GLint wrap)
|
||||
: GLTexture(FIXED_TYPE(type), width, height, use_padding, filter, wrap) {
|
||||
_textureId = textureID;
|
||||
isOwner = false;
|
||||
|
|
|
|||
|
|
@ -31,9 +31,7 @@ void arm_memcpy(volatile unsigned char* dst, volatile unsigned char* src, int sz
|
|||
|
||||
const GLTexture::Type GLTexture::FP16 = {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT};
|
||||
const GLTexture::Type GLTexture::UI8 = {GL_RGBA, GL_RGBA, GL_UNSIGNED_BYTE};
|
||||
const GLTexture::Type GLTexture::FP16_COMPAT = {GL_RG32UI,
|
||||
GL_RG_INTEGER,
|
||||
GL_UNSIGNED_INT};
|
||||
const GLTexture::Type GLTexture::FP16_COMPAT = {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT};
|
||||
|
||||
void GLTexture::map_read(std::function<void(const void* buffer,
|
||||
size_t width,
|
||||
|
|
|
|||
|
|
@ -13,14 +13,14 @@ class GLTexture {
|
|||
|
||||
int dataSize() const {
|
||||
switch (type) {
|
||||
case GL_UNSIGNED_INT:
|
||||
return 4;
|
||||
case GL_HALF_FLOAT:
|
||||
return 2;
|
||||
case GL_UNSIGNED_BYTE:
|
||||
return 1;
|
||||
default:
|
||||
throw std::runtime_error("Unknown Texture Type");
|
||||
case GL_UNSIGNED_INT:
|
||||
return 4;
|
||||
case GL_HALF_FLOAT:
|
||||
return 2;
|
||||
case GL_UNSIGNED_BYTE:
|
||||
return 1;
|
||||
default:
|
||||
throw std::runtime_error("Unknown Texture Type");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -251,7 +251,7 @@ void dumpDefForOpenGL(const NetDef& d) {
|
|||
// }
|
||||
//}
|
||||
|
||||
NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput, bool useTiling) {
|
||||
NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput, bool useTiling, bool runFusion) {
|
||||
CAFFE_ENFORCE_GE(predictNet.op_size(), 1);
|
||||
NetDef net;
|
||||
net.CopyFrom(predictNet);
|
||||
|
|
@ -303,7 +303,9 @@ NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput
|
|||
CAFFE_THROW("OpenGL operator missing");
|
||||
}
|
||||
|
||||
net = runOpenGLFusion(net, openGLOps);
|
||||
if (runFusion) {
|
||||
net = runOpenGLFusion(net, openGLOps);
|
||||
}
|
||||
|
||||
if (net.op(0).type() == replacements["OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess"]) {
|
||||
// For end-to-end testing
|
||||
|
|
@ -320,7 +322,9 @@ NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput
|
|||
copy_op->add_output(net.external_output(0));
|
||||
}
|
||||
} else {
|
||||
needCopyOps = true;
|
||||
if (!useTextureInput) {
|
||||
needCopyOps = true;
|
||||
}
|
||||
}
|
||||
|
||||
// copy ops are needed when the input is not a texture
|
||||
|
|
@ -335,10 +339,12 @@ NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput
|
|||
bool tryConvertToOpenGL(const NetDef& initNet,
|
||||
const NetDef& predictNet,
|
||||
NetDef* glPredictNet,
|
||||
bool useTextureInput) {
|
||||
bool useTextureInput,
|
||||
bool useTiling,
|
||||
bool runFusion) {
|
||||
try {
|
||||
// Throws if unsupported operators are found.
|
||||
*glPredictNet = rewritePredictNetForOpenGL(predictNet, useTextureInput);
|
||||
*glPredictNet = rewritePredictNetForOpenGL(predictNet, useTextureInput, useTiling, runFusion);
|
||||
dumpDefForOpenGL(*glPredictNet);
|
||||
// Throws if unsupported parameters are found.
|
||||
Workspace ws;
|
||||
|
|
|
|||
|
|
@ -8,11 +8,14 @@ namespace caffe2 {
|
|||
bool tryConvertToOpenGL(const NetDef& initNet,
|
||||
const NetDef& predictNet,
|
||||
NetDef* glPredictNet,
|
||||
bool useTextureInput = false);
|
||||
bool useTextureInput = false,
|
||||
bool useTiling = false,
|
||||
bool runFusion = true);
|
||||
|
||||
// Exposed for testing
|
||||
NetDef rewritePredictNetForOpenGL(const NetDef& predictNet,
|
||||
bool useTextureInput = false,
|
||||
bool useTiling = false);
|
||||
bool useTiling = false,
|
||||
bool runFusion = true);
|
||||
void dumpDefForOpenGL(const NetDef& net);
|
||||
} // namespace caffe2
|
||||
|
|
|
|||
|
|
@ -75,9 +75,9 @@ void GLAdd::add(const GLImageVector<T>& input_images0,
|
|||
|
||||
run(input_attachments,
|
||||
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
|
||||
[&]() { glUniform2i(outputSize->location, output_image->width, output_image->height); },
|
||||
output_image->width,
|
||||
output_image->height);
|
||||
[&]() { glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height); },
|
||||
output_image->texture_width,
|
||||
output_image->texture_height);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -91,8 +91,7 @@ class OpenGLAddOp final : public Operator<CPUContext>, ImageAllocator<T> {
|
|||
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("broadcast") == false,
|
||||
"OpenGLAdd does not support broadcast");
|
||||
|
||||
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false,
|
||||
"OpenGLMul does not support axis");
|
||||
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, "OpenGLAdd does not support axis");
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
|
|
@ -105,18 +104,25 @@ class OpenGLAddOp final : public Operator<CPUContext>, ImageAllocator<T> {
|
|||
const int input_channels = input0.channels();
|
||||
const int input_width = input0.width();
|
||||
const int input_height = input0.height();
|
||||
const int input_tile_x = input0.tile_x();
|
||||
const int input_tile_y = input0.tile_y();
|
||||
|
||||
CAFFE_ENFORCE_EQ(input1.channels(), input_channels);
|
||||
CAFFE_ENFORCE_EQ(input1.width(), input_width);
|
||||
CAFFE_ENFORCE_EQ(input1.height(), input_height);
|
||||
CAFFE_ENFORCE_EQ(input1.tile_x(), input_tile_x);
|
||||
CAFFE_ENFORCE_EQ(input1.tile_y(), input_tile_y);
|
||||
|
||||
const int output_channels = input_channels;
|
||||
const int output_width = input_width;
|
||||
const int output_height = input_height;
|
||||
const int output_tile_x = input_tile_x;
|
||||
const int output_tile_y = input_tile_y;
|
||||
|
||||
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
|
||||
|
||||
GLImageVector<T>* output = ImageAllocator<T>::newImage(
|
||||
num_images, output_width, output_height, output_channels, is_last);
|
||||
num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
|
||||
|
||||
if (!_add) {
|
||||
_add.reset(new GLAdd());
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -32,19 +32,18 @@ class GLPRelu : public GLFilter {
|
|||
int _output_tile_y,
|
||||
int _output_tile_width,
|
||||
int _output_tile_height)
|
||||
: GLFilter(
|
||||
"GLPRelu",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>({BINDING(inputData)}),
|
||||
std::vector<binding*>({BINDING(scale_block)}),
|
||||
{/* no attributes */},
|
||||
{{"USE_RELU", caffe2::to_string(PRelu)},
|
||||
{"OUTPUT_TILES", caffe2::to_string(_output_tile_x * _output_tile_y)},
|
||||
{"OUTPUT_TILE_X", caffe2::to_string(_output_tile_x)},
|
||||
{"OUTPUT_TILE_WIDTH", caffe2::to_string(_output_tile_width)},
|
||||
{"OUTPUT_TILE_HEIGHT", caffe2::to_string(_output_tile_height)},
|
||||
{"TILED_CONVOLUTION", caffe2::to_string(_output_tile_x > 1 || _output_tile_y > 1)}}),
|
||||
: GLFilter("GLPRelu",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>({BINDING(inputData)}),
|
||||
std::vector<binding*>({BINDING(scale_block)}),
|
||||
{/* no attributes */},
|
||||
{{"USE_RELU", caffe2::to_string(PRelu)},
|
||||
{"OUTPUT_TILES", caffe2::to_string(_output_tile_x * _output_tile_y)},
|
||||
{"OUTPUT_TILE_X", caffe2::to_string(_output_tile_x)},
|
||||
{"OUTPUT_TILE_WIDTH", caffe2::to_string(_output_tile_width)},
|
||||
{"OUTPUT_TILE_HEIGHT", caffe2::to_string(_output_tile_height)},
|
||||
{"TILED_PRELU", caffe2::to_string(_output_tile_x > 1 || _output_tile_y > 1)}}),
|
||||
scale(_scale),
|
||||
scale_size(_scale_size),
|
||||
channels(_channels),
|
||||
|
|
@ -67,7 +66,7 @@ class GLPRelu : public GLFilter {
|
|||
{"OUTPUT_TILE_X", caffe2::to_string(1)},
|
||||
{"OUTPUT_TILE_WIDTH", caffe2::to_string(1)},
|
||||
{"OUTPUT_TILE_HEIGHT", caffe2::to_string(1)},
|
||||
{"TILED_CONVOLUTION", caffe2::to_string(0)}}),
|
||||
{"TILED_PRELU", caffe2::to_string(0)}}),
|
||||
scale(nullptr),
|
||||
scale_block(nullptr),
|
||||
scale_size(0),
|
||||
|
|
@ -88,75 +87,72 @@ class GLPRelu : public GLFilter {
|
|||
// MARK: GLSL
|
||||
|
||||
const char* GLPRelu::fragment_shader = R"GLSL(#version 300 es
|
||||
|
||||
#define TILED_PRELU $(TILED_PRELU)
|
||||
#define USE_RELU $(USE_RELU)
|
||||
|
||||
// tiling
|
||||
#define OUTPUT_TILES $(OUTPUT_TILES)
|
||||
#define OUTPUT_TILE_X $(OUTPUT_TILE_X)
|
||||
#define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
|
||||
#define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
|
||||
#define TILED_CONVOLUTION $(TILED_CONVOLUTION)
|
||||
|
||||
// common
|
||||
precision mediump float;
|
||||
precision highp int;
|
||||
|
||||
TEXTURE_INPUT(inputData);
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
|
||||
#if !USE_RELU
|
||||
#if TILED_CONVOLUTION == 1
|
||||
layout (std140) uniform scale_block {
|
||||
highp uvec4 scale[(OUTPUT_TILES + 1) / 2];
|
||||
};
|
||||
#if USE_RELU
|
||||
|
||||
#else
|
||||
layout (std140) uniform scale_block {
|
||||
highp uvec4 scale;
|
||||
};
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !USE_RELU
|
||||
|
||||
#if TILED_CONVOLUTION
|
||||
void main() {
|
||||
ivec2 inputSize = textureSize(inputData, 0);
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
|
||||
|
||||
ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
|
||||
|
||||
int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
|
||||
|
||||
// output.data = value > 0 ? value : value * weight;
|
||||
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
|
||||
vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw);
|
||||
value = mix(value * preluValue, value, vec4(greaterThan(value, vec4(0))));
|
||||
outputData = TEXTURE_STORE(value);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void main() {
|
||||
ivec2 inputSize = textureSize(inputData, 0);
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
|
||||
|
||||
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
|
||||
value = mix(value * unpackHalf4x16(scale.xy), value, vec4(greaterThan(value, vec4(0))));
|
||||
outputData = TEXTURE_STORE(value);
|
||||
}
|
||||
#endif // TILED_CONVOLUTION
|
||||
|
||||
#else // Relu
|
||||
// Relu
|
||||
void main() {
|
||||
ivec2 inputSize = textureSize(inputData, 0);
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
|
||||
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
|
||||
outputData = TEXTURE_STORE(max(value, vec4(0.0)));
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if TILED_PRELU
|
||||
const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
|
||||
|
||||
layout (std140) uniform scale_block {
|
||||
highp uvec4 scale[(OUTPUT_TILES + 1) / 2];
|
||||
};
|
||||
|
||||
void main() {
|
||||
ivec2 inputSize = textureSize(inputData, 0);
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
|
||||
|
||||
ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
|
||||
int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
|
||||
|
||||
// outputData = value > 0 ? value : value * weight;
|
||||
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
|
||||
vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw);
|
||||
value = mix(value * preluValue, value, vec4(greaterThan(value, vec4(0))));
|
||||
outputData = TEXTURE_STORE(value);
|
||||
}
|
||||
#else
|
||||
layout (std140) uniform scale_block {
|
||||
highp uvec4 scale;
|
||||
};
|
||||
void main() {
|
||||
ivec2 inputSize = textureSize(inputData, 0);
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
|
||||
|
||||
// outputData = value > 0 ? value : value * weight;
|
||||
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
|
||||
value = mix(value * unpackHalf4x16(scale.xy), value, vec4(greaterThan(value, vec4(0))));
|
||||
outputData = TEXTURE_STORE(value);
|
||||
}
|
||||
#endif // TILED_PRELU
|
||||
|
||||
#endif // USE_RELU
|
||||
|
||||
)GLSL";
|
||||
|
||||
|
|
@ -190,8 +186,8 @@ void GLPRelu::prelu(const GLImageVector<T>& input_images,
|
|||
run(input_attachments,
|
||||
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
|
||||
[&]() {},
|
||||
output_image->width * output_image->tile_x,
|
||||
output_image->height * output_image->tile_y);
|
||||
output_image->texture_width,
|
||||
output_image->texture_height);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -139,8 +139,8 @@ class OpenGLPadImageOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator
|
|||
padImage_.reset(new GLPadImage());
|
||||
LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => "
|
||||
<< output_channels << ": " << output_height << " X " << output_width;
|
||||
LOG(INFO) << "Padmode: " << mode_ << "pad_l = " << pad_l() << ", pad_r = " << pad_r()
|
||||
<< ", pad_t = " << pad_t() << ", pad_b = " << pad_b();
|
||||
LOG(INFO) << "Padmode: " << mode_ << ", pad_l = " << pad_l() << ", pad_r = " << pad_r() << ", pad_t = " << pad_t()
|
||||
<< ", pad_b = " << pad_b();
|
||||
}
|
||||
|
||||
padImage_->pad(input, *output, pad_l(), pad_t());
|
||||
|
|
|
|||
|
|
@ -21,6 +21,8 @@ class GLPool : public GLFilter {
|
|||
point kernel_size;
|
||||
point input_padding;
|
||||
point input_stride;
|
||||
point input_tile_size;
|
||||
point output_tile_size;
|
||||
};
|
||||
|
||||
binding* inputData;
|
||||
|
|
@ -29,25 +31,29 @@ class GLPool : public GLFilter {
|
|||
|
||||
const descriptor geometry;
|
||||
|
||||
GLPool(const descriptor& _geometry, PoolType poolType)
|
||||
: GLFilter(
|
||||
"GLPool",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
{
|
||||
BINDING(inputData), BINDING(kernelSize), BINDING(outputSize),
|
||||
},
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{{"KERNEL_SIZE_X", caffe2::to_string(_geometry.kernel_size.x)},
|
||||
{"KERNEL_SIZE_Y", caffe2::to_string(_geometry.kernel_size.y)},
|
||||
{"INPUT_PADDING_X", caffe2::to_string(_geometry.input_padding.x)},
|
||||
{"INPUT_PADDING_Y", caffe2::to_string(_geometry.input_padding.y)},
|
||||
{"INPUT_STRIDE_X", caffe2::to_string(_geometry.input_stride.x)},
|
||||
{"INPUT_STRIDE_Y", caffe2::to_string(_geometry.input_stride.y)},
|
||||
{"TEXTURE_BORDER_CLAMP",
|
||||
caffe2::to_string(GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined())},
|
||||
{"MAX_POOL", caffe2::to_string(poolType == MaxPool)}}),
|
||||
GLPool(const descriptor& _geometry, PoolType poolType, bool _tiling)
|
||||
: GLFilter("GLPool",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
{
|
||||
BINDING(inputData), BINDING(kernelSize), BINDING(outputSize),
|
||||
},
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{{"KERNEL_SIZE_X", caffe2::to_string(_geometry.kernel_size.x)},
|
||||
{"KERNEL_SIZE_Y", caffe2::to_string(_geometry.kernel_size.y)},
|
||||
{"INPUT_PADDING_X", caffe2::to_string(_geometry.input_padding.x)},
|
||||
{"INPUT_PADDING_Y", caffe2::to_string(_geometry.input_padding.y)},
|
||||
{"INPUT_STRIDE_X", caffe2::to_string(_geometry.input_stride.x)},
|
||||
{"INPUT_STRIDE_Y", caffe2::to_string(_geometry.input_stride.y)},
|
||||
{"INPUT_TILE_WIDTH", caffe2::to_string(_geometry.input_tile_size.x)},
|
||||
{"INPUT_TILE_HEIGHT", caffe2::to_string(_geometry.input_tile_size.y)},
|
||||
{"OUTPUT_TILE_WIDTH", caffe2::to_string(_geometry.output_tile_size.x)},
|
||||
{"OUTPUT_TILE_HEIGHT", caffe2::to_string(_geometry.output_tile_size.y)},
|
||||
{"TILED_POOLING", caffe2::to_string(_tiling)},
|
||||
{"TEXTURE_BORDER_CLAMP",
|
||||
caffe2::to_string(GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined())},
|
||||
{"MAX_POOL", caffe2::to_string(poolType == MaxPool)}}),
|
||||
geometry(_geometry) {}
|
||||
~GLPool() {}
|
||||
|
||||
|
|
@ -63,11 +69,11 @@ class GLPool : public GLFilter {
|
|||
run({{input_image->textures[is], inputData}},
|
||||
{output_image->textures[is]},
|
||||
[&]() {
|
||||
glUniform2i(outputSize->location, output_image->width, output_image->height);
|
||||
glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
|
||||
glUniform2i(kernelSize->location, geometry.kernel_size.x, geometry.kernel_size.y);
|
||||
},
|
||||
output_image->width,
|
||||
output_image->height);
|
||||
output_image->texture_width,
|
||||
output_image->texture_height);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -78,10 +84,16 @@ class GLPool : public GLFilter {
|
|||
|
||||
// MARK: GLSL
|
||||
const char* GLPool::fragment_shader = R"GLSL(#version 300 es
|
||||
|
||||
#define TILED_POOLING $(TILED_POOLING)
|
||||
#define TEXTURE_BORDER_CLAMP $(TEXTURE_BORDER_CLAMP)
|
||||
#define MAX_POOL $(MAX_POOL)
|
||||
|
||||
// tiling
|
||||
#define INPUT_TILE_WIDTH $(INPUT_TILE_WIDTH)
|
||||
#define INPUT_TILE_HEIGHT $(INPUT_TILE_HEIGHT)
|
||||
#define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
|
||||
#define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
|
||||
|
||||
precision mediump float;
|
||||
precision mediump int;
|
||||
|
||||
|
|
@ -90,24 +102,84 @@ in highp vec2 v_texCoord;
|
|||
const ivec2 input_padding = ivec2($(INPUT_PADDING_X), $(INPUT_PADDING_Y));
|
||||
const ivec2 input_stride = ivec2($(INPUT_STRIDE_X), $(INPUT_STRIDE_Y));
|
||||
const ivec2 kernel_size = ivec2($(KERNEL_SIZE_X), $(KERNEL_SIZE_Y));
|
||||
const int channels = 4;
|
||||
|
||||
uniform ivec2 kernelSize;
|
||||
uniform ivec2 outputSize;
|
||||
|
||||
TEXTURE_INPUT(inputData);
|
||||
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
const bool no_bounds = bool(TEXTURE_BORDER_CLAMP) || all(equal(input_padding, ivec2(0)));
|
||||
|
||||
const bool no_bounds = (TILED_POOLING == 0) && (bool(TEXTURE_BORDER_CLAMP) || all(equal(input_padding, ivec2(0))));
|
||||
#define IN_BOUNDS(p, p0, p1) (all(greaterThanEqual(p, p0)) && all(lessThan(p, p1)))
|
||||
|
||||
// MIN_FLOAT is -2^14, which is the minimum precision requirement for mediump in OpenGL ES 3.0
|
||||
const float MIN_FLOAT = -exp2(14.0);
|
||||
|
||||
#if TILED_POOLING
|
||||
|
||||
const ivec2 inputTileSize = ivec2(INPUT_TILE_WIDTH, INPUT_TILE_HEIGHT);
|
||||
const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
|
||||
|
||||
// tiled pooling
|
||||
#if MAX_POOL
|
||||
|
||||
// MIN_FLOAT is -2^14, which is the minimum precision requirement for mediump in OpenGL ES 3.0
|
||||
#define POOL { \
|
||||
pool = vec4(MIN_FLOAT); \
|
||||
for (int y = 0; y < kernelSize.y; y++) { \
|
||||
for (int x = 0; x < kernelSize.x; x++) { \
|
||||
ivec2 idx = tileCoord + ivec2(x, y); \
|
||||
if (no_bounds || IN_BOUNDS(idx, ivec2(0), inputTileSize)) { \
|
||||
vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
|
||||
pool = max(pool, data); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
const float MIN_FLOAT = -exp2(14.0);
|
||||
#else
|
||||
|
||||
#define POOL { \
|
||||
int count = 0; \
|
||||
for (int y = 0; y < kernelSize.y; y++) { \
|
||||
for (int x = 0; x < kernelSize.x; x++) { \
|
||||
ivec2 idx = tileCoord + ivec2(x, y); \
|
||||
if (no_bounds || IN_BOUNDS(idx, ivec2(0), inputTileSize)) { \
|
||||
vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
|
||||
pool += data;\
|
||||
count += 1; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
pool = pool / float(count); \
|
||||
}
|
||||
|
||||
#endif // MAX_POOL
|
||||
|
||||
void main() {
|
||||
ivec2 inputSize = textureSize(inputData, 0);
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
|
||||
|
||||
ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
|
||||
ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
|
||||
tileCoord = input_stride * tileCoord - input_padding;
|
||||
|
||||
ivec2 inputTileOffset = tile * inputTileSize;
|
||||
|
||||
#if MAX_POOL
|
||||
vec4 pool = vec4(0);
|
||||
#else
|
||||
highp vec4 pool = vec4(0);
|
||||
#endif
|
||||
|
||||
POOL;
|
||||
|
||||
outputData = TEXTURE_STORE(pool);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// no tiling
|
||||
#if MAX_POOL
|
||||
|
||||
#define POOL { \
|
||||
pool = vec4(MIN_FLOAT); \
|
||||
|
|
@ -125,22 +197,21 @@ const float MIN_FLOAT = -exp2(14.0);
|
|||
#else
|
||||
|
||||
#define POOL { \
|
||||
int count = 0; \
|
||||
for (int y = 0; y < kernelSize.y; y++) { \
|
||||
for (int x = 0; x < kernelSize.x; x++) { \
|
||||
ivec2 idx = texelCoord + ivec2(x, y); \
|
||||
if (no_bounds || IN_BOUNDS(idx, ivec2(0), inputSize)) { \
|
||||
vec4 data = TEXTURE_LOAD(inputData, idx); \
|
||||
pool += data;\
|
||||
pool += data; \
|
||||
count += 1; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
ivec2 start = texelCoord; \
|
||||
ivec2 end = min(start + kernel_size, inputSize); \
|
||||
start = max(ivec2(0), start); \
|
||||
pool = pool / float((end.x - start.x) * (end.y - start.y)); \
|
||||
pool = pool / float(count); \
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // MAX_POOL
|
||||
|
||||
void main() {
|
||||
ivec2 inputSize = textureSize(inputData, 0);
|
||||
|
|
@ -155,6 +226,8 @@ void main() {
|
|||
|
||||
outputData = TEXTURE_STORE(pool);
|
||||
}
|
||||
#endif // TILED_POOLING
|
||||
|
||||
)GLSL";
|
||||
|
||||
namespace caffe2 {
|
||||
|
|
@ -199,18 +272,25 @@ class GLPoolOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<float16
|
|||
|
||||
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
|
||||
|
||||
GLImageVector<T>* output = ImageAllocator<T>::newImage(
|
||||
num_images, output_width, output_height, output_channels, is_last);
|
||||
const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
|
||||
const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
|
||||
|
||||
GLPool::descriptor geometry{
|
||||
input_channels, {kernel_w(), kernel_h()}, {pad_l(), pad_t()}, {stride_w(), stride_h()}};
|
||||
GLImageVector<T>* output = ImageAllocator<T>::newImage(
|
||||
num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
|
||||
|
||||
GLPool::descriptor geometry{input_channels,
|
||||
{kernel_w(), kernel_h()},
|
||||
{pad_l(), pad_t()},
|
||||
{stride_w(), stride_h()},
|
||||
{input_width, input_height},
|
||||
{output_height, output_width}};
|
||||
|
||||
if (!glPool_) {
|
||||
LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => "
|
||||
<< output_channels << ": " << output_height << " X " << output_width
|
||||
<< " Kernel: " << kernel_w() << "X" << kernel_h();
|
||||
LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => " << output_channels << ": "
|
||||
<< output_height << " X " << output_width << " Kernel: " << kernel_w() << "X" << kernel_h()
|
||||
<< " Tiling: " << input_tile_x << "X" << input_tile_y;
|
||||
|
||||
glPool_.reset(new GLPool(geometry, poolType));
|
||||
glPool_.reset(new GLPool(geometry, poolType, input_tile_x > 1 || input_tile_y > 1));
|
||||
}
|
||||
|
||||
glPool_->pool(input, *output);
|
||||
|
|
|
|||
|
|
@ -11,89 +11,48 @@
|
|||
|
||||
class GLResizeNearest : public GLFilter {
|
||||
public:
|
||||
static constexpr int MaxBatchSize = 4;
|
||||
binding* inputData;
|
||||
binding* outputSize;
|
||||
binding* scale_reverse;
|
||||
|
||||
binding* inputData[MaxBatchSize];
|
||||
binding* inputSize;
|
||||
binding* outputSize;
|
||||
binding* scale_reverse;
|
||||
GLResizeNearest()
|
||||
: GLFilter("GLResizeNearest",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>({BINDING(outputSize), BINDING(scale_reverse), BINDING(inputData)}),
|
||||
{/* no uniform blocks*/},
|
||||
{/* no attributes */},
|
||||
{/* replacements */}) {}
|
||||
|
||||
const int batch_size;
|
||||
template <typename T>
|
||||
void resize(const GLImageVector<T>& input_images,
|
||||
const GLImageVector<T>& output_images,
|
||||
float width_scale_rev,
|
||||
float height_scale_rev);
|
||||
|
||||
const std::vector<binding*> input_bindings(int batch_size) {
|
||||
std::vector<binding*> bindings(
|
||||
{BINDING(inputSize), BINDING(outputSize), BINDING(scale_reverse)});
|
||||
for (int i = 0; i < batch_size; i++) {
|
||||
bindings.push_back(inputData[i] = new binding{"inputData[" + caffe2::to_string(i) + "]"});
|
||||
}
|
||||
return bindings;
|
||||
}
|
||||
|
||||
GLResizeNearest(int _batch_size = 1)
|
||||
: GLFilter("GLResizeNearest",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
input_bindings(_batch_size),
|
||||
{/* no uniform blocks*/},
|
||||
{/* no attributes */},
|
||||
{{"BATCH_SIZE", caffe2::to_string(_batch_size)}}),
|
||||
batch_size(_batch_size) {}
|
||||
|
||||
template <typename T>
|
||||
void resize(const GLImageVector<T>& input_images,
|
||||
const GLImageVector<T>& output_images,
|
||||
float width_scale_rev,
|
||||
float height_scale_rev);
|
||||
|
||||
static const char* fragment_shader;
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
// MARK: GLSL
|
||||
|
||||
const char* GLResizeNearest::fragment_shader = R"GLSL(#version 300 es
|
||||
|
||||
#define BATCH_SIZE $(BATCH_SIZE)
|
||||
|
||||
precision mediump float;
|
||||
precision mediump int;
|
||||
|
||||
in highp vec2 v_texCoord;
|
||||
|
||||
uniform ivec2 inputSize;
|
||||
uniform ivec2 outputSize;
|
||||
uniform highp vec2 scale_reverse;
|
||||
|
||||
TEXTURE_INPUT(inputData[BATCH_SIZE]);
|
||||
|
||||
TEXTURE_OUTPUT(0, outputData0);
|
||||
#if BATCH_SIZE > 1
|
||||
TEXTURE_OUTPUT(1, outputData1);
|
||||
#if BATCH_SIZE > 2
|
||||
TEXTURE_OUTPUT(2, outputData2);
|
||||
#if BATCH_SIZE > 3
|
||||
TEXTURE_OUTPUT(3, outputData3);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
TEXTURE_INPUT(inputData);
|
||||
TEXTURE_OUTPUT(0, outputData);
|
||||
|
||||
void main() {
|
||||
// it clamps to the edge by default
|
||||
ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize) * scale_reverse);
|
||||
|
||||
vec4 v0 = TEXTURE_LOAD(inputData[0], texelCoord);
|
||||
outputData0 = TEXTURE_STORE(v0);
|
||||
#if BATCH_SIZE > 1
|
||||
vec4 v1 = TEXTURE_LOAD(inputData[1], texelCoord);
|
||||
outputData1 = TEXTURE_STORE(v1);
|
||||
#if BATCH_SIZE > 2
|
||||
vec4 v2 = TEXTURE_LOAD(inputData[2], texelCoord);
|
||||
outputData2 = TEXTURE_STORE(v2);
|
||||
#if BATCH_SIZE > 3
|
||||
vec4 v3 = TEXTURE_LOAD(inputData[3], texelCoord);
|
||||
outputData3 = TEXTURE_STORE(v3);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
vec4 value = TEXTURE_LOAD(inputData, texelCoord);
|
||||
outputData = TEXTURE_STORE(value);
|
||||
}
|
||||
)GLSL";
|
||||
|
||||
|
|
@ -108,21 +67,17 @@ void GLResizeNearest::resize(const GLImageVector<T>& input_images,
|
|||
int input_slices = input_image->slices;
|
||||
int output_slices = output_image->slices;
|
||||
|
||||
for (int is = 0; is < input_slices; is += batch_size) {
|
||||
std::vector<texture_attachment> input_attachments;
|
||||
for (int ib = 0; ib < batch_size; ib++) {
|
||||
input_attachments.push_back({input_image->textures[is + ib], inputData[ib]});
|
||||
}
|
||||
for (int is = 0; is < input_slices; is++) {
|
||||
std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
|
||||
|
||||
run(input_attachments,
|
||||
{output_image->textures.begin() + is, output_image->textures.begin() + is + batch_size},
|
||||
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
|
||||
[&]() {
|
||||
glUniform2i(inputSize->location, input_image->width, input_image->height);
|
||||
glUniform2i(outputSize->location, output_image->width, output_image->height);
|
||||
glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
|
||||
glUniform2f(scale_reverse->location, width_scale_rev, height_scale_rev);
|
||||
},
|
||||
output_image->width,
|
||||
output_image->height);
|
||||
output_image->texture_width,
|
||||
output_image->texture_height);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -153,14 +108,15 @@ class OpenGLResizeNearestOp final : public Operator<CPUContext>, ImageAllocator<
|
|||
const int output_height = input_height * height_scale_;
|
||||
const int output_channels = input_channels;
|
||||
|
||||
const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
|
||||
const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
|
||||
|
||||
int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
|
||||
GLImageVector<T>* output = ImageAllocator<T>::newImage(
|
||||
num_images, output_width, output_height, output_channels, is_last);
|
||||
num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
|
||||
|
||||
if (!resizeNearest_) {
|
||||
int batch_size = OperatorBase::GetSingleArgument<int>("batch_size", 1);
|
||||
resizeNearest_.reset(new GLResizeNearest(batch_size));
|
||||
LOG(INFO) << "batch_size = " << batch_size;
|
||||
resizeNearest_.reset(new GLResizeNearest());
|
||||
}
|
||||
resizeNearest_->resize(input, *output, 1.0 / width_scale_, 1.0 / height_scale_);
|
||||
Outputs()[0]->Reset(output);
|
||||
|
|
|
|||
|
|
@ -18,28 +18,23 @@ class GLStylizer : public GLFilter {
|
|||
bool deprocess;
|
||||
|
||||
public:
|
||||
GLStylizer(bool _deprocess = false, InputFormat input_format = BGRA)
|
||||
: GLFilter(
|
||||
_deprocess ? "GLDeStylizer" : "GLStylizer",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>({BINDING(inputData),
|
||||
BINDING(mean),
|
||||
BINDING(noise_std),
|
||||
BINDING(outputSize)}),
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{{"DEPROCESS", caffe2::to_string(_deprocess)},
|
||||
{"RGBAINPUT", caffe2::to_string(input_format)}}),
|
||||
deprocess(_deprocess) {}
|
||||
GLStylizer(bool _deprocess = false, InputFormat input_format = BGRA)
|
||||
: GLFilter(_deprocess ? "GLDeStylizer" : "GLStylizer",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>({BINDING(inputData), BINDING(mean), BINDING(noise_std), BINDING(outputSize)}),
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{{"DEPROCESS", caffe2::to_string(_deprocess)}, {"RGBAINPUT", caffe2::to_string(input_format)}}),
|
||||
deprocess(_deprocess) {}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
void stylize(const GLImage<T1>* input_image,
|
||||
const GLImage<T2>* output_image,
|
||||
const float mean_values[3],
|
||||
float noise_std_value);
|
||||
template <typename T1, typename T2>
|
||||
void stylize(const GLImage<T1>* input_image,
|
||||
const GLImage<T2>* output_image,
|
||||
const float mean_values[3],
|
||||
float noise_std_value);
|
||||
|
||||
static const char* fragment_shader;
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
||||
// MARK: GLSL
|
||||
|
|
@ -116,8 +111,7 @@ void GLStylizer::stylize(const GLImage<T1>* input_image,
|
|||
run(std::vector<texture_attachment>({{input_image->textures[0], inputData}}),
|
||||
{output_image->textures[0]},
|
||||
[&]() {
|
||||
glUniform2i(
|
||||
outputSize->location, output_image->width, output_image->height);
|
||||
glUniform2i(outputSize->location, output_image->width, output_image->height);
|
||||
glUniform3f(mean->location, mean_values[0], mean_values[1], mean_values[2]);
|
||||
if (!deprocess) {
|
||||
glUniform1f(noise_std->location, noise_std_value);
|
||||
|
|
|
|||
|
|
@ -4,10 +4,10 @@
|
|||
#include "../core/GLImage.h"
|
||||
#include "../core/ImageAllocator.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
class GLSub : public GLFilter {
|
||||
public:
|
||||
|
|
@ -15,22 +15,18 @@ class GLSub : public GLFilter {
|
|||
binding* outputSize;
|
||||
|
||||
GLSub()
|
||||
: GLFilter(
|
||||
"GLSub",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>({BINDING(outputSize),
|
||||
BINDING(inputData[0]),
|
||||
BINDING(inputData[1])}),
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{/* no replacements */}) {}
|
||||
: GLFilter("GLSub",
|
||||
vertex_shader,
|
||||
fragment_shader,
|
||||
std::vector<binding*>({BINDING(outputSize), BINDING(inputData[0]), BINDING(inputData[1])}),
|
||||
{/* no uniform blocks */},
|
||||
{/* no attributes */},
|
||||
{/* no replacements */}) {}
|
||||
|
||||
template <typename T>
|
||||
void sub(
|
||||
const GLImageVector<T>& input_image0,
|
||||
const GLImageVector<T>& input_image1,
|
||||
const GLImageVector<T>& output_image);
|
||||
void sub(const GLImageVector<T>& input_image0,
|
||||
const GLImageVector<T>& input_image1,
|
||||
const GLImageVector<T>& output_image);
|
||||
|
||||
static const char* fragment_shader;
|
||||
};
|
||||
|
|
@ -59,10 +55,9 @@ void main() {
|
|||
)GLSL";
|
||||
|
||||
template <typename T>
|
||||
void GLSub::sub(
|
||||
const GLImageVector<T>& input_images0,
|
||||
const GLImageVector<T>& input_images1,
|
||||
const GLImageVector<T>& output_images) {
|
||||
void GLSub::sub(const GLImageVector<T>& input_images0,
|
||||
const GLImageVector<T>& input_images1,
|
||||
const GLImageVector<T>& output_images) {
|
||||
const int num_images = input_images0.size();
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
GLImage<T>* input_image0 = input_images0[i];
|
||||
|
|
@ -77,14 +72,8 @@ void GLSub::sub(
|
|||
input_attachments.push_back({input_image1->textures[is], inputData[1]});
|
||||
|
||||
run(input_attachments,
|
||||
{output_image->textures.begin() + is,
|
||||
output_image->textures.begin() + is + 1},
|
||||
[&]() {
|
||||
glUniform2i(
|
||||
outputSize->location,
|
||||
output_image->width,
|
||||
output_image->height);
|
||||
},
|
||||
{output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
|
||||
[&]() { glUniform2i(outputSize->location, output_image->width, output_image->height); },
|
||||
output_image->width,
|
||||
output_image->height);
|
||||
}
|
||||
|
|
@ -97,20 +86,14 @@ class OpenGLSubOp final : public Operator<CPUContext>, ImageAllocator<T> {
|
|||
public:
|
||||
OpenGLSubOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws) {
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
OperatorBase::HasArgument("broadcast") == false,
|
||||
"OpenGLSub does not support broadcast");
|
||||
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("broadcast") == false, "OpenGLSub does not support broadcast");
|
||||
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
OperatorBase::HasArgument("axis") == false,
|
||||
"OpenGLSub does not support axis");
|
||||
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, "OpenGLSub does not support axis");
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const GLImageVector<T>& input0 =
|
||||
Inputs()[0]->template Get<GLImageVector<T>>();
|
||||
const GLImageVector<T>& input1 =
|
||||
Inputs()[1]->template Get<GLImageVector<T>>();
|
||||
const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
|
||||
const GLImageVector<T>& input1 = Inputs()[1]->template Get<GLImageVector<T>>();
|
||||
|
||||
CAFFE_ENFORCE_EQ(input0.size(), input1.size());
|
||||
|
||||
|
|
|
|||
|
|
@ -57,8 +57,7 @@ double BenchOp(const std::string& typ,
|
|||
def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_l", 0));
|
||||
def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_b", 0));
|
||||
def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_r", 0));
|
||||
def1.add_arg()->CopyFrom(caffe2::MakeArgument(
|
||||
"convolution_transform_strategy", std::string("PRECOMPUTE")));
|
||||
def1.add_arg()->CopyFrom(caffe2::MakeArgument("convolution_transform_strategy", std::string("PRECOMPUTE")));
|
||||
|
||||
AddNoiseInput(std::vector<caffe2::TIndex>{1, inputC, inH, inW}, "X", ws);
|
||||
if (transposed) {
|
||||
|
|
@ -281,11 +280,11 @@ void TestGLConvolution() {
|
|||
// std::vector<int> sizes({208, 312, 416, 720, 1080});
|
||||
// std::vector<int> channels({16, 4});
|
||||
//
|
||||
std::vector<int> sizes({14, 26, 52, 104});
|
||||
std::vector<int> sizes({14, 26, 52, 104, 208});
|
||||
// std::vector<int> channels({24, 16, 4});
|
||||
|
||||
// std::vector<int> sizes({14});
|
||||
std::vector<int> channels({64, 128, 256, 512});
|
||||
std::vector<int> channels({32, 64, 128, 192, 256, 384, 512});
|
||||
|
||||
std::vector<int> kernels({3});
|
||||
|
||||
|
|
@ -321,19 +320,20 @@ void TestGLConvolution() {
|
|||
const double flops = double(input_channel) * output_channel * kernel * kernel *
|
||||
(kernel == 1 ? space : space - 2) *
|
||||
(kernel == 1 ? space : space - 2) * 2;
|
||||
gl_log(GL_LOG,
|
||||
"Conv: X: %ix%i \tC: %i -> %i\tK: %ix%i\t16b GPU GFLOPS: %.2f\t32b CPU GFLOPS:"
|
||||
"%.2f\tratio: "
|
||||
"%.2f\n",
|
||||
space,
|
||||
space,
|
||||
input_channel,
|
||||
output_channel,
|
||||
kernel,
|
||||
kernel,
|
||||
flops / gpuIterTime / 1E6,
|
||||
flops / cpuIterTime / 1E6,
|
||||
cpuIterTime / gpuIterTime);
|
||||
// gl_log(GL_LOG,
|
||||
printf(
|
||||
"Conv: X: %ix%i \tC: %i -> %i\tK: %ix%i\t16b GPU GFLOPS: %.2f\t32b CPU GFLOPS:"
|
||||
"%.2f\tratio: "
|
||||
"%.2f\n",
|
||||
space,
|
||||
space,
|
||||
input_channel,
|
||||
output_channel,
|
||||
kernel,
|
||||
kernel,
|
||||
flops / gpuIterTime / 1E6,
|
||||
flops / cpuIterTime / 1E6,
|
||||
cpuIterTime / gpuIterTime);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -282,7 +282,7 @@ void testOpenGLConv(int N,
|
|||
} else {
|
||||
float* data = t->mutable_data<float>();
|
||||
for (int i = 0; i < t->size(); i++) {
|
||||
data[i] = -1;
|
||||
data[i] = 1;
|
||||
}
|
||||
}
|
||||
#if 0
|
||||
|
|
@ -658,7 +658,7 @@ void testOpenGLRelu(int N, int C, int H, int W, int input_tile_x, int input_tile
|
|||
checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
|
||||
}
|
||||
|
||||
void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1) {
|
||||
void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1, int input_tile_x = 1, int input_tile_y = 1) {
|
||||
LOG(INFO) << "OpenGL Add Test "
|
||||
<< "C: " << C << ", H: " << H << ", W: " << W;
|
||||
Workspace ws;
|
||||
|
|
@ -682,6 +682,16 @@ void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1) {
|
|||
op.set_type("CopyToOpenGL");
|
||||
op.add_input("X_cpu0");
|
||||
op.add_output("X_gl0");
|
||||
{
|
||||
auto& arg = *(op.add_arg());
|
||||
arg.set_name("tile_x");
|
||||
arg.set_i(input_tile_x);
|
||||
}
|
||||
{
|
||||
auto& arg = *(op.add_arg());
|
||||
arg.set_name("tile_y");
|
||||
arg.set_i(input_tile_y);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
|
|
@ -689,6 +699,16 @@ void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1) {
|
|||
op.set_type("CopyToOpenGL");
|
||||
op.add_input("X_cpu1");
|
||||
op.add_output("X_gl1");
|
||||
{
|
||||
auto& arg = *(op.add_arg());
|
||||
arg.set_name("tile_x");
|
||||
arg.set_i(input_tile_x);
|
||||
}
|
||||
{
|
||||
auto& arg = *(op.add_arg());
|
||||
arg.set_name("tile_y");
|
||||
arg.set_i(input_tile_y);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
|
|
@ -733,15 +753,13 @@ void testOpenGLSub(int N, int C, int H, int W, float error = 0.1) {
|
|||
t0->Resize(N, C, H, W);
|
||||
CPUContext ctx0;
|
||||
// Too noisy.
|
||||
math::RandGaussian<float, CPUContext>(
|
||||
t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
|
||||
math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
|
||||
|
||||
auto* t1 = ws.CreateBlob("X_cpu1")->GetMutable<TensorCPU>();
|
||||
t1->Resize(N, C, H, W);
|
||||
CPUContext ctx1;
|
||||
// Too noisy.
|
||||
math::RandGaussian<float, CPUContext>(
|
||||
t1->size(), 0, 30, t1->mutable_data<float>(), &ctx1);
|
||||
math::RandGaussian<float, CPUContext>(t1->size(), 0, 30, t1->mutable_data<float>(), &ctx1);
|
||||
}
|
||||
|
||||
NetDef netdef;
|
||||
|
|
@ -916,8 +934,7 @@ void testOpenGLTanh(int N, int C, int H, int W, float error) {
|
|||
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
|
||||
t->Resize(N, C, H, W);
|
||||
CPUContext ctx;
|
||||
math::RandGaussian<float, CPUContext>(
|
||||
t->size(), 0, 2, t->mutable_data<float>(), &ctx);
|
||||
math::RandGaussian<float, CPUContext>(t->size(), 0, 2, t->mutable_data<float>(), &ctx);
|
||||
}
|
||||
|
||||
NetDef netdef;
|
||||
|
|
@ -1535,8 +1552,15 @@ void testOpenGLPadImage(
|
|||
}
|
||||
}
|
||||
|
||||
void testOpenGLResize(
|
||||
int N, int C, int H, int W, int width_scale, int height_scale, int batch_size, float error) {
|
||||
void testOpenGLResize(int N,
|
||||
int C,
|
||||
int H,
|
||||
int W,
|
||||
int width_scale,
|
||||
int height_scale,
|
||||
float error,
|
||||
int input_tile_x = 1,
|
||||
int input_tile_y = 1) {
|
||||
LOG(INFO) << "OpenGLResize Test";
|
||||
{
|
||||
Workspace ws;
|
||||
|
|
@ -1553,6 +1577,16 @@ void testOpenGLResize(
|
|||
op.set_type("CopyToOpenGL");
|
||||
op.add_input("X_cpu");
|
||||
op.add_output("X_gl");
|
||||
{
|
||||
auto& arg = *(op.add_arg());
|
||||
arg.set_name("tile_x");
|
||||
arg.set_i(input_tile_x);
|
||||
}
|
||||
{
|
||||
auto& arg = *(op.add_arg());
|
||||
arg.set_name("tile_y");
|
||||
arg.set_i(input_tile_y);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
|
|
@ -1569,11 +1603,6 @@ void testOpenGLResize(
|
|||
arg.set_name("height_scale");
|
||||
arg.set_f(height_scale);
|
||||
}
|
||||
{
|
||||
auto& arg = *(op.add_arg());
|
||||
arg.set_name("batch_size");
|
||||
arg.set_i(batch_size);
|
||||
}
|
||||
{
|
||||
auto& arg = *(op.add_arg());
|
||||
arg.set_name("is_last");
|
||||
|
|
@ -2125,7 +2154,9 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
|
|||
std::string input_order,
|
||||
std::string engine, // "CPU", "OPENGL", or "MPSCNN"
|
||||
bool run_individual,
|
||||
bool use_texture_input) {
|
||||
bool use_texture_input,
|
||||
bool use_tiling,
|
||||
bool run_fusion) {
|
||||
std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
|
||||
|
||||
// caffe2::dumpDefForOpenGL(init_net);
|
||||
|
|
@ -2138,7 +2169,7 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
|
|||
if (engine == "CPU") {
|
||||
net_def.CopyFrom(predict_net);
|
||||
} else if (engine == "OPENGL") {
|
||||
if (!caffe2::tryConvertToOpenGL(init_net, predict_net, &net_def, use_texture_input)) {
|
||||
if (!caffe2::tryConvertToOpenGL(init_net, predict_net, &net_def, use_texture_input, use_tiling, run_fusion)) {
|
||||
CAFFE_THROW("Failed to convert to openGL. Benchmark failed to run");
|
||||
return -1;
|
||||
}
|
||||
|
|
@ -2197,19 +2228,18 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
|
|||
}
|
||||
if (input_type == "float") {
|
||||
ImageAllocator<float16_t> allocator;
|
||||
GLImageVector<float16_t>* output_image = allocator.newImage(
|
||||
1,
|
||||
width,
|
||||
height,
|
||||
channel,
|
||||
tile_x,
|
||||
tile_y,
|
||||
GLImageVector<float16_t>* output_image = allocator.newImage(1,
|
||||
width,
|
||||
height,
|
||||
channel,
|
||||
tile_x,
|
||||
tile_y,
|
||||
#if CAFFE2_IOS
|
||||
true
|
||||
true
|
||||
#else
|
||||
false
|
||||
false
|
||||
#endif
|
||||
);
|
||||
);
|
||||
blob->Reset(output_image);
|
||||
for (auto& texture : (*output_image)[0]->textures) {
|
||||
texture->map_load([&](void* buffer,
|
||||
|
|
@ -2221,19 +2251,18 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
|
|||
}
|
||||
} else {
|
||||
ImageAllocator<uint8_t> allocator;
|
||||
GLImageVector<uint8_t>* output_image = allocator.newImage(
|
||||
1,
|
||||
width,
|
||||
height,
|
||||
channel,
|
||||
tile_x,
|
||||
tile_y,
|
||||
GLImageVector<uint8_t>* output_image = allocator.newImage(1,
|
||||
width,
|
||||
height,
|
||||
channel,
|
||||
tile_x,
|
||||
tile_y,
|
||||
#if CAFFE2_IOS
|
||||
true
|
||||
true
|
||||
#else
|
||||
false
|
||||
false
|
||||
#endif
|
||||
);
|
||||
);
|
||||
blob->Reset(output_image);
|
||||
for (auto& texture : (*output_image)[0]->textures) {
|
||||
texture->map_load([&](void* buffer,
|
||||
|
|
@ -2288,7 +2317,8 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
|
|||
}
|
||||
glFinish();
|
||||
|
||||
LOG(INFO) << net_def.op(k).type() << ": " << (double)timer.MilliSeconds() / main_runs;
|
||||
LOG(INFO) << "Operator #" << k << " " << net_def.op(k).type() << ": "
|
||||
<< (double)timer.MilliSeconds() / main_runs;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -2501,6 +2531,10 @@ void testOpenGL() {
|
|||
|
||||
testOpenGLPRelu(1, channel, 13, 4, channel, tile_x, tile_y, 0.1);
|
||||
testOpenGLRelu(1, channel, 4, 17, tile_x, tile_y, 0.1);
|
||||
testOpenGLConv(1, channel, 16, 16, channel, 3, 3, 0, 2, MaxPool, 0.01, true, 1, 1, tile_x, tile_y, true);
|
||||
testOpenGLConv(1, channel, 16, 16, channel, 3, 3, 0, 2, AveragePool, 0.01, true, 1, 1, tile_x, tile_y, true);
|
||||
testOpenGLAdd(1, channel, 14, 8, 0.1, tile_x, tile_y);
|
||||
testOpenGLResize(1, channel, 16, 16, 2, 2, 0.1, tile_x, tile_y);
|
||||
// clang-format on
|
||||
}
|
||||
}
|
||||
|
|
@ -2760,14 +2794,14 @@ void testOpenGL() {
|
|||
testOpenGLInstanceNormPRelu(1, 6, 640, 360, 0.2);
|
||||
|
||||
LOG(INFO) << "Test OpenGL ResizeNearest";
|
||||
testOpenGLResize(1, 4, 16, 16, 1, 1, 1, 0.1);
|
||||
testOpenGLResize(1, 4, 16, 16, 2, 2, 1, 0.1);
|
||||
testOpenGLResize(1, 4, 16, 16, 3, 3, 1, 0.1);
|
||||
testOpenGLResize(1, 4, 16, 16, 4, 4, 1, 0.1);
|
||||
testOpenGLResize(1, 16, 25, 25, 3, 3, 2, 0.1);
|
||||
testOpenGLResize(1, 16, 25, 25, 3, 3, 4, 0.1);
|
||||
testOpenGLResize(1, 12, 25, 25, 3, 3, 3, 0.1);
|
||||
testOpenGLResize(1, 4, 720, 1280, 3, 3, 1, 0.1);
|
||||
testOpenGLResize(1, 4, 16, 16, 1, 1, 0.1);
|
||||
testOpenGLResize(1, 4, 16, 16, 2, 2, 0.1);
|
||||
testOpenGLResize(1, 4, 16, 16, 3, 3, 0.1);
|
||||
testOpenGLResize(1, 4, 16, 16, 4, 4, 0.1);
|
||||
testOpenGLResize(1, 16, 25, 25, 3, 3, 0.1);
|
||||
testOpenGLResize(1, 16, 25, 25, 3, 3, 0.1);
|
||||
testOpenGLResize(1, 12, 25, 25, 3, 3, 0.1);
|
||||
testOpenGLResize(1, 4, 720, 1280, 3, 3, 0.1);
|
||||
|
||||
// debug style transfer
|
||||
// conv
|
||||
|
|
@ -2848,8 +2882,8 @@ void testOpenGL() {
|
|||
testOpenGLInstanceNormPRelu(3, 4, 16, 16, 0.2);
|
||||
testOpenGLInstanceNormPRelu(15, 4, 16, 16, 0.2);
|
||||
|
||||
testOpenGLResize(3, 4, 16, 16, 1, 1, 1, 0.1);
|
||||
testOpenGLResize(16, 4, 16, 16, 1, 1, 1, 0.1);
|
||||
testOpenGLResize(3, 4, 16, 16, 1, 1, 0.1);
|
||||
testOpenGLResize(16, 4, 16, 16, 1, 1, 0.1);
|
||||
|
||||
testOpenGLPadImage(3, 3, 4, 4, 0, 1, 0, 1, 0.01);
|
||||
testOpenGLPadImage(23, 3, 4, 4, 0, 1, 0, 1, 0.01);
|
||||
|
|
|
|||
|
|
@ -32,6 +32,8 @@ int runModelBenchmarks(caffe2::NetDef& init_net,
|
|||
std::string input_type,
|
||||
std::string input_order,
|
||||
std::string engine,
|
||||
bool run_individual = false,
|
||||
bool use_texture_input = false);
|
||||
bool run_individual = false,
|
||||
bool use_texture_input = false,
|
||||
bool use_tiling = false,
|
||||
bool run_fusion = true);
|
||||
} // namespace caffe2
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user