Update XNNPACK Version (#139913)

Updating XNNPACK Version to 4ea82e595b36106653175dcb04b2aa532660d0d8

submodule update
Pull Request resolved: https://github.com/pytorch/pytorch/pull/139913
Approved by: https://github.com/digantdesai, https://github.com/huydhn
This commit is contained in:
Max Ren 2024-11-18 18:16:31 +00:00 committed by PyTorch MergeBot
parent e429a3b72e
commit cca34be584
15 changed files with 3429 additions and 1611 deletions

View File

@ -14,7 +14,7 @@ mkdir -p ${ZIP_DIR}/src
cp -R ${ARTIFACTS_DIR}/arm64/include ${ZIP_DIR}/install/ cp -R ${ARTIFACTS_DIR}/arm64/include ${ZIP_DIR}/install/
# build a FAT bianry # build a FAT bianry
cd ${ZIP_DIR}/install/lib cd ${ZIP_DIR}/install/lib
target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpthreadpool.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a) target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpthreadpool.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a libmicrokernels-prod.a)
for lib in ${target_libs[*]} for lib in ${target_libs[*]}
do do
if [ -f "${ARTIFACTS_DIR}/x86_64/lib/${lib}" ] && [ -f "${ARTIFACTS_DIR}/arm64/lib/${lib}" ]; then if [ -f "${ARTIFACTS_DIR}/x86_64/lib/${lib}" ] && [ -f "${ARTIFACTS_DIR}/arm64/lib/${lib}" ]; then

View File

@ -112,6 +112,7 @@ if(ANDROID_ABI)
import_static_lib(libc10) import_static_lib(libc10)
import_static_lib(libnnpack) import_static_lib(libnnpack)
import_static_lib(libXNNPACK) import_static_lib(libXNNPACK)
import_static_lib(libmicrokernels-prod)
import_static_lib(libpytorch_qnnpack) import_static_lib(libpytorch_qnnpack)
import_static_lib(libpthreadpool) import_static_lib(libpthreadpool)
import_static_lib(libeigen_blas) import_static_lib(libeigen_blas)
@ -129,6 +130,7 @@ if(ANDROID_ABI)
libc10 libc10
libnnpack libnnpack
libXNNPACK libXNNPACK
libmicrokernels-prod
libpytorch_qnnpack libpytorch_qnnpack
libpthreadpool libpthreadpool
libeigen_blas libeigen_blas
@ -151,6 +153,7 @@ else()
if(USE_XNNPACK) if(USE_XNNPACK)
list(APPEND pytorch_jni_LIBS XNNPACK) list(APPEND pytorch_jni_LIBS XNNPACK)
list(APPEND pytorch_jni_LIBS microkernels-prod)
endif() endif()
if(USE_SYSTEM_PTHREADPOOL) if(USE_SYSTEM_PTHREADPOOL)

View File

@ -234,62 +234,27 @@ Tensor qnnpack_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
#ifdef USE_XNNPACK #ifdef USE_XNNPACK
C10_ALWAYS_INLINE C10_ALWAYS_INLINE
enum xnn_status xnnp_create_add_nd( enum xnn_status xnnp_define_q_tensor(const Tensor& tensor, MemoryFormat format, uint32_t& id, xnn_subgraph_t subgraph_ptr, uint32_t external_id, uint32_t flags){
int8_t azp, Tensor contig_tensor = tensor.contiguous(format);
float ascale, const auto tensor_shape = xnnp_utils::get_mem_format_aware_shape(contig_tensor);
int8_t bzp, const int32_t zero_point = static_cast<int32_t>(contig_tensor.q_zero_point());
float bscale, const float scale = static_cast<float>(contig_tensor.q_scale());
int8_t czp,
float cscale,
int8_t output_min,
int8_t output_max,
uint32_t flags,
xnn_operator_t* op) {
return xnn_create_add_nd_qs8(
azp, /* int8_t input1_zero_point */
ascale, /* float input1_scale */
bzp, /* int8_t input2_zero_point */
bscale, /* float input2_scale */
czp, /* int8_t output_zero_point */
cscale, /* float output_scale */
output_min, /* int8_t output_min */
output_max, /* int8_t output_max */
flags, /* uint32_t flags */
op); /* xnn_operator_t* add_op_out */
}
C10_ALWAYS_INLINE return xnn_define_quantized_tensor_value(
enum xnn_status xnnp_reshape_add_nd( subgraph_ptr,
xnn_operator_t op, xnn_datatype_qint8,
const std::vector<size_t>& a_shape, zero_point,
const std::vector<size_t>& b_shape, scale,
pthreadpool_t pt_pool) { tensor.ndimension(),
return xnn_reshape_add_nd_qs8( tensor_shape.data(),
op, /* xnn_operator_t add_op */ nullptr,
a_shape.size(), /* size_t num_input1_dims */ external_id,
a_shape.data(), /* const size_t* input1_shape */ flags,
b_shape.size(), /* size_t num_input2_dims */ &id);
b_shape.data(), /* const size_t* input2_shape */
pt_pool); /* pthreadpool_t threadpool */
}
C10_ALWAYS_INLINE
enum xnn_status xnnp_setup_add_nd(
xnn_operator_t op,
const int8_t* da,
const int8_t* db,
int8_t* dc,
pthreadpool_t pt_pool) {
return xnn_setup_add_nd_qs8(
op, /* xnn_operator_t add_op */
da, /* const int8_t* input1 */
db, /* const int8_t* input2 */
dc); /* int8_t* output */
} }
template <typename scalar_t, bool ReLUFused = false> template <typename scalar_t, bool ReLUFused = false>
Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) { Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
using underlying_t = typename scalar_t::underlying;
const string func_name = "xnnp_add()"; const string func_name = "xnnp_add()";
TORCH_CHECK(qa.ndimension() > 0, func_name, ": Got empty input tensor."); TORCH_CHECK(qa.ndimension() > 0, func_name, ": Got empty input tensor.");
TORCH_CHECK(at::native::xnnpack::available(), func_name, ": XNNPACK is not available") TORCH_CHECK(at::native::xnnpack::available(), func_name, ": XNNPACK is not available")
@ -299,12 +264,6 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
auto qa_mem_format = qa.suggest_memory_format(); auto qa_mem_format = qa.suggest_memory_format();
Tensor qa_contig = qa.contiguous(qa_mem_format); Tensor qa_contig = qa.contiguous(qa_mem_format);
Tensor qb_contig = qb.contiguous(qa_mem_format); Tensor qb_contig = qb.contiguous(qa_mem_format);
const auto a_zero_point = qa_contig.q_zero_point();
const auto b_zero_point = qb_contig.q_zero_point();
const auto a_scale = qa_contig.q_scale();
const auto b_scale = qb_contig.q_scale();
Tensor qy = at::native::empty_affine_quantized( Tensor qy = at::native::empty_affine_quantized(
at::infer_size_dimvector(qa_contig.sizes(), qb_contig.sizes()), at::infer_size_dimvector(qa_contig.sizes(), qb_contig.sizes()),
qa.scalar_type(), qa.scalar_type(),
@ -319,72 +278,108 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
return qy; return qy;
} }
xnn_operator_t xnnp_op = nullptr;
xnnpack_operator xnnp_add_operator;
auto output_max = std::numeric_limits<underlying_t>::max(); auto output_max = std::numeric_limits<float>::infinity();
auto output_min = std::numeric_limits<underlying_t>::min(); auto output_min = -std::numeric_limits<float>::infinity();
if (ReLUFused) { if (ReLUFused) {
/* output_min = 0;
* FIXME: use activationLimits<T>()
* With <T>, MSVC runs into "error C3862: identifier activationLimits not found".
*/
constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
int64_t qvalue = static_cast<int64_t>(zero_point);
qvalue = std::max<int64_t>(qvalue, qmin);
output_min = static_cast<underlying_t>(std::min<int64_t>(qvalue, qmax));
} }
// Create an operator // Create XNNPACK Subgraph
auto status = xnnp_create_add_nd( xnn_subgraph_t subgraph_ptr = nullptr;
a_zero_point, auto status = xnn_create_subgraph(
a_scale, /*external_value_ids=*/3,
b_zero_point, /*flags=*/0,
b_scale, &subgraph_ptr);
static_cast<underlying_t>(zero_point),
static_cast<float>(scale),
output_min,
output_max,
0,
&xnnp_op);
xnnp_add_operator = xnnpack_operator(xnnp_op);
TORCH_CHECK( TORCH_CHECK(
status == xnn_status_success, status == xnn_status_success,
func_name, ": xnn create operator failed(", status,")!"); func_name, ": xnn create subgraph failed(", status,")!");
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
subgraph_ptr, &xnn_delete_subgraph);
const auto qa_shape = xnnp_utils::get_mem_format_aware_shape(qa_contig); uint32_t input0_id = XNN_INVALID_VALUE_ID, input1_id = XNN_INVALID_VALUE_ID, output_id = XNN_INVALID_VALUE_ID;
const auto qb_shape = xnnp_utils::get_mem_format_aware_shape(qb_contig);
// Reshape the operator // Defining the quantized input 0
status = xnnp_reshape_add_nd( status = xnnp_define_q_tensor(
xnnp_add_operator.get(), qa,
qa_shape, qa_mem_format,
qb_shape, input0_id,
caffe2::pthreadpool_()); subgraph_ptr,
0,
XNN_VALUE_FLAG_EXTERNAL_INPUT
);
TORCH_CHECK(
status == xnn_status_success && input0_id != XNN_INVALID_VALUE_ID,
func_name, ": xnn define input 0 failed(", status,")!");
// Defining the quantized input 1
status = xnnp_define_q_tensor(
qb,
qa_mem_format,
input1_id,
subgraph_ptr,
1,
XNN_VALUE_FLAG_EXTERNAL_INPUT
);
TORCH_CHECK(
status == xnn_status_success && input1_id != XNN_INVALID_VALUE_ID,
func_name, ": xnn define input 1 failed(", status,")!");
// Defining the quantized output
status = xnnp_define_q_tensor(
qy,
qa_mem_format,
output_id,
subgraph_ptr,
2,
XNN_VALUE_FLAG_EXTERNAL_OUTPUT
);
TORCH_CHECK(
status == xnn_status_success && output_id != XNN_INVALID_VALUE_ID,
func_name, ": xnn define output failed(", status,")!");
const struct xnn_binary_params binary_params = {output_min, output_max};
status = xnn_define_binary(
subgraph_ptr,
xnn_binary_add,
&binary_params,
input0_id,
input1_id,
output_id,
0);
TORCH_CHECK( TORCH_CHECK(
status == xnn_status_success, status == xnn_status_success,
func_name, ": xnn reshape operator failed(", status,")!"); func_name, ": xnn define binary add failed(", status,")!");
// Setup the operator // create runtime
status = xnnp_setup_add_nd( xnn_runtime_t runtime_ptr = nullptr;
xnnp_add_operator.get(), status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
reinterpret_cast<const underlying_t*>(qa_contig.data_ptr<scalar_t>()),
reinterpret_cast<const underlying_t*>(qb_contig.data_ptr<scalar_t>()),
reinterpret_cast<underlying_t*>(qy.data_ptr<scalar_t>()),
caffe2::pthreadpool_());
TORCH_CHECK( TORCH_CHECK(
status == xnn_status_success, status == xnn_status_success,
func_name, ": xnn setup operator failed(", status,")!"); func_name, ": xnn create runtime failed(", status,")!");
TORCH_CHECK(
runtime_ptr != nullptr,
func_name, ": xnn create runtime failed because runtime_ptr is null");
std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
runtime_ptr, &xnn_delete_runtime);
// Run the operator std::array<xnn_external_value, 3> external = {
status = xnn_run_operator( xnn_external_value{input0_id, reinterpret_cast<void*>(qa_contig.data_ptr<scalar_t>())},
xnnp_add_operator.get(), /* xnn_operator_t op */ xnn_external_value{input1_id, reinterpret_cast<void*>(qb_contig.data_ptr<scalar_t>())},
caffe2::pthreadpool_()); /* pthreadpool_t threadpool */ xnn_external_value{output_id, reinterpret_cast<void*>(qy.data_ptr<scalar_t>())}};
status = xnn_setup_runtime(
runtime_ptr,
external.size(),
external.data());
TORCH_CHECK( TORCH_CHECK(
status == xnn_status_success, status == xnn_status_success,
func_name, ": xnn run operator failed(", status,")"); func_name, ": xnn setup runtime failed(", status,")!");
status = xnn_invoke_runtime(runtime_ptr);
TORCH_CHECK(
status == xnn_status_success,
func_name, ": xnn invoke runtime failed(", status,")!");
return qy; return qy;
} }
#endif // USE_XNNPACK #endif // USE_XNNPACK

View File

@ -13,7 +13,6 @@
#include <ATen/native/quantized/cpu/init_qnnpack.h> #include <ATen/native/quantized/cpu/init_qnnpack.h>
#include <ATen/quantized/Quantizer.h> #include <ATen/quantized/Quantizer.h>
#include <caffe2/utils/threadpool/pthreadpool-cpp.h> #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
#include <torch/library.h>
#ifndef AT_PER_OPERATOR_HEADERS #ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h> #include <ATen/Functions.h>
@ -56,14 +55,32 @@ Tensor _mul_out(Tensor& out, const Tensor& self, const Tensor& other) {
} }
#ifdef USE_XNNPACK #ifdef USE_XNNPACK
C10_ALWAYS_INLINE
enum xnn_status xnnp_define_q_tensor(const Tensor& tensor, MemoryFormat format, uint32_t& id, xnn_subgraph_t subgraph_ptr, uint32_t external_id, uint32_t flags){
Tensor contig_tensor = tensor.contiguous(format);
const auto tensor_shape = xnnp_utils::get_mem_format_aware_shape(contig_tensor);
const int32_t zero_point = static_cast<int32_t>(contig_tensor.q_zero_point());
const float scale = static_cast<float>(contig_tensor.q_scale());
return xnn_define_quantized_tensor_value(
subgraph_ptr,
xnn_datatype_qint8,
zero_point,
scale,
tensor.ndimension(),
tensor_shape.data(),
nullptr,
external_id,
flags,
&id);
}
template <typename scalar_t, bool ReLUFused = false> template <typename scalar_t, bool ReLUFused = false>
Tensor _mul_out_xnnpack( Tensor _mul_out_xnnpack(
const Tensor& self, const Tensor& self,
const Tensor& other, const Tensor& other,
double output_scale, double output_scale,
int64_t output_zero_point) { int64_t output_zero_point) {
using underlying_t = typename scalar_t::underlying;
const string func_name = "xnnp_mul()"; const string func_name = "xnnp_mul()";
TORCH_CHECK(self.ndimension() > 0, func_name, ": Got empty input tensor."); TORCH_CHECK(self.ndimension() > 0, func_name, ": Got empty input tensor.");
TORCH_CHECK( TORCH_CHECK(
@ -89,96 +106,108 @@ Tensor _mul_out_xnnpack(
return out; return out;
} }
int64_t self_zero_point = self_contig.q_zero_point(); auto output_max = std::numeric_limits<float>::infinity();
double self_scale = self_contig.q_scale(); auto output_min = -std::numeric_limits<float>::infinity();
int64_t other_zero_point = other_contig.q_zero_point(); if (ReLUFused) {
double other_scale = other_contig.q_scale(); output_min = 0;
int64_t output_min = std::numeric_limits<underlying_t>::min();
int64_t output_max = std::numeric_limits<underlying_t>::max();
if(ReLUFused) {
/*
* FIXME: use activationLimits<T>()
* With <T>, MSVC runs into "error C3862: identifier activationLimits not
* found".
*/
constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
int64_t qvalue = static_cast<int64_t>(output_zero_point);
qvalue = std::max<int64_t>(qvalue, qmin);
output_min = static_cast<underlying_t>(std::min<int64_t>(qvalue, qmax));
} }
xnn_operator_t xnnp_op = nullptr; // Create XNNPACK Subgraph
xnnpack_operator xnnp_qmul_operator; xnn_subgraph_t subgraph_ptr = nullptr;
auto status = xnn_create_subgraph(
// create xnnpack multiply operator ... /*external_value_ids=*/3,
auto status = xnn_create_multiply_nd_qs8( /*flags=*/0,
self_zero_point, &subgraph_ptr);
self_scale,
other_zero_point,
other_scale,
static_cast<underlying_t>(output_zero_point),
static_cast<float>(output_scale),
output_min,
output_max,
0,
&xnnp_op);
TORCH_CHECK( TORCH_CHECK(
status == xnn_status_success, status == xnn_status_success,
func_name, func_name, ": xnn create subgraph failed(", status,")!");
": xnn create operator failed(", std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
status, subgraph_ptr, &xnn_delete_subgraph);
")!");
xnnp_qmul_operator = xnnpack_operator(xnnp_op);
uint32_t input0_id = XNN_INVALID_VALUE_ID;
uint32_t input1_id = XNN_INVALID_VALUE_ID;
uint32_t output_id = XNN_INVALID_VALUE_ID;
const auto self_shape = xnnp_utils::get_mem_format_aware_shape(self_contig); // Defining the quantized input 0
const auto other_shape = xnnp_utils::get_mem_format_aware_shape(other_contig); status = xnnp_define_q_tensor(
self,
// reshape operator qa_mem_format,
status = xnn_reshape_multiply_nd_qs8( input0_id,
xnnp_qmul_operator.get(), subgraph_ptr,
self_shape.size(), 0,
self_shape.data(), XNN_VALUE_FLAG_EXTERNAL_INPUT
other_shape.size(),
other_shape.data(),
caffe2::pthreadpool_());
TORCH_CHECK(
status == xnn_status_success,
func_name,
": xnn reshape operator failed(",
status,
")!");
// set up operator
status = xnn_setup_multiply_nd_qs8(
xnnp_qmul_operator.get(),
reinterpret_cast<const underlying_t*>(self_contig.data_ptr<scalar_t>()),
reinterpret_cast<const underlying_t*>(other_contig.data_ptr<scalar_t>()),
reinterpret_cast<underlying_t*>(out.data_ptr<scalar_t>())
); );
TORCH_CHECK(
status == xnn_status_success && input0_id != XNN_INVALID_VALUE_ID,
func_name, ": xnn define input 0 failed(", status,")!");
// Defining the quantized input 1
status = xnnp_define_q_tensor(
other,
qa_mem_format,
input1_id,
subgraph_ptr,
1,
XNN_VALUE_FLAG_EXTERNAL_INPUT
);
TORCH_CHECK(
status == xnn_status_success && input1_id != XNN_INVALID_VALUE_ID,
func_name, ": xnn define input 1 failed(", status,")!");
// Defining the quantized output
status = xnnp_define_q_tensor(
out,
qa_mem_format,
output_id,
subgraph_ptr,
2,
XNN_VALUE_FLAG_EXTERNAL_OUTPUT
);
TORCH_CHECK(
status == xnn_status_success && output_id != XNN_INVALID_VALUE_ID,
func_name, ": xnn define output failed(", status,")!");
const struct xnn_binary_params binary_params = {output_min, output_max};
status = xnn_define_binary(
subgraph_ptr,
xnn_binary_multiply,
&binary_params,
input0_id,
input1_id,
output_id,
0);
TORCH_CHECK( TORCH_CHECK(
status == xnn_status_success, status == xnn_status_success,
func_name, func_name, ": xnn define binary add failed(", status,")!");
": xnn setup operator failed(",
status,
")!");
// Run the operator // create runtime
status = xnn_run_operator( xnn_runtime_t runtime_ptr = nullptr;
xnnp_qmul_operator.get(), /* xnn_operator_t op */ status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
caffe2::pthreadpool_()); /* pthreadpool_t threadpool */
TORCH_CHECK( TORCH_CHECK(
status == xnn_status_success, status == xnn_status_success,
func_name, func_name, ": xnn create runtime failed(", status,")!");
": xnn run operator failed(", TORCH_CHECK(
status, runtime_ptr != nullptr,
")"); func_name, ": xnn create runtime failed because runtime_ptr is null");
std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
runtime_ptr, &xnn_delete_runtime);
std::array<xnn_external_value, 3> external = {
xnn_external_value{input0_id, reinterpret_cast<void*>(self.data_ptr<scalar_t>())},
xnn_external_value{input1_id, reinterpret_cast<void*>(other.data_ptr<scalar_t>())},
xnn_external_value{output_id, reinterpret_cast<void*>(out.data_ptr<scalar_t>())}};
status = xnn_setup_runtime(
runtime_ptr,
external.size(),
external.data());
TORCH_CHECK(
status == xnn_status_success,
func_name, ": xnn setup runtime failed(", status,")!");
status = xnn_invoke_runtime(runtime_ptr);
TORCH_CHECK(
status == xnn_status_success,
func_name, ": xnn invoke runtime failed(", status,")!");
return out; return out;
} }

View File

@ -19,46 +19,84 @@ bool use_hardswish(
static Tensor& hardswish_impl(Tensor& input, Tensor& output) { static Tensor& hardswish_impl(Tensor& input, Tensor& output) {
using namespace internal; using namespace internal;
// Create XNNPACK Subgraph
xnn_operator_t hardswish_op{}; xnn_subgraph_t subgraph_ptr = nullptr;
const xnn_status create_status = xnn_create_hardswish_nc_f32( xnn_status status = xnn_create_subgraph(
0, // flags /*external_value_ids=*/2,
&hardswish_op); /*flags=*/0,
&subgraph_ptr);
TORCH_CHECK( TORCH_CHECK(
xnn_status_success == create_status, status == xnn_status_success,
"xnn_create_hardswish_nc_f32 failed!"); "xnn create subgraph failed(", status,")!");
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
Operator hardswish_scoped_op(hardswish_op); subgraph_ptr, &xnn_delete_subgraph);
uint32_t input_id = XNN_INVALID_VALUE_ID, output_id = XNN_INVALID_VALUE_ID;
const xnn_status reshape_status = xnn_reshape_hardswish_nc_f32( std::vector<size_t> input_output_shape(input.sizes().begin(), input.sizes().end());
hardswish_op,
input.numel(), // Batch
1, // channels
1, // input stride
1, // output stride
caffe2::pthreadpool_()); // threadpool
status = xnn_define_tensor_value(
subgraph_ptr,
xnn_datatype_fp32,
input_output_shape.size(),
input_output_shape.data(),
nullptr,
0,
XNN_VALUE_FLAG_EXTERNAL_INPUT,
&input_id
);
TORCH_CHECK( TORCH_CHECK(
xnn_status_success == reshape_status, status == xnn_status_success,
"xnn_reshape_hardswish_nc_f32 failed!"); "defining xnn input failed(", status,")!");
const xnn_status setup_status = xnn_setup_hardswish_nc_f32(
hardswish_op,
input.data_ptr<float>(),
output.data_ptr<float>());
status = xnn_define_tensor_value(
subgraph_ptr,
xnn_datatype_fp32,
input_output_shape.size(),
input_output_shape.data(),
nullptr,
1,
XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
&output_id
);
TORCH_CHECK( TORCH_CHECK(
xnn_status_success == setup_status, status == xnn_status_success,
"xnn_setup_hardswish_nc_f32 failed!"); "defining xnn output failed(", status,")!");
const xnn_status run_status = xnn_run_operator( status = xnn_define_unary(
hardswish_op, subgraph_ptr,
caffe2::pthreadpool_()); // threadpool xnn_unary_hardswish,
nullptr,
input_id,
output_id,
0
);
TORCH_INTERNAL_ASSERT( // create runtime
xnn_status_success == run_status, xnn_runtime_t runtime_ptr = nullptr;
"xnn_run_operator failed!"); status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
TORCH_CHECK(
status == xnn_status_success,
"xnn create runtime failed(", status,")!");
TORCH_CHECK(
runtime_ptr != nullptr,
"xnn create runtime failed because runtime_ptr is null");
std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
runtime_ptr, &xnn_delete_runtime);
std::array<xnn_external_value, 2> external = {
xnn_external_value{input_id, input.data_ptr<float>()},
xnn_external_value{output_id, output.data_ptr<float>()}};
status = xnn_setup_runtime(
runtime_ptr,
external.size(),
external.data());
TORCH_CHECK(
status == xnn_status_success,
"xnn setup runtime failed(", status,")!");
status = xnn_invoke_runtime(runtime_ptr);
TORCH_CHECK(
status == xnn_status_success,
"xnn invoke runtime failed(", status,")!");
return output; return output;
} }

View File

@ -7,6 +7,27 @@
namespace at::native::xnnpack { namespace at::native::xnnpack {
inline std::vector<size_t> get_mem_format_aware_shape(const at::Tensor& in) {
const auto mem_format = in.suggest_memory_format();
const auto& sizes = in.sizes();
std::vector<size_t> ret(sizes.begin(), sizes.end());
if (mem_format == c10::MemoryFormat::ChannelsLast) {
// NCHW -> NHWC
// 0123 -> 0231
ret[1] = sizes[2]; /* H */
ret[2] = sizes[3]; /* W */
ret[3] = sizes[1]; /* C */
} else if (mem_format == c10::MemoryFormat::ChannelsLast3d) {
// NCDHW -> NDHWC
// 01234 -> 02341
ret[1] = sizes[2]; /* D */
ret[2] = sizes[3]; /* H */
ret[3] = sizes[4]; /* W */
ret[4] = sizes[1]; /* C */
}
return ret;
}
bool use_global_average_pool(const Tensor& input) { bool use_global_average_pool(const Tensor& input) {
return xnnpack::available() && (1 <= input.ndimension()) && return xnnpack::available() && (1 <= input.ndimension()) &&
(input.device().is_cpu()) && (kFloat == input.scalar_type()) && (input.device().is_cpu()) && (kFloat == input.scalar_type()) &&
@ -31,63 +52,91 @@ Tensor global_average_pool(const Tensor& input) {
MemoryFormat::ChannelsLast, MemoryFormat::ChannelsLast,
input_padded_contig_nhwc.opt_names()); input_padded_contig_nhwc.opt_names());
xnn_operator_t global_average_pooling_op{}; // Create XNNPACK Subgraph
const xnn_status create_status = xnn_create_global_average_pooling_nwc_f32( xnn_subgraph_t subgraph_ptr = nullptr;
-std::numeric_limits<float>::infinity(), xnn_status status = xnn_create_subgraph(
std::numeric_limits<float>::infinity(), /*external_value_ids=*/2,
0 /* flags */, /*flags=*/0,
&global_average_pooling_op); &subgraph_ptr);
TORCH_CHECK( TORCH_CHECK(
xnn_status_success == create_status, status == xnn_status_success,
"xnn_create_global_average_pooling_nwc_f32 failed!"); "xnn create subgraph failed(", status,")!");
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
subgraph_ptr, &xnn_delete_subgraph);
uint32_t input_id = XNN_INVALID_VALUE_ID, output_id = XNN_INVALID_VALUE_ID;
Operator global_avg_pool_scoped_op(global_average_pooling_op);
size_t workspace_size = 0;
size_t workspace_alignment = 0;
const xnn_status reshape_status = xnn_reshape_global_average_pooling_nwc_f32(
global_average_pooling_op,
input_padded_contig_nhwc.size(Layout::Activation4D::batch), // batch_size
input_padded_contig_nhwc.size(Layout::Activation4D::width) *
input_padded_contig_nhwc.size(Layout::Activation4D::height), // width
input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
input_padded_contig_nhwc.size(
Layout::Activation4D::channels), // input stride
input_padded_contig_nhwc.size(
Layout::Activation4D::channels), // output stride
&workspace_size, // workspace_size
&workspace_alignment, // workspace_alignment
caffe2::pthreadpool_());
const auto& input_shape = get_mem_format_aware_shape(input_padded_contig_nhwc);
status = xnn_define_tensor_value(
subgraph_ptr,
xnn_datatype_fp32,
input_shape.size(),
input_shape.data(),
nullptr,
0,
XNN_VALUE_FLAG_EXTERNAL_INPUT,
&input_id
);
TORCH_CHECK( TORCH_CHECK(
xnn_status_success == reshape_status, status == xnn_status_success,
"xnn_reshape_global_average_pooling_nwc_f32 failed!"); "defining xnn input failed(", status,")!");
// Create Workspace pointer, which we will align and pad with 16 bytes
size_t xnnpack_buffer_padding = 16;
std::vector<char> workspace_vector(workspace_size + workspace_alignment + xnnpack_buffer_padding);
void* maybe_aligned_workspace = workspace_vector.data();
void* aligned_workspace =
(void*)((intptr_t)maybe_aligned_workspace + workspace_alignment - (intptr_t)maybe_aligned_workspace % workspace_alignment);
const xnn_status setup_status = xnn_setup_global_average_pooling_nwc_f32(
global_average_pooling_op,
aligned_workspace,
input_padded_contig_nhwc.data_ptr<float>(),
output.data_ptr<float>());
const auto& output_shape = get_mem_format_aware_shape(output);
status = xnn_define_tensor_value(
subgraph_ptr,
xnn_datatype_fp32,
output_shape.size(),
output_shape.data(),
nullptr,
1,
XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
&output_id
);
TORCH_CHECK( TORCH_CHECK(
xnn_status_success == setup_status, status == xnn_status_success,
"xnn_setup_global_average_pooling_nwc_f32 failed!"); "defining xnn output failed(", status,")!");
const xnn_status run_status =
xnn_run_operator(global_average_pooling_op, caffe2::pthreadpool_());
std::vector<size_t> reduce_dims{1, 2};
status = xnn_define_static_reduce(
subgraph_ptr,
xnn_reduce_mean,
reduce_dims.size(),
reduce_dims.data(),
input_id,
output_id,
0
);
TORCH_CHECK( TORCH_CHECK(
xnn_status_success == run_status, status == xnn_status_success,
"xnn_setup_global_average_pooling_nwc_f32 failed!"); "defining xnn static reduce failed(", status,")!");
// create runtime
xnn_runtime_t runtime_ptr = nullptr;
status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
TORCH_CHECK(
status == xnn_status_success,
"xnn create runtime failed(", status,")!");
TORCH_CHECK(
runtime_ptr != nullptr,
"xnn create runtime failed because runtime_ptr is null");
std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
runtime_ptr, &xnn_delete_runtime);
std::array<xnn_external_value, 2> external = {
xnn_external_value{input_id, input_padded_contig_nhwc.data_ptr<float>()},
xnn_external_value{output_id, output.data_ptr<float>()}};
status = xnn_setup_runtime(
runtime_ptr,
external.size(),
external.data());
TORCH_CHECK(
status == xnn_status_success,
"xnn setup runtime failed(", status,")!");
status = xnn_invoke_runtime(runtime_ptr);
TORCH_CHECK(
status == xnn_status_success,
"xnn invoke runtime failed(", status,")!");
return output.to(input.suggest_memory_format()); return output.to(input.suggest_memory_format());
} }

View File

@ -516,6 +516,9 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
# Disable I8MM For CI since clang 9 does not support neon i8mm. # Disable I8MM For CI since clang 9 does not support neon i8mm.
set(XNNPACK_ENABLE_ARM_I8MM OFF CACHE BOOL "") set(XNNPACK_ENABLE_ARM_I8MM OFF CACHE BOOL "")
# Disable avxvnni int8
set(XNNPACK_ENABLE_AVXVNNIINT8 OFF CACHE BOOL "")
# Older MSVC versions don't support AVX512FP. TODO Minimum version support? # Older MSVC versions don't support AVX512FP. TODO Minimum version support?
IF(CMAKE_C_COMPILER_ID STREQUAL "MSVC") IF(CMAKE_C_COMPILER_ID STREQUAL "MSVC")
set(XNNPACK_ENABLE_AVX512FP16 OFF CACHE BOOL "") set(XNNPACK_ENABLE_AVX512FP16 OFF CACHE BOOL "")

View File

@ -94,6 +94,7 @@ else()
if(@USE_XNNPACK@) if(@USE_XNNPACK@)
append_torchlib_if_found(XNNPACK) append_torchlib_if_found(XNNPACK)
append_torchlib_if_found(microkernels-prod)
endif() endif()
append_torchlib_if_found(caffe2_protos protobuf-lite protobuf protoc) append_torchlib_if_found(caffe2_protos protobuf-lite protobuf protoc)

View File

@ -111,7 +111,7 @@ else
end end
puts "Linking static libraries..." puts "Linking static libraries..."
libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a'] libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libmicrokernels-prod.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a']
frameworks = ['CoreML', 'Metal', 'MetalPerformanceShaders', 'Accelerate', 'UIKit'] frameworks = ['CoreML', 'Metal', 'MetalPerformanceShaders', 'Accelerate', 'UIKit']
targets.each do |target| targets.each do |target|
# NB: All these libraries and frameworks have already been linked by TestApp, adding them # NB: All these libraries and frameworks have already been linked by TestApp, adding them

View File

@ -40,7 +40,7 @@ end
# link static libraries # link static libraries
target.frameworks_build_phases.clear target.frameworks_build_phases.clear
libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a', 'libkineto.a'] libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libmicrokernels-prod.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a', 'libkineto.a']
for lib in libs do for lib in libs do
path = "#{install_path}/lib/#{lib}" path = "#{install_path}/lib/#{lib}"
if File.exist?(path) if File.exist?(path)

2
third_party/XNNPACK vendored

@ -1 +1 @@
Subproject commit 87ee0b46b834f67bad9025d4a82ed5654f3403d3 Subproject commit 4ea82e595b36106653175dcb04b2aa532660d0d8

File diff suppressed because it is too large Load Diff

33
third_party/xnnpack_buck_shim.bzl vendored Normal file
View File

@ -0,0 +1,33 @@
load(
"//xplat/third-party/XNNPACK/XNNPACK:build_srcs.bzl",
_LOGGING_SRCS = "LOGGING_SRCS",
_OPERATOR_SRCS = "OPERATOR_SRCS",
_SUBGRAPH_SRCS = "SUBGRAPH_SRCS",
_TABLE_SRCS = "TABLE_SRCS",
_XNNPACK_SRCS = "XNNPACK_SRCS",
)
load("//xplat/third-party/XNNPACK/XNNPACK/gen:microkernels.bzl", "prod_srcs_for_arch")
load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
def define_xnnpack_build_src(xnnpack_build_src):
return ["XNNPACK/{}".format(src) for src in xnnpack_build_src]
def prod_srcs_for_arch_wrapper(arch):
prod_srcs = prod_srcs_for_arch(arch)
return define_xnnpack_build_src(prod_srcs)
def get_xnnpack_headers():
src_headers = subdir_glob([
("XNNPACK/src", "**/*.h"),
])
include_headers = subdir_glob([
("XNNPACK/include", "*.h"),
])
return src_headers | include_headers
OPERATOR_SRCS = define_xnnpack_build_src(_OPERATOR_SRCS)
SUBGRAPH_SRCS = define_xnnpack_build_src(_SUBGRAPH_SRCS)
TABLE_SRCS = define_xnnpack_build_src(_TABLE_SRCS)
XNNPACK_SRCS = define_xnnpack_build_src(_XNNPACK_SRCS)
LOGGING_SRCS = define_xnnpack_build_src(_LOGGING_SRCS)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff