Update XNNPACK Version (#139913)

Updating XNNPACK Version to 4ea82e595b36106653175dcb04b2aa532660d0d8 submodule update Pull Request resolved: https://github.com/pytorch/pytorch/pull/139913 Approved by: https://github.com/digantdesai, https://github.com/huydhn
2025-12-06 12:20:52 +01:00 · 2024-11-18 18:16:31 +00:00 · 2024-11-18 18:16:31 +00:00 · cca34be584
commit cca34be584
parent e429a3b72e
15 changed files with 3429 additions and 1611 deletions
--- a/.circleci/scripts/binary_ios_upload.sh
+++ b/.circleci/scripts/binary_ios_upload.sh
@ -14,7 +14,7 @@ mkdir -p ${ZIP_DIR}/src
 cp -R ${ARTIFACTS_DIR}/arm64/include ${ZIP_DIR}/install/
 # build a FAT bianry
 cd ${ZIP_DIR}/install/lib
-target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpthreadpool.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a)
+target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpthreadpool.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a libmicrokernels-prod.a)
 for lib in ${target_libs[*]}
 do
    if [ -f "${ARTIFACTS_DIR}/x86_64/lib/${lib}" ] && [ -f "${ARTIFACTS_DIR}/arm64/lib/${lib}" ]; then
--- a/android/pytorch_android/CMakeLists.txt
+++ b/android/pytorch_android/CMakeLists.txt
@ -112,6 +112,7 @@ if(ANDROID_ABI)
  import_static_lib(libc10)
  import_static_lib(libnnpack)
  import_static_lib(libXNNPACK)
  import_static_lib(libmicrokernels-prod)
  import_static_lib(libpytorch_qnnpack)
  import_static_lib(libpthreadpool)
  import_static_lib(libeigen_blas)
@ -129,6 +130,7 @@ if(ANDROID_ABI)
      libc10
      libnnpack
      libXNNPACK
      libmicrokernels-prod
      libpytorch_qnnpack
      libpthreadpool
      libeigen_blas
@ -151,6 +153,7 @@ else()
  if(USE_XNNPACK)
    list(APPEND pytorch_jni_LIBS XNNPACK)
    list(APPEND pytorch_jni_LIBS microkernels-prod)
  endif()
  if(USE_SYSTEM_PTHREADPOOL)
--- a/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
+++ b/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
@ -234,62 +234,27 @@ Tensor qnnpack_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
 #ifdef USE_XNNPACK
 C10_ALWAYS_INLINE
-enum xnn_status xnnp_create_add_nd(
+enum xnn_status xnnp_define_q_tensor(const Tensor& tensor, MemoryFormat format, uint32_t& id, xnn_subgraph_t subgraph_ptr, uint32_t external_id, uint32_t flags){
-    int8_t azp,
+  Tensor contig_tensor = tensor.contiguous(format);
-    float ascale,
+  const auto tensor_shape = xnnp_utils::get_mem_format_aware_shape(contig_tensor);
-    int8_t bzp,
+  const int32_t zero_point = static_cast<int32_t>(contig_tensor.q_zero_point());
-    float bscale,
+  const float scale = static_cast<float>(contig_tensor.q_scale());
    int8_t czp,
    float cscale,
    int8_t output_min,
    int8_t output_max,
    uint32_t flags,
    xnn_operator_t* op) {
  return xnn_create_add_nd_qs8(
      azp,        /* int8_t input1_zero_point   */
      ascale,     /* float input1_scale         */
      bzp,        /* int8_t input2_zero_point   */
      bscale,     /* float input2_scale         */
      czp,        /* int8_t output_zero_point   */
      cscale,     /* float output_scale         */
      output_min, /* int8_t output_min          */
      output_max, /* int8_t output_max          */
      flags,      /* uint32_t flags             */
      op);        /* xnn_operator_t* add_op_out */
 }
-C10_ALWAYS_INLINE
+  return xnn_define_quantized_tensor_value(
-enum xnn_status xnnp_reshape_add_nd(
+    subgraph_ptr,
-    xnn_operator_t op,
+    xnn_datatype_qint8,
-    const std::vector<size_t>& a_shape,
+    zero_point,
-    const std::vector<size_t>& b_shape,
+    scale,
-    pthreadpool_t pt_pool) {
+    tensor.ndimension(),
-  return xnn_reshape_add_nd_qs8(
+    tensor_shape.data(),
-      op,             /* xnn_operator_t add_op      */
+    nullptr,
-      a_shape.size(), /* size_t num_input1_dims     */
+    external_id,
-      a_shape.data(), /* const size_t* input1_shape */
+    flags,
-      b_shape.size(), /* size_t num_input2_dims     */
+    &id);
      b_shape.data(), /* const size_t* input2_shape */
      pt_pool);       /* pthreadpool_t threadpool   */
 }
 C10_ALWAYS_INLINE
 enum xnn_status xnnp_setup_add_nd(
    xnn_operator_t op,
    const int8_t* da,
    const int8_t* db,
    int8_t* dc,
    pthreadpool_t pt_pool) {
  return xnn_setup_add_nd_qs8(
      op,             /* xnn_operator_t add_op      */
      da,             /* const int8_t* input1       */
      db,             /* const int8_t* input2       */
      dc);            /* int8_t* output             */
 }
 template <typename scalar_t, bool ReLUFused = false>
 Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
  using underlying_t = typename scalar_t::underlying;
  const string func_name = "xnnp_add()";
  TORCH_CHECK(qa.ndimension() > 0, func_name, ": Got empty input tensor.");
  TORCH_CHECK(at::native::xnnpack::available(), func_name, ": XNNPACK is not available")
@ -299,12 +264,6 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
  auto qa_mem_format = qa.suggest_memory_format();
  Tensor qa_contig = qa.contiguous(qa_mem_format);
  Tensor qb_contig = qb.contiguous(qa_mem_format);
  const auto a_zero_point = qa_contig.q_zero_point();
  const auto b_zero_point = qb_contig.q_zero_point();
  const auto a_scale = qa_contig.q_scale();
  const auto b_scale = qb_contig.q_scale();
  Tensor qy = at::native::empty_affine_quantized(
      at::infer_size_dimvector(qa_contig.sizes(), qb_contig.sizes()),
      qa.scalar_type(),
@ -319,72 +278,108 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
    return qy;
  }
  xnn_operator_t xnnp_op = nullptr;
  xnnpack_operator xnnp_add_operator;
-  auto output_max = std::numeric_limits<underlying_t>::max();
+  auto output_max = std::numeric_limits<float>::infinity();
-  auto output_min = std::numeric_limits<underlying_t>::min();
+  auto output_min = -std::numeric_limits<float>::infinity();
  if (ReLUFused) {
-    /*
+    output_min = 0;
     * FIXME: use activationLimits<T>()
     * With <T>, MSVC runs into "error C3862: identifier activationLimits not found".
     */
    constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
    constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
    int64_t qvalue = static_cast<int64_t>(zero_point);
    qvalue = std::max<int64_t>(qvalue, qmin);
    output_min = static_cast<underlying_t>(std::min<int64_t>(qvalue, qmax));
  }
-  // Create an operator
+  // Create XNNPACK Subgraph
-  auto status = xnnp_create_add_nd(
+  xnn_subgraph_t subgraph_ptr = nullptr;
-      a_zero_point,
+  auto status = xnn_create_subgraph(
-      a_scale,
+    /*external_value_ids=*/3,
-      b_zero_point,
+    /*flags=*/0,
-      b_scale,
+    &subgraph_ptr);
      static_cast<underlying_t>(zero_point),
      static_cast<float>(scale),
      output_min,
      output_max,
      0,
      &xnnp_op);
  xnnp_add_operator = xnnpack_operator(xnnp_op);
  TORCH_CHECK(
      status == xnn_status_success,
-      func_name, ": xnn create operator failed(", status,")!");
+      func_name, ": xnn create subgraph failed(", status,")!");
  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
      subgraph_ptr, &xnn_delete_subgraph);
-  const auto qa_shape = xnnp_utils::get_mem_format_aware_shape(qa_contig);
+  uint32_t input0_id = XNN_INVALID_VALUE_ID, input1_id = XNN_INVALID_VALUE_ID, output_id = XNN_INVALID_VALUE_ID;
  const auto qb_shape = xnnp_utils::get_mem_format_aware_shape(qb_contig);
-  // Reshape the operator
+  // Defining the quantized input 0
-  status = xnnp_reshape_add_nd(
+  status = xnnp_define_q_tensor(
-      xnnp_add_operator.get(),
+    qa,
-      qa_shape,
+    qa_mem_format,
-      qb_shape,
+    input0_id,
-      caffe2::pthreadpool_());
+    subgraph_ptr,
    0,
    XNN_VALUE_FLAG_EXTERNAL_INPUT
  );
  TORCH_CHECK(
      status == xnn_status_success && input0_id != XNN_INVALID_VALUE_ID,
      func_name, ": xnn define input 0 failed(", status,")!");
  // Defining the quantized input 1
  status = xnnp_define_q_tensor(
    qb,
    qa_mem_format,
    input1_id,
    subgraph_ptr,
    1,
    XNN_VALUE_FLAG_EXTERNAL_INPUT
  );
  TORCH_CHECK(
      status == xnn_status_success && input1_id != XNN_INVALID_VALUE_ID,
      func_name, ": xnn define input 1 failed(", status,")!");
  // Defining the quantized output
  status = xnnp_define_q_tensor(
    qy,
    qa_mem_format,
    output_id,
    subgraph_ptr,
    2,
    XNN_VALUE_FLAG_EXTERNAL_OUTPUT
  );
  TORCH_CHECK(
      status == xnn_status_success && output_id != XNN_INVALID_VALUE_ID,
      func_name, ": xnn define output failed(", status,")!");
  const struct xnn_binary_params binary_params = {output_min, output_max};
  status = xnn_define_binary(
    subgraph_ptr,
    xnn_binary_add,
    &binary_params,
    input0_id,
    input1_id,
    output_id,
    0);
  TORCH_CHECK(
      status == xnn_status_success,
-      func_name, ": xnn reshape operator failed(", status,")!");
+      func_name, ": xnn define binary add failed(", status,")!");
-  // Setup the operator
+  // create runtime
-  status = xnnp_setup_add_nd(
+  xnn_runtime_t runtime_ptr = nullptr;
-      xnnp_add_operator.get(),
+  status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
      reinterpret_cast<const underlying_t*>(qa_contig.data_ptr<scalar_t>()),
      reinterpret_cast<const underlying_t*>(qb_contig.data_ptr<scalar_t>()),
      reinterpret_cast<underlying_t*>(qy.data_ptr<scalar_t>()),
      caffe2::pthreadpool_());
  TORCH_CHECK(
      status == xnn_status_success,
-      func_name, ": xnn setup operator failed(", status,")!");
+      func_name, ": xnn create runtime failed(", status,")!");
  TORCH_CHECK(
      runtime_ptr != nullptr,
      func_name, ": xnn create runtime failed because runtime_ptr is null");
  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
      runtime_ptr, &xnn_delete_runtime);
-  // Run the operator
+  std::array<xnn_external_value, 3> external = {
-  status = xnn_run_operator(
+    xnn_external_value{input0_id, reinterpret_cast<void*>(qa_contig.data_ptr<scalar_t>())},
-      xnnp_add_operator.get(), /* xnn_operator_t op */
+    xnn_external_value{input1_id, reinterpret_cast<void*>(qb_contig.data_ptr<scalar_t>())},
-      caffe2::pthreadpool_()); /* pthreadpool_t threadpool */
+    xnn_external_value{output_id, reinterpret_cast<void*>(qy.data_ptr<scalar_t>())}};
  status = xnn_setup_runtime(
    runtime_ptr,
    external.size(),
    external.data());
  TORCH_CHECK(
      status == xnn_status_success,
-      func_name, ": xnn run operator failed(", status,")");
+      func_name, ": xnn setup runtime failed(", status,")!");
  status = xnn_invoke_runtime(runtime_ptr);
  TORCH_CHECK(
      status == xnn_status_success,
      func_name, ": xnn invoke runtime failed(", status,")!");
  return qy;
 }
 #endif // USE_XNNPACK
--- a/aten/src/ATen/native/quantized/cpu/qmul.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qmul.cpp
@ -13,7 +13,6 @@
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/quantized/Quantizer.h>
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #include <torch/library.h>
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -56,14 +55,32 @@ Tensor _mul_out(Tensor& out, const Tensor& self, const Tensor& other) {
 }
 #ifdef USE_XNNPACK
 C10_ALWAYS_INLINE
 enum xnn_status xnnp_define_q_tensor(const Tensor& tensor, MemoryFormat format, uint32_t& id, xnn_subgraph_t subgraph_ptr, uint32_t external_id, uint32_t flags){
  Tensor contig_tensor = tensor.contiguous(format);
  const auto tensor_shape = xnnp_utils::get_mem_format_aware_shape(contig_tensor);
  const int32_t zero_point = static_cast<int32_t>(contig_tensor.q_zero_point());
  const float scale = static_cast<float>(contig_tensor.q_scale());
  return xnn_define_quantized_tensor_value(
    subgraph_ptr,
    xnn_datatype_qint8,
    zero_point,
    scale,
    tensor.ndimension(),
    tensor_shape.data(),
    nullptr,
    external_id,
    flags,
    &id);
 }
 template <typename scalar_t, bool ReLUFused = false>
 Tensor _mul_out_xnnpack(
    const Tensor& self,
    const Tensor& other,
    double output_scale,
    int64_t output_zero_point) {
  using underlying_t = typename scalar_t::underlying;
  const string func_name = "xnnp_mul()";
  TORCH_CHECK(self.ndimension() > 0, func_name, ": Got empty input tensor.");
  TORCH_CHECK(
@ -89,96 +106,108 @@ Tensor _mul_out_xnnpack(
    return out;
  }
-  int64_t self_zero_point = self_contig.q_zero_point();
+  auto output_max = std::numeric_limits<float>::infinity();
-  double self_scale = self_contig.q_scale();
+  auto output_min = -std::numeric_limits<float>::infinity();
-  int64_t other_zero_point = other_contig.q_zero_point();
+  if (ReLUFused) {
-  double other_scale = other_contig.q_scale();
+    output_min = 0;
  int64_t output_min = std::numeric_limits<underlying_t>::min();
  int64_t output_max = std::numeric_limits<underlying_t>::max();
  if(ReLUFused) {
    /*
     * FIXME: use activationLimits<T>()
     * With <T>, MSVC runs into "error C3862: identifier activationLimits not
     * found".
     */
    constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
    constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
    int64_t qvalue = static_cast<int64_t>(output_zero_point);
    qvalue = std::max<int64_t>(qvalue, qmin);
    output_min = static_cast<underlying_t>(std::min<int64_t>(qvalue, qmax));
  }
-  xnn_operator_t xnnp_op = nullptr;
+  // Create XNNPACK Subgraph
-  xnnpack_operator xnnp_qmul_operator;
+  xnn_subgraph_t subgraph_ptr = nullptr;
-
+  auto status = xnn_create_subgraph(
-  // create xnnpack multiply operator ...
+    /*external_value_ids=*/3,
-  auto status = xnn_create_multiply_nd_qs8(
+    /*flags=*/0,
-      self_zero_point,
+    &subgraph_ptr);
      self_scale,
      other_zero_point,
      other_scale,
      static_cast<underlying_t>(output_zero_point),
      static_cast<float>(output_scale),
      output_min,
      output_max,
      0,
      &xnnp_op);
  TORCH_CHECK(
      status == xnn_status_success,
-      func_name,
+      func_name, ": xnn create subgraph failed(", status,")!");
-      ": xnn create operator failed(",
+  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
-      status,
+      subgraph_ptr, &xnn_delete_subgraph);
      ")!");
  xnnp_qmul_operator = xnnpack_operator(xnnp_op);
  uint32_t input0_id = XNN_INVALID_VALUE_ID;
  uint32_t input1_id = XNN_INVALID_VALUE_ID;
  uint32_t output_id = XNN_INVALID_VALUE_ID;
-  const auto self_shape = xnnp_utils::get_mem_format_aware_shape(self_contig);
+  // Defining the quantized input 0
-  const auto other_shape = xnnp_utils::get_mem_format_aware_shape(other_contig);
+  status = xnnp_define_q_tensor(
-
+    self,
-  // reshape operator
+    qa_mem_format,
-  status = xnn_reshape_multiply_nd_qs8(
+    input0_id,
-      xnnp_qmul_operator.get(),
+    subgraph_ptr,
-      self_shape.size(),
+    0,
-      self_shape.data(),
+    XNN_VALUE_FLAG_EXTERNAL_INPUT
      other_shape.size(),
      other_shape.data(),
      caffe2::pthreadpool_());
  TORCH_CHECK(
      status == xnn_status_success,
      func_name,
      ": xnn reshape operator failed(",
      status,
      ")!");
  // set up operator
  status = xnn_setup_multiply_nd_qs8(
      xnnp_qmul_operator.get(),
      reinterpret_cast<const underlying_t*>(self_contig.data_ptr<scalar_t>()),
      reinterpret_cast<const underlying_t*>(other_contig.data_ptr<scalar_t>()),
      reinterpret_cast<underlying_t*>(out.data_ptr<scalar_t>())
  );
  TORCH_CHECK(
      status == xnn_status_success && input0_id != XNN_INVALID_VALUE_ID,
      func_name, ": xnn define input 0 failed(", status,")!");
  // Defining the quantized input 1
  status = xnnp_define_q_tensor(
    other,
    qa_mem_format,
    input1_id,
    subgraph_ptr,
    1,
    XNN_VALUE_FLAG_EXTERNAL_INPUT
  );
  TORCH_CHECK(
      status == xnn_status_success && input1_id != XNN_INVALID_VALUE_ID,
      func_name, ": xnn define input 1 failed(", status,")!");
  // Defining the quantized output
  status = xnnp_define_q_tensor(
    out,
    qa_mem_format,
    output_id,
    subgraph_ptr,
    2,
    XNN_VALUE_FLAG_EXTERNAL_OUTPUT
  );
  TORCH_CHECK(
      status == xnn_status_success && output_id != XNN_INVALID_VALUE_ID,
      func_name, ": xnn define output failed(", status,")!");
  const struct xnn_binary_params binary_params = {output_min, output_max};
  status = xnn_define_binary(
    subgraph_ptr,
    xnn_binary_multiply,
    &binary_params,
    input0_id,
    input1_id,
    output_id,
    0);
  TORCH_CHECK(
      status == xnn_status_success,
-      func_name,
+      func_name, ": xnn define binary add failed(", status,")!");
      ": xnn setup operator failed(",
      status,
      ")!");
-  // Run the operator
+  // create runtime
-  status = xnn_run_operator(
+  xnn_runtime_t runtime_ptr = nullptr;
-      xnnp_qmul_operator.get(), /* xnn_operator_t op */
+  status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
      caffe2::pthreadpool_()); /* pthreadpool_t threadpool */
  TORCH_CHECK(
      status == xnn_status_success,
-      func_name,
+      func_name, ": xnn create runtime failed(", status,")!");
-      ": xnn run operator failed(",
+  TORCH_CHECK(
-      status,
+      runtime_ptr != nullptr,
-      ")");
+      func_name, ": xnn create runtime failed because runtime_ptr is null");
  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
      runtime_ptr, &xnn_delete_runtime);
  std::array<xnn_external_value, 3> external = {
    xnn_external_value{input0_id, reinterpret_cast<void*>(self.data_ptr<scalar_t>())},
    xnn_external_value{input1_id, reinterpret_cast<void*>(other.data_ptr<scalar_t>())},
    xnn_external_value{output_id, reinterpret_cast<void*>(out.data_ptr<scalar_t>())}};
  status = xnn_setup_runtime(
    runtime_ptr,
    external.size(),
    external.data());
  TORCH_CHECK(
      status == xnn_status_success,
      func_name, ": xnn setup runtime failed(", status,")!");
  status = xnn_invoke_runtime(runtime_ptr);
  TORCH_CHECK(
      status == xnn_status_success,
      func_name, ": xnn invoke runtime failed(", status,")!");
  return out;
 }
--- a/aten/src/ATen/native/xnnpack/Activation.cpp
+++ b/aten/src/ATen/native/xnnpack/Activation.cpp
@ -19,46 +19,84 @@ bool use_hardswish(
 static Tensor& hardswish_impl(Tensor& input, Tensor& output) {
  using namespace internal;
-
+  // Create XNNPACK Subgraph
-  xnn_operator_t hardswish_op{};
+  xnn_subgraph_t subgraph_ptr = nullptr;
-  const xnn_status create_status = xnn_create_hardswish_nc_f32(
+  xnn_status status = xnn_create_subgraph(
-    0, // flags
+    /*external_value_ids=*/2,
-    &hardswish_op);
+    /*flags=*/0,
-
+    &subgraph_ptr);
  TORCH_CHECK(
-    xnn_status_success == create_status,
+      status == xnn_status_success,
-    "xnn_create_hardswish_nc_f32 failed!");
+      "xnn create subgraph failed(", status,")!");
-
+  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
-  Operator hardswish_scoped_op(hardswish_op);
+      subgraph_ptr, &xnn_delete_subgraph);
-
+  uint32_t input_id = XNN_INVALID_VALUE_ID, output_id = XNN_INVALID_VALUE_ID;
-  const xnn_status reshape_status = xnn_reshape_hardswish_nc_f32(
+  std::vector<size_t> input_output_shape(input.sizes().begin(), input.sizes().end());
    hardswish_op,
    input.numel(),  // Batch
    1, // channels
    1, // input stride
    1, // output stride
    caffe2::pthreadpool_());  // threadpool
  status = xnn_define_tensor_value(
    subgraph_ptr,
    xnn_datatype_fp32,
    input_output_shape.size(),
    input_output_shape.data(),
    nullptr,
    0,
    XNN_VALUE_FLAG_EXTERNAL_INPUT,
    &input_id
  );
  TORCH_CHECK(
-    xnn_status_success == reshape_status,
+      status == xnn_status_success,
-    "xnn_reshape_hardswish_nc_f32 failed!");
+      "defining xnn input failed(", status,")!");
  const xnn_status setup_status = xnn_setup_hardswish_nc_f32(
    hardswish_op,
    input.data_ptr<float>(),
    output.data_ptr<float>());
  status = xnn_define_tensor_value(
    subgraph_ptr,
    xnn_datatype_fp32,
    input_output_shape.size(),
    input_output_shape.data(),
    nullptr,
    1,
    XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
    &output_id
  );
  TORCH_CHECK(
-    xnn_status_success == setup_status,
+      status == xnn_status_success,
-    "xnn_setup_hardswish_nc_f32 failed!");
+      "defining xnn output failed(", status,")!");
-  const xnn_status run_status = xnn_run_operator(
+  status = xnn_define_unary(
-    hardswish_op,
+    subgraph_ptr,
-    caffe2::pthreadpool_());  // threadpool
+    xnn_unary_hardswish,
    nullptr,
    input_id,
    output_id,
    0
  );
-  TORCH_INTERNAL_ASSERT(
+  // create runtime
-    xnn_status_success == run_status,
+  xnn_runtime_t runtime_ptr = nullptr;
-    "xnn_run_operator failed!");
+  status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
  TORCH_CHECK(
      status == xnn_status_success,
      "xnn create runtime failed(", status,")!");
  TORCH_CHECK(
      runtime_ptr != nullptr,
      "xnn create runtime failed because runtime_ptr is null");
  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
      runtime_ptr, &xnn_delete_runtime);
  std::array<xnn_external_value, 2> external = {
    xnn_external_value{input_id, input.data_ptr<float>()},
    xnn_external_value{output_id, output.data_ptr<float>()}};
  status = xnn_setup_runtime(
    runtime_ptr,
    external.size(),
    external.data());
  TORCH_CHECK(
      status == xnn_status_success,
      "xnn setup runtime failed(", status,")!");
  status = xnn_invoke_runtime(runtime_ptr);
  TORCH_CHECK(
      status == xnn_status_success,
      "xnn invoke runtime failed(", status,")!");
  return output;
 }
--- a/aten/src/ATen/native/xnnpack/AveragePooling.cpp
+++ b/aten/src/ATen/native/xnnpack/AveragePooling.cpp
@ -7,6 +7,27 @@
 namespace at::native::xnnpack {
 inline std::vector<size_t> get_mem_format_aware_shape(const at::Tensor& in) {
  const auto mem_format = in.suggest_memory_format();
  const auto& sizes = in.sizes();
  std::vector<size_t> ret(sizes.begin(), sizes.end());
  if (mem_format == c10::MemoryFormat::ChannelsLast) {
    // NCHW -> NHWC
    // 0123 -> 0231
    ret[1] = sizes[2]; /* H */
    ret[2] = sizes[3]; /* W */
    ret[3] = sizes[1]; /* C */
  } else if (mem_format == c10::MemoryFormat::ChannelsLast3d) {
    // NCDHW -> NDHWC
    // 01234 -> 02341
    ret[1] = sizes[2]; /* D */
    ret[2] = sizes[3]; /* H */
    ret[3] = sizes[4]; /* W */
    ret[4] = sizes[1]; /* C */
  }
  return ret;
 }
 bool use_global_average_pool(const Tensor& input) {
  return xnnpack::available() && (1 <= input.ndimension()) &&
      (input.device().is_cpu()) && (kFloat == input.scalar_type()) &&
@ -31,63 +52,91 @@ Tensor global_average_pool(const Tensor& input) {
      MemoryFormat::ChannelsLast,
      input_padded_contig_nhwc.opt_names());
-  xnn_operator_t global_average_pooling_op{};
+  // Create XNNPACK Subgraph
-  const xnn_status create_status = xnn_create_global_average_pooling_nwc_f32(
+  xnn_subgraph_t subgraph_ptr = nullptr;
-      -std::numeric_limits<float>::infinity(),
+  xnn_status status = xnn_create_subgraph(
-      std::numeric_limits<float>::infinity(),
+    /*external_value_ids=*/2,
-      0 /* flags */,
+    /*flags=*/0,
-      &global_average_pooling_op);
+    &subgraph_ptr);
  TORCH_CHECK(
-      xnn_status_success == create_status,
+      status == xnn_status_success,
-      "xnn_create_global_average_pooling_nwc_f32 failed!");
+      "xnn create subgraph failed(", status,")!");
  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
      subgraph_ptr, &xnn_delete_subgraph);
  uint32_t input_id = XNN_INVALID_VALUE_ID, output_id = XNN_INVALID_VALUE_ID;
  Operator global_avg_pool_scoped_op(global_average_pooling_op);
  size_t workspace_size = 0;
  size_t workspace_alignment = 0;
  const xnn_status reshape_status = xnn_reshape_global_average_pooling_nwc_f32(
      global_average_pooling_op,
      input_padded_contig_nhwc.size(Layout::Activation4D::batch), // batch_size
      input_padded_contig_nhwc.size(Layout::Activation4D::width) *
          input_padded_contig_nhwc.size(Layout::Activation4D::height), // width
      input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
      input_padded_contig_nhwc.size(
          Layout::Activation4D::channels), // input stride
      input_padded_contig_nhwc.size(
          Layout::Activation4D::channels), // output stride
      &workspace_size, // workspace_size
      &workspace_alignment, // workspace_alignment
      caffe2::pthreadpool_());
  const auto& input_shape = get_mem_format_aware_shape(input_padded_contig_nhwc);
  status = xnn_define_tensor_value(
    subgraph_ptr,
    xnn_datatype_fp32,
    input_shape.size(),
    input_shape.data(),
    nullptr,
    0,
    XNN_VALUE_FLAG_EXTERNAL_INPUT,
    &input_id
  );
  TORCH_CHECK(
-      xnn_status_success == reshape_status,
+      status == xnn_status_success,
-      "xnn_reshape_global_average_pooling_nwc_f32 failed!");
+      "defining xnn input failed(", status,")!");
  // Create Workspace pointer, which we will align and pad with 16 bytes
  size_t xnnpack_buffer_padding = 16;
  std::vector<char> workspace_vector(workspace_size + workspace_alignment + xnnpack_buffer_padding);
  void* maybe_aligned_workspace = workspace_vector.data();
  void* aligned_workspace =
      (void*)((intptr_t)maybe_aligned_workspace + workspace_alignment - (intptr_t)maybe_aligned_workspace % workspace_alignment);
  const xnn_status setup_status = xnn_setup_global_average_pooling_nwc_f32(
      global_average_pooling_op,
      aligned_workspace,
      input_padded_contig_nhwc.data_ptr<float>(),
      output.data_ptr<float>());
  const auto& output_shape = get_mem_format_aware_shape(output);
  status = xnn_define_tensor_value(
    subgraph_ptr,
    xnn_datatype_fp32,
    output_shape.size(),
    output_shape.data(),
    nullptr,
    1,
    XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
    &output_id
  );
  TORCH_CHECK(
-      xnn_status_success == setup_status,
+      status == xnn_status_success,
-      "xnn_setup_global_average_pooling_nwc_f32 failed!");
+      "defining xnn output failed(", status,")!");
  const xnn_status run_status =
      xnn_run_operator(global_average_pooling_op, caffe2::pthreadpool_());
  std::vector<size_t> reduce_dims{1, 2};
  status = xnn_define_static_reduce(
    subgraph_ptr,
    xnn_reduce_mean,
    reduce_dims.size(),
    reduce_dims.data(),
    input_id,
    output_id,
    0
  );
  TORCH_CHECK(
-      xnn_status_success == run_status,
+      status == xnn_status_success,
-      "xnn_setup_global_average_pooling_nwc_f32 failed!");
+      "defining xnn static reduce failed(", status,")!");
  // create runtime
  xnn_runtime_t runtime_ptr = nullptr;
  status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
  TORCH_CHECK(
      status == xnn_status_success,
      "xnn create runtime failed(", status,")!");
  TORCH_CHECK(
      runtime_ptr != nullptr,
      "xnn create runtime failed because runtime_ptr is null");
  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
      runtime_ptr, &xnn_delete_runtime);
  std::array<xnn_external_value, 2> external = {
    xnn_external_value{input_id, input_padded_contig_nhwc.data_ptr<float>()},
    xnn_external_value{output_id, output.data_ptr<float>()}};
  status = xnn_setup_runtime(
    runtime_ptr,
    external.size(),
    external.data());
  TORCH_CHECK(
      status == xnn_status_success,
      "xnn setup runtime failed(", status,")!");
  status = xnn_invoke_runtime(runtime_ptr);
  TORCH_CHECK(
      status == xnn_status_success,
      "xnn invoke runtime failed(", status,")!");
  return output.to(input.suggest_memory_format());
 }
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -516,6 +516,9 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
    # Disable I8MM For CI since clang 9 does not support neon i8mm.
    set(XNNPACK_ENABLE_ARM_I8MM OFF CACHE BOOL "")
    # Disable avxvnni int8
    set(XNNPACK_ENABLE_AVXVNNIINT8 OFF CACHE BOOL "")
    # Older MSVC versions don't support AVX512FP. TODO Minimum version support?
    IF(CMAKE_C_COMPILER_ID STREQUAL "MSVC")
      set(XNNPACK_ENABLE_AVX512FP16  OFF CACHE BOOL "")
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@ -94,6 +94,7 @@ else()
  if(@USE_XNNPACK@)
    append_torchlib_if_found(XNNPACK)
    append_torchlib_if_found(microkernels-prod)
  endif()
  append_torchlib_if_found(caffe2_protos protobuf-lite protobuf protoc)
--- a/ios/TestApp/benchmark/setup.rb
+++ b/ios/TestApp/benchmark/setup.rb
@ -111,7 +111,7 @@ else
 end
 puts "Linking static libraries..."
-libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a']
+libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libmicrokernels-prod.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a']
 frameworks = ['CoreML', 'Metal', 'MetalPerformanceShaders', 'Accelerate', 'UIKit']
 targets.each do |target|
    # NB: All these libraries and frameworks have already been linked by TestApp, adding them
--- a/scripts/xcode_build.rb
+++ b/scripts/xcode_build.rb
@ -40,7 +40,7 @@ end
 # link static libraries
 target.frameworks_build_phases.clear
-libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a', 'libkineto.a']
+libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libmicrokernels-prod.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a', 'libkineto.a']
 for lib in libs do
    path = "#{install_path}/lib/#{lib}"
    if File.exist?(path)
--- a/third_party/XNNPACK
+++ b/third_party/XNNPACK
@ -1 +1 @@
-Subproject commit 87ee0b46b834f67bad9025d4a82ed5654f3403d3
+Subproject commit 4ea82e595b36106653175dcb04b2aa532660d0d8
--- a/third_party/xnnpack.buck.bzl
+++ b/third_party/xnnpack.buck.bzl
--- a/third_party/xnnpack_buck_shim.bzl
+++ b/third_party/xnnpack_buck_shim.bzl
@ -0,0 +1,33 @@
 load(
    "//xplat/third-party/XNNPACK/XNNPACK:build_srcs.bzl",
    _LOGGING_SRCS = "LOGGING_SRCS",
    _OPERATOR_SRCS = "OPERATOR_SRCS",
    _SUBGRAPH_SRCS = "SUBGRAPH_SRCS",
    _TABLE_SRCS = "TABLE_SRCS",
    _XNNPACK_SRCS = "XNNPACK_SRCS",
 )
 load("//xplat/third-party/XNNPACK/XNNPACK/gen:microkernels.bzl", "prod_srcs_for_arch")
 load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
 def define_xnnpack_build_src(xnnpack_build_src):
    return ["XNNPACK/{}".format(src) for src in xnnpack_build_src]
 def prod_srcs_for_arch_wrapper(arch):
    prod_srcs = prod_srcs_for_arch(arch)
    return define_xnnpack_build_src(prod_srcs)
 def get_xnnpack_headers():
    src_headers = subdir_glob([
        ("XNNPACK/src", "**/*.h"),
    ])
    include_headers = subdir_glob([
        ("XNNPACK/include", "*.h"),
    ])
    return src_headers | include_headers
 OPERATOR_SRCS = define_xnnpack_build_src(_OPERATOR_SRCS)
 SUBGRAPH_SRCS = define_xnnpack_build_src(_SUBGRAPH_SRCS)
 TABLE_SRCS = define_xnnpack_build_src(_TABLE_SRCS)
 XNNPACK_SRCS = define_xnnpack_build_src(_XNNPACK_SRCS)
 LOGGING_SRCS = define_xnnpack_build_src(_LOGGING_SRCS)
--- a/third_party/xnnpack_src_defs.bzl
+++ b/third_party/xnnpack_src_defs.bzl
--- a/third_party/xnnpack_wrapper_defs.bzl
+++ b/third_party/xnnpack_wrapper_defs.bzl
		`@ -1 +1 @@`
			`Subproject commit 87ee0b46b834f67bad9025d4a82ed5654f3403d3`				`Subproject commit 4ea82e595b36106653175dcb04b2aa532660d0d8`