build(aarch64): Update to oneDNN-3.7 + ACL-24.12

Bumps the aarch64-compatible oneDNN version to 3.7 and the ACL version to 24.12. This brings better performance, improved memory management, and numerous bug fixes over the previous, long out-of-date versions. Signed-off-by: Siddhartha Menon <siddhartha.menon@arm.com>
2025-12-06 00:19:58 +01:00 · 2024-10-18 13:50:49 +00:00 · 2024-10-18 13:50:49 +00:00 · a24a3a48f2
commit a24a3a48f2
parent 2765e59402
36 changed files with 441 additions and 2337 deletions
--- a/.bazelrc
+++ b/.bazelrc
@ -241,17 +241,15 @@ build:mkl_threadpool --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl_threadpool --define=build_with_mkl_opensource=true
 build:mkl_threadpool -c opt

-# Config setting to build oneDNN with Compute Library for the Arm Architecture (ACL).
-build:mkl_aarch64 --define=build_with_mkl_aarch64=true
-build:mkl_aarch64 --define=build_with_openmp=true
-build:mkl_aarch64 --define=build_with_acl=true
-build:mkl_aarch64 -c opt
-
 # Config setting to build oneDNN with Compute Library for the Arm Architecture (ACL).
 # with Eigen threadpool support
 build:mkl_aarch64_threadpool --define=build_with_mkl_aarch64=true
+build:mkl_aarch64_threadpool --define=build_with_acl=true
 build:mkl_aarch64_threadpool -c opt

+# This is an alias for the mkl_aarch64_threadpool build.
+build:mkl_aarch64 --config=mkl_aarch64_threadpool
+
 # Default CUDA and CUDNN versions.
 build:cuda_version --repo_env=HERMETIC_CUDA_VERSION="12.5.1"
 build:cuda_version --repo_env=HERMETIC_CUDNN_VERSION="9.3.0"
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@ -21,7 +21,6 @@ load(
    "if_mkl",
    "if_mkl_ml",
    "if_mkldnn_aarch64_acl",
-    "if_mkldnn_aarch64_acl_openmp",
    "if_mkldnn_openmp",
    "onednn_v3_define",
 )
@ -478,7 +477,6 @@ def tf_copts(
        if_mkldnn_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
        onednn_v3_define() +
        if_mkldnn_aarch64_acl(["-DDNNL_AARCH64_USE_ACL=1"]) +
-        if_mkldnn_aarch64_acl_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
        if_zendnn(["-DAMD_ZENDNN"]) +
        if_enable_acl(["-DXLA_CPU_USE_ACL=1", "-fexceptions"]) +
        if_llvm_aarch32_available(["-DTF_LLVM_AARCH32_AVAILABLE=1"]) +
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@ -236,33 +236,23 @@ def _tf_repositories():
        name = "mkl_dnn_acl_compatible",
        build_file = "//third_party/mkl_dnn:mkldnn_acl.BUILD",
        patch_file = [
-            "//third_party/mkl_dnn:onednn_acl_threadcap.patch",
-            "//third_party/mkl_dnn:onednn_acl_reorder.patch",
-            "//third_party/mkl_dnn:onednn_acl_thread_local_scheduler.patch",
-            "//third_party/mkl_dnn:onednn_acl_fp32_bf16_reorder.patch",
-            "//third_party/mkl_dnn:onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch",
-            "//third_party/mkl_dnn:onednn_acl_indirect_conv.patch",
-            "//third_party/mkl_dnn:onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch",
-            "//third_party/mkl_dnn:onednn_acl_fix_segfault_during_postop_execute.patch",
-            "//third_party/mkl_dnn:onednn_acl_add_bf16_platform_support_check.patch",
-            "//third_party/mkl_dnn:onednn_acl_add_sbgemm_matmul_primitive_definition.patch",
+            "//third_party/mkl_dnn:onednn_acl_threadpool_default_max.patch",
        ],
-        sha256 = "2f76b407ef8893cca71340f88cd800019a1f14f8ac1bbdbb89a84be1370b52e3",
-        strip_prefix = "oneDNN-3.2.1",
-        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.2.1.tar.gz"),
+        sha256 = "5792cbc07764c6e25c459ff68efb5cfcd7f4a0ba66dca6a4a2c681cd7a644596",
+        strip_prefix = "oneDNN-3.7",
+        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.7.zip"),
    )

    tf_http_archive(
        name = "compute_library",
        patch_file = [
            "//third_party/compute_library:compute_library.patch",
-            "//third_party/compute_library:acl_thread_local_scheduler.patch",
            "//third_party/compute_library:exclude_omp_scheduler.patch",
            "//third_party/compute_library:include_string.patch",
        ],
-        sha256 = "c4ca329a78da380163b2d86e91ba728349b6f0ee97d66e260a694ef37f0b0d93",
-        strip_prefix = "ComputeLibrary-23.05.1",
-        urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/v23.05.1.tar.gz"),
+        sha256 = "8273f68cd0bb17e9231a11a6618d245eb6d623884ae681c00e7a4eabca2dad42",
+        strip_prefix = "ComputeLibrary-24.12",
+        urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/refs/tags/v24.12.tar.gz"),
    )

    tf_http_archive(
--- a/third_party/compute_library/acl_thread_local_scheduler.patch
+++ b/third_party/compute_library/acl_thread_local_scheduler.patch
@ -1,98 +0,0 @@
-diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
-index 9e8add1f9..cf5e2bf4c 100644
--- a/arm_compute/runtime/Scheduler.h
-+++ b/arm_compute/runtime/Scheduler.h
-@@ -75,7 +75,7 @@ public:
-
- private:
-     static Type                        _scheduler_type;
-    static std::shared_ptr<IScheduler> _custom_scheduler;
-+    static thread_local std::shared_ptr<IScheduler> _custom_scheduler;
-     static std::map<Type, std::unique_ptr<IScheduler>> _schedulers;
-
-     Scheduler();
-diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
-index a5b9eca56..d1ab19397 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
-+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
-@@ -60,8 +60,8 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo     *src,
-                                                    const ConvolutionInfo &info)
- {
-     ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, info);
-    const CPUInfo     &ci          = NEScheduler::get().cpu_info();
-    const unsigned int num_threads = NEScheduler::get().num_threads();
-+    const CPUInfo     &ci          = CPUInfo::get();
-+    const unsigned int num_threads = CPUInfo::get().get_cpu_num();
-     _pImpl->is_prepared            = false;
-     _pImpl->are_weights_const      = weights->are_values_constant();
-
-diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp
-index 722cd36ee..03aef1632 100644
--- a/src/cpu/operators/CpuPool2d.cpp
-+++ b/src/cpu/operators/CpuPool2d.cpp
-@@ -66,8 +66,8 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
-
-     if(run_optimised)
-     {
-        const CPUInfo     &ci          = NEScheduler::get().cpu_info();
-        const unsigned int num_threads = NEScheduler::get().num_threads();
-+        const CPUInfo     &ci          = CPUInfo::get();
-+        const unsigned int num_threads = CPUInfo::get().get_cpu_num();
-
-         auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>();
-         ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
-diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-index 9c8563140..f7771945a 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
-+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
-@@ -623,8 +623,8 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
-                      arm_gemm::Activation activation, const AsmGemmInfo &info)
- {
-     Params         p           = extract_parameters(a, b, d, info);
-    const CPUInfo &ci          = NEScheduler::get().cpu_info();
-    unsigned int   num_threads = NEScheduler::get().num_threads();
-+    const CPUInfo &ci          = CPUInfo::get();
-+    unsigned int   num_threads = CPUInfo::get().get_cpu_num();
-
-     arm_gemm::GemmConfig cfg;
-     cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
-@@ -696,8 +696,8 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
-     ARM_COMPUTE_UNUSED(c);
-     arm_gemm::Activation act         = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
-     Params               p           = extract_parameters(a, b, d, info);
-    const CPUInfo       &ci          = NEScheduler::get().cpu_info();
-    unsigned int         num_threads = NEScheduler::get().num_threads();
-+    const CPUInfo       &ci          = CPUInfo::get();
-+    unsigned int         num_threads = CPUInfo::get().get_cpu_num();
-     arm_gemm::GemmConfig cfg;
-     cfg.weight_format                           = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
-     arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format);
-diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
-index 0713b9a2a..f15ac2e22 100644
--- a/src/runtime/Scheduler.cpp
-+++ b/src/runtime/Scheduler.cpp
-@@ -47,7 +47,7 @@ Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
- Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST;
- #endif /* ARM_COMPUTE_*_SCHEDULER */
-
-std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
-+thread_local std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
-
- namespace
- {
--- a/third_party/compute_library/exclude_omp_scheduler.patch
+++ b/third_party/compute_library/exclude_omp_scheduler.patch
@ -1,8 +1,8 @@
 diff --git a/src/BUILD.bazel b/src/BUILD.bazel
-index bf71e534e2..22377f1a32 100644
+index 547c98576..a31301230 100644
 --- a/src/BUILD.bazel
 +++ b/src/BUILD.bazel
-@@ -971,7 +971,6 @@ filegroup(
+@@ -1029,7 +1029,6 @@ filegroup(
 	"runtime/NEON/functions/NETranspose.cpp",
 	"runtime/NEON/functions/NEUnstack.cpp",
 	"runtime/NEON/functions/NEWinogradConvolutionLayer.cpp",
@ -10,10 +10,10 @@ index bf71e534e2..22377f1a32 100644
 	"runtime/OffsetLifetimeManager.cpp",
 	"runtime/OffsetMemoryPool.cpp",
 	"runtime/OperatorTensor.cpp",
-@@ -984,6 +983,10 @@ filegroup(
- 	"runtime/Tensor.cpp",
- 	"runtime/TensorAllocator.cpp",
- 	"runtime/Utils.cpp"]  +
+@@ -1058,6 +1057,10 @@ filegroup(
+ 	"runtime/experimental/operators/CpuSub.cpp",
+ 	"runtime/experimental/operators/CpuTranspose.cpp",
+ 	"runtime/experimental/operators/CpuWinogradConv2d.cpp"]  +
 +    select({
 +        "//:openmp_flag": ["runtime/OMP/OMPScheduler.cpp"],
 +        "//conditions:default": [],
--- a/third_party/mkl_dnn/mkldnn_acl.BUILD
+++ b/third_party/mkl_dnn/mkldnn_acl.BUILD
@ -9,13 +9,6 @@ _DNNL_COPTS_THREADPOOL = [
    "-UUSE_CBLAS",
 ]

-_DNNL_COPTS_OMP = [
-    "-fopenmp",
-    "-fexceptions",
-    "-UUSE_MKL",
-    "-UUSE_CBLAS",
-]
-
 _DNNL_RUNTIME_THREADPOOL = {
    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
@ -63,61 +56,24 @@ _DNNL_RUNTIME_THREADPOOL = {
    "#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
    "#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
    "#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
-}
-
-_DNNL_RUNTIME_OMP = {
-    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
-    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
-    "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
-    "#cmakedefine DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE": "#undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE",
-    "#cmakedefine DNNL_WITH_SYCL": "#undef DNNL_WITH_SYCL",
-    "#cmakedefine DNNL_WITH_LEVEL_ZERO": "#undef DNNL_WITH_LEVEL_ZERO",
-    "#cmakedefine DNNL_SYCL_CUDA": "#undef DNNL_SYCL_CUDA",
-    "#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
-    "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
-    "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
-    "#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
-    "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
-    "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
-    "#cmakedefine01 BUILD_BATCH_NORMALIZATION": "#define BUILD_BATCH_NORMALIZATION 0",
-    "#cmakedefine01 BUILD_BINARY": "#define BUILD_BINARY 0",
-    "#cmakedefine01 BUILD_CONCAT": "#define BUILD_CONCAT 0",
-    "#cmakedefine01 BUILD_CONVOLUTION": "#define BUILD_CONVOLUTION 0",
-    "#cmakedefine01 BUILD_DECONVOLUTION": "#define BUILD_DECONVOLUTION 0",
-    "#cmakedefine01 BUILD_ELTWISE": "#define BUILD_ELTWISE 0",
-    "#cmakedefine01 BUILD_INNER_PRODUCT": "#define BUILD_INNER_PRODUCT 0",
-    "#cmakedefine01 BUILD_LAYER_NORMALIZATION": "#define BUILD_LAYER_NORMALIZATION 0",
-    "#cmakedefine01 BUILD_LRN": "#define BUILD_LRN 0",
-    "#cmakedefine01 BUILD_MATMUL": "#define BUILD_MATMUL 0",
-    "#cmakedefine01 BUILD_POOLING": "#define BUILD_POOLING 0",
-    "#cmakedefine01 BUILD_PRELU": "#define BUILD_PRELU 0",
-    "#cmakedefine01 BUILD_REDUCTION": "#define BUILD_REDUCTION 0",
-    "#cmakedefine01 BUILD_REORDER": "#define BUILD_REORDER 0",
-    "#cmakedefine01 BUILD_RESAMPLING": "#define BUILD_RESAMPLING 0",
-    "#cmakedefine01 BUILD_RNN": "#define BUILD_RNN 0",
-    "#cmakedefine01 BUILD_SHUFFLE": "#define BUILD_SHUFFLE 0",
-    "#cmakedefine01 BUILD_SOFTMAX": "#define BUILD_SOFTMAX 0",
-    "#cmakedefine01 BUILD_SUM": "#define BUILD_SUM 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_CPU_ISA_ALL": "#define BUILD_PRIMITIVE_CPU_ISA_ALL 0",
-    "#cmakedefine01 BUILD_SSE41": "#define BUILD_SSE41 0",
-    "#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0",
-    "#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0",
-    "#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 0",
-    "#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0",
-    "#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0",
-    "#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0",
-    "#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
-    "#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
-    "#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
+    "#cmakedefine01 BUILD_GROUP_NORMALIZATION": "#define BUILD_GROUP_NORMALIZATION 0",
+    "#cmakedefine01 BUILD_GEMM_KERNELS_ALL": "#define BUILD_GEMM_KERNELS_ALL 1",
+    "#cmakedefine01 BUILD_GEMM_KERNELS_NONE": "#define BUILD_GEMM_KERNELS_NONE 0",
+    "#cmakedefine01 BUILD_GEMM_SSE41": "#define BUILD_GEMM_SSE41 0",
+    "#cmakedefine01 BUILD_GEMM_AVX2": "#define BUILD_GEMM_AVX2 0",
+    "#cmakedefine01 BUILD_GEMM_AVX512": "#define BUILD_GEMM_AVX512 0",
+    "#cmakedefine DNNL_GPU_VENDOR": "#define DNNL_GPU_VENDOR INTEL",
+    "#cmakedefine DNNL_SYCL_GENERIC": "#undef DNNL_SYCL_GENERIC",
+    "#cmakedefine DNNL_DISABLE_GPU_REF_KERNELS": "#undef DNNL_DISABLE_GPU_REF_KERNELS",
+    "#cmakedefine01 BUILD_SDPA": "#define BUILD_SDPA 0",
+    "#cmakedefine01 BUILD_XE2": "#define BUILD_XE2 0",
+    "#cmakedefine01 BUILD_XE3": "#define BUILD_XE3 0",
 }

 expand_template(
    name = "dnnl_config_h",
    out = "include/oneapi/dnnl/dnnl_config.h",
    substitutions = select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": _DNNL_RUNTIME_OMP,
        "//conditions:default": _DNNL_RUNTIME_THREADPOOL,
    }),
    template = "include/oneapi/dnnl/dnnl_config.h.in",
@ -128,13 +84,21 @@ expand_template(
    out = "include/oneapi/dnnl/dnnl_version.h",
    substitutions = {
        "@DNNL_VERSION_MAJOR@": "3",
-        "@DNNL_VERSION_MINOR@": "2",
-        "@DNNL_VERSION_PATCH@": "1",
-        "@DNNL_VERSION_HASH@": "N/A",
+        "@DNNL_VERSION_MINOR@": "7",
+        "@DNNL_VERSION_PATCH@": "0",
    },
    template = "include/oneapi/dnnl/dnnl_version.h.in",
 )

+expand_template(
+    name = "dnnl_version_hash_h",
+    out = "include/oneapi/dnnl/dnnl_version_hash.h",
+    substitutions = {
+        "@DNNL_VERSION_HASH@": "N/A",
+    },
+    template = "include/oneapi/dnnl/dnnl_version_hash.h.in",
+)
+
 cc_library(
    name = "mkl_dnn_acl",
    srcs = glob(
@ -146,10 +110,11 @@ cc_library(
        exclude = [
            "src/cpu/x64/**",
            "src/cpu/rv64/**",
+            "src/cpu/sycl/**",
+            "src/xpu/**",
        ],
    ),
    copts = select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": _DNNL_COPTS_OMP,
        "//conditions:default": _DNNL_COPTS_THREADPOOL,
    }),
    defines = ["DNNL_AARCH64_USE_ACL=1"],
@ -175,6 +140,7 @@ cc_library(
    ) + [
        ":dnnl_config_h",
        ":dnnl_version_h",
+        ":dnnl_version_hash_h",
    ],
    visibility = ["//visibility:public"],
    deps = [
--- a/third_party/mkl_dnn/onednn_acl_add_bf16_platform_support_check.patch
+++ b/third_party/mkl_dnn/onednn_acl_add_bf16_platform_support_check.patch
@ -1,31 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-diff --git a/src/cpu/platform.cpp b/src/cpu/platform.cpp
-index 65b887ea21..eabdb827bd 100644
--- a/src/cpu/platform.cpp
-+++ b/src/cpu/platform.cpp
-@@ -117,6 +117,8 @@ bool has_data_type_support(data_type_t data_type) {
- #if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__)
-             return true;
- #endif
-+#elif DNNL_AARCH64_USE_ACL
-+            return arm_compute::CPUInfo::get().has_bf16();
- #else
-             return false;
- #endif
-- 
-2.34.1
-
--- a/third_party/mkl_dnn/onednn_acl_add_sbgemm_matmul_primitive_definition.patch
+++ b/third_party/mkl_dnn/onednn_acl_add_sbgemm_matmul_primitive_definition.patch
@ -1,44 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
-index ab13efb9b2..ec261e156d 100644
--- a/src/cpu/aarch64/matmul/acl_matmul.hpp
-+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
-@@ -78,11 +78,21 @@ struct acl_matmul_t : public primitive_t {
-                     = utils::everyone_is(data_type::f16, src_md()->data_type,
-                               weights_md()->data_type, dst_md()->data_type)
-                     && platform::has_data_type_support(data_type::f16);
-+            const bool is_fp32_bf16_ok
-+                    = (utils::everyone_is(data_type::f32, src_md()->data_type,
-+                               dst_md()->data_type, desc()->accum_data_type)
-+                            && platform::has_data_type_support(data_type::f32)
-+                            && utils::everyone_is(
-+                                    data_type::bf16, weights_md()->data_type)
-+                            && platform::has_data_type_support(
-+                                    data_type::bf16));
-+
-             const bool is_weights_md_format_ok
-                     = utils::one_of(weights_format_kind_received,
-                             format_kind::any, format_kind::blocked);
-             bool ok = is_dense_data()
-                    && utils::one_of(true, is_fp32_ok, is_fp16_ok)
-+                    && utils::one_of(
-+                            true, is_fp32_ok, is_fp16_ok, is_fp32_bf16_ok)
-                     && !has_zero_dim_memory() && is_weights_md_format_ok
-                     && set_default_formats()
-                     && attr()->has_default_values(
-- 
-2.34.1
--- a/third_party/mkl_dnn/onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch
+++ b/third_party/mkl_dnn/onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch
@ -1,100 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
-index 451cc78d52..ab13efb9b2 100644
--- a/src/cpu/aarch64/matmul/acl_matmul.hpp
-+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
-@@ -67,6 +67,8 @@ struct acl_matmul_t : public primitive_t {
- 
-         status_t init(engine_t *engine) {
-             using smask_t = primitive_attr_t::skip_mask_t;
-+            const format_kind_t weights_format_kind_received
-+                    = weights_md_.format_kind;
-             const bool is_fp32_ok
-                     = utils::everyone_is(data_type::f32, src_md()->data_type,
-                               weights_md()->data_type, dst_md()->data_type,
-@@ -76,18 +78,20 @@ struct acl_matmul_t : public primitive_t {
-                     = utils::everyone_is(data_type::f16, src_md()->data_type,
-                               weights_md()->data_type, dst_md()->data_type)
-                     && platform::has_data_type_support(data_type::f16);
-+            const bool is_weights_md_format_ok
-+                    = utils::one_of(weights_format_kind_received,
-+                            format_kind::any, format_kind::blocked);
-             bool ok = is_dense_data()
-                     && utils::one_of(true, is_fp32_ok, is_fp16_ok)
-                    && !has_zero_dim_memory()
-                    && weights_md_.format_kind == format_kind::any
-+                    && !has_zero_dim_memory() && is_weights_md_format_ok
-                     && set_default_formats()
-                     && attr()->has_default_values(
-                             smask_t::oscale | smask_t::post_ops)
-                     && attr_oscale_ok() && !has_runtime_dims_or_strides();
-             if (!ok) return status::unimplemented;
- 
-            CHECK(acl_matmul_utils::init_conf_matmul(
-                    amp_, src_md_, weights_md_, dst_md_, *desc(), *attr()));
-+            CHECK(acl_matmul_utils::init_conf_matmul(amp_, src_md_, weights_md_,
-+                    dst_md_, *desc(), *attr(), weights_format_kind_received));
- 
-             arm_compute::ActivationLayerInfo act_info;
-             CHECK(post_ops.init(engine, attr_.post_ops_, dst_md_, act_info));
-diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
-index a314d96384..027f915a8a 100644
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
-+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
-@@ -27,7 +27,8 @@ namespace acl_matmul_utils {
- 
- status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
-         memory_desc_t &wei_md, memory_desc_t &dst_md, const matmul_desc_t &md,
-        const primitive_attr_t &attr) {
-+        const primitive_attr_t &attr,
-+        format_kind_t weights_format_kind_received) {
- 
-     const memory_desc_wrapper src_d(&src_md);
-     const memory_desc_wrapper wei_d(&wei_md);
-@@ -128,9 +129,16 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
-     for (dim_t i = K_dim - 1; i >= 0; --i)
-         batch_dims.push_back(i);
- 
-+    const memory_desc_t weights_md_received = wei_md;
-     acl_utils::reorder_to_weight_format(amp.wei_tensor_info, wei_md,
-             expected_weight_format, K_dim, N_dim, {}, batch_dims);
- 
-+    ACL_CHECK_SUPPORT((weights_format_kind_received == format_kind::blocked)
-+                    && !(dnnl_memory_desc_equal(&weights_md_received, &wei_md)),
-+            "specified blocked format not supported by ACL, use "
-+            "format_kind_t::any to find a supported blocked format for "
-+            "your platform");
-+
-     return status::success;
- }
- 
-diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
-index 67bb2e78eb..5ba4241abc 100644
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
-+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
-@@ -52,7 +52,8 @@ namespace acl_matmul_utils {
- 
- status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
-         memory_desc_t &wei_md, memory_desc_t &dst_md, const matmul_desc_t &md,
-        const primitive_attr_t &attr);
-+        const primitive_attr_t &attr,
-+        format_kind_t weights_format_kind_received);
- 
- } // namespace acl_matmul_utils
- 
-- 
-2.34.1
--- a/third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch
+++ b/third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch
@ -1,50 +0,0 @@
-From 9a9430c7db870b78c6402d786a67921af4a66334 Mon Sep 17 00:00:00 2001
-From: Kentaro Kawakami <kawakami.k@fujitsu.com>
-Date: Fri, 26 May 2023 10:58:36 +0900
-Subject: [PATCH] cpu: aarch64: xbyak_aarch64: BF16 capability detection for
- Ubuntu 20.04
-
---
- .../aarch64/xbyak_aarch64/src/util_impl_linux.h   | 15 ++++++++++++---
- 1 file changed, 12 insertions(+), 3 deletions(-)
-
-diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
-index 743843bae50..3db37e972d1 100644
--- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
-+++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
-@@ -39,6 +39,13 @@
- #include <asm/hwcap.h>
- #endif
- 
-+/* Linux kernel used in Ubuntu 20.04 does not have HWCAP2_BF16 definition. */
-+#ifdef AT_HWCAP2
-+#ifndef HWCAP2_BF16
-+#define HWCAP2_BF16 (1UL << 14)
-+#endif
-+#endif
-+
- namespace Xbyak_aarch64 {
- namespace util {
- #define XBYAK_AARCH64_ERROR_ fprintf(stderr, "%s, %d, Error occurrs during read cache infomation.\n", __FILE__, __LINE__);
-@@ -383,7 +390,7 @@ class CpuInfoLinux : public CpuInfo {
-   }
- 
-   void setHwCap() {
-    unsigned long hwcap = getauxval(AT_HWCAP);
-+    const unsigned long hwcap = getauxval(AT_HWCAP);
-     if (hwcap & HWCAP_ATOMICS)
-       type_ |= (Type)XBYAK_AARCH64_HWCAP_ATOMIC;
- 
-@@ -391,8 +398,10 @@ class CpuInfoLinux : public CpuInfo {
-       type_ |= (Type)XBYAK_AARCH64_HWCAP_FP;
-     if (hwcap & HWCAP_ASIMD)
-       type_ |= (Type)XBYAK_AARCH64_HWCAP_ADVSIMD;
-#ifdef HWCAP2_BF16
-    if (hwcap & HWCAP2_BF16)
-+
-+#ifdef AT_HWCAP2
-+    const unsigned long hwcap2 = getauxval(AT_HWCAP2);
-+    if (hwcap2 & HWCAP2_BF16)
-       type_ |= (Type)XBYAK_AARCH64_HWCAP_BF16;
- #endif
- 
--- a/third_party/mkl_dnn/onednn_acl_fix_segfault_during_postop_execute.patch
+++ b/third_party/mkl_dnn/onednn_acl_fix_segfault_during_postop_execute.patch
@ -1,96 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-diff --git a/src/cpu/aarch64/acl_post_ops.cpp b/src/cpu/aarch64/acl_post_ops.cpp
-index ea4bb200ec..3eb53b81bd 100644
--- a/src/cpu/aarch64/acl_post_ops.cpp
-+++ b/src/cpu/aarch64/acl_post_ops.cpp
-@@ -24,7 +24,7 @@ namespace aarch64 {
- 
- status_t acl_post_ops_t::execute(const exec_ctx_t &ctx, void *src_orig) const {
- 
-    int post_op_index = 0;
-+    int post_op_index = post_op_start_index_;
- 
-     // As these are post ops, this src will also be our dst. If we have a sum
-     // post op, the src/dst will start off in a temporary, then change to
-diff --git a/src/cpu/aarch64/acl_post_ops.hpp b/src/cpu/aarch64/acl_post_ops.hpp
-index 7b59ad71d3..ceaa95b73a 100644
--- a/src/cpu/aarch64/acl_post_ops.hpp
-+++ b/src/cpu/aarch64/acl_post_ops.hpp
-@@ -32,7 +32,9 @@ struct acl_post_ops_t {
-     // init the acl_post_ops_t. Note that this function modifies the passed in
-     // post ops by setting the preferred memory formats
-     status_t init(engine_t *engine, post_ops_t &post_ops,
-            const memory_desc_t &dst_md) {
-+            const memory_desc_t &dst_md, int post_op_start_index = 0) {
-+
-+        post_op_start_index_ = post_op_start_index;
- 
-         CHECK(post_ops.set_default_formats(&dst_md));
-         dst_data_type = dst_md.data_type;
-@@ -41,7 +43,7 @@ struct acl_post_ops_t {
-         sum_index = -1;
-         post_op_primitives = {};
- 
-        for (int i = 0; i < post_ops.len(); i++) {
-+        for (int i = post_op_start_index; i < post_ops.len(); i++) {
-             auto &po = post_ops.entry_[i];
- 
-             if (po.is_sum()) {
-@@ -135,7 +137,8 @@ struct acl_post_ops_t {
-     // formats
-     status_t init(engine_t *engine, post_ops_t &base_post_ops,
-             const memory_desc_t &dst_md,
-            arm_compute::ActivationLayerInfo &act_info_to_fuse) {
-+            arm_compute::ActivationLayerInfo &act_info_to_fuse,
-+            int post_op_start_index = 0) {
- 
-         CHECK(base_post_ops.set_default_formats(&dst_md));
-         dst_data_type = dst_md.data_type;
-@@ -149,18 +152,11 @@ struct acl_post_ops_t {
-                     "eltwise post op scale must be 1 (no scale)");
-             CHECK(acl_utils::convert_to_acl_act(first_po, act_info_to_fuse));
- 
-            // Copy all but the first, because it has been fused
-            post_ops_t post_ops;
-            for (int idx = 1; idx < base_post_ops.len(); ++idx) {
-                // Construct empty entry then copy, so that we can check for failure
-                post_ops.entry_.emplace_back();
-                post_ops.entry_.back().copy_from(base_post_ops.entry_[idx]);
-            }
-            return init(engine, post_ops, dst_md);
-
-+             // post_op_start_index + 1 to skip the fused eltwise
-+              return init(engine, base_post_ops, dst_md, post_op_start_index + 1);
-         } else {
-             // Nothing to fuse, just copy all post ops
-            return init(engine, base_post_ops, dst_md);
-+            return init(engine, base_post_ops, dst_md, post_op_start_index);
-         }
-     }
- 
-@@ -179,6 +175,9 @@ struct acl_post_ops_t {
- private:
-     // Index of the sum post op if there is one, < 0 means no sum
-     int sum_index = -1;
-+    // Index of the first post op this primitive executes. This is typically the
-+    // number of post ops which were fused.
-+    int post_op_start_index_ = 0;
-     data_type_t dst_data_type;
-     // Vector of primitives used to execute the post ops. They are constructed
-     // in init to be either acl_binary_t (for sum, add, sub, div, mul, min and
-- 
-2.34.1
--- a/third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch
+++ b/third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch
@ -1,111 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/cpu_isa_traits.hpp b/src/cpu/aarch64/cpu_isa_traits.hpp
-index 4a43b24c5..1a5cfe590 100644
--- a/src/cpu/aarch64/cpu_isa_traits.hpp
-+++ b/src/cpu/aarch64/cpu_isa_traits.hpp
-@@ -1,6 +1,7 @@
- /*******************************************************************************
- * Copyright 2018-2023 Intel Corporation
- * Copyright 2020-2023 FUJITSU LIMITED
-+* Copyright 2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -211,10 +212,10 @@ static inline bool mayiuse_atomic() {
-     return cpu().isAtomicSupported();
- }
- 
-inline bool isa_has_bf16(cpu_isa_t isa) {
-    return false;
-+static inline bool mayiuse_bf16() {
-+    using namespace Xbyak_aarch64::util;
-+    return cpu().isBf16Supported();
- }
-
- } // namespace
- 
- /* whatever is required to generate string literals... */
-diff --git a/src/cpu/aarch64/jit_uni_reorder.cpp b/src/cpu/aarch64/jit_uni_reorder.cpp
-index 6bd259ec2..5541bb702 100644
--- a/src/cpu/aarch64/jit_uni_reorder.cpp
-+++ b/src/cpu/aarch64/jit_uni_reorder.cpp
-@@ -1,7 +1,7 @@
- /*******************************************************************************
- * Copyright 2018-2023 Intel Corporation
- * Copyright 2020-2023 FUJITSU LIMITED
-* Copyright 2022 Arm Ltd. and affiliates
-+* Copyright 2022-2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -163,11 +163,11 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
- 
-         bool ok = true && p.ndims > 0
-                 && utils::one_of(p.itype, f32, s32, data_type::s8, u8)
-                && utils::one_of(p.otype, f32, s32, data_type::s8, u8)
-+                && utils::one_of(p.otype, f32, bf16, s32, data_type::s8, u8)
-                 && utils::everyone_is(0, p.ioff, p.ooff) /* do we need this? */
-                 && utils::one_of(p.beta, 0.f, 1.f) /* anything else? */
-                && simple_impl_desc_init(p, nullptr)
-                && prb_has_small_strides(p);
-+                && simple_impl_desc_init(p, nullptr) && prb_has_small_strides(p)
-+                && ((p.otype != bf16) || (p.itype == f32 && mayiuse_bf16()));
- 
-         return ok;
-     }
-@@ -648,6 +648,9 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-                         cvt_v_s32_u8(startIdx, regNum);
-                     if (idt == data_type::s8) cvt_v_s8_u8(startIdx, regNum);
-                     break;
-+                case bf16:
-+                    if (idt == f32) cvt_v_f32_bf16(startIdx, regNum);
-+                    break;
-                 default: assert(!"unreachable");
-             }
-         };
-@@ -1677,6 +1680,10 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-         UNROLL_INST(fcvtzs, VReg4S, tmp, tmp);
-     }
- 
-+    void cvt_v_f32_bf16(const size_t startIdx, const size_t regNum) {
-+        UNROLL_INST2(bfcvtn, VReg4H(i), VReg4S(i));
-+    }
-+
-     void cvt_z_s8_s32(const size_t startIdx, const size_t regNum) {
-         cvt_z_b_s(startIdx, regNum);
-         UNROLL_INST(sxtb, ZRegS, tmp, P_ALL_ONE / T_m, tmp);
-diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
-index ba5499ba9..d4e21d316 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
-+++ b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
-@@ -1,5 +1,6 @@
- /*******************************************************************************
- * Copyright 2020-2022 Intel Corporation
-+* Copyright 2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -34,6 +35,8 @@ const impl_list_map_t &regular_f32_bf16_impl_list_map() {
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nChw16c))
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nCdhw16c))
- 
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-+
-             DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8i16o2i, fmt_order::keep))
-             DNNL_NON_X64_ONLY(REG_SR(f32, goihw, bf16, gOIhw8i16o2i, fmt_order::keep))
-             DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8o16i2o, fmt_order::keep))
--- a/third_party/mkl_dnn/onednn_acl_indirect_conv.patch
+++ b/third_party/mkl_dnn/onednn_acl_indirect_conv.patch
@ -1,31 +0,0 @@
- *******************************************************************************
- Copyright 2024 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
-index f043fee4bc..0384cce757 100644
--- a/src/cpu/aarch64/acl_convolution_utils.cpp
-+++ b/src/cpu/aarch64/acl_convolution_utils.cpp
-@@ -313,10 +313,6 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
-
-     CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
-
-    // Indirect is slower than gemm for low thread counts, except for fast math
-    if (dnnl_get_max_threads() < 28 && !acp.fast_math)
-        return status::unimplemented;
-
-     // If we do not need to pad input channels for fast math mode then it would
-     // be faster to run convolution with im2row instead of using indirect kernel
-     int block_by = arm_compute::block_by(acp.weights_info.weight_format());
--- a/third_party/mkl_dnn/onednn_acl_reorder.patch
+++ b/third_party/mkl_dnn/onednn_acl_reorder.patch
@ -1,371 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_reorder.cpp b/src/cpu/aarch64/acl_reorder.cpp
-new file mode 100644
-index 000000000..061751b55
--- /dev/null
-+++ b/src/cpu/aarch64/acl_reorder.cpp
-@@ -0,0 +1,52 @@
-+/*******************************************************************************
-+* Copyright 2023 Arm Ltd. and affiliates
-+*
-+* Licensed under the Apache License, Version 2.0 (the "License");
-+* you may not use this file except in compliance with the License.
-+* You may obtain a copy of the License at
-+*
-+*     http://www.apache.org/licenses/LICENSE-2.0
-+*
-+* Unless required by applicable law or agreed to in writing, software
-+* distributed under the License is distributed on an "AS IS" BASIS,
-+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+* See the License for the specific language governing permissions and
-+* limitations under the License.
-+*******************************************************************************/
-+
-+#include "cpu/aarch64/acl_reorder.hpp"
-+
-+namespace dnnl {
-+namespace impl {
-+namespace cpu {
-+namespace aarch64 {
-+
-+status_t acl_reorder_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
-+    // Lock here is needed because resource_mapper does not support
-+    // concurrent multithreaded access.
-+    std::lock_guard<std::mutex> _lock {this->mtx};
-+
-+    auto src = CTX_IN_MEM(const void *, DNNL_ARG_FROM);
-+    auto dst = CTX_OUT_MEM(void *, DNNL_ARG_TO);
-+
-+    // Retrieve primitive resource and configured Compute Library objects
-+    auto *acl_resource
-+            = ctx.get_resource_mapper()->get<acl_reorder_resource_t>(this);
-+
-+    acl_reorder_obj_t &acl_obj = acl_resource->get_acl_obj();
-+
-+    acl_obj.src_tensor.allocator()->import_memory(const_cast<void *>(src));
-+    acl_obj.dst_tensor.allocator()->import_memory(dst);
-+
-+    acl_obj.reorder.run();
-+
-+    acl_obj.src_tensor.allocator()->free();
-+    acl_obj.dst_tensor.allocator()->free();
-+
-+    return status::success;
-+}
-+
-+} // namespace aarch64
-+} // namespace cpu
-+} // namespace impl
-+} // namespace dnnl
-diff --git a/src/cpu/aarch64/acl_reorder.hpp b/src/cpu/aarch64/acl_reorder.hpp
-new file mode 100644
-index 0000000000..edbc38914d
--- /dev/null
-+++ b/src/cpu/aarch64/acl_reorder.hpp
-@@ -0,0 +1,262 @@
-+/*******************************************************************************
-+* Copyright 2023 Arm Ltd. and affiliates
-+*
-+* Licensed under the Apache License, Version 2.0 (the "License");
-+* you may not use this file except in compliance with the License.
-+* You may obtain a copy of the License at
-+*
-+*     http://www.apache.org/licenses/LICENSE-2.0
-+*
-+* Unless required by applicable law or agreed to in writing, software
-+* distributed under the License is distributed on an "AS IS" BASIS,
-+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+* See the License for the specific language governing permissions and
-+* limitations under the License.
-+*******************************************************************************/
-+#ifndef CPU_AARCH64_ACL_REORDER_HPP
-+#define CPU_AARCH64_ACL_REORDER_HPP
-+
-+#include "cpu/aarch64/acl_utils.hpp"
-+#include "cpu/reorder/cpu_reorder_pd.hpp"
-+#include "arm_compute/core/Types.h"
-+#include "common/utils.hpp"
-+
-+namespace dnnl {
-+namespace impl {
-+namespace cpu {
-+namespace aarch64 {
-+
-+struct acl_reorder_obj_t {
-+    arm_compute::NEReorderLayer reorder;
-+    arm_compute::Tensor src_tensor;
-+    arm_compute::Tensor dst_tensor;
-+    arm_compute::WeightFormat src_wf;
-+    arm_compute::WeightFormat dst_wf;
-+};
-+
-+struct acl_reorder_conf_t {
-+    arm_compute::TensorInfo src_info;
-+    arm_compute::TensorInfo dst_info;
-+    arm_compute::WeightFormat src_wf;
-+    arm_compute::WeightFormat dst_wf;
-+};
-+
-+struct acl_reorder_resource_t : public resource_t {
-+    acl_reorder_resource_t() : acl_obj_(utils::make_unique<acl_reorder_obj_t>()) {}
-+
-+    status_t configure(const acl_reorder_conf_t &app) {
-+        if (!acl_obj_) return status::out_of_memory;
-+
-+        // Init Compute Library tensors based on info from descriptor
-+        acl_obj_->src_tensor.allocator()->init(app.src_info);
-+        acl_obj_->dst_tensor.allocator()->init(app.dst_info);
-+
-+        // clang-format off
-+        acl_obj_->reorder.configure(
-+            &acl_obj_->src_tensor,
-+            &acl_obj_->dst_tensor,
-+            app.src_wf,
-+            app.dst_wf
-+            );
-+        // clang-format on
-+
-+        return status::success;
-+    }
-+
-+    acl_reorder_obj_t &get_acl_obj() const { return *acl_obj_; }
-+    DNNL_DISALLOW_COPY_AND_ASSIGN(acl_reorder_resource_t);
-+
-+private:
-+    std::unique_ptr<acl_reorder_obj_t> acl_obj_;
-+}; // acl_reorder_resource_t
-+
-+struct acl_reorder_fwd_t : public primitive_t {
-+    using primitive_t::primitive_t;
-+    struct pd_t : public cpu_reorder_pd_t {
-+
-+        using cpu_reorder_pd_t::cpu_reorder_pd_t;
-+
-+        DECLARE_COMMON_PD_T("acl", acl_reorder_fwd_t);
-+
-+        static status_t create(reorder_pd_t **reorder_pd, engine_t *engine,
-+                const primitive_attr_t *attr, engine_t *src_engine,
-+                const memory_desc_t *src_md, engine_t *dst_engine,
-+                const memory_desc_t *dst_md) {
-+
-+            using namespace acl_utils;
-+            // using skip_mask_t = dnnl_primitive_attr::skip_mask_t;
-+
-+            bool ok = src_md->data_type
-+                            == dst_md->data_type // ACL only supports matching src/dst data types
-+                    && utils::one_of(src_md->data_type,
-+                            data_type::f32) // Only supports f32 for now
-+                    && attr->has_default_values();
-+            if (!ok) return status::unimplemented;
-+
-+            int mask = -1;
-+            bool is_set = false;
-+            // CHECK(attr->scales_.get(DNNL_ARG_DST, &mask, &is_set));
-+            const memory_desc_wrapper input_d(src_md);
-+            if (input_d.has_runtime_dims_or_strides() && is_set && mask > 0)
-+                return status::unimplemented;
-+
-+            // Create and check primitive descriptor
-+            auto _pd = new pd_t(attr, src_engine->kind(), src_md,
-+                    dst_engine->kind(), dst_md);
-+            if (_pd == nullptr) return status::out_of_memory;
-+            if (_pd->init(engine, src_engine, dst_engine) != status::success) {
-+                delete _pd;
-+                return status::unimplemented;
-+            }
-+
-+            const memory_desc_wrapper src_d(*src_md);
-+            const memory_desc_wrapper dst_d(*dst_md);
-+
-+            const int ndims = src_d.ndims();
-+
-+            auto src_tag = memory_desc_matches_one_of_tag(
-+                            *src_md, format_tag::ba, format_tag::cdba);
-+            ACL_CHECK_SUPPORT(
-+                            utils::one_of(format_tag::undef, src_tag),
-+                            "");
-+
-+            arm_compute::TensorShape acl_tensor_shape_in;
-+            arm_compute::TensorShape acl_tensor_shape_out;
-+            // Need even amount of dims in dim 0 for ACL kernel (eg mulitple of 8 rows when blocking by 8)
-+            int dim_0_rounded_up;
-+
-+            // Switch for 2 or 4 dim tensors
-+            switch(ndims)
-+            {
-+                // Currently for Ab4a and Ab8a
-+                // No format_tag for these, have to deduce from stride
-+                case 2:
-+                    {
-+                        if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){
-+                            return status::unimplemented;
-+                        }
-+                        int dst_dim_1 = dst_md->dims[1];
-+                        int dst_dim_0_stride = dst_md->format_desc.blocking.strides[0];
-+                        int dst_dim_1_stride = dst_md->format_desc.blocking.strides[1];
-+                        // Interleave of 4 or 8 that stride for dim 1
-+                        if (dst_dim_1_stride != 4 && dst_dim_1_stride != 8){
-+                            return status::unimplemented;
-+                        }
-+                        // Check to ensure it's a blocking transpose
-+                        if (dst_dim_1 * dst_dim_1_stride != dst_dim_0_stride){
-+                            return status::unimplemented;
-+                        }
-+                        if(dst_dim_1_stride == 4){
-+                            // Set Dest WeightFormat
-+                            _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
-+                            dim_0_rounded_up
-+                                    = utils::rnd_up(src_md->dims[0], 4);
-+                        } else {
-+                            // Set Dest WeightFormat
-+                            _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
-+                            dim_0_rounded_up
-+                                    = utils::rnd_up(src_md->dims[0], 8);
-+                        }
-+                        acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[1], src_md->dims[0]);
-+                        acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[1], dim_0_rounded_up);
-+
-+                        break;
-+                    }
-+                // Currently for Acdb4a and Acdb8a
-+                case 4:
-+                    { 
-+
-+                        auto dst_tag = memory_desc_matches_one_of_tag(
-+                            *dst_md, format_tag::Acdb4a, format_tag::Acdb8a);
-+                        ACL_CHECK_SUPPORT(
-+                            utils::one_of(format_tag::undef, dst_tag),
-+                            "");
-+                        if(dst_tag == format_tag::Acdb4a){
-+                            // Set Dest WeightFormat
-+                            _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
-+                            dim_0_rounded_up
-+                                    = utils::rnd_up(src_md->dims[0], 4);
-+                        }
-+                        else{
-+                            // Set Dest WeightFormat
-+                            _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
-+                            dim_0_rounded_up
-+                                    = utils::rnd_up(src_md->dims[0], 8);
-+                        }
-+                        // Currently only supporting AxBx1x1 cases
-+                        if(dst_md->dims[2] != 1 || dst_md->dims[3] != 1){
-+                            return status::unimplemented;
-+                        }
-+                        if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){
-+                            return status::unimplemented;
-+                        }
-+                        acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], src_md->dims[0]);
-+                        acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], dim_0_rounded_up);
-+                        break;
-+                    }
-+                default:
-+                    return status::unimplemented;
-+            }
-+
-+            // Choose the data layout
-+            // bool is_nspc = utils::one_of(src_tag, format_tag::nhwc);
-+            const auto acl_layout = arm_compute::DataLayout::NCHW;
-+
-+            // Set Source WeightFormat
-+            _pd->app_.src_wf = arm_compute::WeightFormat::OHWI;
-+
-+            // Create ACL tensor infos
-+            const data_type_t data_type = src_d.data_type();
-+            const arm_compute::DataType acl_data_t
-+                    = acl_utils::get_acl_data_t(data_type);
-+            _pd->app_.src_info = arm_compute::TensorInfo(
-+                        acl_tensor_shape_in, 1, acl_data_t, acl_layout);
-+            _pd->app_.dst_info = arm_compute::TensorInfo(
-+                        acl_tensor_shape_out, 1, acl_data_t, acl_layout);
-+
-+            // Init scratch memory, not used so 0 in this implementation
-+            _pd->init_scratchpad_md();
-+
-+            return safe_ptr_assign(*reorder_pd, _pd);
-+        } // create 
-+
-+        friend dnnl::impl::impl_list_item_t;
-+        acl_reorder_conf_t app_;
-+
-+    }; // pd_t
-+
-+    acl_reorder_fwd_t(const pd_t *apd) : primitive_t(apd) {}
-+
-+    status_t create_resource(
-+            engine_t *engine, resource_mapper_t &mapper) const override {
-+        if (mapper.has_resource(this)) return status::success;
-+
-+        auto r = utils::make_unique<acl_reorder_resource_t>();
-+        if (!r) return status::out_of_memory;
-+
-+        // Configure the resource based on information from primitive descriptor
-+        CHECK(r->configure(pd()->app_));
-+
-+        mapper.add(this, std::move(r));
-+        return status::success;
-+    }
-+
-+    status_t execute(const exec_ctx_t &ctx) const override {
-+        return execute_forward(ctx);
-+    }
-+
-+private:
-+    // To guard the const execute_forward, the mutex must be 'mutable'
-+    mutable std::mutex mtx;
-+    status_t execute_forward(const exec_ctx_t &ctx) const;
-+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-+
-+
-+}; // acl_reorder_fwd_t
-+
-+} // namespace aarch64
-+} // namespace cpu
-+} // namespace impl
-+} // namespace dnnl
-+
-+#endif // CPU_AARCH64_ACL_REORDER_HPP
-diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-index a4150b619..f4d6b4de3 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-+++ b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-@@ -16,6 +16,7 @@
- *******************************************************************************/
- 
- #include "cpu/reorder/cpu_reorder.hpp"
-+#include "cpu/aarch64/acl_reorder.hpp"
- 
- namespace dnnl {
- namespace impl {
-@@ -28,6 +29,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
-         // f32 -> f32
-         {{f32, f32, 0}, {
-             REG_FAST_DIRECT_COPY_F32_F32
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
- 
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-@@ -69,6 +71,8 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
-             nullptr,
-         }},
-         {{f32, f32, 4}, {
-+
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
-             CPU_REORDER_INSTANCE(rnn_weights_reorder_t<f32, f32>)
- 
-             REG_FAST_DIRECT_COPY_F32_F32
--- a/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch
+++ b/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch
@ -1,97 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
-index fd2c76d01..bd7bed837 100644
--- a/src/cpu/aarch64/acl_thread.cpp
-+++ b/src/cpu/aarch64/acl_thread.cpp
-@@ -55,14 +55,17 @@ void acl_set_benchmark_scheduler_default() {
- #endif
- 
- #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
-void acl_set_tp_scheduler() {
-    static std::once_flag flag_once;
-    // Create threadpool scheduler
-    std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
-            = std::make_unique<ThreadpoolScheduler>();
-+void acl_set_tp_scheduler(int intra_threads = 0) {
-+    static thread_local std::once_flag flag_once;
-     // set CUSTOM scheduler in ACL
-     std::call_once(flag_once,
-            [&]() { arm_compute::Scheduler::set(threadpool_scheduler); });
-+            [&]() {
-+                    // Create threadpool scheduler
-+                    std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
-+                        = std::make_unique<ThreadpoolScheduler>();
-+                    threadpool_scheduler->set_num_threads(intra_threads);
-+
-+                    arm_compute::Scheduler::set(threadpool_scheduler); });
- }
- 
- void acl_set_threadpool_num_threads() {
-@@ -102,14 +105,6 @@ void set_acl_threading() {
-         acl_set_benchmark_scheduler_default();
-     }
- #endif
-#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
-    if (verbose_has_profile_externals()) {
-        acl_set_tp_benchmark_scheduler();
-    } else {
-        acl_set_tp_scheduler();
-    }
-
-#endif
- }
- 
- } // namespace acl_thread_utils
-diff --git a/src/cpu/aarch64/acl_thread.hpp b/src/cpu/aarch64/acl_thread.hpp
-index f073376e6..654a2aa5d 100644
--- a/src/cpu/aarch64/acl_thread.hpp
-+++ b/src/cpu/aarch64/acl_thread.hpp
-@@ -40,7 +40,7 @@ void acl_set_benchmark_scheduler_default();
- 
- #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
- // Retrieve threadpool size during primitive execution and set ThreadpoolScheduler num_threads
-void acl_set_tp_scheduler();
-+void acl_set_tp_scheduler(int intra_threads);
- void acl_set_threadpool_num_threads();
- // Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler) for DNNL_VERBOSE=profile,profile_externals
- void acl_set_tp_benchmark_scheduler();
-diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
-index 439ca862e..6656c37a5 100644
--- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
-+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
-@@ -102,8 +102,6 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
- void ThreadpoolScheduler::run_workloads(
-         std::vector<arm_compute::IScheduler::Workload> &workloads) {
- 
-    arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
-
-     const unsigned int num_threads
-             = std::min(static_cast<unsigned int>(_num_threads),
-                     static_cast<unsigned int>(workloads.size()));
-diff --git a/src/cpu/cpu_engine.cpp b/src/cpu/cpu_engine.cpp
-index 0bfec3871..7207b2b60 100644
--- a/src/cpu/cpu_engine.cpp
-+++ b/src/cpu/cpu_engine.cpp
-@@ -47,6 +47,7 @@ status_t cpu_engine_t::create_stream(stream_t **stream, unsigned flags) {
- #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
- status_t cpu_engine_t::create_stream(stream_t **stream,
-         dnnl::threadpool_interop::threadpool_iface *threadpool) {
-+    dnnl::impl::cpu::aarch64::acl_thread_utils::acl_set_tp_scheduler(threadpool->get_num_threads());
-     return safe_ptr_assign<stream_t>(
-             *stream, new cpu_stream_t(this, threadpool));
- }
--- a/third_party/mkl_dnn/onednn_acl_threadcap.patch
+++ b/third_party/mkl_dnn/onednn_acl_threadcap.patch
@ -1,43 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
-index fd2c76d01..2d7c76d48 100644
--- a/src/cpu/aarch64/acl_thread.cpp
-+++ b/src/cpu/aarch64/acl_thread.cpp
-@@ -17,6 +17,8 @@
- #include "cpu/aarch64/acl_thread.hpp"
- #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
- #include "cpu/aarch64/acl_threadpool_scheduler.hpp"
-+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
-+#include <thread>
- #endif
- #include "cpu/aarch64/acl_benchmark_scheduler.hpp"
- 
-@@ -30,9 +32,10 @@ namespace acl_thread_utils {
- #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
- void acl_thread_bind() {
-     static std::once_flag flag_once;
-    // The threads in Compute Library are bound for the cores 0..max_threads-1
-    // dnnl_get_max_threads() returns OMP_NUM_THREADS
-    const int max_threads = dnnl_get_max_threads();
-+    // Cap the number of threads to 90% of the total core count
-+    // to ensure Compute Library doesn't use too much resource
-+    int capped_threads = (int)std::floor(0.9*std::thread::hardware_concurrency());
-+    const int max_threads = std::min(capped_threads, dnnl_get_max_threads());
-     // arm_compute::Scheduler does not support concurrent access thus a
-     // workaround here restricts it to only one call
-     std::call_once(flag_once, [&]() {
--- a/third_party/mkl_dnn/onednn_acl_threadpool_default_max.patch
+++ b/third_party/mkl_dnn/onednn_acl_threadpool_default_max.patch
@ -0,0 +1,180 @@
+# *******************************************************************************
+# Copyright 2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
+index 53175a05f9..89731cb356 100644
+--- a/src/cpu/aarch64/acl_thread.cpp
+++ b/src/cpu/aarch64/acl_thread.cpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2022-2024 Arm Ltd. and affiliates
+* Copyright 2022-2025 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -83,17 +83,20 @@ void acl_set_threadpool_num_threads() {
+ }
+ // Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler)
+ void acl_set_tp_benchmark_scheduler() {
+-    static std::once_flag flag_once;
+-    // Create threadpool scheduler
+-    std::unique_ptr<arm_compute::IScheduler> threadpool_scheduler
+-            = std::make_unique<ThreadpoolScheduler>();
+-    arm_compute::IScheduler *_real_scheduler = nullptr;
+-    _real_scheduler = threadpool_scheduler.release();
+-    // Create benchmark scheduler and set TP as real scheduler
+-    std::shared_ptr<arm_compute::IScheduler> benchmark_scheduler
+-            = std::make_unique<BenchmarkScheduler>(*_real_scheduler);
+-    std::call_once(flag_once,
+-            [&]() { arm_compute::Scheduler::set(benchmark_scheduler); });
+    static thread_local std::once_flag flag_once;
+    std::call_once(flag_once, [&]() {
+        // Create threadpool scheduler
+        std::unique_ptr<arm_compute::IScheduler> threadpool_scheduler
+                = std::make_unique<ThreadpoolScheduler>();
+        arm_compute::IScheduler *_real_scheduler = nullptr;
+        _real_scheduler = threadpool_scheduler.release();
+
+        // Create benchmark scheduler and set TP as real scheduler
+        std::shared_ptr<arm_compute::IScheduler> benchmark_scheduler
+                = std::make_unique<BenchmarkScheduler>(*_real_scheduler);
+
+        arm_compute::Scheduler::set(benchmark_scheduler);
+    });
+ }
+ #endif
+ 
+diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
+index 30910398d9..34cf44b7e2 100644
+--- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2022-2024 Arm Ltd. and affiliates
+* Copyright 2022-2025 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -18,24 +18,17 @@
+ 
+ #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
+ 
+-#include "cpu/aarch64/acl_thread.hpp"
+-
+ #include "common/counting_barrier.hpp"
+ #include "common/dnnl_thread.hpp"
+#include "cpu/aarch64/acl_thread.hpp"
+ 
+ #include "arm_compute/core/CPP/ICPPKernel.h"
+ #include "arm_compute/core/Error.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/Utils.h"
+ #include "arm_compute/runtime/IScheduler.h"
+ 
+-// BARRIER
+ #include <atomic>
+ #include <cassert>
+-#include <chrono>
+ #include <mutex>
+-#include <thread>
+-#include <condition_variable>
+ 
+ namespace dnnl {
+ namespace impl {
+@@ -51,7 +44,7 @@ public:
+ 
+     /// Function to check the next element in the range if there is one.
+     bool get_next(unsigned int &next) {
+-        next = atomic_fetch_add_explicit(
+        next = std::atomic_fetch_add_explicit(
+                 &_atomic_counter, 1u, std::memory_order_relaxed);
+         return next < _end;
+     }
+@@ -70,11 +63,8 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads,
+     } while (feeder.get_next(workload_index));
+ }
+ 
+-ThreadpoolScheduler::ThreadpoolScheduler() {
+-    using namespace dnnl::impl::threadpool_utils;
+-    // Set number of threads to one when threadpool is not available.
+-    _num_threads = get_active_threadpool() == nullptr ? 1 : num_threads_hint();
+-}
+ThreadpoolScheduler::ThreadpoolScheduler()
+    : _num_threads(dnnl_get_max_threads()) {}
+ 
+ ThreadpoolScheduler::~ThreadpoolScheduler() = default;
+ 
+@@ -83,8 +73,8 @@ unsigned int ThreadpoolScheduler::num_threads() const {
+ }
+ 
+ void ThreadpoolScheduler::set_num_threads(unsigned int num_threads) {
+-    arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
+-    _num_threads = num_threads == 0 ? num_threads_hint() : num_threads;
+    std::lock_guard<std::mutex> lock(this->_mtx);
+    _num_threads = num_threads == 0 ? dnnl_get_max_threads() : num_threads;
+ }
+ 
+ void ThreadpoolScheduler::schedule(ICPPKernel *kernel, const Hints &hints) {
+@@ -104,7 +94,7 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
+ void ThreadpoolScheduler::run_workloads(
+         std::vector<arm_compute::IScheduler::Workload> &workloads) {
+ 
+-    arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
+    std::lock_guard<std::mutex> lock(this->_mtx);
+ 
+     const unsigned int num_threads
+             = std::min(static_cast<unsigned int>(_num_threads),
+diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.hpp b/src/cpu/aarch64/acl_threadpool_scheduler.hpp
+index e9ba21c803..384dfec1b9 100644
+--- a/src/cpu/aarch64/acl_threadpool_scheduler.hpp
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.hpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2022 Arm Ltd. and affiliates
+* Copyright 2022, 2025 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -22,7 +22,8 @@
+ #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
+ 
+ #include "arm_compute/runtime/IScheduler.h"
+-#include "support/Mutex.h"
+
+#include <mutex>
+ 
+ namespace dnnl {
+ namespace impl {
+@@ -32,7 +33,7 @@ namespace aarch64 {
+ class ThreadpoolScheduler final : public arm_compute::IScheduler {
+ public:
+     ThreadpoolScheduler();
+-    ~ThreadpoolScheduler();
+    ~ThreadpoolScheduler() override;
+ 
+     /// Sets the number of threads the scheduler will use to run the kernels.
+     void set_num_threads(unsigned int num_threads) override;
+@@ -54,8 +55,8 @@ protected:
+     void run_workloads(std::vector<Workload> &workloads) override;
+ 
+ private:
+-    uint _num_threads {};
+-    arm_compute::Mutex _run_workloads_mutex {};
+    unsigned int _num_threads {};
+    std::mutex _mtx;
+ };
+ 
+ } // namespace aarch64
--- a/third_party/xla/third_party/compute_library/acl_thread_local_scheduler.patch
+++ b/third_party/xla/third_party/compute_library/acl_thread_local_scheduler.patch
@ -1,98 +0,0 @@
-diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
-index 9e8add1f9..cf5e2bf4c 100644
--- a/arm_compute/runtime/Scheduler.h
-+++ b/arm_compute/runtime/Scheduler.h
-@@ -75,7 +75,7 @@ public:
-
- private:
-     static Type                        _scheduler_type;
-    static std::shared_ptr<IScheduler> _custom_scheduler;
-+    static thread_local std::shared_ptr<IScheduler> _custom_scheduler;
-     static std::map<Type, std::unique_ptr<IScheduler>> _schedulers;
-
-     Scheduler();
-diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
-index a5b9eca56..d1ab19397 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
-+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
-@@ -60,8 +60,8 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo     *src,
-                                                    const ConvolutionInfo &info)
- {
-     ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, info);
-    const CPUInfo     &ci          = NEScheduler::get().cpu_info();
-    const unsigned int num_threads = NEScheduler::get().num_threads();
-+    const CPUInfo     &ci          = CPUInfo::get();
-+    const unsigned int num_threads = CPUInfo::get().get_cpu_num();
-     _pImpl->is_prepared            = false;
-     _pImpl->are_weights_const      = weights->are_values_constant();
-
-diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp
-index 722cd36ee..03aef1632 100644
--- a/src/cpu/operators/CpuPool2d.cpp
-+++ b/src/cpu/operators/CpuPool2d.cpp
-@@ -66,8 +66,8 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
-
-     if(run_optimised)
-     {
-        const CPUInfo     &ci          = NEScheduler::get().cpu_info();
-        const unsigned int num_threads = NEScheduler::get().num_threads();
-+        const CPUInfo     &ci          = CPUInfo::get();
-+        const unsigned int num_threads = CPUInfo::get().get_cpu_num();
-
-         auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>();
-         ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
-diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-index 9c8563140..f7771945a 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
-+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
-@@ -623,8 +623,8 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
-                      arm_gemm::Activation activation, const AsmGemmInfo &info)
- {
-     Params         p           = extract_parameters(a, b, d, info);
-    const CPUInfo &ci          = NEScheduler::get().cpu_info();
-    unsigned int   num_threads = NEScheduler::get().num_threads();
-+    const CPUInfo &ci          = CPUInfo::get();
-+    unsigned int   num_threads = CPUInfo::get().get_cpu_num();
-
-     arm_gemm::GemmConfig cfg;
-     cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
-@@ -696,8 +696,8 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
-     ARM_COMPUTE_UNUSED(c);
-     arm_gemm::Activation act         = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
-     Params               p           = extract_parameters(a, b, d, info);
-    const CPUInfo       &ci          = NEScheduler::get().cpu_info();
-    unsigned int         num_threads = NEScheduler::get().num_threads();
-+    const CPUInfo       &ci          = CPUInfo::get();
-+    unsigned int         num_threads = CPUInfo::get().get_cpu_num();
-     arm_gemm::GemmConfig cfg;
-     cfg.weight_format                           = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
-     arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format);
-diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
-index 0713b9a2a..f15ac2e22 100644
--- a/src/runtime/Scheduler.cpp
-+++ b/src/runtime/Scheduler.cpp
-@@ -47,7 +47,7 @@ Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
- Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST;
- #endif /* ARM_COMPUTE_*_SCHEDULER */
-
-std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
-+thread_local std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
-
- namespace
- {
--- a/third_party/xla/third_party/compute_library/exclude_omp_scheduler.patch
+++ b/third_party/xla/third_party/compute_library/exclude_omp_scheduler.patch
@ -1,8 +1,8 @@
 diff --git a/src/BUILD.bazel b/src/BUILD.bazel
-index bf71e534e2..22377f1a32 100644
+index 547c98576..a31301230 100644
 --- a/src/BUILD.bazel
 +++ b/src/BUILD.bazel
-@@ -971,7 +971,6 @@ filegroup(
+@@ -1029,7 +1029,6 @@ filegroup(
 	"runtime/NEON/functions/NETranspose.cpp",
 	"runtime/NEON/functions/NEUnstack.cpp",
 	"runtime/NEON/functions/NEWinogradConvolutionLayer.cpp",
@ -10,10 +10,10 @@ index bf71e534e2..22377f1a32 100644
 	"runtime/OffsetLifetimeManager.cpp",
 	"runtime/OffsetMemoryPool.cpp",
 	"runtime/OperatorTensor.cpp",
-@@ -984,6 +983,10 @@ filegroup(
- 	"runtime/Tensor.cpp",
- 	"runtime/TensorAllocator.cpp",
- 	"runtime/Utils.cpp"]  +
+@@ -1058,6 +1057,10 @@ filegroup(
+ 	"runtime/experimental/operators/CpuSub.cpp",
+ 	"runtime/experimental/operators/CpuTranspose.cpp",
+ 	"runtime/experimental/operators/CpuWinogradConv2d.cpp"]  +
 +    select({
 +        "//:openmp_flag": ["runtime/OMP/OMPScheduler.cpp"],
 +        "//conditions:default": [],
--- a/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD
+++ b/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD
@ -9,13 +9,6 @@ _DNNL_COPTS_THREADPOOL = [
    "-UUSE_CBLAS",
 ]

-_DNNL_COPTS_OMP = [
-    "-fopenmp",
-    "-fexceptions",
-    "-UUSE_MKL",
-    "-UUSE_CBLAS",
-]
-
 _DNNL_RUNTIME_THREADPOOL = {
    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
@ -63,61 +56,23 @@ _DNNL_RUNTIME_THREADPOOL = {
    "#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
    "#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
    "#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
-}
-
-_DNNL_RUNTIME_OMP = {
-    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
-    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
-    "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
-    "#cmakedefine DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE": "#undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE",
-    "#cmakedefine DNNL_WITH_SYCL": "#undef DNNL_WITH_SYCL",
-    "#cmakedefine DNNL_WITH_LEVEL_ZERO": "#undef DNNL_WITH_LEVEL_ZERO",
-    "#cmakedefine DNNL_SYCL_CUDA": "#undef DNNL_SYCL_CUDA",
-    "#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
-    "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
-    "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
-    "#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
-    "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
-    "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
-    "#cmakedefine01 BUILD_BATCH_NORMALIZATION": "#define BUILD_BATCH_NORMALIZATION 0",
-    "#cmakedefine01 BUILD_BINARY": "#define BUILD_BINARY 0",
-    "#cmakedefine01 BUILD_CONCAT": "#define BUILD_CONCAT 0",
-    "#cmakedefine01 BUILD_CONVOLUTION": "#define BUILD_CONVOLUTION 0",
-    "#cmakedefine01 BUILD_DECONVOLUTION": "#define BUILD_DECONVOLUTION 0",
-    "#cmakedefine01 BUILD_ELTWISE": "#define BUILD_ELTWISE 0",
-    "#cmakedefine01 BUILD_INNER_PRODUCT": "#define BUILD_INNER_PRODUCT 0",
-    "#cmakedefine01 BUILD_LAYER_NORMALIZATION": "#define BUILD_LAYER_NORMALIZATION 0",
-    "#cmakedefine01 BUILD_LRN": "#define BUILD_LRN 0",
-    "#cmakedefine01 BUILD_MATMUL": "#define BUILD_MATMUL 0",
-    "#cmakedefine01 BUILD_POOLING": "#define BUILD_POOLING 0",
-    "#cmakedefine01 BUILD_PRELU": "#define BUILD_PRELU 0",
-    "#cmakedefine01 BUILD_REDUCTION": "#define BUILD_REDUCTION 0",
-    "#cmakedefine01 BUILD_REORDER": "#define BUILD_REORDER 0",
-    "#cmakedefine01 BUILD_RESAMPLING": "#define BUILD_RESAMPLING 0",
-    "#cmakedefine01 BUILD_RNN": "#define BUILD_RNN 0",
-    "#cmakedefine01 BUILD_SHUFFLE": "#define BUILD_SHUFFLE 0",
-    "#cmakedefine01 BUILD_SOFTMAX": "#define BUILD_SOFTMAX 0",
-    "#cmakedefine01 BUILD_SUM": "#define BUILD_SUM 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_CPU_ISA_ALL": "#define BUILD_PRIMITIVE_CPU_ISA_ALL 0",
-    "#cmakedefine01 BUILD_SSE41": "#define BUILD_SSE41 0",
-    "#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0",
-    "#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0",
-    "#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 0",
-    "#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0",
-    "#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0",
-    "#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0",
-    "#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
-    "#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
-    "#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
+    "#cmakedefine01 BUILD_GROUP_NORMALIZATION": "#define BUILD_GROUP_NORMALIZATION 0",
+    "#cmakedefine01 BUILD_GEMM_KERNELS_ALL": "#define BUILD_GEMM_KERNELS_ALL 1",
+    "#cmakedefine01 BUILD_GEMM_KERNELS_NONE": "#define BUILD_GEMM_KERNELS_NONE 0",
+    "#cmakedefine01 BUILD_GEMM_SSE41": "#define BUILD_GEMM_SSE41 0",
+    "#cmakedefine01 BUILD_GEMM_AVX2": "#define BUILD_GEMM_AVX2 0",
+    "#cmakedefine01 BUILD_GEMM_AVX512": "#define BUILD_GEMM_AVX512 0",
+    "#cmakedefine DNNL_GPU_VENDOR": "#define DNNL_GPU_VENDOR INTEL",
+    "#cmakedefine DNNL_SYCL_GENERIC": "#undef DNNL_SYCL_GENERIC",
+    "#cmakedefine DNNL_DISABLE_GPU_REF_KERNELS": "#undef DNNL_DISABLE_GPU_REF_KERNELS",
+    "#cmakedefine01 BUILD_SDPA": "#define BUILD_SDPA 0",
+    "#cmakedefine01 BUILD_XE2": "#define BUILD_XE2 0",
 }

 expand_template(
    name = "dnnl_config_h",
    out = "include/oneapi/dnnl/dnnl_config.h",
    substitutions = select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": _DNNL_RUNTIME_OMP,
        "//conditions:default": _DNNL_RUNTIME_THREADPOOL,
    }),
    template = "include/oneapi/dnnl/dnnl_config.h.in",
@ -128,13 +83,21 @@ expand_template(
    out = "include/oneapi/dnnl/dnnl_version.h",
    substitutions = {
        "@DNNL_VERSION_MAJOR@": "3",
-        "@DNNL_VERSION_MINOR@": "2",
-        "@DNNL_VERSION_PATCH@": "1",
-        "@DNNL_VERSION_HASH@": "N/A",
+        "@DNNL_VERSION_MINOR@": "7",
+        "@DNNL_VERSION_PATCH@": "0",
    },
    template = "include/oneapi/dnnl/dnnl_version.h.in",
 )

+expand_template(
+    name = "dnnl_version_hash_h",
+    out = "include/oneapi/dnnl/dnnl_version_hash.h",
+    substitutions = {
+        "@DNNL_VERSION_HASH@": "N/A",
+    },
+    template = "include/oneapi/dnnl/dnnl_version_hash.h.in",
+)
+
 cc_library(
    name = "mkl_dnn_acl",
    srcs = glob(
@ -146,10 +109,11 @@ cc_library(
        exclude = [
            "src/cpu/x64/**",
            "src/cpu/rv64/**",
+            "src/cpu/sycl/**",
+            "src/xpu/**",
        ],
    ),
    copts = select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": _DNNL_COPTS_OMP,
        "//conditions:default": _DNNL_COPTS_THREADPOOL,
    }),
    defines = ["DNNL_AARCH64_USE_ACL=1"],
@ -175,6 +139,7 @@ cc_library(
    ) + [
        ":dnnl_config_h",
        ":dnnl_version_h",
+        ":dnnl_version_hash_h",
    ],
    visibility = ["//visibility:public"],
    deps = [
--- a/third_party/xla/third_party/mkl_dnn/onednn_acl_add_bf16_platform_support_check.patch
+++ b/third_party/xla/third_party/mkl_dnn/onednn_acl_add_bf16_platform_support_check.patch
@ -1,31 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-diff --git a/src/cpu/platform.cpp b/src/cpu/platform.cpp
-index 65b887ea21..eabdb827bd 100644
--- a/src/cpu/platform.cpp
-+++ b/src/cpu/platform.cpp
-@@ -117,6 +117,8 @@ bool has_data_type_support(data_type_t data_type) {
- #if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__)
-             return true;
- #endif
-+#elif DNNL_AARCH64_USE_ACL
-+            return arm_compute::CPUInfo::get().has_bf16();
- #else
-             return false;
- #endif
-- 
-2.34.1
-
--- a/third_party/xla/third_party/mkl_dnn/onednn_acl_add_sbgemm_matmul_primitive_definition.patch
+++ b/third_party/xla/third_party/mkl_dnn/onednn_acl_add_sbgemm_matmul_primitive_definition.patch
@ -1,44 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
-index ab13efb9b2..ec261e156d 100644
--- a/src/cpu/aarch64/matmul/acl_matmul.hpp
-+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
-@@ -78,11 +78,21 @@ struct acl_matmul_t : public primitive_t {
-                     = utils::everyone_is(data_type::f16, src_md()->data_type,
-                               weights_md()->data_type, dst_md()->data_type)
-                     && platform::has_data_type_support(data_type::f16);
-+            const bool is_fp32_bf16_ok
-+                    = (utils::everyone_is(data_type::f32, src_md()->data_type,
-+                               dst_md()->data_type, desc()->accum_data_type)
-+                            && platform::has_data_type_support(data_type::f32)
-+                            && utils::everyone_is(
-+                                    data_type::bf16, weights_md()->data_type)
-+                            && platform::has_data_type_support(
-+                                    data_type::bf16));
-+
-             const bool is_weights_md_format_ok
-                     = utils::one_of(weights_format_kind_received,
-                             format_kind::any, format_kind::blocked);
-             bool ok = is_dense_data()
-                    && utils::one_of(true, is_fp32_ok, is_fp16_ok)
-+                    && utils::one_of(
-+                            true, is_fp32_ok, is_fp16_ok, is_fp32_bf16_ok)
-                     && !has_zero_dim_memory() && is_weights_md_format_ok
-                     && set_default_formats()
-                     && attr()->has_default_values(
-- 
-2.34.1
--- a/third_party/xla/third_party/mkl_dnn/onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch
+++ b/third_party/xla/third_party/mkl_dnn/onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch
@ -1,100 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
-index 451cc78d52..ab13efb9b2 100644
--- a/src/cpu/aarch64/matmul/acl_matmul.hpp
-+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
-@@ -67,6 +67,8 @@ struct acl_matmul_t : public primitive_t {
- 
-         status_t init(engine_t *engine) {
-             using smask_t = primitive_attr_t::skip_mask_t;
-+            const format_kind_t weights_format_kind_received
-+                    = weights_md_.format_kind;
-             const bool is_fp32_ok
-                     = utils::everyone_is(data_type::f32, src_md()->data_type,
-                               weights_md()->data_type, dst_md()->data_type,
-@@ -76,18 +78,20 @@ struct acl_matmul_t : public primitive_t {
-                     = utils::everyone_is(data_type::f16, src_md()->data_type,
-                               weights_md()->data_type, dst_md()->data_type)
-                     && platform::has_data_type_support(data_type::f16);
-+            const bool is_weights_md_format_ok
-+                    = utils::one_of(weights_format_kind_received,
-+                            format_kind::any, format_kind::blocked);
-             bool ok = is_dense_data()
-                     && utils::one_of(true, is_fp32_ok, is_fp16_ok)
-                    && !has_zero_dim_memory()
-                    && weights_md_.format_kind == format_kind::any
-+                    && !has_zero_dim_memory() && is_weights_md_format_ok
-                     && set_default_formats()
-                     && attr()->has_default_values(
-                             smask_t::oscale | smask_t::post_ops)
-                     && attr_oscale_ok() && !has_runtime_dims_or_strides();
-             if (!ok) return status::unimplemented;
- 
-            CHECK(acl_matmul_utils::init_conf_matmul(
-                    amp_, src_md_, weights_md_, dst_md_, *desc(), *attr()));
-+            CHECK(acl_matmul_utils::init_conf_matmul(amp_, src_md_, weights_md_,
-+                    dst_md_, *desc(), *attr(), weights_format_kind_received));
- 
-             arm_compute::ActivationLayerInfo act_info;
-             CHECK(post_ops.init(engine, attr_.post_ops_, dst_md_, act_info));
-diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
-index a314d96384..027f915a8a 100644
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
-+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
-@@ -27,7 +27,8 @@ namespace acl_matmul_utils {
- 
- status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
-         memory_desc_t &wei_md, memory_desc_t &dst_md, const matmul_desc_t &md,
-        const primitive_attr_t &attr) {
-+        const primitive_attr_t &attr,
-+        format_kind_t weights_format_kind_received) {
- 
-     const memory_desc_wrapper src_d(&src_md);
-     const memory_desc_wrapper wei_d(&wei_md);
-@@ -128,9 +129,16 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
-     for (dim_t i = K_dim - 1; i >= 0; --i)
-         batch_dims.push_back(i);
- 
-+    const memory_desc_t weights_md_received = wei_md;
-     acl_utils::reorder_to_weight_format(amp.wei_tensor_info, wei_md,
-             expected_weight_format, K_dim, N_dim, {}, batch_dims);
- 
-+    ACL_CHECK_SUPPORT((weights_format_kind_received == format_kind::blocked)
-+                    && !(dnnl_memory_desc_equal(&weights_md_received, &wei_md)),
-+            "specified blocked format not supported by ACL, use "
-+            "format_kind_t::any to find a supported blocked format for "
-+            "your platform");
-+
-     return status::success;
- }
- 
-diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
-index 67bb2e78eb..5ba4241abc 100644
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
-+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
-@@ -52,7 +52,8 @@ namespace acl_matmul_utils {
- 
- status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
-         memory_desc_t &wei_md, memory_desc_t &dst_md, const matmul_desc_t &md,
-        const primitive_attr_t &attr);
-+        const primitive_attr_t &attr,
-+        format_kind_t weights_format_kind_received);
- 
- } // namespace acl_matmul_utils
- 
-- 
-2.34.1
--- a/third_party/xla/third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch
+++ b/third_party/xla/third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch
@ -1,50 +0,0 @@
-From 9a9430c7db870b78c6402d786a67921af4a66334 Mon Sep 17 00:00:00 2001
-From: Kentaro Kawakami <kawakami.k@fujitsu.com>
-Date: Fri, 26 May 2023 10:58:36 +0900
-Subject: [PATCH] cpu: aarch64: xbyak_aarch64: BF16 capability detection for
- Ubuntu 20.04
-
---
- .../aarch64/xbyak_aarch64/src/util_impl_linux.h   | 15 ++++++++++++---
- 1 file changed, 12 insertions(+), 3 deletions(-)
-
-diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
-index 743843bae50..3db37e972d1 100644
--- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
-+++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
-@@ -39,6 +39,13 @@
- #include <asm/hwcap.h>
- #endif
- 
-+/* Linux kernel used in Ubuntu 20.04 does not have HWCAP2_BF16 definition. */
-+#ifdef AT_HWCAP2
-+#ifndef HWCAP2_BF16
-+#define HWCAP2_BF16 (1UL << 14)
-+#endif
-+#endif
-+
- namespace Xbyak_aarch64 {
- namespace util {
- #define XBYAK_AARCH64_ERROR_ fprintf(stderr, "%s, %d, Error occurrs during read cache infomation.\n", __FILE__, __LINE__);
-@@ -383,7 +390,7 @@ class CpuInfoLinux : public CpuInfo {
-   }
- 
-   void setHwCap() {
-    unsigned long hwcap = getauxval(AT_HWCAP);
-+    const unsigned long hwcap = getauxval(AT_HWCAP);
-     if (hwcap & HWCAP_ATOMICS)
-       type_ |= (Type)XBYAK_AARCH64_HWCAP_ATOMIC;
- 
-@@ -391,8 +398,10 @@ class CpuInfoLinux : public CpuInfo {
-       type_ |= (Type)XBYAK_AARCH64_HWCAP_FP;
-     if (hwcap & HWCAP_ASIMD)
-       type_ |= (Type)XBYAK_AARCH64_HWCAP_ADVSIMD;
-#ifdef HWCAP2_BF16
-    if (hwcap & HWCAP2_BF16)
-+
-+#ifdef AT_HWCAP2
-+    const unsigned long hwcap2 = getauxval(AT_HWCAP2);
-+    if (hwcap2 & HWCAP2_BF16)
-       type_ |= (Type)XBYAK_AARCH64_HWCAP_BF16;
- #endif
- 
--- a/third_party/xla/third_party/mkl_dnn/onednn_acl_fix_segfault_during_postop_execute.patch
+++ b/third_party/xla/third_party/mkl_dnn/onednn_acl_fix_segfault_during_postop_execute.patch
@ -1,96 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-diff --git a/src/cpu/aarch64/acl_post_ops.cpp b/src/cpu/aarch64/acl_post_ops.cpp
-index ea4bb200ec..3eb53b81bd 100644
--- a/src/cpu/aarch64/acl_post_ops.cpp
-+++ b/src/cpu/aarch64/acl_post_ops.cpp
-@@ -24,7 +24,7 @@ namespace aarch64 {
- 
- status_t acl_post_ops_t::execute(const exec_ctx_t &ctx, void *src_orig) const {
- 
-    int post_op_index = 0;
-+    int post_op_index = post_op_start_index_;
- 
-     // As these are post ops, this src will also be our dst. If we have a sum
-     // post op, the src/dst will start off in a temporary, then change to
-diff --git a/src/cpu/aarch64/acl_post_ops.hpp b/src/cpu/aarch64/acl_post_ops.hpp
-index 7b59ad71d3..ceaa95b73a 100644
--- a/src/cpu/aarch64/acl_post_ops.hpp
-+++ b/src/cpu/aarch64/acl_post_ops.hpp
-@@ -32,7 +32,9 @@ struct acl_post_ops_t {
-     // init the acl_post_ops_t. Note that this function modifies the passed in
-     // post ops by setting the preferred memory formats
-     status_t init(engine_t *engine, post_ops_t &post_ops,
-            const memory_desc_t &dst_md) {
-+            const memory_desc_t &dst_md, int post_op_start_index = 0) {
-+
-+        post_op_start_index_ = post_op_start_index;
- 
-         CHECK(post_ops.set_default_formats(&dst_md));
-         dst_data_type = dst_md.data_type;
-@@ -41,7 +43,7 @@ struct acl_post_ops_t {
-         sum_index = -1;
-         post_op_primitives = {};
- 
-        for (int i = 0; i < post_ops.len(); i++) {
-+        for (int i = post_op_start_index; i < post_ops.len(); i++) {
-             auto &po = post_ops.entry_[i];
- 
-             if (po.is_sum()) {
-@@ -135,7 +137,8 @@ struct acl_post_ops_t {
-     // formats
-     status_t init(engine_t *engine, post_ops_t &base_post_ops,
-             const memory_desc_t &dst_md,
-            arm_compute::ActivationLayerInfo &act_info_to_fuse) {
-+            arm_compute::ActivationLayerInfo &act_info_to_fuse,
-+            int post_op_start_index = 0) {
- 
-         CHECK(base_post_ops.set_default_formats(&dst_md));
-         dst_data_type = dst_md.data_type;
-@@ -149,18 +152,11 @@ struct acl_post_ops_t {
-                     "eltwise post op scale must be 1 (no scale)");
-             CHECK(acl_utils::convert_to_acl_act(first_po, act_info_to_fuse));
- 
-            // Copy all but the first, because it has been fused
-            post_ops_t post_ops;
-            for (int idx = 1; idx < base_post_ops.len(); ++idx) {
-                // Construct empty entry then copy, so that we can check for failure
-                post_ops.entry_.emplace_back();
-                post_ops.entry_.back().copy_from(base_post_ops.entry_[idx]);
-            }
-            return init(engine, post_ops, dst_md);
-
-+             // post_op_start_index + 1 to skip the fused eltwise
-+              return init(engine, base_post_ops, dst_md, post_op_start_index + 1);
-         } else {
-             // Nothing to fuse, just copy all post ops
-            return init(engine, base_post_ops, dst_md);
-+            return init(engine, base_post_ops, dst_md, post_op_start_index);
-         }
-     }
- 
-@@ -179,6 +175,9 @@ struct acl_post_ops_t {
- private:
-     // Index of the sum post op if there is one, < 0 means no sum
-     int sum_index = -1;
-+    // Index of the first post op this primitive executes. This is typically the
-+    // number of post ops which were fused.
-+    int post_op_start_index_ = 0;
-     data_type_t dst_data_type;
-     // Vector of primitives used to execute the post ops. They are constructed
-     // in init to be either acl_binary_t (for sum, add, sub, div, mul, min and
-- 
-2.34.1
--- a/third_party/xla/third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch
+++ b/third_party/xla/third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch
@ -1,111 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/cpu_isa_traits.hpp b/src/cpu/aarch64/cpu_isa_traits.hpp
-index 4a43b24c5..1a5cfe590 100644
--- a/src/cpu/aarch64/cpu_isa_traits.hpp
-+++ b/src/cpu/aarch64/cpu_isa_traits.hpp
-@@ -1,6 +1,7 @@
- /*******************************************************************************
- * Copyright 2018-2023 Intel Corporation
- * Copyright 2020-2023 FUJITSU LIMITED
-+* Copyright 2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -211,10 +212,10 @@ static inline bool mayiuse_atomic() {
-     return cpu().isAtomicSupported();
- }
- 
-inline bool isa_has_bf16(cpu_isa_t isa) {
-    return false;
-+static inline bool mayiuse_bf16() {
-+    using namespace Xbyak_aarch64::util;
-+    return cpu().isBf16Supported();
- }
-
- } // namespace
- 
- /* whatever is required to generate string literals... */
-diff --git a/src/cpu/aarch64/jit_uni_reorder.cpp b/src/cpu/aarch64/jit_uni_reorder.cpp
-index 6bd259ec2..5541bb702 100644
--- a/src/cpu/aarch64/jit_uni_reorder.cpp
-+++ b/src/cpu/aarch64/jit_uni_reorder.cpp
-@@ -1,7 +1,7 @@
- /*******************************************************************************
- * Copyright 2018-2023 Intel Corporation
- * Copyright 2020-2023 FUJITSU LIMITED
-* Copyright 2022 Arm Ltd. and affiliates
-+* Copyright 2022-2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -163,11 +163,11 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
- 
-         bool ok = true && p.ndims > 0
-                 && utils::one_of(p.itype, f32, s32, data_type::s8, u8)
-                && utils::one_of(p.otype, f32, s32, data_type::s8, u8)
-+                && utils::one_of(p.otype, f32, bf16, s32, data_type::s8, u8)
-                 && utils::everyone_is(0, p.ioff, p.ooff) /* do we need this? */
-                 && utils::one_of(p.beta, 0.f, 1.f) /* anything else? */
-                && simple_impl_desc_init(p, nullptr)
-                && prb_has_small_strides(p);
-+                && simple_impl_desc_init(p, nullptr) && prb_has_small_strides(p)
-+                && ((p.otype != bf16) || (p.itype == f32 && mayiuse_bf16()));
- 
-         return ok;
-     }
-@@ -648,6 +648,9 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-                         cvt_v_s32_u8(startIdx, regNum);
-                     if (idt == data_type::s8) cvt_v_s8_u8(startIdx, regNum);
-                     break;
-+                case bf16:
-+                    if (idt == f32) cvt_v_f32_bf16(startIdx, regNum);
-+                    break;
-                 default: assert(!"unreachable");
-             }
-         };
-@@ -1677,6 +1680,10 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-         UNROLL_INST(fcvtzs, VReg4S, tmp, tmp);
-     }
- 
-+    void cvt_v_f32_bf16(const size_t startIdx, const size_t regNum) {
-+        UNROLL_INST2(bfcvtn, VReg4H(i), VReg4S(i));
-+    }
-+
-     void cvt_z_s8_s32(const size_t startIdx, const size_t regNum) {
-         cvt_z_b_s(startIdx, regNum);
-         UNROLL_INST(sxtb, ZRegS, tmp, P_ALL_ONE / T_m, tmp);
-diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
-index ba5499ba9..d4e21d316 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
-+++ b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
-@@ -1,5 +1,6 @@
- /*******************************************************************************
- * Copyright 2020-2022 Intel Corporation
-+* Copyright 2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -34,6 +35,8 @@ const impl_list_map_t &regular_f32_bf16_impl_list_map() {
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nChw16c))
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nCdhw16c))
- 
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-+
-             DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8i16o2i, fmt_order::keep))
-             DNNL_NON_X64_ONLY(REG_SR(f32, goihw, bf16, gOIhw8i16o2i, fmt_order::keep))
-             DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8o16i2o, fmt_order::keep))
--- a/third_party/xla/third_party/mkl_dnn/onednn_acl_indirect_conv.patch
+++ b/third_party/xla/third_party/mkl_dnn/onednn_acl_indirect_conv.patch
@ -1,31 +0,0 @@
- *******************************************************************************
- Copyright 2024 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
-index f043fee4bc..0384cce757 100644
--- a/src/cpu/aarch64/acl_convolution_utils.cpp
-+++ b/src/cpu/aarch64/acl_convolution_utils.cpp
-@@ -313,10 +313,6 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
-
-     CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
-
-    // Indirect is slower than gemm for low thread counts, except for fast math
-    if (dnnl_get_max_threads() < 28 && !acp.fast_math)
-        return status::unimplemented;
-
-     // If we do not need to pad input channels for fast math mode then it would
-     // be faster to run convolution with im2row instead of using indirect kernel
-     int block_by = arm_compute::block_by(acp.weights_info.weight_format());
--- a/third_party/xla/third_party/mkl_dnn/onednn_acl_reorder.patch
+++ b/third_party/xla/third_party/mkl_dnn/onednn_acl_reorder.patch
@ -1,371 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_reorder.cpp b/src/cpu/aarch64/acl_reorder.cpp
-new file mode 100644
-index 000000000..061751b55
--- /dev/null
-+++ b/src/cpu/aarch64/acl_reorder.cpp
-@@ -0,0 +1,52 @@
-+/*******************************************************************************
-+* Copyright 2023 Arm Ltd. and affiliates
-+*
-+* Licensed under the Apache License, Version 2.0 (the "License");
-+* you may not use this file except in compliance with the License.
-+* You may obtain a copy of the License at
-+*
-+*     http://www.apache.org/licenses/LICENSE-2.0
-+*
-+* Unless required by applicable law or agreed to in writing, software
-+* distributed under the License is distributed on an "AS IS" BASIS,
-+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+* See the License for the specific language governing permissions and
-+* limitations under the License.
-+*******************************************************************************/
-+
-+#include "cpu/aarch64/acl_reorder.hpp"
-+
-+namespace dnnl {
-+namespace impl {
-+namespace cpu {
-+namespace aarch64 {
-+
-+status_t acl_reorder_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
-+    // Lock here is needed because resource_mapper does not support
-+    // concurrent multithreaded access.
-+    std::lock_guard<std::mutex> _lock {this->mtx};
-+
-+    auto src = CTX_IN_MEM(const void *, DNNL_ARG_FROM);
-+    auto dst = CTX_OUT_MEM(void *, DNNL_ARG_TO);
-+
-+    // Retrieve primitive resource and configured Compute Library objects
-+    auto *acl_resource
-+            = ctx.get_resource_mapper()->get<acl_reorder_resource_t>(this);
-+
-+    acl_reorder_obj_t &acl_obj = acl_resource->get_acl_obj();
-+
-+    acl_obj.src_tensor.allocator()->import_memory(const_cast<void *>(src));
-+    acl_obj.dst_tensor.allocator()->import_memory(dst);
-+
-+    acl_obj.reorder.run();
-+
-+    acl_obj.src_tensor.allocator()->free();
-+    acl_obj.dst_tensor.allocator()->free();
-+
-+    return status::success;
-+}
-+
-+} // namespace aarch64
-+} // namespace cpu
-+} // namespace impl
-+} // namespace dnnl
-diff --git a/src/cpu/aarch64/acl_reorder.hpp b/src/cpu/aarch64/acl_reorder.hpp
-new file mode 100644
-index 0000000000..edbc38914d
--- /dev/null
-+++ b/src/cpu/aarch64/acl_reorder.hpp
-@@ -0,0 +1,262 @@
-+/*******************************************************************************
-+* Copyright 2023 Arm Ltd. and affiliates
-+*
-+* Licensed under the Apache License, Version 2.0 (the "License");
-+* you may not use this file except in compliance with the License.
-+* You may obtain a copy of the License at
-+*
-+*     http://www.apache.org/licenses/LICENSE-2.0
-+*
-+* Unless required by applicable law or agreed to in writing, software
-+* distributed under the License is distributed on an "AS IS" BASIS,
-+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+* See the License for the specific language governing permissions and
-+* limitations under the License.
-+*******************************************************************************/
-+#ifndef CPU_AARCH64_ACL_REORDER_HPP
-+#define CPU_AARCH64_ACL_REORDER_HPP
-+
-+#include "cpu/aarch64/acl_utils.hpp"
-+#include "cpu/reorder/cpu_reorder_pd.hpp"
-+#include "arm_compute/core/Types.h"
-+#include "common/utils.hpp"
-+
-+namespace dnnl {
-+namespace impl {
-+namespace cpu {
-+namespace aarch64 {
-+
-+struct acl_reorder_obj_t {
-+    arm_compute::NEReorderLayer reorder;
-+    arm_compute::Tensor src_tensor;
-+    arm_compute::Tensor dst_tensor;
-+    arm_compute::WeightFormat src_wf;
-+    arm_compute::WeightFormat dst_wf;
-+};
-+
-+struct acl_reorder_conf_t {
-+    arm_compute::TensorInfo src_info;
-+    arm_compute::TensorInfo dst_info;
-+    arm_compute::WeightFormat src_wf;
-+    arm_compute::WeightFormat dst_wf;
-+};
-+
-+struct acl_reorder_resource_t : public resource_t {
-+    acl_reorder_resource_t() : acl_obj_(utils::make_unique<acl_reorder_obj_t>()) {}
-+
-+    status_t configure(const acl_reorder_conf_t &app) {
-+        if (!acl_obj_) return status::out_of_memory;
-+
-+        // Init Compute Library tensors based on info from descriptor
-+        acl_obj_->src_tensor.allocator()->init(app.src_info);
-+        acl_obj_->dst_tensor.allocator()->init(app.dst_info);
-+
-+        // clang-format off
-+        acl_obj_->reorder.configure(
-+            &acl_obj_->src_tensor,
-+            &acl_obj_->dst_tensor,
-+            app.src_wf,
-+            app.dst_wf
-+            );
-+        // clang-format on
-+
-+        return status::success;
-+    }
-+
-+    acl_reorder_obj_t &get_acl_obj() const { return *acl_obj_; }
-+    DNNL_DISALLOW_COPY_AND_ASSIGN(acl_reorder_resource_t);
-+
-+private:
-+    std::unique_ptr<acl_reorder_obj_t> acl_obj_;
-+}; // acl_reorder_resource_t
-+
-+struct acl_reorder_fwd_t : public primitive_t {
-+    using primitive_t::primitive_t;
-+    struct pd_t : public cpu_reorder_pd_t {
-+
-+        using cpu_reorder_pd_t::cpu_reorder_pd_t;
-+
-+        DECLARE_COMMON_PD_T("acl", acl_reorder_fwd_t);
-+
-+        static status_t create(reorder_pd_t **reorder_pd, engine_t *engine,
-+                const primitive_attr_t *attr, engine_t *src_engine,
-+                const memory_desc_t *src_md, engine_t *dst_engine,
-+                const memory_desc_t *dst_md) {
-+
-+            using namespace acl_utils;
-+            // using skip_mask_t = dnnl_primitive_attr::skip_mask_t;
-+
-+            bool ok = src_md->data_type
-+                            == dst_md->data_type // ACL only supports matching src/dst data types
-+                    && utils::one_of(src_md->data_type,
-+                            data_type::f32) // Only supports f32 for now
-+                    && attr->has_default_values();
-+            if (!ok) return status::unimplemented;
-+
-+            int mask = -1;
-+            bool is_set = false;
-+            // CHECK(attr->scales_.get(DNNL_ARG_DST, &mask, &is_set));
-+            const memory_desc_wrapper input_d(src_md);
-+            if (input_d.has_runtime_dims_or_strides() && is_set && mask > 0)
-+                return status::unimplemented;
-+
-+            // Create and check primitive descriptor
-+            auto _pd = new pd_t(attr, src_engine->kind(), src_md,
-+                    dst_engine->kind(), dst_md);
-+            if (_pd == nullptr) return status::out_of_memory;
-+            if (_pd->init(engine, src_engine, dst_engine) != status::success) {
-+                delete _pd;
-+                return status::unimplemented;
-+            }
-+
-+            const memory_desc_wrapper src_d(*src_md);
-+            const memory_desc_wrapper dst_d(*dst_md);
-+
-+            const int ndims = src_d.ndims();
-+
-+            auto src_tag = memory_desc_matches_one_of_tag(
-+                            *src_md, format_tag::ba, format_tag::cdba);
-+            ACL_CHECK_SUPPORT(
-+                            utils::one_of(format_tag::undef, src_tag),
-+                            "");
-+
-+            arm_compute::TensorShape acl_tensor_shape_in;
-+            arm_compute::TensorShape acl_tensor_shape_out;
-+            // Need even amount of dims in dim 0 for ACL kernel (eg mulitple of 8 rows when blocking by 8)
-+            int dim_0_rounded_up;
-+
-+            // Switch for 2 or 4 dim tensors
-+            switch(ndims)
-+            {
-+                // Currently for Ab4a and Ab8a
-+                // No format_tag for these, have to deduce from stride
-+                case 2:
-+                    {
-+                        if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){
-+                            return status::unimplemented;
-+                        }
-+                        int dst_dim_1 = dst_md->dims[1];
-+                        int dst_dim_0_stride = dst_md->format_desc.blocking.strides[0];
-+                        int dst_dim_1_stride = dst_md->format_desc.blocking.strides[1];
-+                        // Interleave of 4 or 8 that stride for dim 1
-+                        if (dst_dim_1_stride != 4 && dst_dim_1_stride != 8){
-+                            return status::unimplemented;
-+                        }
-+                        // Check to ensure it's a blocking transpose
-+                        if (dst_dim_1 * dst_dim_1_stride != dst_dim_0_stride){
-+                            return status::unimplemented;
-+                        }
-+                        if(dst_dim_1_stride == 4){
-+                            // Set Dest WeightFormat
-+                            _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
-+                            dim_0_rounded_up
-+                                    = utils::rnd_up(src_md->dims[0], 4);
-+                        } else {
-+                            // Set Dest WeightFormat
-+                            _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
-+                            dim_0_rounded_up
-+                                    = utils::rnd_up(src_md->dims[0], 8);
-+                        }
-+                        acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[1], src_md->dims[0]);
-+                        acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[1], dim_0_rounded_up);
-+
-+                        break;
-+                    }
-+                // Currently for Acdb4a and Acdb8a
-+                case 4:
-+                    { 
-+
-+                        auto dst_tag = memory_desc_matches_one_of_tag(
-+                            *dst_md, format_tag::Acdb4a, format_tag::Acdb8a);
-+                        ACL_CHECK_SUPPORT(
-+                            utils::one_of(format_tag::undef, dst_tag),
-+                            "");
-+                        if(dst_tag == format_tag::Acdb4a){
-+                            // Set Dest WeightFormat
-+                            _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
-+                            dim_0_rounded_up
-+                                    = utils::rnd_up(src_md->dims[0], 4);
-+                        }
-+                        else{
-+                            // Set Dest WeightFormat
-+                            _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
-+                            dim_0_rounded_up
-+                                    = utils::rnd_up(src_md->dims[0], 8);
-+                        }
-+                        // Currently only supporting AxBx1x1 cases
-+                        if(dst_md->dims[2] != 1 || dst_md->dims[3] != 1){
-+                            return status::unimplemented;
-+                        }
-+                        if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){
-+                            return status::unimplemented;
-+                        }
-+                        acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], src_md->dims[0]);
-+                        acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], dim_0_rounded_up);
-+                        break;
-+                    }
-+                default:
-+                    return status::unimplemented;
-+            }
-+
-+            // Choose the data layout
-+            // bool is_nspc = utils::one_of(src_tag, format_tag::nhwc);
-+            const auto acl_layout = arm_compute::DataLayout::NCHW;
-+
-+            // Set Source WeightFormat
-+            _pd->app_.src_wf = arm_compute::WeightFormat::OHWI;
-+
-+            // Create ACL tensor infos
-+            const data_type_t data_type = src_d.data_type();
-+            const arm_compute::DataType acl_data_t
-+                    = acl_utils::get_acl_data_t(data_type);
-+            _pd->app_.src_info = arm_compute::TensorInfo(
-+                        acl_tensor_shape_in, 1, acl_data_t, acl_layout);
-+            _pd->app_.dst_info = arm_compute::TensorInfo(
-+                        acl_tensor_shape_out, 1, acl_data_t, acl_layout);
-+
-+            // Init scratch memory, not used so 0 in this implementation
-+            _pd->init_scratchpad_md();
-+
-+            return safe_ptr_assign(*reorder_pd, _pd);
-+        } // create 
-+
-+        friend dnnl::impl::impl_list_item_t;
-+        acl_reorder_conf_t app_;
-+
-+    }; // pd_t
-+
-+    acl_reorder_fwd_t(const pd_t *apd) : primitive_t(apd) {}
-+
-+    status_t create_resource(
-+            engine_t *engine, resource_mapper_t &mapper) const override {
-+        if (mapper.has_resource(this)) return status::success;
-+
-+        auto r = utils::make_unique<acl_reorder_resource_t>();
-+        if (!r) return status::out_of_memory;
-+
-+        // Configure the resource based on information from primitive descriptor
-+        CHECK(r->configure(pd()->app_));
-+
-+        mapper.add(this, std::move(r));
-+        return status::success;
-+    }
-+
-+    status_t execute(const exec_ctx_t &ctx) const override {
-+        return execute_forward(ctx);
-+    }
-+
-+private:
-+    // To guard the const execute_forward, the mutex must be 'mutable'
-+    mutable std::mutex mtx;
-+    status_t execute_forward(const exec_ctx_t &ctx) const;
-+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-+
-+
-+}; // acl_reorder_fwd_t
-+
-+} // namespace aarch64
-+} // namespace cpu
-+} // namespace impl
-+} // namespace dnnl
-+
-+#endif // CPU_AARCH64_ACL_REORDER_HPP
-diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-index a4150b619..f4d6b4de3 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-+++ b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-@@ -16,6 +16,7 @@
- *******************************************************************************/
- 
- #include "cpu/reorder/cpu_reorder.hpp"
-+#include "cpu/aarch64/acl_reorder.hpp"
- 
- namespace dnnl {
- namespace impl {
-@@ -28,6 +29,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
-         // f32 -> f32
-         {{f32, f32, 0}, {
-             REG_FAST_DIRECT_COPY_F32_F32
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
- 
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-@@ -69,6 +71,8 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
-             nullptr,
-         }},
-         {{f32, f32, 4}, {
-+
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
-             CPU_REORDER_INSTANCE(rnn_weights_reorder_t<f32, f32>)
- 
-             REG_FAST_DIRECT_COPY_F32_F32
--- a/third_party/xla/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch
+++ b/third_party/xla/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch
@ -1,97 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
-index fd2c76d01..bd7bed837 100644
--- a/src/cpu/aarch64/acl_thread.cpp
-+++ b/src/cpu/aarch64/acl_thread.cpp
-@@ -55,14 +55,17 @@ void acl_set_benchmark_scheduler_default() {
- #endif
- 
- #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
-void acl_set_tp_scheduler() {
-    static std::once_flag flag_once;
-    // Create threadpool scheduler
-    std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
-            = std::make_unique<ThreadpoolScheduler>();
-+void acl_set_tp_scheduler(int intra_threads = 0) {
-+    static thread_local std::once_flag flag_once;
-     // set CUSTOM scheduler in ACL
-     std::call_once(flag_once,
-            [&]() { arm_compute::Scheduler::set(threadpool_scheduler); });
-+            [&]() {
-+                    // Create threadpool scheduler
-+                    std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
-+                        = std::make_unique<ThreadpoolScheduler>();
-+                    threadpool_scheduler->set_num_threads(intra_threads);
-+
-+                    arm_compute::Scheduler::set(threadpool_scheduler); });
- }
- 
- void acl_set_threadpool_num_threads() {
-@@ -102,14 +105,6 @@ void set_acl_threading() {
-         acl_set_benchmark_scheduler_default();
-     }
- #endif
-#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
-    if (verbose_has_profile_externals()) {
-        acl_set_tp_benchmark_scheduler();
-    } else {
-        acl_set_tp_scheduler();
-    }
-
-#endif
- }
- 
- } // namespace acl_thread_utils
-diff --git a/src/cpu/aarch64/acl_thread.hpp b/src/cpu/aarch64/acl_thread.hpp
-index f073376e6..654a2aa5d 100644
--- a/src/cpu/aarch64/acl_thread.hpp
-+++ b/src/cpu/aarch64/acl_thread.hpp
-@@ -40,7 +40,7 @@ void acl_set_benchmark_scheduler_default();
- 
- #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
- // Retrieve threadpool size during primitive execution and set ThreadpoolScheduler num_threads
-void acl_set_tp_scheduler();
-+void acl_set_tp_scheduler(int intra_threads);
- void acl_set_threadpool_num_threads();
- // Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler) for DNNL_VERBOSE=profile,profile_externals
- void acl_set_tp_benchmark_scheduler();
-diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
-index 439ca862e..6656c37a5 100644
--- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
-+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
-@@ -102,8 +102,6 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
- void ThreadpoolScheduler::run_workloads(
-         std::vector<arm_compute::IScheduler::Workload> &workloads) {
- 
-    arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
-
-     const unsigned int num_threads
-             = std::min(static_cast<unsigned int>(_num_threads),
-                     static_cast<unsigned int>(workloads.size()));
-diff --git a/src/cpu/cpu_engine.cpp b/src/cpu/cpu_engine.cpp
-index 0bfec3871..7207b2b60 100644
--- a/src/cpu/cpu_engine.cpp
-+++ b/src/cpu/cpu_engine.cpp
-@@ -47,6 +47,7 @@ status_t cpu_engine_t::create_stream(stream_t **stream, unsigned flags) {
- #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
- status_t cpu_engine_t::create_stream(stream_t **stream,
-         dnnl::threadpool_interop::threadpool_iface *threadpool) {
-+    dnnl::impl::cpu::aarch64::acl_thread_utils::acl_set_tp_scheduler(threadpool->get_num_threads());
-     return safe_ptr_assign<stream_t>(
-             *stream, new cpu_stream_t(this, threadpool));
- }
--- a/third_party/xla/third_party/mkl_dnn/onednn_acl_threadcap.patch
+++ b/third_party/xla/third_party/mkl_dnn/onednn_acl_threadcap.patch
@ -1,43 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
-index fd2c76d01..2d7c76d48 100644
--- a/src/cpu/aarch64/acl_thread.cpp
-+++ b/src/cpu/aarch64/acl_thread.cpp
-@@ -17,6 +17,8 @@
- #include "cpu/aarch64/acl_thread.hpp"
- #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
- #include "cpu/aarch64/acl_threadpool_scheduler.hpp"
-+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
-+#include <thread>
- #endif
- #include "cpu/aarch64/acl_benchmark_scheduler.hpp"
- 
-@@ -30,9 +32,10 @@ namespace acl_thread_utils {
- #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
- void acl_thread_bind() {
-     static std::once_flag flag_once;
-    // The threads in Compute Library are bound for the cores 0..max_threads-1
-    // dnnl_get_max_threads() returns OMP_NUM_THREADS
-    const int max_threads = dnnl_get_max_threads();
-+    // Cap the number of threads to 90% of the total core count
-+    // to ensure Compute Library doesn't use too much resource
-+    int capped_threads = (int)std::floor(0.9*std::thread::hardware_concurrency());
-+    const int max_threads = std::min(capped_threads, dnnl_get_max_threads());
-     // arm_compute::Scheduler does not support concurrent access thus a
-     // workaround here restricts it to only one call
-     std::call_once(flag_once, [&]() {
--- a/third_party/xla/third_party/mkl_dnn/onednn_acl_threadpool_default_max.patch
+++ b/third_party/xla/third_party/mkl_dnn/onednn_acl_threadpool_default_max.patch
@ -0,0 +1,180 @@
+# *******************************************************************************
+# Copyright 2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
+index 53175a05f9..89731cb356 100644
+--- a/src/cpu/aarch64/acl_thread.cpp
+++ b/src/cpu/aarch64/acl_thread.cpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2022-2024 Arm Ltd. and affiliates
+* Copyright 2022-2025 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -83,17 +83,20 @@ void acl_set_threadpool_num_threads() {
+ }
+ // Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler)
+ void acl_set_tp_benchmark_scheduler() {
+-    static std::once_flag flag_once;
+-    // Create threadpool scheduler
+-    std::unique_ptr<arm_compute::IScheduler> threadpool_scheduler
+-            = std::make_unique<ThreadpoolScheduler>();
+-    arm_compute::IScheduler *_real_scheduler = nullptr;
+-    _real_scheduler = threadpool_scheduler.release();
+-    // Create benchmark scheduler and set TP as real scheduler
+-    std::shared_ptr<arm_compute::IScheduler> benchmark_scheduler
+-            = std::make_unique<BenchmarkScheduler>(*_real_scheduler);
+-    std::call_once(flag_once,
+-            [&]() { arm_compute::Scheduler::set(benchmark_scheduler); });
+    static thread_local std::once_flag flag_once;
+    std::call_once(flag_once, [&]() {
+        // Create threadpool scheduler
+        std::unique_ptr<arm_compute::IScheduler> threadpool_scheduler
+                = std::make_unique<ThreadpoolScheduler>();
+        arm_compute::IScheduler *_real_scheduler = nullptr;
+        _real_scheduler = threadpool_scheduler.release();
+
+        // Create benchmark scheduler and set TP as real scheduler
+        std::shared_ptr<arm_compute::IScheduler> benchmark_scheduler
+                = std::make_unique<BenchmarkScheduler>(*_real_scheduler);
+
+        arm_compute::Scheduler::set(benchmark_scheduler);
+    });
+ }
+ #endif
+ 
+diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
+index 30910398d9..34cf44b7e2 100644
+--- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2022-2024 Arm Ltd. and affiliates
+* Copyright 2022-2025 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -18,24 +18,17 @@
+ 
+ #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
+ 
+-#include "cpu/aarch64/acl_thread.hpp"
+-
+ #include "common/counting_barrier.hpp"
+ #include "common/dnnl_thread.hpp"
+#include "cpu/aarch64/acl_thread.hpp"
+ 
+ #include "arm_compute/core/CPP/ICPPKernel.h"
+ #include "arm_compute/core/Error.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/Utils.h"
+ #include "arm_compute/runtime/IScheduler.h"
+ 
+-// BARRIER
+ #include <atomic>
+ #include <cassert>
+-#include <chrono>
+ #include <mutex>
+-#include <thread>
+-#include <condition_variable>
+ 
+ namespace dnnl {
+ namespace impl {
+@@ -51,7 +44,7 @@ public:
+ 
+     /// Function to check the next element in the range if there is one.
+     bool get_next(unsigned int &next) {
+-        next = atomic_fetch_add_explicit(
+        next = std::atomic_fetch_add_explicit(
+                 &_atomic_counter, 1u, std::memory_order_relaxed);
+         return next < _end;
+     }
+@@ -70,11 +63,8 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads,
+     } while (feeder.get_next(workload_index));
+ }
+ 
+-ThreadpoolScheduler::ThreadpoolScheduler() {
+-    using namespace dnnl::impl::threadpool_utils;
+-    // Set number of threads to one when threadpool is not available.
+-    _num_threads = get_active_threadpool() == nullptr ? 1 : num_threads_hint();
+-}
+ThreadpoolScheduler::ThreadpoolScheduler()
+    : _num_threads(dnnl_get_max_threads()) {}
+ 
+ ThreadpoolScheduler::~ThreadpoolScheduler() = default;
+ 
+@@ -83,8 +73,8 @@ unsigned int ThreadpoolScheduler::num_threads() const {
+ }
+ 
+ void ThreadpoolScheduler::set_num_threads(unsigned int num_threads) {
+-    arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
+-    _num_threads = num_threads == 0 ? num_threads_hint() : num_threads;
+    std::lock_guard<std::mutex> lock(this->_mtx);
+    _num_threads = num_threads == 0 ? dnnl_get_max_threads() : num_threads;
+ }
+ 
+ void ThreadpoolScheduler::schedule(ICPPKernel *kernel, const Hints &hints) {
+@@ -104,7 +94,7 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
+ void ThreadpoolScheduler::run_workloads(
+         std::vector<arm_compute::IScheduler::Workload> &workloads) {
+ 
+-    arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
+    std::lock_guard<std::mutex> lock(this->_mtx);
+ 
+     const unsigned int num_threads
+             = std::min(static_cast<unsigned int>(_num_threads),
+diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.hpp b/src/cpu/aarch64/acl_threadpool_scheduler.hpp
+index e9ba21c803..384dfec1b9 100644
+--- a/src/cpu/aarch64/acl_threadpool_scheduler.hpp
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.hpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2022 Arm Ltd. and affiliates
+* Copyright 2022, 2025 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -22,7 +22,8 @@
+ #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
+ 
+ #include "arm_compute/runtime/IScheduler.h"
+-#include "support/Mutex.h"
+
+#include <mutex>
+ 
+ namespace dnnl {
+ namespace impl {
+@@ -32,7 +33,7 @@ namespace aarch64 {
+ class ThreadpoolScheduler final : public arm_compute::IScheduler {
+ public:
+     ThreadpoolScheduler();
+-    ~ThreadpoolScheduler();
+    ~ThreadpoolScheduler() override;
+ 
+     /// Sets the number of threads the scheduler will use to run the kernels.
+     void set_num_threads(unsigned int num_threads) override;
+@@ -54,8 +55,8 @@ protected:
+     void run_workloads(std::vector<Workload> &workloads) override;
+ 
+ private:
+-    uint _num_threads {};
+-    arm_compute::Mutex _run_workloads_mutex {};
+    unsigned int _num_threads {};
+    std::mutex _mtx;
+ };
+ 
+ } // namespace aarch64
--- a/third_party/xla/tsl_workspace2.bzl
+++ b/third_party/xla/tsl_workspace2.bzl
@ -163,33 +163,23 @@ def _tf_repositories():
        name = "mkl_dnn_acl_compatible",
        build_file = "//third_party/mkl_dnn:mkldnn_acl.BUILD",
        patch_file = [
-            "//third_party/mkl_dnn:onednn_acl_threadcap.patch",
-            "//third_party/mkl_dnn:onednn_acl_reorder.patch",
-            "//third_party/mkl_dnn:onednn_acl_thread_local_scheduler.patch",
-            "//third_party/mkl_dnn:onednn_acl_fp32_bf16_reorder.patch",
-            "//third_party/mkl_dnn:onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch",
-            "//third_party/mkl_dnn:onednn_acl_indirect_conv.patch",
-            "//third_party/mkl_dnn:onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch",
-            "//third_party/mkl_dnn:onednn_acl_fix_segfault_during_postop_execute.patch",
-            "//third_party/mkl_dnn:onednn_acl_add_bf16_platform_support_check.patch",
-            "//third_party/mkl_dnn:onednn_acl_add_sbgemm_matmul_primitive_definition.patch",
+            "//third_party/mkl_dnn:onednn_acl_threadpool_default_max.patch",
        ],
-        sha256 = "2f76b407ef8893cca71340f88cd800019a1f14f8ac1bbdbb89a84be1370b52e3",
-        strip_prefix = "oneDNN-3.2.1",
-        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.2.1.tar.gz"),
+        sha256 = "5792cbc07764c6e25c459ff68efb5cfcd7f4a0ba66dca6a4a2c681cd7a644596",
+        strip_prefix = "oneDNN-3.7",
+        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.7.zip"),
    )

    tf_http_archive(
        name = "compute_library",
        patch_file = [
            "//third_party/compute_library:compute_library.patch",
-            "//third_party/compute_library:acl_thread_local_scheduler.patch",
            "//third_party/compute_library:exclude_omp_scheduler.patch",
            "//third_party/compute_library:include_string.patch",
        ],
-        sha256 = "c4ca329a78da380163b2d86e91ba728349b6f0ee97d66e260a694ef37f0b0d93",
-        strip_prefix = "ComputeLibrary-23.05.1",
-        urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/v23.05.1.tar.gz"),
+        sha256 = "8273f68cd0bb17e9231a11a6618d245eb6d623884ae681c00e7a4eabca2dad42",
+        strip_prefix = "ComputeLibrary-24.12",
+        urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/refs/tags/v24.12.tar.gz"),
    )

    tf_http_archive(
--- a/third_party/xla/xla/tsl/mkl/BUILD.bazel
+++ b/third_party/xla/xla/tsl/mkl/BUILD.bazel
@ -82,14 +82,6 @@ config_setting(
    },
 )

-config_setting(
-    name = "build_with_mkl_aarch64_openmp",
-    define_values = {
-        "build_with_mkl_aarch64": "true",
-        "build_with_openmp": "true",
-    },
-)
-
 filegroup(
    name = "LICENSE",
    srcs = [
--- a/third_party/xla/xla/tsl/mkl/build_defs.bzl
+++ b/third_party/xla/xla/tsl/mkl/build_defs.bzl
@ -7,7 +7,6 @@ if_mkl_lnx_x64 is a conditional to check for MKL
 if_enable_mkl is a conditional to check if building with MKL and MKL is enabled.
 if_mkldnn_openmp checks if we are building x86 backend with OpenMP.
 if_mkldnn_aarch64_acl checks if we are building with Arm Compute Library.
-if_mkldnn_aarch64_acl_openmp checks if we are building ACL with OpenMP.

 mkl_repository is a repository rule for creating MKL repository rule that can
 be pointed to either a local folder, or download it from the internet.
@ -146,12 +145,6 @@ def if_mkldnn_aarch64_acl(if_true, if_false = []):
        "//conditions:default": if_false,
    })

-def if_mkldnn_aarch64_acl_openmp(if_true, if_false = []):
-    return select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": if_true,
-        "//conditions:default": if_false,
-    })
-
 # Temporarily disable Graph API on aarch64 until we change the aarch64 BUILD
 # file to support Graph API.
 def if_graph_api(if_true, if_false = []):
--- a/third_party/xla/xla/tsl/tsl.bzl
+++ b/third_party/xla/xla/tsl/tsl.bzl
@ -10,7 +10,6 @@ load(
    "if_enable_mkl",
    "if_mkl",
    "if_mkldnn_aarch64_acl",
-    "if_mkldnn_aarch64_acl_openmp",
    "if_mkldnn_openmp",
    "onednn_v3_define",
 )
@ -334,7 +333,6 @@ def tsl_copts(
        if_mkldnn_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
        onednn_v3_define() +
        if_mkldnn_aarch64_acl(["-DDNNL_AARCH64_USE_ACL=1"]) +
-        if_mkldnn_aarch64_acl_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
        if_enable_acl(["-DXLA_CPU_USE_ACL=1", "-fexceptions"]) +
        if_android_arm(["-mfpu=neon", "-fomit-frame-pointer"]) +
        if_linux_x86_64(["-msse3"]) +
--- a/third_party/xla/xla/tsl/util/onednn_threadpool.h
+++ b/third_party/xla/xla/tsl/util/onednn_threadpool.h
@ -154,9 +154,7 @@ class OneDnnThreadPool : public threadpool_iface {
  static void set_onednn_max_threads(int num_threads) {
 #if DNNL_VERSION_MAJOR >= 3 || \
    (DNNL_VERSION_MAJOR == 2 && DNNL_VERSION_MINOR >= 7)
-#ifndef DNNL_AARCH64_USE_ACL
    dnnl_threadpool_interop_set_max_concurrency(num_threads);
-#endif  // DNNL_AARCH64_USE_ACL
 #endif  // DNNL_VERSION_MAJOR >= 3 ||
        // (DNNL_VERSION_MAJOR == 2 && DNNL_VERSION_MINOR >= 7)
  }