build(aarch64): Update to oneDNN-3.7 + ACL-24.12

Bumps the aarch64-compatible oneDNN version to 3.7 and the ACL version
to 24.12. This brings better performance, improved memory management,
and numerous bug fixes over the previous, long out-of-date versions.

Signed-off-by: Siddhartha Menon <siddhartha.menon@arm.com>
This commit is contained in:
Siddhartha Menon 2024-10-18 13:50:49 +00:00
parent 2765e59402
commit a24a3a48f2
No known key found for this signature in database
GPG Key ID: 7EC0C47881232E84
36 changed files with 441 additions and 2337 deletions

View File

@ -241,17 +241,15 @@ build:mkl_threadpool --define=tensorflow_mkldnn_contraction_kernel=0
build:mkl_threadpool --define=build_with_mkl_opensource=true
build:mkl_threadpool -c opt
# Config setting to build oneDNN with Compute Library for the Arm Architecture (ACL).
build:mkl_aarch64 --define=build_with_mkl_aarch64=true
build:mkl_aarch64 --define=build_with_openmp=true
build:mkl_aarch64 --define=build_with_acl=true
build:mkl_aarch64 -c opt
# Config setting to build oneDNN with Compute Library for the Arm Architecture (ACL).
# with Eigen threadpool support
build:mkl_aarch64_threadpool --define=build_with_mkl_aarch64=true
build:mkl_aarch64_threadpool --define=build_with_acl=true
build:mkl_aarch64_threadpool -c opt
# This is an alias for the mkl_aarch64_threadpool build.
build:mkl_aarch64 --config=mkl_aarch64_threadpool
# Default CUDA and CUDNN versions.
build:cuda_version --repo_env=HERMETIC_CUDA_VERSION="12.5.1"
build:cuda_version --repo_env=HERMETIC_CUDNN_VERSION="9.3.0"

View File

@ -21,7 +21,6 @@ load(
"if_mkl",
"if_mkl_ml",
"if_mkldnn_aarch64_acl",
"if_mkldnn_aarch64_acl_openmp",
"if_mkldnn_openmp",
"onednn_v3_define",
)
@ -478,7 +477,6 @@ def tf_copts(
if_mkldnn_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
onednn_v3_define() +
if_mkldnn_aarch64_acl(["-DDNNL_AARCH64_USE_ACL=1"]) +
if_mkldnn_aarch64_acl_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
if_zendnn(["-DAMD_ZENDNN"]) +
if_enable_acl(["-DXLA_CPU_USE_ACL=1", "-fexceptions"]) +
if_llvm_aarch32_available(["-DTF_LLVM_AARCH32_AVAILABLE=1"]) +

View File

@ -236,33 +236,23 @@ def _tf_repositories():
name = "mkl_dnn_acl_compatible",
build_file = "//third_party/mkl_dnn:mkldnn_acl.BUILD",
patch_file = [
"//third_party/mkl_dnn:onednn_acl_threadcap.patch",
"//third_party/mkl_dnn:onednn_acl_reorder.patch",
"//third_party/mkl_dnn:onednn_acl_thread_local_scheduler.patch",
"//third_party/mkl_dnn:onednn_acl_fp32_bf16_reorder.patch",
"//third_party/mkl_dnn:onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch",
"//third_party/mkl_dnn:onednn_acl_indirect_conv.patch",
"//third_party/mkl_dnn:onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch",
"//third_party/mkl_dnn:onednn_acl_fix_segfault_during_postop_execute.patch",
"//third_party/mkl_dnn:onednn_acl_add_bf16_platform_support_check.patch",
"//third_party/mkl_dnn:onednn_acl_add_sbgemm_matmul_primitive_definition.patch",
"//third_party/mkl_dnn:onednn_acl_threadpool_default_max.patch",
],
sha256 = "2f76b407ef8893cca71340f88cd800019a1f14f8ac1bbdbb89a84be1370b52e3",
strip_prefix = "oneDNN-3.2.1",
urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.2.1.tar.gz"),
sha256 = "5792cbc07764c6e25c459ff68efb5cfcd7f4a0ba66dca6a4a2c681cd7a644596",
strip_prefix = "oneDNN-3.7",
urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.7.zip"),
)
tf_http_archive(
name = "compute_library",
patch_file = [
"//third_party/compute_library:compute_library.patch",
"//third_party/compute_library:acl_thread_local_scheduler.patch",
"//third_party/compute_library:exclude_omp_scheduler.patch",
"//third_party/compute_library:include_string.patch",
],
sha256 = "c4ca329a78da380163b2d86e91ba728349b6f0ee97d66e260a694ef37f0b0d93",
strip_prefix = "ComputeLibrary-23.05.1",
urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/v23.05.1.tar.gz"),
sha256 = "8273f68cd0bb17e9231a11a6618d245eb6d623884ae681c00e7a4eabca2dad42",
strip_prefix = "ComputeLibrary-24.12",
urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/refs/tags/v24.12.tar.gz"),
)
tf_http_archive(

View File

@ -1,98 +0,0 @@
diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
index 9e8add1f9..cf5e2bf4c 100644
--- a/arm_compute/runtime/Scheduler.h
+++ b/arm_compute/runtime/Scheduler.h
@@ -75,7 +75,7 @@ public:
private:
static Type _scheduler_type;
- static std::shared_ptr<IScheduler> _custom_scheduler;
+ static thread_local std::shared_ptr<IScheduler> _custom_scheduler;
static std::map<Type, std::unique_ptr<IScheduler>> _schedulers;
Scheduler();
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
index a5b9eca56..d1ab19397 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
@@ -60,8 +60,8 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src,
const ConvolutionInfo &info)
{
ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, info);
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- const unsigned int num_threads = NEScheduler::get().num_threads();
+ const CPUInfo &ci = CPUInfo::get();
+ const unsigned int num_threads = CPUInfo::get().get_cpu_num();
_pImpl->is_prepared = false;
_pImpl->are_weights_const = weights->are_values_constant();
diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp
index 722cd36ee..03aef1632 100644
--- a/src/cpu/operators/CpuPool2d.cpp
+++ b/src/cpu/operators/CpuPool2d.cpp
@@ -66,8 +66,8 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
if(run_optimised)
{
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- const unsigned int num_threads = NEScheduler::get().num_threads();
+ const CPUInfo &ci = CPUInfo::get();
+ const unsigned int num_threads = CPUInfo::get().get_cpu_num();
auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>();
ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
*******************************************************************************
Copyright 2023 Arm Limited and affiliates.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*******************************************************************************
index 9c8563140..f7771945a 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -623,8 +623,8 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
arm_gemm::Activation activation, const AsmGemmInfo &info)
{
Params p = extract_parameters(a, b, d, info);
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- unsigned int num_threads = NEScheduler::get().num_threads();
+ const CPUInfo &ci = CPUInfo::get();
+ unsigned int num_threads = CPUInfo::get().get_cpu_num();
arm_gemm::GemmConfig cfg;
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
@@ -696,8 +696,8 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
ARM_COMPUTE_UNUSED(c);
arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
Params p = extract_parameters(a, b, d, info);
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- unsigned int num_threads = NEScheduler::get().num_threads();
+ const CPUInfo &ci = CPUInfo::get();
+ unsigned int num_threads = CPUInfo::get().get_cpu_num();
arm_gemm::GemmConfig cfg;
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format);
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
index 0713b9a2a..f15ac2e22 100644
--- a/src/runtime/Scheduler.cpp
+++ b/src/runtime/Scheduler.cpp
@@ -47,7 +47,7 @@ Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST;
#endif /* ARM_COMPUTE_*_SCHEDULER */
-std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
+thread_local std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
namespace
{

View File

@ -1,8 +1,8 @@
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index bf71e534e2..22377f1a32 100644
index 547c98576..a31301230 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -971,7 +971,6 @@ filegroup(
@@ -1029,7 +1029,6 @@ filegroup(
"runtime/NEON/functions/NETranspose.cpp",
"runtime/NEON/functions/NEUnstack.cpp",
"runtime/NEON/functions/NEWinogradConvolutionLayer.cpp",
@ -10,10 +10,10 @@ index bf71e534e2..22377f1a32 100644
"runtime/OffsetLifetimeManager.cpp",
"runtime/OffsetMemoryPool.cpp",
"runtime/OperatorTensor.cpp",
@@ -984,6 +983,10 @@ filegroup(
"runtime/Tensor.cpp",
"runtime/TensorAllocator.cpp",
"runtime/Utils.cpp"] +
@@ -1058,6 +1057,10 @@ filegroup(
"runtime/experimental/operators/CpuSub.cpp",
"runtime/experimental/operators/CpuTranspose.cpp",
"runtime/experimental/operators/CpuWinogradConv2d.cpp"] +
+ select({
+ "//:openmp_flag": ["runtime/OMP/OMPScheduler.cpp"],
+ "//conditions:default": [],

View File

@ -9,13 +9,6 @@ _DNNL_COPTS_THREADPOOL = [
"-UUSE_CBLAS",
]
_DNNL_COPTS_OMP = [
"-fopenmp",
"-fexceptions",
"-UUSE_MKL",
"-UUSE_CBLAS",
]
_DNNL_RUNTIME_THREADPOOL = {
"#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
"#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
@ -63,61 +56,24 @@ _DNNL_RUNTIME_THREADPOOL = {
"#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
"#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
"#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
}
_DNNL_RUNTIME_OMP = {
"#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
"#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
"#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
"#cmakedefine DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE": "#undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE",
"#cmakedefine DNNL_WITH_SYCL": "#undef DNNL_WITH_SYCL",
"#cmakedefine DNNL_WITH_LEVEL_ZERO": "#undef DNNL_WITH_LEVEL_ZERO",
"#cmakedefine DNNL_SYCL_CUDA": "#undef DNNL_SYCL_CUDA",
"#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
"#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
"#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
"#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
"#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
"#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
"#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
"#cmakedefine01 BUILD_BATCH_NORMALIZATION": "#define BUILD_BATCH_NORMALIZATION 0",
"#cmakedefine01 BUILD_BINARY": "#define BUILD_BINARY 0",
"#cmakedefine01 BUILD_CONCAT": "#define BUILD_CONCAT 0",
"#cmakedefine01 BUILD_CONVOLUTION": "#define BUILD_CONVOLUTION 0",
"#cmakedefine01 BUILD_DECONVOLUTION": "#define BUILD_DECONVOLUTION 0",
"#cmakedefine01 BUILD_ELTWISE": "#define BUILD_ELTWISE 0",
"#cmakedefine01 BUILD_INNER_PRODUCT": "#define BUILD_INNER_PRODUCT 0",
"#cmakedefine01 BUILD_LAYER_NORMALIZATION": "#define BUILD_LAYER_NORMALIZATION 0",
"#cmakedefine01 BUILD_LRN": "#define BUILD_LRN 0",
"#cmakedefine01 BUILD_MATMUL": "#define BUILD_MATMUL 0",
"#cmakedefine01 BUILD_POOLING": "#define BUILD_POOLING 0",
"#cmakedefine01 BUILD_PRELU": "#define BUILD_PRELU 0",
"#cmakedefine01 BUILD_REDUCTION": "#define BUILD_REDUCTION 0",
"#cmakedefine01 BUILD_REORDER": "#define BUILD_REORDER 0",
"#cmakedefine01 BUILD_RESAMPLING": "#define BUILD_RESAMPLING 0",
"#cmakedefine01 BUILD_RNN": "#define BUILD_RNN 0",
"#cmakedefine01 BUILD_SHUFFLE": "#define BUILD_SHUFFLE 0",
"#cmakedefine01 BUILD_SOFTMAX": "#define BUILD_SOFTMAX 0",
"#cmakedefine01 BUILD_SUM": "#define BUILD_SUM 0",
"#cmakedefine01 BUILD_PRIMITIVE_CPU_ISA_ALL": "#define BUILD_PRIMITIVE_CPU_ISA_ALL 0",
"#cmakedefine01 BUILD_SSE41": "#define BUILD_SSE41 0",
"#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0",
"#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0",
"#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0",
"#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 0",
"#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0",
"#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0",
"#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0",
"#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
"#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
"#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
"#cmakedefine01 BUILD_GROUP_NORMALIZATION": "#define BUILD_GROUP_NORMALIZATION 0",
"#cmakedefine01 BUILD_GEMM_KERNELS_ALL": "#define BUILD_GEMM_KERNELS_ALL 1",
"#cmakedefine01 BUILD_GEMM_KERNELS_NONE": "#define BUILD_GEMM_KERNELS_NONE 0",
"#cmakedefine01 BUILD_GEMM_SSE41": "#define BUILD_GEMM_SSE41 0",
"#cmakedefine01 BUILD_GEMM_AVX2": "#define BUILD_GEMM_AVX2 0",
"#cmakedefine01 BUILD_GEMM_AVX512": "#define BUILD_GEMM_AVX512 0",
"#cmakedefine DNNL_GPU_VENDOR": "#define DNNL_GPU_VENDOR INTEL",
"#cmakedefine DNNL_SYCL_GENERIC": "#undef DNNL_SYCL_GENERIC",
"#cmakedefine DNNL_DISABLE_GPU_REF_KERNELS": "#undef DNNL_DISABLE_GPU_REF_KERNELS",
"#cmakedefine01 BUILD_SDPA": "#define BUILD_SDPA 0",
"#cmakedefine01 BUILD_XE2": "#define BUILD_XE2 0",
"#cmakedefine01 BUILD_XE3": "#define BUILD_XE3 0",
}
expand_template(
name = "dnnl_config_h",
out = "include/oneapi/dnnl/dnnl_config.h",
substitutions = select({
"@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": _DNNL_RUNTIME_OMP,
"//conditions:default": _DNNL_RUNTIME_THREADPOOL,
}),
template = "include/oneapi/dnnl/dnnl_config.h.in",
@ -128,13 +84,21 @@ expand_template(
out = "include/oneapi/dnnl/dnnl_version.h",
substitutions = {
"@DNNL_VERSION_MAJOR@": "3",
"@DNNL_VERSION_MINOR@": "2",
"@DNNL_VERSION_PATCH@": "1",
"@DNNL_VERSION_HASH@": "N/A",
"@DNNL_VERSION_MINOR@": "7",
"@DNNL_VERSION_PATCH@": "0",
},
template = "include/oneapi/dnnl/dnnl_version.h.in",
)
expand_template(
name = "dnnl_version_hash_h",
out = "include/oneapi/dnnl/dnnl_version_hash.h",
substitutions = {
"@DNNL_VERSION_HASH@": "N/A",
},
template = "include/oneapi/dnnl/dnnl_version_hash.h.in",
)
cc_library(
name = "mkl_dnn_acl",
srcs = glob(
@ -146,10 +110,11 @@ cc_library(
exclude = [
"src/cpu/x64/**",
"src/cpu/rv64/**",
"src/cpu/sycl/**",
"src/xpu/**",
],
),
copts = select({
"@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": _DNNL_COPTS_OMP,
"//conditions:default": _DNNL_COPTS_THREADPOOL,
}),
defines = ["DNNL_AARCH64_USE_ACL=1"],
@ -175,6 +140,7 @@ cc_library(
) + [
":dnnl_config_h",
":dnnl_version_h",
":dnnl_version_hash_h",
],
visibility = ["//visibility:public"],
deps = [

View File

@ -1,31 +0,0 @@
/* Copyright 2024 The OpenXLA Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
diff --git a/src/cpu/platform.cpp b/src/cpu/platform.cpp
index 65b887ea21..eabdb827bd 100644
--- a/src/cpu/platform.cpp
+++ b/src/cpu/platform.cpp
@@ -117,6 +117,8 @@ bool has_data_type_support(data_type_t data_type) {
#if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__)
return true;
#endif
+#elif DNNL_AARCH64_USE_ACL
+ return arm_compute::CPUInfo::get().has_bf16();
#else
return false;
#endif
--
2.34.1

View File

@ -1,44 +0,0 @@
/* Copyright 2024 The OpenXLA Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
index ab13efb9b2..ec261e156d 100644
--- a/src/cpu/aarch64/matmul/acl_matmul.hpp
+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
@@ -78,11 +78,21 @@ struct acl_matmul_t : public primitive_t {
= utils::everyone_is(data_type::f16, src_md()->data_type,
weights_md()->data_type, dst_md()->data_type)
&& platform::has_data_type_support(data_type::f16);
+ const bool is_fp32_bf16_ok
+ = (utils::everyone_is(data_type::f32, src_md()->data_type,
+ dst_md()->data_type, desc()->accum_data_type)
+ && platform::has_data_type_support(data_type::f32)
+ && utils::everyone_is(
+ data_type::bf16, weights_md()->data_type)
+ && platform::has_data_type_support(
+ data_type::bf16));
+
const bool is_weights_md_format_ok
= utils::one_of(weights_format_kind_received,
format_kind::any, format_kind::blocked);
bool ok = is_dense_data()
- && utils::one_of(true, is_fp32_ok, is_fp16_ok)
+ && utils::one_of(
+ true, is_fp32_ok, is_fp16_ok, is_fp32_bf16_ok)
&& !has_zero_dim_memory() && is_weights_md_format_ok
&& set_default_formats()
&& attr()->has_default_values(
--
2.34.1

View File

@ -1,100 +0,0 @@
/* Copyright 2024 The OpenXLA Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
index 451cc78d52..ab13efb9b2 100644
--- a/src/cpu/aarch64/matmul/acl_matmul.hpp
+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
@@ -67,6 +67,8 @@ struct acl_matmul_t : public primitive_t {
status_t init(engine_t *engine) {
using smask_t = primitive_attr_t::skip_mask_t;
+ const format_kind_t weights_format_kind_received
+ = weights_md_.format_kind;
const bool is_fp32_ok
= utils::everyone_is(data_type::f32, src_md()->data_type,
weights_md()->data_type, dst_md()->data_type,
@@ -76,18 +78,20 @@ struct acl_matmul_t : public primitive_t {
= utils::everyone_is(data_type::f16, src_md()->data_type,
weights_md()->data_type, dst_md()->data_type)
&& platform::has_data_type_support(data_type::f16);
+ const bool is_weights_md_format_ok
+ = utils::one_of(weights_format_kind_received,
+ format_kind::any, format_kind::blocked);
bool ok = is_dense_data()
&& utils::one_of(true, is_fp32_ok, is_fp16_ok)
- && !has_zero_dim_memory()
- && weights_md_.format_kind == format_kind::any
+ && !has_zero_dim_memory() && is_weights_md_format_ok
&& set_default_formats()
&& attr()->has_default_values(
smask_t::oscale | smask_t::post_ops)
&& attr_oscale_ok() && !has_runtime_dims_or_strides();
if (!ok) return status::unimplemented;
- CHECK(acl_matmul_utils::init_conf_matmul(
- amp_, src_md_, weights_md_, dst_md_, *desc(), *attr()));
+ CHECK(acl_matmul_utils::init_conf_matmul(amp_, src_md_, weights_md_,
+ dst_md_, *desc(), *attr(), weights_format_kind_received));
arm_compute::ActivationLayerInfo act_info;
CHECK(post_ops.init(engine, attr_.post_ops_, dst_md_, act_info));
diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
index a314d96384..027f915a8a 100644
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
@@ -27,7 +27,8 @@ namespace acl_matmul_utils {
status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
memory_desc_t &wei_md, memory_desc_t &dst_md, const matmul_desc_t &md,
- const primitive_attr_t &attr) {
+ const primitive_attr_t &attr,
+ format_kind_t weights_format_kind_received) {
const memory_desc_wrapper src_d(&src_md);
const memory_desc_wrapper wei_d(&wei_md);
@@ -128,9 +129,16 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
for (dim_t i = K_dim - 1; i >= 0; --i)
batch_dims.push_back(i);
+ const memory_desc_t weights_md_received = wei_md;
acl_utils::reorder_to_weight_format(amp.wei_tensor_info, wei_md,
expected_weight_format, K_dim, N_dim, {}, batch_dims);
+ ACL_CHECK_SUPPORT((weights_format_kind_received == format_kind::blocked)
+ && !(dnnl_memory_desc_equal(&weights_md_received, &wei_md)),
+ "specified blocked format not supported by ACL, use "
+ "format_kind_t::any to find a supported blocked format for "
+ "your platform");
+
return status::success;
}
diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
index 67bb2e78eb..5ba4241abc 100644
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
@@ -52,7 +52,8 @@ namespace acl_matmul_utils {
status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
memory_desc_t &wei_md, memory_desc_t &dst_md, const matmul_desc_t &md,
- const primitive_attr_t &attr);
+ const primitive_attr_t &attr,
+ format_kind_t weights_format_kind_received);
} // namespace acl_matmul_utils
--
2.34.1

View File

@ -1,50 +0,0 @@
From 9a9430c7db870b78c6402d786a67921af4a66334 Mon Sep 17 00:00:00 2001
From: Kentaro Kawakami <kawakami.k@fujitsu.com>
Date: Fri, 26 May 2023 10:58:36 +0900
Subject: [PATCH] cpu: aarch64: xbyak_aarch64: BF16 capability detection for
Ubuntu 20.04
---
.../aarch64/xbyak_aarch64/src/util_impl_linux.h | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
index 743843bae50..3db37e972d1 100644
--- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
+++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
@@ -39,6 +39,13 @@
#include <asm/hwcap.h>
#endif
+/* Linux kernel used in Ubuntu 20.04 does not have HWCAP2_BF16 definition. */
+#ifdef AT_HWCAP2
+#ifndef HWCAP2_BF16
+#define HWCAP2_BF16 (1UL << 14)
+#endif
+#endif
+
namespace Xbyak_aarch64 {
namespace util {
#define XBYAK_AARCH64_ERROR_ fprintf(stderr, "%s, %d, Error occurrs during read cache infomation.\n", __FILE__, __LINE__);
@@ -383,7 +390,7 @@ class CpuInfoLinux : public CpuInfo {
}
void setHwCap() {
- unsigned long hwcap = getauxval(AT_HWCAP);
+ const unsigned long hwcap = getauxval(AT_HWCAP);
if (hwcap & HWCAP_ATOMICS)
type_ |= (Type)XBYAK_AARCH64_HWCAP_ATOMIC;
@@ -391,8 +398,10 @@ class CpuInfoLinux : public CpuInfo {
type_ |= (Type)XBYAK_AARCH64_HWCAP_FP;
if (hwcap & HWCAP_ASIMD)
type_ |= (Type)XBYAK_AARCH64_HWCAP_ADVSIMD;
-#ifdef HWCAP2_BF16
- if (hwcap & HWCAP2_BF16)
+
+#ifdef AT_HWCAP2
+ const unsigned long hwcap2 = getauxval(AT_HWCAP2);
+ if (hwcap2 & HWCAP2_BF16)
type_ |= (Type)XBYAK_AARCH64_HWCAP_BF16;
#endif

View File

@ -1,96 +0,0 @@
/* Copyright 2024 The OpenXLA Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
diff --git a/src/cpu/aarch64/acl_post_ops.cpp b/src/cpu/aarch64/acl_post_ops.cpp
index ea4bb200ec..3eb53b81bd 100644
--- a/src/cpu/aarch64/acl_post_ops.cpp
+++ b/src/cpu/aarch64/acl_post_ops.cpp
@@ -24,7 +24,7 @@ namespace aarch64 {
status_t acl_post_ops_t::execute(const exec_ctx_t &ctx, void *src_orig) const {
- int post_op_index = 0;
+ int post_op_index = post_op_start_index_;
// As these are post ops, this src will also be our dst. If we have a sum
// post op, the src/dst will start off in a temporary, then change to
diff --git a/src/cpu/aarch64/acl_post_ops.hpp b/src/cpu/aarch64/acl_post_ops.hpp
index 7b59ad71d3..ceaa95b73a 100644
--- a/src/cpu/aarch64/acl_post_ops.hpp
+++ b/src/cpu/aarch64/acl_post_ops.hpp
@@ -32,7 +32,9 @@ struct acl_post_ops_t {
// init the acl_post_ops_t. Note that this function modifies the passed in
// post ops by setting the preferred memory formats
status_t init(engine_t *engine, post_ops_t &post_ops,
- const memory_desc_t &dst_md) {
+ const memory_desc_t &dst_md, int post_op_start_index = 0) {
+
+ post_op_start_index_ = post_op_start_index;
CHECK(post_ops.set_default_formats(&dst_md));
dst_data_type = dst_md.data_type;
@@ -41,7 +43,7 @@ struct acl_post_ops_t {
sum_index = -1;
post_op_primitives = {};
- for (int i = 0; i < post_ops.len(); i++) {
+ for (int i = post_op_start_index; i < post_ops.len(); i++) {
auto &po = post_ops.entry_[i];
if (po.is_sum()) {
@@ -135,7 +137,8 @@ struct acl_post_ops_t {
// formats
status_t init(engine_t *engine, post_ops_t &base_post_ops,
const memory_desc_t &dst_md,
- arm_compute::ActivationLayerInfo &act_info_to_fuse) {
+ arm_compute::ActivationLayerInfo &act_info_to_fuse,
+ int post_op_start_index = 0) {
CHECK(base_post_ops.set_default_formats(&dst_md));
dst_data_type = dst_md.data_type;
@@ -149,18 +152,11 @@ struct acl_post_ops_t {
"eltwise post op scale must be 1 (no scale)");
CHECK(acl_utils::convert_to_acl_act(first_po, act_info_to_fuse));
- // Copy all but the first, because it has been fused
- post_ops_t post_ops;
- for (int idx = 1; idx < base_post_ops.len(); ++idx) {
- // Construct empty entry then copy, so that we can check for failure
- post_ops.entry_.emplace_back();
- post_ops.entry_.back().copy_from(base_post_ops.entry_[idx]);
- }
- return init(engine, post_ops, dst_md);
-
+ // post_op_start_index + 1 to skip the fused eltwise
+ return init(engine, base_post_ops, dst_md, post_op_start_index + 1);
} else {
// Nothing to fuse, just copy all post ops
- return init(engine, base_post_ops, dst_md);
+ return init(engine, base_post_ops, dst_md, post_op_start_index);
}
}
@@ -179,6 +175,9 @@ struct acl_post_ops_t {
private:
// Index of the sum post op if there is one, < 0 means no sum
int sum_index = -1;
+ // Index of the first post op this primitive executes. This is typically the
+ // number of post ops which were fused.
+ int post_op_start_index_ = 0;
data_type_t dst_data_type;
// Vector of primitives used to execute the post ops. They are constructed
// in init to be either acl_binary_t (for sum, add, sub, div, mul, min and
--
2.34.1

View File

@ -1,111 +0,0 @@
*******************************************************************************
Copyright 2023 Arm Limited and affiliates.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*******************************************************************************
diff --git a/src/cpu/aarch64/cpu_isa_traits.hpp b/src/cpu/aarch64/cpu_isa_traits.hpp
index 4a43b24c5..1a5cfe590 100644
--- a/src/cpu/aarch64/cpu_isa_traits.hpp
+++ b/src/cpu/aarch64/cpu_isa_traits.hpp
@@ -1,6 +1,7 @@
/*******************************************************************************
* Copyright 2018-2023 Intel Corporation
* Copyright 2020-2023 FUJITSU LIMITED
+* Copyright 2023 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -211,10 +212,10 @@ static inline bool mayiuse_atomic() {
return cpu().isAtomicSupported();
}
-inline bool isa_has_bf16(cpu_isa_t isa) {
- return false;
+static inline bool mayiuse_bf16() {
+ using namespace Xbyak_aarch64::util;
+ return cpu().isBf16Supported();
}
-
} // namespace
/* whatever is required to generate string literals... */
diff --git a/src/cpu/aarch64/jit_uni_reorder.cpp b/src/cpu/aarch64/jit_uni_reorder.cpp
index 6bd259ec2..5541bb702 100644
--- a/src/cpu/aarch64/jit_uni_reorder.cpp
+++ b/src/cpu/aarch64/jit_uni_reorder.cpp
@@ -1,7 +1,7 @@
/*******************************************************************************
* Copyright 2018-2023 Intel Corporation
* Copyright 2020-2023 FUJITSU LIMITED
-* Copyright 2022 Arm Ltd. and affiliates
+* Copyright 2022-2023 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -163,11 +163,11 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
bool ok = true && p.ndims > 0
&& utils::one_of(p.itype, f32, s32, data_type::s8, u8)
- && utils::one_of(p.otype, f32, s32, data_type::s8, u8)
+ && utils::one_of(p.otype, f32, bf16, s32, data_type::s8, u8)
&& utils::everyone_is(0, p.ioff, p.ooff) /* do we need this? */
&& utils::one_of(p.beta, 0.f, 1.f) /* anything else? */
- && simple_impl_desc_init(p, nullptr)
- && prb_has_small_strides(p);
+ && simple_impl_desc_init(p, nullptr) && prb_has_small_strides(p)
+ && ((p.otype != bf16) || (p.itype == f32 && mayiuse_bf16()));
return ok;
}
@@ -648,6 +648,9 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
cvt_v_s32_u8(startIdx, regNum);
if (idt == data_type::s8) cvt_v_s8_u8(startIdx, regNum);
break;
+ case bf16:
+ if (idt == f32) cvt_v_f32_bf16(startIdx, regNum);
+ break;
default: assert(!"unreachable");
}
};
@@ -1677,6 +1680,10 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
UNROLL_INST(fcvtzs, VReg4S, tmp, tmp);
}
+ void cvt_v_f32_bf16(const size_t startIdx, const size_t regNum) {
+ UNROLL_INST2(bfcvtn, VReg4H(i), VReg4S(i));
+ }
+
void cvt_z_s8_s32(const size_t startIdx, const size_t regNum) {
cvt_z_b_s(startIdx, regNum);
UNROLL_INST(sxtb, ZRegS, tmp, P_ALL_ONE / T_m, tmp);
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
index ba5499ba9..d4e21d316 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
@@ -1,5 +1,6 @@
/*******************************************************************************
* Copyright 2020-2022 Intel Corporation
+* Copyright 2023 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -34,6 +35,8 @@ const impl_list_map_t &regular_f32_bf16_impl_list_map() {
DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nChw16c))
DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nCdhw16c))
+ DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
+
DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8i16o2i, fmt_order::keep))
DNNL_NON_X64_ONLY(REG_SR(f32, goihw, bf16, gOIhw8i16o2i, fmt_order::keep))
DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8o16i2o, fmt_order::keep))

View File

@ -1,31 +0,0 @@
*******************************************************************************
Copyright 2024 Arm Limited and affiliates.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*******************************************************************************
diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
index f043fee4bc..0384cce757 100644
--- a/src/cpu/aarch64/acl_convolution_utils.cpp
+++ b/src/cpu/aarch64/acl_convolution_utils.cpp
@@ -313,10 +313,6 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
- // Indirect is slower than gemm for low thread counts, except for fast math
- if (dnnl_get_max_threads() < 28 && !acp.fast_math)
- return status::unimplemented;
-
// If we do not need to pad input channels for fast math mode then it would
// be faster to run convolution with im2row instead of using indirect kernel
int block_by = arm_compute::block_by(acp.weights_info.weight_format());

View File

@ -1,371 +0,0 @@
*******************************************************************************
Copyright 2023 Arm Limited and affiliates.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*******************************************************************************
diff --git a/src/cpu/aarch64/acl_reorder.cpp b/src/cpu/aarch64/acl_reorder.cpp
new file mode 100644
index 000000000..061751b55
--- /dev/null
+++ b/src/cpu/aarch64/acl_reorder.cpp
@@ -0,0 +1,52 @@
+/*******************************************************************************
+* Copyright 2023 Arm Ltd. and affiliates
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "cpu/aarch64/acl_reorder.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace aarch64 {
+
+status_t acl_reorder_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
+ // Lock here is needed because resource_mapper does not support
+ // concurrent multithreaded access.
+ std::lock_guard<std::mutex> _lock {this->mtx};
+
+ auto src = CTX_IN_MEM(const void *, DNNL_ARG_FROM);
+ auto dst = CTX_OUT_MEM(void *, DNNL_ARG_TO);
+
+ // Retrieve primitive resource and configured Compute Library objects
+ auto *acl_resource
+ = ctx.get_resource_mapper()->get<acl_reorder_resource_t>(this);
+
+ acl_reorder_obj_t &acl_obj = acl_resource->get_acl_obj();
+
+ acl_obj.src_tensor.allocator()->import_memory(const_cast<void *>(src));
+ acl_obj.dst_tensor.allocator()->import_memory(dst);
+
+ acl_obj.reorder.run();
+
+ acl_obj.src_tensor.allocator()->free();
+ acl_obj.dst_tensor.allocator()->free();
+
+ return status::success;
+}
+
+} // namespace aarch64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/aarch64/acl_reorder.hpp b/src/cpu/aarch64/acl_reorder.hpp
new file mode 100644
index 0000000000..edbc38914d
--- /dev/null
+++ b/src/cpu/aarch64/acl_reorder.hpp
@@ -0,0 +1,262 @@
+/*******************************************************************************
+* Copyright 2023 Arm Ltd. and affiliates
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#ifndef CPU_AARCH64_ACL_REORDER_HPP
+#define CPU_AARCH64_ACL_REORDER_HPP
+
+#include "cpu/aarch64/acl_utils.hpp"
+#include "cpu/reorder/cpu_reorder_pd.hpp"
+#include "arm_compute/core/Types.h"
+#include "common/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace aarch64 {
+
+struct acl_reorder_obj_t {
+ arm_compute::NEReorderLayer reorder;
+ arm_compute::Tensor src_tensor;
+ arm_compute::Tensor dst_tensor;
+ arm_compute::WeightFormat src_wf;
+ arm_compute::WeightFormat dst_wf;
+};
+
+struct acl_reorder_conf_t {
+ arm_compute::TensorInfo src_info;
+ arm_compute::TensorInfo dst_info;
+ arm_compute::WeightFormat src_wf;
+ arm_compute::WeightFormat dst_wf;
+};
+
+struct acl_reorder_resource_t : public resource_t {
+ acl_reorder_resource_t() : acl_obj_(utils::make_unique<acl_reorder_obj_t>()) {}
+
+ status_t configure(const acl_reorder_conf_t &app) {
+ if (!acl_obj_) return status::out_of_memory;
+
+ // Init Compute Library tensors based on info from descriptor
+ acl_obj_->src_tensor.allocator()->init(app.src_info);
+ acl_obj_->dst_tensor.allocator()->init(app.dst_info);
+
+ // clang-format off
+ acl_obj_->reorder.configure(
+ &acl_obj_->src_tensor,
+ &acl_obj_->dst_tensor,
+ app.src_wf,
+ app.dst_wf
+ );
+ // clang-format on
+
+ return status::success;
+ }
+
+ acl_reorder_obj_t &get_acl_obj() const { return *acl_obj_; }
+ DNNL_DISALLOW_COPY_AND_ASSIGN(acl_reorder_resource_t);
+
+private:
+ std::unique_ptr<acl_reorder_obj_t> acl_obj_;
+}; // acl_reorder_resource_t
+
+struct acl_reorder_fwd_t : public primitive_t {
+ using primitive_t::primitive_t;
+ struct pd_t : public cpu_reorder_pd_t {
+
+ using cpu_reorder_pd_t::cpu_reorder_pd_t;
+
+ DECLARE_COMMON_PD_T("acl", acl_reorder_fwd_t);
+
+ static status_t create(reorder_pd_t **reorder_pd, engine_t *engine,
+ const primitive_attr_t *attr, engine_t *src_engine,
+ const memory_desc_t *src_md, engine_t *dst_engine,
+ const memory_desc_t *dst_md) {
+
+ using namespace acl_utils;
+ // using skip_mask_t = dnnl_primitive_attr::skip_mask_t;
+
+ bool ok = src_md->data_type
+ == dst_md->data_type // ACL only supports matching src/dst data types
+ && utils::one_of(src_md->data_type,
+ data_type::f32) // Only supports f32 for now
+ && attr->has_default_values();
+ if (!ok) return status::unimplemented;
+
+ int mask = -1;
+ bool is_set = false;
+ // CHECK(attr->scales_.get(DNNL_ARG_DST, &mask, &is_set));
+ const memory_desc_wrapper input_d(src_md);
+ if (input_d.has_runtime_dims_or_strides() && is_set && mask > 0)
+ return status::unimplemented;
+
+ // Create and check primitive descriptor
+ auto _pd = new pd_t(attr, src_engine->kind(), src_md,
+ dst_engine->kind(), dst_md);
+ if (_pd == nullptr) return status::out_of_memory;
+ if (_pd->init(engine, src_engine, dst_engine) != status::success) {
+ delete _pd;
+ return status::unimplemented;
+ }
+
+ const memory_desc_wrapper src_d(*src_md);
+ const memory_desc_wrapper dst_d(*dst_md);
+
+ const int ndims = src_d.ndims();
+
+ auto src_tag = memory_desc_matches_one_of_tag(
+ *src_md, format_tag::ba, format_tag::cdba);
+ ACL_CHECK_SUPPORT(
+ utils::one_of(format_tag::undef, src_tag),
+ "");
+
+ arm_compute::TensorShape acl_tensor_shape_in;
+ arm_compute::TensorShape acl_tensor_shape_out;
+ // Need even amount of dims in dim 0 for ACL kernel (eg mulitple of 8 rows when blocking by 8)
+ int dim_0_rounded_up;
+
+ // Switch for 2 or 4 dim tensors
+ switch(ndims)
+ {
+ // Currently for Ab4a and Ab8a
+ // No format_tag for these, have to deduce from stride
+ case 2:
+ {
+ if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){
+ return status::unimplemented;
+ }
+ int dst_dim_1 = dst_md->dims[1];
+ int dst_dim_0_stride = dst_md->format_desc.blocking.strides[0];
+ int dst_dim_1_stride = dst_md->format_desc.blocking.strides[1];
+ // Interleave of 4 or 8 that stride for dim 1
+ if (dst_dim_1_stride != 4 && dst_dim_1_stride != 8){
+ return status::unimplemented;
+ }
+ // Check to ensure it's a blocking transpose
+ if (dst_dim_1 * dst_dim_1_stride != dst_dim_0_stride){
+ return status::unimplemented;
+ }
+ if(dst_dim_1_stride == 4){
+ // Set Dest WeightFormat
+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
+ dim_0_rounded_up
+ = utils::rnd_up(src_md->dims[0], 4);
+ } else {
+ // Set Dest WeightFormat
+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
+ dim_0_rounded_up
+ = utils::rnd_up(src_md->dims[0], 8);
+ }
+ acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[1], src_md->dims[0]);
+ acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[1], dim_0_rounded_up);
+
+ break;
+ }
+ // Currently for Acdb4a and Acdb8a
+ case 4:
+ {
+
+ auto dst_tag = memory_desc_matches_one_of_tag(
+ *dst_md, format_tag::Acdb4a, format_tag::Acdb8a);
+ ACL_CHECK_SUPPORT(
+ utils::one_of(format_tag::undef, dst_tag),
+ "");
+ if(dst_tag == format_tag::Acdb4a){
+ // Set Dest WeightFormat
+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
+ dim_0_rounded_up
+ = utils::rnd_up(src_md->dims[0], 4);
+ }
+ else{
+ // Set Dest WeightFormat
+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
+ dim_0_rounded_up
+ = utils::rnd_up(src_md->dims[0], 8);
+ }
+ // Currently only supporting AxBx1x1 cases
+ if(dst_md->dims[2] != 1 || dst_md->dims[3] != 1){
+ return status::unimplemented;
+ }
+ if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){
+ return status::unimplemented;
+ }
+ acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], src_md->dims[0]);
+ acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], dim_0_rounded_up);
+ break;
+ }
+ default:
+ return status::unimplemented;
+ }
+
+ // Choose the data layout
+ // bool is_nspc = utils::one_of(src_tag, format_tag::nhwc);
+ const auto acl_layout = arm_compute::DataLayout::NCHW;
+
+ // Set Source WeightFormat
+ _pd->app_.src_wf = arm_compute::WeightFormat::OHWI;
+
+ // Create ACL tensor infos
+ const data_type_t data_type = src_d.data_type();
+ const arm_compute::DataType acl_data_t
+ = acl_utils::get_acl_data_t(data_type);
+ _pd->app_.src_info = arm_compute::TensorInfo(
+ acl_tensor_shape_in, 1, acl_data_t, acl_layout);
+ _pd->app_.dst_info = arm_compute::TensorInfo(
+ acl_tensor_shape_out, 1, acl_data_t, acl_layout);
+
+ // Init scratch memory, not used so 0 in this implementation
+ _pd->init_scratchpad_md();
+
+ return safe_ptr_assign(*reorder_pd, _pd);
+ } // create
+
+ friend dnnl::impl::impl_list_item_t;
+ acl_reorder_conf_t app_;
+
+ }; // pd_t
+
+ acl_reorder_fwd_t(const pd_t *apd) : primitive_t(apd) {}
+
+ status_t create_resource(
+ engine_t *engine, resource_mapper_t &mapper) const override {
+ if (mapper.has_resource(this)) return status::success;
+
+ auto r = utils::make_unique<acl_reorder_resource_t>();
+ if (!r) return status::out_of_memory;
+
+ // Configure the resource based on information from primitive descriptor
+ CHECK(r->configure(pd()->app_));
+
+ mapper.add(this, std::move(r));
+ return status::success;
+ }
+
+ status_t execute(const exec_ctx_t &ctx) const override {
+ return execute_forward(ctx);
+ }
+
+private:
+ // To guard the const execute_forward, the mutex must be 'mutable'
+ mutable std::mutex mtx;
+ status_t execute_forward(const exec_ctx_t &ctx) const;
+ const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+
+
+}; // acl_reorder_fwd_t
+
+} // namespace aarch64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // CPU_AARCH64_ACL_REORDER_HPP
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
index a4150b619..f4d6b4de3 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
@@ -16,6 +16,7 @@
*******************************************************************************/
#include "cpu/reorder/cpu_reorder.hpp"
+#include "cpu/aarch64/acl_reorder.hpp"
namespace dnnl {
namespace impl {
@@ -28,6 +29,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
// f32 -> f32
{{f32, f32, 0}, {
REG_FAST_DIRECT_COPY_F32_F32
+ DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
@@ -69,6 +71,8 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
nullptr,
}},
{{f32, f32, 4}, {
+
+ DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
CPU_REORDER_INSTANCE(rnn_weights_reorder_t<f32, f32>)
REG_FAST_DIRECT_COPY_F32_F32

View File

@ -1,97 +0,0 @@
*******************************************************************************
Copyright 2023 Arm Limited and affiliates.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*******************************************************************************
diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
index fd2c76d01..bd7bed837 100644
--- a/src/cpu/aarch64/acl_thread.cpp
+++ b/src/cpu/aarch64/acl_thread.cpp
@@ -55,14 +55,17 @@ void acl_set_benchmark_scheduler_default() {
#endif
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
-void acl_set_tp_scheduler() {
- static std::once_flag flag_once;
- // Create threadpool scheduler
- std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
- = std::make_unique<ThreadpoolScheduler>();
+void acl_set_tp_scheduler(int intra_threads = 0) {
+ static thread_local std::once_flag flag_once;
// set CUSTOM scheduler in ACL
std::call_once(flag_once,
- [&]() { arm_compute::Scheduler::set(threadpool_scheduler); });
+ [&]() {
+ // Create threadpool scheduler
+ std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
+ = std::make_unique<ThreadpoolScheduler>();
+ threadpool_scheduler->set_num_threads(intra_threads);
+
+ arm_compute::Scheduler::set(threadpool_scheduler); });
}
void acl_set_threadpool_num_threads() {
@@ -102,14 +105,6 @@ void set_acl_threading() {
acl_set_benchmark_scheduler_default();
}
#endif
-#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
- if (verbose_has_profile_externals()) {
- acl_set_tp_benchmark_scheduler();
- } else {
- acl_set_tp_scheduler();
- }
-
-#endif
}
} // namespace acl_thread_utils
diff --git a/src/cpu/aarch64/acl_thread.hpp b/src/cpu/aarch64/acl_thread.hpp
index f073376e6..654a2aa5d 100644
--- a/src/cpu/aarch64/acl_thread.hpp
+++ b/src/cpu/aarch64/acl_thread.hpp
@@ -40,7 +40,7 @@ void acl_set_benchmark_scheduler_default();
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
// Retrieve threadpool size during primitive execution and set ThreadpoolScheduler num_threads
-void acl_set_tp_scheduler();
+void acl_set_tp_scheduler(int intra_threads);
void acl_set_threadpool_num_threads();
// Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler) for DNNL_VERBOSE=profile,profile_externals
void acl_set_tp_benchmark_scheduler();
diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
index 439ca862e..6656c37a5 100644
--- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
@@ -102,8 +102,6 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
void ThreadpoolScheduler::run_workloads(
std::vector<arm_compute::IScheduler::Workload> &workloads) {
- arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
-
const unsigned int num_threads
= std::min(static_cast<unsigned int>(_num_threads),
static_cast<unsigned int>(workloads.size()));
diff --git a/src/cpu/cpu_engine.cpp b/src/cpu/cpu_engine.cpp
index 0bfec3871..7207b2b60 100644
--- a/src/cpu/cpu_engine.cpp
+++ b/src/cpu/cpu_engine.cpp
@@ -47,6 +47,7 @@ status_t cpu_engine_t::create_stream(stream_t **stream, unsigned flags) {
#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
status_t cpu_engine_t::create_stream(stream_t **stream,
dnnl::threadpool_interop::threadpool_iface *threadpool) {
+ dnnl::impl::cpu::aarch64::acl_thread_utils::acl_set_tp_scheduler(threadpool->get_num_threads());
return safe_ptr_assign<stream_t>(
*stream, new cpu_stream_t(this, threadpool));
}

View File

@ -1,43 +0,0 @@
*******************************************************************************
Copyright 2023 Arm Limited and affiliates.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*******************************************************************************
diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
index fd2c76d01..2d7c76d48 100644
--- a/src/cpu/aarch64/acl_thread.cpp
+++ b/src/cpu/aarch64/acl_thread.cpp
@@ -17,6 +17,8 @@
#include "cpu/aarch64/acl_thread.hpp"
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
#include "cpu/aarch64/acl_threadpool_scheduler.hpp"
+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
+#include <thread>
#endif
#include "cpu/aarch64/acl_benchmark_scheduler.hpp"
@@ -30,9 +32,10 @@ namespace acl_thread_utils {
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
void acl_thread_bind() {
static std::once_flag flag_once;
- // The threads in Compute Library are bound for the cores 0..max_threads-1
- // dnnl_get_max_threads() returns OMP_NUM_THREADS
- const int max_threads = dnnl_get_max_threads();
+ // Cap the number of threads to 90% of the total core count
+ // to ensure Compute Library doesn't use too much resource
+ int capped_threads = (int)std::floor(0.9*std::thread::hardware_concurrency());
+ const int max_threads = std::min(capped_threads, dnnl_get_max_threads());
// arm_compute::Scheduler does not support concurrent access thus a
// workaround here restricts it to only one call
std::call_once(flag_once, [&]() {

View File

@ -0,0 +1,180 @@
# *******************************************************************************
# Copyright 2025 Arm Limited and affiliates.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# *******************************************************************************
diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
index 53175a05f9..89731cb356 100644
--- a/src/cpu/aarch64/acl_thread.cpp
+++ b/src/cpu/aarch64/acl_thread.cpp
@@ -1,5 +1,5 @@
/*******************************************************************************
-* Copyright 2022-2024 Arm Ltd. and affiliates
+* Copyright 2022-2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -83,17 +83,20 @@ void acl_set_threadpool_num_threads() {
}
// Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler)
void acl_set_tp_benchmark_scheduler() {
- static std::once_flag flag_once;
- // Create threadpool scheduler
- std::unique_ptr<arm_compute::IScheduler> threadpool_scheduler
- = std::make_unique<ThreadpoolScheduler>();
- arm_compute::IScheduler *_real_scheduler = nullptr;
- _real_scheduler = threadpool_scheduler.release();
- // Create benchmark scheduler and set TP as real scheduler
- std::shared_ptr<arm_compute::IScheduler> benchmark_scheduler
- = std::make_unique<BenchmarkScheduler>(*_real_scheduler);
- std::call_once(flag_once,
- [&]() { arm_compute::Scheduler::set(benchmark_scheduler); });
+ static thread_local std::once_flag flag_once;
+ std::call_once(flag_once, [&]() {
+ // Create threadpool scheduler
+ std::unique_ptr<arm_compute::IScheduler> threadpool_scheduler
+ = std::make_unique<ThreadpoolScheduler>();
+ arm_compute::IScheduler *_real_scheduler = nullptr;
+ _real_scheduler = threadpool_scheduler.release();
+
+ // Create benchmark scheduler and set TP as real scheduler
+ std::shared_ptr<arm_compute::IScheduler> benchmark_scheduler
+ = std::make_unique<BenchmarkScheduler>(*_real_scheduler);
+
+ arm_compute::Scheduler::set(benchmark_scheduler);
+ });
}
#endif
diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
index 30910398d9..34cf44b7e2 100644
--- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
@@ -1,5 +1,5 @@
/*******************************************************************************
-* Copyright 2022-2024 Arm Ltd. and affiliates
+* Copyright 2022-2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -18,24 +18,17 @@
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
-#include "cpu/aarch64/acl_thread.hpp"
-
#include "common/counting_barrier.hpp"
#include "common/dnnl_thread.hpp"
+#include "cpu/aarch64/acl_thread.hpp"
#include "arm_compute/core/CPP/ICPPKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
#include "arm_compute/runtime/IScheduler.h"
-// BARRIER
#include <atomic>
#include <cassert>
-#include <chrono>
#include <mutex>
-#include <thread>
-#include <condition_variable>
namespace dnnl {
namespace impl {
@@ -51,7 +44,7 @@ public:
/// Function to check the next element in the range if there is one.
bool get_next(unsigned int &next) {
- next = atomic_fetch_add_explicit(
+ next = std::atomic_fetch_add_explicit(
&_atomic_counter, 1u, std::memory_order_relaxed);
return next < _end;
}
@@ -70,11 +63,8 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads,
} while (feeder.get_next(workload_index));
}
-ThreadpoolScheduler::ThreadpoolScheduler() {
- using namespace dnnl::impl::threadpool_utils;
- // Set number of threads to one when threadpool is not available.
- _num_threads = get_active_threadpool() == nullptr ? 1 : num_threads_hint();
-}
+ThreadpoolScheduler::ThreadpoolScheduler()
+ : _num_threads(dnnl_get_max_threads()) {}
ThreadpoolScheduler::~ThreadpoolScheduler() = default;
@@ -83,8 +73,8 @@ unsigned int ThreadpoolScheduler::num_threads() const {
}
void ThreadpoolScheduler::set_num_threads(unsigned int num_threads) {
- arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
- _num_threads = num_threads == 0 ? num_threads_hint() : num_threads;
+ std::lock_guard<std::mutex> lock(this->_mtx);
+ _num_threads = num_threads == 0 ? dnnl_get_max_threads() : num_threads;
}
void ThreadpoolScheduler::schedule(ICPPKernel *kernel, const Hints &hints) {
@@ -104,7 +94,7 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
void ThreadpoolScheduler::run_workloads(
std::vector<arm_compute::IScheduler::Workload> &workloads) {
- arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
+ std::lock_guard<std::mutex> lock(this->_mtx);
const unsigned int num_threads
= std::min(static_cast<unsigned int>(_num_threads),
diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.hpp b/src/cpu/aarch64/acl_threadpool_scheduler.hpp
index e9ba21c803..384dfec1b9 100644
--- a/src/cpu/aarch64/acl_threadpool_scheduler.hpp
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.hpp
@@ -1,5 +1,5 @@
/*******************************************************************************
-* Copyright 2022 Arm Ltd. and affiliates
+* Copyright 2022, 2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -22,7 +22,8 @@
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
#include "arm_compute/runtime/IScheduler.h"
-#include "support/Mutex.h"
+
+#include <mutex>
namespace dnnl {
namespace impl {
@@ -32,7 +33,7 @@ namespace aarch64 {
class ThreadpoolScheduler final : public arm_compute::IScheduler {
public:
ThreadpoolScheduler();
- ~ThreadpoolScheduler();
+ ~ThreadpoolScheduler() override;
/// Sets the number of threads the scheduler will use to run the kernels.
void set_num_threads(unsigned int num_threads) override;
@@ -54,8 +55,8 @@ protected:
void run_workloads(std::vector<Workload> &workloads) override;
private:
- uint _num_threads {};
- arm_compute::Mutex _run_workloads_mutex {};
+ unsigned int _num_threads {};
+ std::mutex _mtx;
};
} // namespace aarch64

View File

@ -1,98 +0,0 @@
diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
index 9e8add1f9..cf5e2bf4c 100644
--- a/arm_compute/runtime/Scheduler.h
+++ b/arm_compute/runtime/Scheduler.h
@@ -75,7 +75,7 @@ public:
private:
static Type _scheduler_type;
- static std::shared_ptr<IScheduler> _custom_scheduler;
+ static thread_local std::shared_ptr<IScheduler> _custom_scheduler;
static std::map<Type, std::unique_ptr<IScheduler>> _schedulers;
Scheduler();
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
index a5b9eca56..d1ab19397 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
@@ -60,8 +60,8 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src,
const ConvolutionInfo &info)
{
ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, info);
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- const unsigned int num_threads = NEScheduler::get().num_threads();
+ const CPUInfo &ci = CPUInfo::get();
+ const unsigned int num_threads = CPUInfo::get().get_cpu_num();
_pImpl->is_prepared = false;
_pImpl->are_weights_const = weights->are_values_constant();
diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp
index 722cd36ee..03aef1632 100644
--- a/src/cpu/operators/CpuPool2d.cpp
+++ b/src/cpu/operators/CpuPool2d.cpp
@@ -66,8 +66,8 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
if(run_optimised)
{
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- const unsigned int num_threads = NEScheduler::get().num_threads();
+ const CPUInfo &ci = CPUInfo::get();
+ const unsigned int num_threads = CPUInfo::get().get_cpu_num();
auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>();
ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
*******************************************************************************
Copyright 2023 Arm Limited and affiliates.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*******************************************************************************
index 9c8563140..f7771945a 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -623,8 +623,8 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
arm_gemm::Activation activation, const AsmGemmInfo &info)
{
Params p = extract_parameters(a, b, d, info);
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- unsigned int num_threads = NEScheduler::get().num_threads();
+ const CPUInfo &ci = CPUInfo::get();
+ unsigned int num_threads = CPUInfo::get().get_cpu_num();
arm_gemm::GemmConfig cfg;
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
@@ -696,8 +696,8 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
ARM_COMPUTE_UNUSED(c);
arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
Params p = extract_parameters(a, b, d, info);
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- unsigned int num_threads = NEScheduler::get().num_threads();
+ const CPUInfo &ci = CPUInfo::get();
+ unsigned int num_threads = CPUInfo::get().get_cpu_num();
arm_gemm::GemmConfig cfg;
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format);
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
index 0713b9a2a..f15ac2e22 100644
--- a/src/runtime/Scheduler.cpp
+++ b/src/runtime/Scheduler.cpp
@@ -47,7 +47,7 @@ Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST;
#endif /* ARM_COMPUTE_*_SCHEDULER */
-std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
+thread_local std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
namespace
{

View File

@ -1,8 +1,8 @@
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index bf71e534e2..22377f1a32 100644
index 547c98576..a31301230 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -971,7 +971,6 @@ filegroup(
@@ -1029,7 +1029,6 @@ filegroup(
"runtime/NEON/functions/NETranspose.cpp",
"runtime/NEON/functions/NEUnstack.cpp",
"runtime/NEON/functions/NEWinogradConvolutionLayer.cpp",
@ -10,10 +10,10 @@ index bf71e534e2..22377f1a32 100644
"runtime/OffsetLifetimeManager.cpp",
"runtime/OffsetMemoryPool.cpp",
"runtime/OperatorTensor.cpp",
@@ -984,6 +983,10 @@ filegroup(
"runtime/Tensor.cpp",
"runtime/TensorAllocator.cpp",
"runtime/Utils.cpp"] +
@@ -1058,6 +1057,10 @@ filegroup(
"runtime/experimental/operators/CpuSub.cpp",
"runtime/experimental/operators/CpuTranspose.cpp",
"runtime/experimental/operators/CpuWinogradConv2d.cpp"] +
+ select({
+ "//:openmp_flag": ["runtime/OMP/OMPScheduler.cpp"],
+ "//conditions:default": [],

View File

@ -9,13 +9,6 @@ _DNNL_COPTS_THREADPOOL = [
"-UUSE_CBLAS",
]
_DNNL_COPTS_OMP = [
"-fopenmp",
"-fexceptions",
"-UUSE_MKL",
"-UUSE_CBLAS",
]
_DNNL_RUNTIME_THREADPOOL = {
"#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
"#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
@ -63,61 +56,23 @@ _DNNL_RUNTIME_THREADPOOL = {
"#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
"#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
"#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
}
_DNNL_RUNTIME_OMP = {
"#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
"#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
"#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
"#cmakedefine DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE": "#undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE",
"#cmakedefine DNNL_WITH_SYCL": "#undef DNNL_WITH_SYCL",
"#cmakedefine DNNL_WITH_LEVEL_ZERO": "#undef DNNL_WITH_LEVEL_ZERO",
"#cmakedefine DNNL_SYCL_CUDA": "#undef DNNL_SYCL_CUDA",
"#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
"#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
"#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
"#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
"#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
"#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
"#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
"#cmakedefine01 BUILD_BATCH_NORMALIZATION": "#define BUILD_BATCH_NORMALIZATION 0",
"#cmakedefine01 BUILD_BINARY": "#define BUILD_BINARY 0",
"#cmakedefine01 BUILD_CONCAT": "#define BUILD_CONCAT 0",
"#cmakedefine01 BUILD_CONVOLUTION": "#define BUILD_CONVOLUTION 0",
"#cmakedefine01 BUILD_DECONVOLUTION": "#define BUILD_DECONVOLUTION 0",
"#cmakedefine01 BUILD_ELTWISE": "#define BUILD_ELTWISE 0",
"#cmakedefine01 BUILD_INNER_PRODUCT": "#define BUILD_INNER_PRODUCT 0",
"#cmakedefine01 BUILD_LAYER_NORMALIZATION": "#define BUILD_LAYER_NORMALIZATION 0",
"#cmakedefine01 BUILD_LRN": "#define BUILD_LRN 0",
"#cmakedefine01 BUILD_MATMUL": "#define BUILD_MATMUL 0",
"#cmakedefine01 BUILD_POOLING": "#define BUILD_POOLING 0",
"#cmakedefine01 BUILD_PRELU": "#define BUILD_PRELU 0",
"#cmakedefine01 BUILD_REDUCTION": "#define BUILD_REDUCTION 0",
"#cmakedefine01 BUILD_REORDER": "#define BUILD_REORDER 0",
"#cmakedefine01 BUILD_RESAMPLING": "#define BUILD_RESAMPLING 0",
"#cmakedefine01 BUILD_RNN": "#define BUILD_RNN 0",
"#cmakedefine01 BUILD_SHUFFLE": "#define BUILD_SHUFFLE 0",
"#cmakedefine01 BUILD_SOFTMAX": "#define BUILD_SOFTMAX 0",
"#cmakedefine01 BUILD_SUM": "#define BUILD_SUM 0",
"#cmakedefine01 BUILD_PRIMITIVE_CPU_ISA_ALL": "#define BUILD_PRIMITIVE_CPU_ISA_ALL 0",
"#cmakedefine01 BUILD_SSE41": "#define BUILD_SSE41 0",
"#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0",
"#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0",
"#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0",
"#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 0",
"#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0",
"#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0",
"#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0",
"#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
"#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
"#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
"#cmakedefine01 BUILD_GROUP_NORMALIZATION": "#define BUILD_GROUP_NORMALIZATION 0",
"#cmakedefine01 BUILD_GEMM_KERNELS_ALL": "#define BUILD_GEMM_KERNELS_ALL 1",
"#cmakedefine01 BUILD_GEMM_KERNELS_NONE": "#define BUILD_GEMM_KERNELS_NONE 0",
"#cmakedefine01 BUILD_GEMM_SSE41": "#define BUILD_GEMM_SSE41 0",
"#cmakedefine01 BUILD_GEMM_AVX2": "#define BUILD_GEMM_AVX2 0",
"#cmakedefine01 BUILD_GEMM_AVX512": "#define BUILD_GEMM_AVX512 0",
"#cmakedefine DNNL_GPU_VENDOR": "#define DNNL_GPU_VENDOR INTEL",
"#cmakedefine DNNL_SYCL_GENERIC": "#undef DNNL_SYCL_GENERIC",
"#cmakedefine DNNL_DISABLE_GPU_REF_KERNELS": "#undef DNNL_DISABLE_GPU_REF_KERNELS",
"#cmakedefine01 BUILD_SDPA": "#define BUILD_SDPA 0",
"#cmakedefine01 BUILD_XE2": "#define BUILD_XE2 0",
}
expand_template(
name = "dnnl_config_h",
out = "include/oneapi/dnnl/dnnl_config.h",
substitutions = select({
"@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": _DNNL_RUNTIME_OMP,
"//conditions:default": _DNNL_RUNTIME_THREADPOOL,
}),
template = "include/oneapi/dnnl/dnnl_config.h.in",
@ -128,13 +83,21 @@ expand_template(
out = "include/oneapi/dnnl/dnnl_version.h",
substitutions = {
"@DNNL_VERSION_MAJOR@": "3",
"@DNNL_VERSION_MINOR@": "2",
"@DNNL_VERSION_PATCH@": "1",
"@DNNL_VERSION_HASH@": "N/A",
"@DNNL_VERSION_MINOR@": "7",
"@DNNL_VERSION_PATCH@": "0",
},
template = "include/oneapi/dnnl/dnnl_version.h.in",
)
expand_template(
name = "dnnl_version_hash_h",
out = "include/oneapi/dnnl/dnnl_version_hash.h",
substitutions = {
"@DNNL_VERSION_HASH@": "N/A",
},
template = "include/oneapi/dnnl/dnnl_version_hash.h.in",
)
cc_library(
name = "mkl_dnn_acl",
srcs = glob(
@ -146,10 +109,11 @@ cc_library(
exclude = [
"src/cpu/x64/**",
"src/cpu/rv64/**",
"src/cpu/sycl/**",
"src/xpu/**",
],
),
copts = select({
"@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": _DNNL_COPTS_OMP,
"//conditions:default": _DNNL_COPTS_THREADPOOL,
}),
defines = ["DNNL_AARCH64_USE_ACL=1"],
@ -175,6 +139,7 @@ cc_library(
) + [
":dnnl_config_h",
":dnnl_version_h",
":dnnl_version_hash_h",
],
visibility = ["//visibility:public"],
deps = [

View File

@ -1,31 +0,0 @@
/* Copyright 2024 The OpenXLA Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
diff --git a/src/cpu/platform.cpp b/src/cpu/platform.cpp
index 65b887ea21..eabdb827bd 100644
--- a/src/cpu/platform.cpp
+++ b/src/cpu/platform.cpp
@@ -117,6 +117,8 @@ bool has_data_type_support(data_type_t data_type) {
#if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__)
return true;
#endif
+#elif DNNL_AARCH64_USE_ACL
+ return arm_compute::CPUInfo::get().has_bf16();
#else
return false;
#endif
--
2.34.1

View File

@ -1,44 +0,0 @@
/* Copyright 2024 The OpenXLA Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
index ab13efb9b2..ec261e156d 100644
--- a/src/cpu/aarch64/matmul/acl_matmul.hpp
+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
@@ -78,11 +78,21 @@ struct acl_matmul_t : public primitive_t {
= utils::everyone_is(data_type::f16, src_md()->data_type,
weights_md()->data_type, dst_md()->data_type)
&& platform::has_data_type_support(data_type::f16);
+ const bool is_fp32_bf16_ok
+ = (utils::everyone_is(data_type::f32, src_md()->data_type,
+ dst_md()->data_type, desc()->accum_data_type)
+ && platform::has_data_type_support(data_type::f32)
+ && utils::everyone_is(
+ data_type::bf16, weights_md()->data_type)
+ && platform::has_data_type_support(
+ data_type::bf16));
+
const bool is_weights_md_format_ok
= utils::one_of(weights_format_kind_received,
format_kind::any, format_kind::blocked);
bool ok = is_dense_data()
- && utils::one_of(true, is_fp32_ok, is_fp16_ok)
+ && utils::one_of(
+ true, is_fp32_ok, is_fp16_ok, is_fp32_bf16_ok)
&& !has_zero_dim_memory() && is_weights_md_format_ok
&& set_default_formats()
&& attr()->has_default_values(
--
2.34.1

View File

@ -1,100 +0,0 @@
/* Copyright 2024 The OpenXLA Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
index 451cc78d52..ab13efb9b2 100644
--- a/src/cpu/aarch64/matmul/acl_matmul.hpp
+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
@@ -67,6 +67,8 @@ struct acl_matmul_t : public primitive_t {
status_t init(engine_t *engine) {
using smask_t = primitive_attr_t::skip_mask_t;
+ const format_kind_t weights_format_kind_received
+ = weights_md_.format_kind;
const bool is_fp32_ok
= utils::everyone_is(data_type::f32, src_md()->data_type,
weights_md()->data_type, dst_md()->data_type,
@@ -76,18 +78,20 @@ struct acl_matmul_t : public primitive_t {
= utils::everyone_is(data_type::f16, src_md()->data_type,
weights_md()->data_type, dst_md()->data_type)
&& platform::has_data_type_support(data_type::f16);
+ const bool is_weights_md_format_ok
+ = utils::one_of(weights_format_kind_received,
+ format_kind::any, format_kind::blocked);
bool ok = is_dense_data()
&& utils::one_of(true, is_fp32_ok, is_fp16_ok)
- && !has_zero_dim_memory()
- && weights_md_.format_kind == format_kind::any
+ && !has_zero_dim_memory() && is_weights_md_format_ok
&& set_default_formats()
&& attr()->has_default_values(
smask_t::oscale | smask_t::post_ops)
&& attr_oscale_ok() && !has_runtime_dims_or_strides();
if (!ok) return status::unimplemented;
- CHECK(acl_matmul_utils::init_conf_matmul(
- amp_, src_md_, weights_md_, dst_md_, *desc(), *attr()));
+ CHECK(acl_matmul_utils::init_conf_matmul(amp_, src_md_, weights_md_,
+ dst_md_, *desc(), *attr(), weights_format_kind_received));
arm_compute::ActivationLayerInfo act_info;
CHECK(post_ops.init(engine, attr_.post_ops_, dst_md_, act_info));
diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
index a314d96384..027f915a8a 100644
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
@@ -27,7 +27,8 @@ namespace acl_matmul_utils {
status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
memory_desc_t &wei_md, memory_desc_t &dst_md, const matmul_desc_t &md,
- const primitive_attr_t &attr) {
+ const primitive_attr_t &attr,
+ format_kind_t weights_format_kind_received) {
const memory_desc_wrapper src_d(&src_md);
const memory_desc_wrapper wei_d(&wei_md);
@@ -128,9 +129,16 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
for (dim_t i = K_dim - 1; i >= 0; --i)
batch_dims.push_back(i);
+ const memory_desc_t weights_md_received = wei_md;
acl_utils::reorder_to_weight_format(amp.wei_tensor_info, wei_md,
expected_weight_format, K_dim, N_dim, {}, batch_dims);
+ ACL_CHECK_SUPPORT((weights_format_kind_received == format_kind::blocked)
+ && !(dnnl_memory_desc_equal(&weights_md_received, &wei_md)),
+ "specified blocked format not supported by ACL, use "
+ "format_kind_t::any to find a supported blocked format for "
+ "your platform");
+
return status::success;
}
diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
index 67bb2e78eb..5ba4241abc 100644
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
@@ -52,7 +52,8 @@ namespace acl_matmul_utils {
status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
memory_desc_t &wei_md, memory_desc_t &dst_md, const matmul_desc_t &md,
- const primitive_attr_t &attr);
+ const primitive_attr_t &attr,
+ format_kind_t weights_format_kind_received);
} // namespace acl_matmul_utils
--
2.34.1

View File

@ -1,50 +0,0 @@
From 9a9430c7db870b78c6402d786a67921af4a66334 Mon Sep 17 00:00:00 2001
From: Kentaro Kawakami <kawakami.k@fujitsu.com>
Date: Fri, 26 May 2023 10:58:36 +0900
Subject: [PATCH] cpu: aarch64: xbyak_aarch64: BF16 capability detection for
Ubuntu 20.04
---
.../aarch64/xbyak_aarch64/src/util_impl_linux.h | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
index 743843bae50..3db37e972d1 100644
--- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
+++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
@@ -39,6 +39,13 @@
#include <asm/hwcap.h>
#endif
+/* Linux kernel used in Ubuntu 20.04 does not have HWCAP2_BF16 definition. */
+#ifdef AT_HWCAP2
+#ifndef HWCAP2_BF16
+#define HWCAP2_BF16 (1UL << 14)
+#endif
+#endif
+
namespace Xbyak_aarch64 {
namespace util {
#define XBYAK_AARCH64_ERROR_ fprintf(stderr, "%s, %d, Error occurrs during read cache infomation.\n", __FILE__, __LINE__);
@@ -383,7 +390,7 @@ class CpuInfoLinux : public CpuInfo {
}
void setHwCap() {
- unsigned long hwcap = getauxval(AT_HWCAP);
+ const unsigned long hwcap = getauxval(AT_HWCAP);
if (hwcap & HWCAP_ATOMICS)
type_ |= (Type)XBYAK_AARCH64_HWCAP_ATOMIC;
@@ -391,8 +398,10 @@ class CpuInfoLinux : public CpuInfo {
type_ |= (Type)XBYAK_AARCH64_HWCAP_FP;
if (hwcap & HWCAP_ASIMD)
type_ |= (Type)XBYAK_AARCH64_HWCAP_ADVSIMD;
-#ifdef HWCAP2_BF16
- if (hwcap & HWCAP2_BF16)
+
+#ifdef AT_HWCAP2
+ const unsigned long hwcap2 = getauxval(AT_HWCAP2);
+ if (hwcap2 & HWCAP2_BF16)
type_ |= (Type)XBYAK_AARCH64_HWCAP_BF16;
#endif

View File

@ -1,96 +0,0 @@
/* Copyright 2024 The OpenXLA Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
diff --git a/src/cpu/aarch64/acl_post_ops.cpp b/src/cpu/aarch64/acl_post_ops.cpp
index ea4bb200ec..3eb53b81bd 100644
--- a/src/cpu/aarch64/acl_post_ops.cpp
+++ b/src/cpu/aarch64/acl_post_ops.cpp
@@ -24,7 +24,7 @@ namespace aarch64 {
status_t acl_post_ops_t::execute(const exec_ctx_t &ctx, void *src_orig) const {
- int post_op_index = 0;
+ int post_op_index = post_op_start_index_;
// As these are post ops, this src will also be our dst. If we have a sum
// post op, the src/dst will start off in a temporary, then change to
diff --git a/src/cpu/aarch64/acl_post_ops.hpp b/src/cpu/aarch64/acl_post_ops.hpp
index 7b59ad71d3..ceaa95b73a 100644
--- a/src/cpu/aarch64/acl_post_ops.hpp
+++ b/src/cpu/aarch64/acl_post_ops.hpp
@@ -32,7 +32,9 @@ struct acl_post_ops_t {
// init the acl_post_ops_t. Note that this function modifies the passed in
// post ops by setting the preferred memory formats
status_t init(engine_t *engine, post_ops_t &post_ops,
- const memory_desc_t &dst_md) {
+ const memory_desc_t &dst_md, int post_op_start_index = 0) {
+
+ post_op_start_index_ = post_op_start_index;
CHECK(post_ops.set_default_formats(&dst_md));
dst_data_type = dst_md.data_type;
@@ -41,7 +43,7 @@ struct acl_post_ops_t {
sum_index = -1;
post_op_primitives = {};
- for (int i = 0; i < post_ops.len(); i++) {
+ for (int i = post_op_start_index; i < post_ops.len(); i++) {
auto &po = post_ops.entry_[i];
if (po.is_sum()) {
@@ -135,7 +137,8 @@ struct acl_post_ops_t {
// formats
status_t init(engine_t *engine, post_ops_t &base_post_ops,
const memory_desc_t &dst_md,
- arm_compute::ActivationLayerInfo &act_info_to_fuse) {
+ arm_compute::ActivationLayerInfo &act_info_to_fuse,
+ int post_op_start_index = 0) {
CHECK(base_post_ops.set_default_formats(&dst_md));
dst_data_type = dst_md.data_type;
@@ -149,18 +152,11 @@ struct acl_post_ops_t {
"eltwise post op scale must be 1 (no scale)");
CHECK(acl_utils::convert_to_acl_act(first_po, act_info_to_fuse));
- // Copy all but the first, because it has been fused
- post_ops_t post_ops;
- for (int idx = 1; idx < base_post_ops.len(); ++idx) {
- // Construct empty entry then copy, so that we can check for failure
- post_ops.entry_.emplace_back();
- post_ops.entry_.back().copy_from(base_post_ops.entry_[idx]);
- }
- return init(engine, post_ops, dst_md);
-
+ // post_op_start_index + 1 to skip the fused eltwise
+ return init(engine, base_post_ops, dst_md, post_op_start_index + 1);
} else {
// Nothing to fuse, just copy all post ops
- return init(engine, base_post_ops, dst_md);
+ return init(engine, base_post_ops, dst_md, post_op_start_index);
}
}
@@ -179,6 +175,9 @@ struct acl_post_ops_t {
private:
// Index of the sum post op if there is one, < 0 means no sum
int sum_index = -1;
+ // Index of the first post op this primitive executes. This is typically the
+ // number of post ops which were fused.
+ int post_op_start_index_ = 0;
data_type_t dst_data_type;
// Vector of primitives used to execute the post ops. They are constructed
// in init to be either acl_binary_t (for sum, add, sub, div, mul, min and
--
2.34.1

View File

@ -1,111 +0,0 @@
*******************************************************************************
Copyright 2023 Arm Limited and affiliates.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*******************************************************************************
diff --git a/src/cpu/aarch64/cpu_isa_traits.hpp b/src/cpu/aarch64/cpu_isa_traits.hpp
index 4a43b24c5..1a5cfe590 100644
--- a/src/cpu/aarch64/cpu_isa_traits.hpp
+++ b/src/cpu/aarch64/cpu_isa_traits.hpp
@@ -1,6 +1,7 @@
/*******************************************************************************
* Copyright 2018-2023 Intel Corporation
* Copyright 2020-2023 FUJITSU LIMITED
+* Copyright 2023 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -211,10 +212,10 @@ static inline bool mayiuse_atomic() {
return cpu().isAtomicSupported();
}
-inline bool isa_has_bf16(cpu_isa_t isa) {
- return false;
+static inline bool mayiuse_bf16() {
+ using namespace Xbyak_aarch64::util;
+ return cpu().isBf16Supported();
}
-
} // namespace
/* whatever is required to generate string literals... */
diff --git a/src/cpu/aarch64/jit_uni_reorder.cpp b/src/cpu/aarch64/jit_uni_reorder.cpp
index 6bd259ec2..5541bb702 100644
--- a/src/cpu/aarch64/jit_uni_reorder.cpp
+++ b/src/cpu/aarch64/jit_uni_reorder.cpp
@@ -1,7 +1,7 @@
/*******************************************************************************
* Copyright 2018-2023 Intel Corporation
* Copyright 2020-2023 FUJITSU LIMITED
-* Copyright 2022 Arm Ltd. and affiliates
+* Copyright 2022-2023 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -163,11 +163,11 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
bool ok = true && p.ndims > 0
&& utils::one_of(p.itype, f32, s32, data_type::s8, u8)
- && utils::one_of(p.otype, f32, s32, data_type::s8, u8)
+ && utils::one_of(p.otype, f32, bf16, s32, data_type::s8, u8)
&& utils::everyone_is(0, p.ioff, p.ooff) /* do we need this? */
&& utils::one_of(p.beta, 0.f, 1.f) /* anything else? */
- && simple_impl_desc_init(p, nullptr)
- && prb_has_small_strides(p);
+ && simple_impl_desc_init(p, nullptr) && prb_has_small_strides(p)
+ && ((p.otype != bf16) || (p.itype == f32 && mayiuse_bf16()));
return ok;
}
@@ -648,6 +648,9 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
cvt_v_s32_u8(startIdx, regNum);
if (idt == data_type::s8) cvt_v_s8_u8(startIdx, regNum);
break;
+ case bf16:
+ if (idt == f32) cvt_v_f32_bf16(startIdx, regNum);
+ break;
default: assert(!"unreachable");
}
};
@@ -1677,6 +1680,10 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
UNROLL_INST(fcvtzs, VReg4S, tmp, tmp);
}
+ void cvt_v_f32_bf16(const size_t startIdx, const size_t regNum) {
+ UNROLL_INST2(bfcvtn, VReg4H(i), VReg4S(i));
+ }
+
void cvt_z_s8_s32(const size_t startIdx, const size_t regNum) {
cvt_z_b_s(startIdx, regNum);
UNROLL_INST(sxtb, ZRegS, tmp, P_ALL_ONE / T_m, tmp);
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
index ba5499ba9..d4e21d316 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
@@ -1,5 +1,6 @@
/*******************************************************************************
* Copyright 2020-2022 Intel Corporation
+* Copyright 2023 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -34,6 +35,8 @@ const impl_list_map_t &regular_f32_bf16_impl_list_map() {
DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nChw16c))
DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nCdhw16c))
+ DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
+
DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8i16o2i, fmt_order::keep))
DNNL_NON_X64_ONLY(REG_SR(f32, goihw, bf16, gOIhw8i16o2i, fmt_order::keep))
DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8o16i2o, fmt_order::keep))

View File

@ -1,31 +0,0 @@
*******************************************************************************
Copyright 2024 Arm Limited and affiliates.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*******************************************************************************
diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
index f043fee4bc..0384cce757 100644
--- a/src/cpu/aarch64/acl_convolution_utils.cpp
+++ b/src/cpu/aarch64/acl_convolution_utils.cpp
@@ -313,10 +313,6 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
- // Indirect is slower than gemm for low thread counts, except for fast math
- if (dnnl_get_max_threads() < 28 && !acp.fast_math)
- return status::unimplemented;
-
// If we do not need to pad input channels for fast math mode then it would
// be faster to run convolution with im2row instead of using indirect kernel
int block_by = arm_compute::block_by(acp.weights_info.weight_format());

View File

@ -1,371 +0,0 @@
*******************************************************************************
Copyright 2023 Arm Limited and affiliates.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*******************************************************************************
diff --git a/src/cpu/aarch64/acl_reorder.cpp b/src/cpu/aarch64/acl_reorder.cpp
new file mode 100644
index 000000000..061751b55
--- /dev/null
+++ b/src/cpu/aarch64/acl_reorder.cpp
@@ -0,0 +1,52 @@
+/*******************************************************************************
+* Copyright 2023 Arm Ltd. and affiliates
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "cpu/aarch64/acl_reorder.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace aarch64 {
+
+status_t acl_reorder_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
+ // Lock here is needed because resource_mapper does not support
+ // concurrent multithreaded access.
+ std::lock_guard<std::mutex> _lock {this->mtx};
+
+ auto src = CTX_IN_MEM(const void *, DNNL_ARG_FROM);
+ auto dst = CTX_OUT_MEM(void *, DNNL_ARG_TO);
+
+ // Retrieve primitive resource and configured Compute Library objects
+ auto *acl_resource
+ = ctx.get_resource_mapper()->get<acl_reorder_resource_t>(this);
+
+ acl_reorder_obj_t &acl_obj = acl_resource->get_acl_obj();
+
+ acl_obj.src_tensor.allocator()->import_memory(const_cast<void *>(src));
+ acl_obj.dst_tensor.allocator()->import_memory(dst);
+
+ acl_obj.reorder.run();
+
+ acl_obj.src_tensor.allocator()->free();
+ acl_obj.dst_tensor.allocator()->free();
+
+ return status::success;
+}
+
+} // namespace aarch64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/aarch64/acl_reorder.hpp b/src/cpu/aarch64/acl_reorder.hpp
new file mode 100644
index 0000000000..edbc38914d
--- /dev/null
+++ b/src/cpu/aarch64/acl_reorder.hpp
@@ -0,0 +1,262 @@
+/*******************************************************************************
+* Copyright 2023 Arm Ltd. and affiliates
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#ifndef CPU_AARCH64_ACL_REORDER_HPP
+#define CPU_AARCH64_ACL_REORDER_HPP
+
+#include "cpu/aarch64/acl_utils.hpp"
+#include "cpu/reorder/cpu_reorder_pd.hpp"
+#include "arm_compute/core/Types.h"
+#include "common/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace aarch64 {
+
+struct acl_reorder_obj_t {
+ arm_compute::NEReorderLayer reorder;
+ arm_compute::Tensor src_tensor;
+ arm_compute::Tensor dst_tensor;
+ arm_compute::WeightFormat src_wf;
+ arm_compute::WeightFormat dst_wf;
+};
+
+struct acl_reorder_conf_t {
+ arm_compute::TensorInfo src_info;
+ arm_compute::TensorInfo dst_info;
+ arm_compute::WeightFormat src_wf;
+ arm_compute::WeightFormat dst_wf;
+};
+
+struct acl_reorder_resource_t : public resource_t {
+ acl_reorder_resource_t() : acl_obj_(utils::make_unique<acl_reorder_obj_t>()) {}
+
+ status_t configure(const acl_reorder_conf_t &app) {
+ if (!acl_obj_) return status::out_of_memory;
+
+ // Init Compute Library tensors based on info from descriptor
+ acl_obj_->src_tensor.allocator()->init(app.src_info);
+ acl_obj_->dst_tensor.allocator()->init(app.dst_info);
+
+ // clang-format off
+ acl_obj_->reorder.configure(
+ &acl_obj_->src_tensor,
+ &acl_obj_->dst_tensor,
+ app.src_wf,
+ app.dst_wf
+ );
+ // clang-format on
+
+ return status::success;
+ }
+
+ acl_reorder_obj_t &get_acl_obj() const { return *acl_obj_; }
+ DNNL_DISALLOW_COPY_AND_ASSIGN(acl_reorder_resource_t);
+
+private:
+ std::unique_ptr<acl_reorder_obj_t> acl_obj_;
+}; // acl_reorder_resource_t
+
+struct acl_reorder_fwd_t : public primitive_t {
+ using primitive_t::primitive_t;
+ struct pd_t : public cpu_reorder_pd_t {
+
+ using cpu_reorder_pd_t::cpu_reorder_pd_t;
+
+ DECLARE_COMMON_PD_T("acl", acl_reorder_fwd_t);
+
+ static status_t create(reorder_pd_t **reorder_pd, engine_t *engine,
+ const primitive_attr_t *attr, engine_t *src_engine,
+ const memory_desc_t *src_md, engine_t *dst_engine,
+ const memory_desc_t *dst_md) {
+
+ using namespace acl_utils;
+ // using skip_mask_t = dnnl_primitive_attr::skip_mask_t;
+
+ bool ok = src_md->data_type
+ == dst_md->data_type // ACL only supports matching src/dst data types
+ && utils::one_of(src_md->data_type,
+ data_type::f32) // Only supports f32 for now
+ && attr->has_default_values();
+ if (!ok) return status::unimplemented;
+
+ int mask = -1;
+ bool is_set = false;
+ // CHECK(attr->scales_.get(DNNL_ARG_DST, &mask, &is_set));
+ const memory_desc_wrapper input_d(src_md);
+ if (input_d.has_runtime_dims_or_strides() && is_set && mask > 0)
+ return status::unimplemented;
+
+ // Create and check primitive descriptor
+ auto _pd = new pd_t(attr, src_engine->kind(), src_md,
+ dst_engine->kind(), dst_md);
+ if (_pd == nullptr) return status::out_of_memory;
+ if (_pd->init(engine, src_engine, dst_engine) != status::success) {
+ delete _pd;
+ return status::unimplemented;
+ }
+
+ const memory_desc_wrapper src_d(*src_md);
+ const memory_desc_wrapper dst_d(*dst_md);
+
+ const int ndims = src_d.ndims();
+
+ auto src_tag = memory_desc_matches_one_of_tag(
+ *src_md, format_tag::ba, format_tag::cdba);
+ ACL_CHECK_SUPPORT(
+ utils::one_of(format_tag::undef, src_tag),
+ "");
+
+ arm_compute::TensorShape acl_tensor_shape_in;
+ arm_compute::TensorShape acl_tensor_shape_out;
+ // Need even amount of dims in dim 0 for ACL kernel (eg mulitple of 8 rows when blocking by 8)
+ int dim_0_rounded_up;
+
+ // Switch for 2 or 4 dim tensors
+ switch(ndims)
+ {
+ // Currently for Ab4a and Ab8a
+ // No format_tag for these, have to deduce from stride
+ case 2:
+ {
+ if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){
+ return status::unimplemented;
+ }
+ int dst_dim_1 = dst_md->dims[1];
+ int dst_dim_0_stride = dst_md->format_desc.blocking.strides[0];
+ int dst_dim_1_stride = dst_md->format_desc.blocking.strides[1];
+ // Interleave of 4 or 8 that stride for dim 1
+ if (dst_dim_1_stride != 4 && dst_dim_1_stride != 8){
+ return status::unimplemented;
+ }
+ // Check to ensure it's a blocking transpose
+ if (dst_dim_1 * dst_dim_1_stride != dst_dim_0_stride){
+ return status::unimplemented;
+ }
+ if(dst_dim_1_stride == 4){
+ // Set Dest WeightFormat
+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
+ dim_0_rounded_up
+ = utils::rnd_up(src_md->dims[0], 4);
+ } else {
+ // Set Dest WeightFormat
+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
+ dim_0_rounded_up
+ = utils::rnd_up(src_md->dims[0], 8);
+ }
+ acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[1], src_md->dims[0]);
+ acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[1], dim_0_rounded_up);
+
+ break;
+ }
+ // Currently for Acdb4a and Acdb8a
+ case 4:
+ {
+
+ auto dst_tag = memory_desc_matches_one_of_tag(
+ *dst_md, format_tag::Acdb4a, format_tag::Acdb8a);
+ ACL_CHECK_SUPPORT(
+ utils::one_of(format_tag::undef, dst_tag),
+ "");
+ if(dst_tag == format_tag::Acdb4a){
+ // Set Dest WeightFormat
+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
+ dim_0_rounded_up
+ = utils::rnd_up(src_md->dims[0], 4);
+ }
+ else{
+ // Set Dest WeightFormat
+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
+ dim_0_rounded_up
+ = utils::rnd_up(src_md->dims[0], 8);
+ }
+ // Currently only supporting AxBx1x1 cases
+ if(dst_md->dims[2] != 1 || dst_md->dims[3] != 1){
+ return status::unimplemented;
+ }
+ if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){
+ return status::unimplemented;
+ }
+ acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], src_md->dims[0]);
+ acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], dim_0_rounded_up);
+ break;
+ }
+ default:
+ return status::unimplemented;
+ }
+
+ // Choose the data layout
+ // bool is_nspc = utils::one_of(src_tag, format_tag::nhwc);
+ const auto acl_layout = arm_compute::DataLayout::NCHW;
+
+ // Set Source WeightFormat
+ _pd->app_.src_wf = arm_compute::WeightFormat::OHWI;
+
+ // Create ACL tensor infos
+ const data_type_t data_type = src_d.data_type();
+ const arm_compute::DataType acl_data_t
+ = acl_utils::get_acl_data_t(data_type);
+ _pd->app_.src_info = arm_compute::TensorInfo(
+ acl_tensor_shape_in, 1, acl_data_t, acl_layout);
+ _pd->app_.dst_info = arm_compute::TensorInfo(
+ acl_tensor_shape_out, 1, acl_data_t, acl_layout);
+
+ // Init scratch memory, not used so 0 in this implementation
+ _pd->init_scratchpad_md();
+
+ return safe_ptr_assign(*reorder_pd, _pd);
+ } // create
+
+ friend dnnl::impl::impl_list_item_t;
+ acl_reorder_conf_t app_;
+
+ }; // pd_t
+
+ acl_reorder_fwd_t(const pd_t *apd) : primitive_t(apd) {}
+
+ status_t create_resource(
+ engine_t *engine, resource_mapper_t &mapper) const override {
+ if (mapper.has_resource(this)) return status::success;
+
+ auto r = utils::make_unique<acl_reorder_resource_t>();
+ if (!r) return status::out_of_memory;
+
+ // Configure the resource based on information from primitive descriptor
+ CHECK(r->configure(pd()->app_));
+
+ mapper.add(this, std::move(r));
+ return status::success;
+ }
+
+ status_t execute(const exec_ctx_t &ctx) const override {
+ return execute_forward(ctx);
+ }
+
+private:
+ // To guard the const execute_forward, the mutex must be 'mutable'
+ mutable std::mutex mtx;
+ status_t execute_forward(const exec_ctx_t &ctx) const;
+ const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+
+
+}; // acl_reorder_fwd_t
+
+} // namespace aarch64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // CPU_AARCH64_ACL_REORDER_HPP
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
index a4150b619..f4d6b4de3 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
@@ -16,6 +16,7 @@
*******************************************************************************/
#include "cpu/reorder/cpu_reorder.hpp"
+#include "cpu/aarch64/acl_reorder.hpp"
namespace dnnl {
namespace impl {
@@ -28,6 +29,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
// f32 -> f32
{{f32, f32, 0}, {
REG_FAST_DIRECT_COPY_F32_F32
+ DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
@@ -69,6 +71,8 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
nullptr,
}},
{{f32, f32, 4}, {
+
+ DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
CPU_REORDER_INSTANCE(rnn_weights_reorder_t<f32, f32>)
REG_FAST_DIRECT_COPY_F32_F32

View File

@ -1,97 +0,0 @@
*******************************************************************************
Copyright 2023 Arm Limited and affiliates.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*******************************************************************************
diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
index fd2c76d01..bd7bed837 100644
--- a/src/cpu/aarch64/acl_thread.cpp
+++ b/src/cpu/aarch64/acl_thread.cpp
@@ -55,14 +55,17 @@ void acl_set_benchmark_scheduler_default() {
#endif
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
-void acl_set_tp_scheduler() {
- static std::once_flag flag_once;
- // Create threadpool scheduler
- std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
- = std::make_unique<ThreadpoolScheduler>();
+void acl_set_tp_scheduler(int intra_threads = 0) {
+ static thread_local std::once_flag flag_once;
// set CUSTOM scheduler in ACL
std::call_once(flag_once,
- [&]() { arm_compute::Scheduler::set(threadpool_scheduler); });
+ [&]() {
+ // Create threadpool scheduler
+ std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
+ = std::make_unique<ThreadpoolScheduler>();
+ threadpool_scheduler->set_num_threads(intra_threads);
+
+ arm_compute::Scheduler::set(threadpool_scheduler); });
}
void acl_set_threadpool_num_threads() {
@@ -102,14 +105,6 @@ void set_acl_threading() {
acl_set_benchmark_scheduler_default();
}
#endif
-#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
- if (verbose_has_profile_externals()) {
- acl_set_tp_benchmark_scheduler();
- } else {
- acl_set_tp_scheduler();
- }
-
-#endif
}
} // namespace acl_thread_utils
diff --git a/src/cpu/aarch64/acl_thread.hpp b/src/cpu/aarch64/acl_thread.hpp
index f073376e6..654a2aa5d 100644
--- a/src/cpu/aarch64/acl_thread.hpp
+++ b/src/cpu/aarch64/acl_thread.hpp
@@ -40,7 +40,7 @@ void acl_set_benchmark_scheduler_default();
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
// Retrieve threadpool size during primitive execution and set ThreadpoolScheduler num_threads
-void acl_set_tp_scheduler();
+void acl_set_tp_scheduler(int intra_threads);
void acl_set_threadpool_num_threads();
// Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler) for DNNL_VERBOSE=profile,profile_externals
void acl_set_tp_benchmark_scheduler();
diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
index 439ca862e..6656c37a5 100644
--- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
@@ -102,8 +102,6 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
void ThreadpoolScheduler::run_workloads(
std::vector<arm_compute::IScheduler::Workload> &workloads) {
- arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
-
const unsigned int num_threads
= std::min(static_cast<unsigned int>(_num_threads),
static_cast<unsigned int>(workloads.size()));
diff --git a/src/cpu/cpu_engine.cpp b/src/cpu/cpu_engine.cpp
index 0bfec3871..7207b2b60 100644
--- a/src/cpu/cpu_engine.cpp
+++ b/src/cpu/cpu_engine.cpp
@@ -47,6 +47,7 @@ status_t cpu_engine_t::create_stream(stream_t **stream, unsigned flags) {
#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
status_t cpu_engine_t::create_stream(stream_t **stream,
dnnl::threadpool_interop::threadpool_iface *threadpool) {
+ dnnl::impl::cpu::aarch64::acl_thread_utils::acl_set_tp_scheduler(threadpool->get_num_threads());
return safe_ptr_assign<stream_t>(
*stream, new cpu_stream_t(this, threadpool));
}

View File

@ -1,43 +0,0 @@
*******************************************************************************
Copyright 2023 Arm Limited and affiliates.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*******************************************************************************
diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
index fd2c76d01..2d7c76d48 100644
--- a/src/cpu/aarch64/acl_thread.cpp
+++ b/src/cpu/aarch64/acl_thread.cpp
@@ -17,6 +17,8 @@
#include "cpu/aarch64/acl_thread.hpp"
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
#include "cpu/aarch64/acl_threadpool_scheduler.hpp"
+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
+#include <thread>
#endif
#include "cpu/aarch64/acl_benchmark_scheduler.hpp"
@@ -30,9 +32,10 @@ namespace acl_thread_utils {
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
void acl_thread_bind() {
static std::once_flag flag_once;
- // The threads in Compute Library are bound for the cores 0..max_threads-1
- // dnnl_get_max_threads() returns OMP_NUM_THREADS
- const int max_threads = dnnl_get_max_threads();
+ // Cap the number of threads to 90% of the total core count
+ // to ensure Compute Library doesn't use too much resource
+ int capped_threads = (int)std::floor(0.9*std::thread::hardware_concurrency());
+ const int max_threads = std::min(capped_threads, dnnl_get_max_threads());
// arm_compute::Scheduler does not support concurrent access thus a
// workaround here restricts it to only one call
std::call_once(flag_once, [&]() {

View File

@ -0,0 +1,180 @@
# *******************************************************************************
# Copyright 2025 Arm Limited and affiliates.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# *******************************************************************************
diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
index 53175a05f9..89731cb356 100644
--- a/src/cpu/aarch64/acl_thread.cpp
+++ b/src/cpu/aarch64/acl_thread.cpp
@@ -1,5 +1,5 @@
/*******************************************************************************
-* Copyright 2022-2024 Arm Ltd. and affiliates
+* Copyright 2022-2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -83,17 +83,20 @@ void acl_set_threadpool_num_threads() {
}
// Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler)
void acl_set_tp_benchmark_scheduler() {
- static std::once_flag flag_once;
- // Create threadpool scheduler
- std::unique_ptr<arm_compute::IScheduler> threadpool_scheduler
- = std::make_unique<ThreadpoolScheduler>();
- arm_compute::IScheduler *_real_scheduler = nullptr;
- _real_scheduler = threadpool_scheduler.release();
- // Create benchmark scheduler and set TP as real scheduler
- std::shared_ptr<arm_compute::IScheduler> benchmark_scheduler
- = std::make_unique<BenchmarkScheduler>(*_real_scheduler);
- std::call_once(flag_once,
- [&]() { arm_compute::Scheduler::set(benchmark_scheduler); });
+ static thread_local std::once_flag flag_once;
+ std::call_once(flag_once, [&]() {
+ // Create threadpool scheduler
+ std::unique_ptr<arm_compute::IScheduler> threadpool_scheduler
+ = std::make_unique<ThreadpoolScheduler>();
+ arm_compute::IScheduler *_real_scheduler = nullptr;
+ _real_scheduler = threadpool_scheduler.release();
+
+ // Create benchmark scheduler and set TP as real scheduler
+ std::shared_ptr<arm_compute::IScheduler> benchmark_scheduler
+ = std::make_unique<BenchmarkScheduler>(*_real_scheduler);
+
+ arm_compute::Scheduler::set(benchmark_scheduler);
+ });
}
#endif
diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
index 30910398d9..34cf44b7e2 100644
--- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
@@ -1,5 +1,5 @@
/*******************************************************************************
-* Copyright 2022-2024 Arm Ltd. and affiliates
+* Copyright 2022-2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -18,24 +18,17 @@
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
-#include "cpu/aarch64/acl_thread.hpp"
-
#include "common/counting_barrier.hpp"
#include "common/dnnl_thread.hpp"
+#include "cpu/aarch64/acl_thread.hpp"
#include "arm_compute/core/CPP/ICPPKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
#include "arm_compute/runtime/IScheduler.h"
-// BARRIER
#include <atomic>
#include <cassert>
-#include <chrono>
#include <mutex>
-#include <thread>
-#include <condition_variable>
namespace dnnl {
namespace impl {
@@ -51,7 +44,7 @@ public:
/// Function to check the next element in the range if there is one.
bool get_next(unsigned int &next) {
- next = atomic_fetch_add_explicit(
+ next = std::atomic_fetch_add_explicit(
&_atomic_counter, 1u, std::memory_order_relaxed);
return next < _end;
}
@@ -70,11 +63,8 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads,
} while (feeder.get_next(workload_index));
}
-ThreadpoolScheduler::ThreadpoolScheduler() {
- using namespace dnnl::impl::threadpool_utils;
- // Set number of threads to one when threadpool is not available.
- _num_threads = get_active_threadpool() == nullptr ? 1 : num_threads_hint();
-}
+ThreadpoolScheduler::ThreadpoolScheduler()
+ : _num_threads(dnnl_get_max_threads()) {}
ThreadpoolScheduler::~ThreadpoolScheduler() = default;
@@ -83,8 +73,8 @@ unsigned int ThreadpoolScheduler::num_threads() const {
}
void ThreadpoolScheduler::set_num_threads(unsigned int num_threads) {
- arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
- _num_threads = num_threads == 0 ? num_threads_hint() : num_threads;
+ std::lock_guard<std::mutex> lock(this->_mtx);
+ _num_threads = num_threads == 0 ? dnnl_get_max_threads() : num_threads;
}
void ThreadpoolScheduler::schedule(ICPPKernel *kernel, const Hints &hints) {
@@ -104,7 +94,7 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
void ThreadpoolScheduler::run_workloads(
std::vector<arm_compute::IScheduler::Workload> &workloads) {
- arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
+ std::lock_guard<std::mutex> lock(this->_mtx);
const unsigned int num_threads
= std::min(static_cast<unsigned int>(_num_threads),
diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.hpp b/src/cpu/aarch64/acl_threadpool_scheduler.hpp
index e9ba21c803..384dfec1b9 100644
--- a/src/cpu/aarch64/acl_threadpool_scheduler.hpp
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.hpp
@@ -1,5 +1,5 @@
/*******************************************************************************
-* Copyright 2022 Arm Ltd. and affiliates
+* Copyright 2022, 2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -22,7 +22,8 @@
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
#include "arm_compute/runtime/IScheduler.h"
-#include "support/Mutex.h"
+
+#include <mutex>
namespace dnnl {
namespace impl {
@@ -32,7 +33,7 @@ namespace aarch64 {
class ThreadpoolScheduler final : public arm_compute::IScheduler {
public:
ThreadpoolScheduler();
- ~ThreadpoolScheduler();
+ ~ThreadpoolScheduler() override;
/// Sets the number of threads the scheduler will use to run the kernels.
void set_num_threads(unsigned int num_threads) override;
@@ -54,8 +55,8 @@ protected:
void run_workloads(std::vector<Workload> &workloads) override;
private:
- uint _num_threads {};
- arm_compute::Mutex _run_workloads_mutex {};
+ unsigned int _num_threads {};
+ std::mutex _mtx;
};
} // namespace aarch64

View File

@ -163,33 +163,23 @@ def _tf_repositories():
name = "mkl_dnn_acl_compatible",
build_file = "//third_party/mkl_dnn:mkldnn_acl.BUILD",
patch_file = [
"//third_party/mkl_dnn:onednn_acl_threadcap.patch",
"//third_party/mkl_dnn:onednn_acl_reorder.patch",
"//third_party/mkl_dnn:onednn_acl_thread_local_scheduler.patch",
"//third_party/mkl_dnn:onednn_acl_fp32_bf16_reorder.patch",
"//third_party/mkl_dnn:onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch",
"//third_party/mkl_dnn:onednn_acl_indirect_conv.patch",
"//third_party/mkl_dnn:onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch",
"//third_party/mkl_dnn:onednn_acl_fix_segfault_during_postop_execute.patch",
"//third_party/mkl_dnn:onednn_acl_add_bf16_platform_support_check.patch",
"//third_party/mkl_dnn:onednn_acl_add_sbgemm_matmul_primitive_definition.patch",
"//third_party/mkl_dnn:onednn_acl_threadpool_default_max.patch",
],
sha256 = "2f76b407ef8893cca71340f88cd800019a1f14f8ac1bbdbb89a84be1370b52e3",
strip_prefix = "oneDNN-3.2.1",
urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.2.1.tar.gz"),
sha256 = "5792cbc07764c6e25c459ff68efb5cfcd7f4a0ba66dca6a4a2c681cd7a644596",
strip_prefix = "oneDNN-3.7",
urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.7.zip"),
)
tf_http_archive(
name = "compute_library",
patch_file = [
"//third_party/compute_library:compute_library.patch",
"//third_party/compute_library:acl_thread_local_scheduler.patch",
"//third_party/compute_library:exclude_omp_scheduler.patch",
"//third_party/compute_library:include_string.patch",
],
sha256 = "c4ca329a78da380163b2d86e91ba728349b6f0ee97d66e260a694ef37f0b0d93",
strip_prefix = "ComputeLibrary-23.05.1",
urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/v23.05.1.tar.gz"),
sha256 = "8273f68cd0bb17e9231a11a6618d245eb6d623884ae681c00e7a4eabca2dad42",
strip_prefix = "ComputeLibrary-24.12",
urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/refs/tags/v24.12.tar.gz"),
)
tf_http_archive(

View File

@ -82,14 +82,6 @@ config_setting(
},
)
config_setting(
name = "build_with_mkl_aarch64_openmp",
define_values = {
"build_with_mkl_aarch64": "true",
"build_with_openmp": "true",
},
)
filegroup(
name = "LICENSE",
srcs = [

View File

@ -7,7 +7,6 @@ if_mkl_lnx_x64 is a conditional to check for MKL
if_enable_mkl is a conditional to check if building with MKL and MKL is enabled.
if_mkldnn_openmp checks if we are building x86 backend with OpenMP.
if_mkldnn_aarch64_acl checks if we are building with Arm Compute Library.
if_mkldnn_aarch64_acl_openmp checks if we are building ACL with OpenMP.
mkl_repository is a repository rule for creating MKL repository rule that can
be pointed to either a local folder, or download it from the internet.
@ -146,12 +145,6 @@ def if_mkldnn_aarch64_acl(if_true, if_false = []):
"//conditions:default": if_false,
})
def if_mkldnn_aarch64_acl_openmp(if_true, if_false = []):
return select({
"@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": if_true,
"//conditions:default": if_false,
})
# Temporarily disable Graph API on aarch64 until we change the aarch64 BUILD
# file to support Graph API.
def if_graph_api(if_true, if_false = []):

View File

@ -10,7 +10,6 @@ load(
"if_enable_mkl",
"if_mkl",
"if_mkldnn_aarch64_acl",
"if_mkldnn_aarch64_acl_openmp",
"if_mkldnn_openmp",
"onednn_v3_define",
)
@ -334,7 +333,6 @@ def tsl_copts(
if_mkldnn_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
onednn_v3_define() +
if_mkldnn_aarch64_acl(["-DDNNL_AARCH64_USE_ACL=1"]) +
if_mkldnn_aarch64_acl_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
if_enable_acl(["-DXLA_CPU_USE_ACL=1", "-fexceptions"]) +
if_android_arm(["-mfpu=neon", "-fomit-frame-pointer"]) +
if_linux_x86_64(["-msse3"]) +

View File

@ -154,9 +154,7 @@ class OneDnnThreadPool : public threadpool_iface {
static void set_onednn_max_threads(int num_threads) {
#if DNNL_VERSION_MAJOR >= 3 || \
(DNNL_VERSION_MAJOR == 2 && DNNL_VERSION_MINOR >= 7)
#ifndef DNNL_AARCH64_USE_ACL
dnnl_threadpool_interop_set_max_concurrency(num_threads);
#endif // DNNL_AARCH64_USE_ACL
#endif // DNNL_VERSION_MAJOR >= 3 ||
// (DNNL_VERSION_MAJOR == 2 && DNNL_VERSION_MINOR >= 7)
}