mirror of
https://github.com/zebrajr/tensorflow.git
synced 2025-12-06 00:19:58 +01:00
build(aarch64): Update to oneDNN-3.7 + ACL-24.12
Bumps the aarch64-compatible oneDNN version to 3.7 and the ACL version to 24.12. This brings better performance, improved memory management, and numerous bug fixes over the previous, long out-of-date versions. Signed-off-by: Siddhartha Menon <siddhartha.menon@arm.com>
This commit is contained in:
parent
2765e59402
commit
a24a3a48f2
10
.bazelrc
10
.bazelrc
|
|
@ -241,17 +241,15 @@ build:mkl_threadpool --define=tensorflow_mkldnn_contraction_kernel=0
|
|||
build:mkl_threadpool --define=build_with_mkl_opensource=true
|
||||
build:mkl_threadpool -c opt
|
||||
|
||||
# Config setting to build oneDNN with Compute Library for the Arm Architecture (ACL).
|
||||
build:mkl_aarch64 --define=build_with_mkl_aarch64=true
|
||||
build:mkl_aarch64 --define=build_with_openmp=true
|
||||
build:mkl_aarch64 --define=build_with_acl=true
|
||||
build:mkl_aarch64 -c opt
|
||||
|
||||
# Config setting to build oneDNN with Compute Library for the Arm Architecture (ACL).
|
||||
# with Eigen threadpool support
|
||||
build:mkl_aarch64_threadpool --define=build_with_mkl_aarch64=true
|
||||
build:mkl_aarch64_threadpool --define=build_with_acl=true
|
||||
build:mkl_aarch64_threadpool -c opt
|
||||
|
||||
# This is an alias for the mkl_aarch64_threadpool build.
|
||||
build:mkl_aarch64 --config=mkl_aarch64_threadpool
|
||||
|
||||
# Default CUDA and CUDNN versions.
|
||||
build:cuda_version --repo_env=HERMETIC_CUDA_VERSION="12.5.1"
|
||||
build:cuda_version --repo_env=HERMETIC_CUDNN_VERSION="9.3.0"
|
||||
|
|
|
|||
|
|
@ -21,7 +21,6 @@ load(
|
|||
"if_mkl",
|
||||
"if_mkl_ml",
|
||||
"if_mkldnn_aarch64_acl",
|
||||
"if_mkldnn_aarch64_acl_openmp",
|
||||
"if_mkldnn_openmp",
|
||||
"onednn_v3_define",
|
||||
)
|
||||
|
|
@ -478,7 +477,6 @@ def tf_copts(
|
|||
if_mkldnn_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
|
||||
onednn_v3_define() +
|
||||
if_mkldnn_aarch64_acl(["-DDNNL_AARCH64_USE_ACL=1"]) +
|
||||
if_mkldnn_aarch64_acl_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
|
||||
if_zendnn(["-DAMD_ZENDNN"]) +
|
||||
if_enable_acl(["-DXLA_CPU_USE_ACL=1", "-fexceptions"]) +
|
||||
if_llvm_aarch32_available(["-DTF_LLVM_AARCH32_AVAILABLE=1"]) +
|
||||
|
|
|
|||
|
|
@ -236,33 +236,23 @@ def _tf_repositories():
|
|||
name = "mkl_dnn_acl_compatible",
|
||||
build_file = "//third_party/mkl_dnn:mkldnn_acl.BUILD",
|
||||
patch_file = [
|
||||
"//third_party/mkl_dnn:onednn_acl_threadcap.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_reorder.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_thread_local_scheduler.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_fp32_bf16_reorder.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_indirect_conv.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_fix_segfault_during_postop_execute.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_add_bf16_platform_support_check.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_add_sbgemm_matmul_primitive_definition.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_threadpool_default_max.patch",
|
||||
],
|
||||
sha256 = "2f76b407ef8893cca71340f88cd800019a1f14f8ac1bbdbb89a84be1370b52e3",
|
||||
strip_prefix = "oneDNN-3.2.1",
|
||||
urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.2.1.tar.gz"),
|
||||
sha256 = "5792cbc07764c6e25c459ff68efb5cfcd7f4a0ba66dca6a4a2c681cd7a644596",
|
||||
strip_prefix = "oneDNN-3.7",
|
||||
urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.7.zip"),
|
||||
)
|
||||
|
||||
tf_http_archive(
|
||||
name = "compute_library",
|
||||
patch_file = [
|
||||
"//third_party/compute_library:compute_library.patch",
|
||||
"//third_party/compute_library:acl_thread_local_scheduler.patch",
|
||||
"//third_party/compute_library:exclude_omp_scheduler.patch",
|
||||
"//third_party/compute_library:include_string.patch",
|
||||
],
|
||||
sha256 = "c4ca329a78da380163b2d86e91ba728349b6f0ee97d66e260a694ef37f0b0d93",
|
||||
strip_prefix = "ComputeLibrary-23.05.1",
|
||||
urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/v23.05.1.tar.gz"),
|
||||
sha256 = "8273f68cd0bb17e9231a11a6618d245eb6d623884ae681c00e7a4eabca2dad42",
|
||||
strip_prefix = "ComputeLibrary-24.12",
|
||||
urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/refs/tags/v24.12.tar.gz"),
|
||||
)
|
||||
|
||||
tf_http_archive(
|
||||
|
|
|
|||
|
|
@ -1,98 +0,0 @@
|
|||
diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
|
||||
index 9e8add1f9..cf5e2bf4c 100644
|
||||
--- a/arm_compute/runtime/Scheduler.h
|
||||
+++ b/arm_compute/runtime/Scheduler.h
|
||||
@@ -75,7 +75,7 @@ public:
|
||||
|
||||
private:
|
||||
static Type _scheduler_type;
|
||||
- static std::shared_ptr<IScheduler> _custom_scheduler;
|
||||
+ static thread_local std::shared_ptr<IScheduler> _custom_scheduler;
|
||||
static std::map<Type, std::unique_ptr<IScheduler>> _schedulers;
|
||||
|
||||
Scheduler();
|
||||
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
|
||||
index a5b9eca56..d1ab19397 100644
|
||||
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
|
||||
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
|
||||
@@ -60,8 +60,8 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src,
|
||||
const ConvolutionInfo &info)
|
||||
{
|
||||
ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, info);
|
||||
- const CPUInfo &ci = NEScheduler::get().cpu_info();
|
||||
- const unsigned int num_threads = NEScheduler::get().num_threads();
|
||||
+ const CPUInfo &ci = CPUInfo::get();
|
||||
+ const unsigned int num_threads = CPUInfo::get().get_cpu_num();
|
||||
_pImpl->is_prepared = false;
|
||||
_pImpl->are_weights_const = weights->are_values_constant();
|
||||
|
||||
diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp
|
||||
index 722cd36ee..03aef1632 100644
|
||||
--- a/src/cpu/operators/CpuPool2d.cpp
|
||||
+++ b/src/cpu/operators/CpuPool2d.cpp
|
||||
@@ -66,8 +66,8 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
|
||||
|
||||
if(run_optimised)
|
||||
{
|
||||
- const CPUInfo &ci = NEScheduler::get().cpu_info();
|
||||
- const unsigned int num_threads = NEScheduler::get().num_threads();
|
||||
+ const CPUInfo &ci = CPUInfo::get();
|
||||
+ const unsigned int num_threads = CPUInfo::get().get_cpu_num();
|
||||
|
||||
auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>();
|
||||
ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
|
||||
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
|
||||
*******************************************************************************
|
||||
Copyright 2023 Arm Limited and affiliates.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*******************************************************************************
|
||||
index 9c8563140..f7771945a 100644
|
||||
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
|
||||
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
|
||||
@@ -623,8 +623,8 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
|
||||
arm_gemm::Activation activation, const AsmGemmInfo &info)
|
||||
{
|
||||
Params p = extract_parameters(a, b, d, info);
|
||||
- const CPUInfo &ci = NEScheduler::get().cpu_info();
|
||||
- unsigned int num_threads = NEScheduler::get().num_threads();
|
||||
+ const CPUInfo &ci = CPUInfo::get();
|
||||
+ unsigned int num_threads = CPUInfo::get().get_cpu_num();
|
||||
|
||||
arm_gemm::GemmConfig cfg;
|
||||
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
|
||||
@@ -696,8 +696,8 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
|
||||
ARM_COMPUTE_UNUSED(c);
|
||||
arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
|
||||
Params p = extract_parameters(a, b, d, info);
|
||||
- const CPUInfo &ci = NEScheduler::get().cpu_info();
|
||||
- unsigned int num_threads = NEScheduler::get().num_threads();
|
||||
+ const CPUInfo &ci = CPUInfo::get();
|
||||
+ unsigned int num_threads = CPUInfo::get().get_cpu_num();
|
||||
arm_gemm::GemmConfig cfg;
|
||||
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
|
||||
arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format);
|
||||
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
|
||||
index 0713b9a2a..f15ac2e22 100644
|
||||
--- a/src/runtime/Scheduler.cpp
|
||||
+++ b/src/runtime/Scheduler.cpp
|
||||
@@ -47,7 +47,7 @@ Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
|
||||
Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST;
|
||||
#endif /* ARM_COMPUTE_*_SCHEDULER */
|
||||
|
||||
-std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
|
||||
+thread_local std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
|
||||
|
||||
namespace
|
||||
{
|
||||
|
|
@ -1,8 +1,8 @@
|
|||
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
|
||||
index bf71e534e2..22377f1a32 100644
|
||||
index 547c98576..a31301230 100644
|
||||
--- a/src/BUILD.bazel
|
||||
+++ b/src/BUILD.bazel
|
||||
@@ -971,7 +971,6 @@ filegroup(
|
||||
@@ -1029,7 +1029,6 @@ filegroup(
|
||||
"runtime/NEON/functions/NETranspose.cpp",
|
||||
"runtime/NEON/functions/NEUnstack.cpp",
|
||||
"runtime/NEON/functions/NEWinogradConvolutionLayer.cpp",
|
||||
|
|
@ -10,10 +10,10 @@ index bf71e534e2..22377f1a32 100644
|
|||
"runtime/OffsetLifetimeManager.cpp",
|
||||
"runtime/OffsetMemoryPool.cpp",
|
||||
"runtime/OperatorTensor.cpp",
|
||||
@@ -984,6 +983,10 @@ filegroup(
|
||||
"runtime/Tensor.cpp",
|
||||
"runtime/TensorAllocator.cpp",
|
||||
"runtime/Utils.cpp"] +
|
||||
@@ -1058,6 +1057,10 @@ filegroup(
|
||||
"runtime/experimental/operators/CpuSub.cpp",
|
||||
"runtime/experimental/operators/CpuTranspose.cpp",
|
||||
"runtime/experimental/operators/CpuWinogradConv2d.cpp"] +
|
||||
+ select({
|
||||
+ "//:openmp_flag": ["runtime/OMP/OMPScheduler.cpp"],
|
||||
+ "//conditions:default": [],
|
||||
|
|
|
|||
86
third_party/mkl_dnn/mkldnn_acl.BUILD
vendored
86
third_party/mkl_dnn/mkldnn_acl.BUILD
vendored
|
|
@ -9,13 +9,6 @@ _DNNL_COPTS_THREADPOOL = [
|
|||
"-UUSE_CBLAS",
|
||||
]
|
||||
|
||||
_DNNL_COPTS_OMP = [
|
||||
"-fopenmp",
|
||||
"-fexceptions",
|
||||
"-UUSE_MKL",
|
||||
"-UUSE_CBLAS",
|
||||
]
|
||||
|
||||
_DNNL_RUNTIME_THREADPOOL = {
|
||||
"#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
|
||||
"#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
|
||||
|
|
@ -63,61 +56,24 @@ _DNNL_RUNTIME_THREADPOOL = {
|
|||
"#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
|
||||
"#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
|
||||
"#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
|
||||
}
|
||||
|
||||
_DNNL_RUNTIME_OMP = {
|
||||
"#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
|
||||
"#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
|
||||
"#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
|
||||
"#cmakedefine DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE": "#undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE",
|
||||
"#cmakedefine DNNL_WITH_SYCL": "#undef DNNL_WITH_SYCL",
|
||||
"#cmakedefine DNNL_WITH_LEVEL_ZERO": "#undef DNNL_WITH_LEVEL_ZERO",
|
||||
"#cmakedefine DNNL_SYCL_CUDA": "#undef DNNL_SYCL_CUDA",
|
||||
"#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
|
||||
"#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
|
||||
"#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
|
||||
"#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
|
||||
"#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
|
||||
"#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
|
||||
"#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
|
||||
"#cmakedefine01 BUILD_BATCH_NORMALIZATION": "#define BUILD_BATCH_NORMALIZATION 0",
|
||||
"#cmakedefine01 BUILD_BINARY": "#define BUILD_BINARY 0",
|
||||
"#cmakedefine01 BUILD_CONCAT": "#define BUILD_CONCAT 0",
|
||||
"#cmakedefine01 BUILD_CONVOLUTION": "#define BUILD_CONVOLUTION 0",
|
||||
"#cmakedefine01 BUILD_DECONVOLUTION": "#define BUILD_DECONVOLUTION 0",
|
||||
"#cmakedefine01 BUILD_ELTWISE": "#define BUILD_ELTWISE 0",
|
||||
"#cmakedefine01 BUILD_INNER_PRODUCT": "#define BUILD_INNER_PRODUCT 0",
|
||||
"#cmakedefine01 BUILD_LAYER_NORMALIZATION": "#define BUILD_LAYER_NORMALIZATION 0",
|
||||
"#cmakedefine01 BUILD_LRN": "#define BUILD_LRN 0",
|
||||
"#cmakedefine01 BUILD_MATMUL": "#define BUILD_MATMUL 0",
|
||||
"#cmakedefine01 BUILD_POOLING": "#define BUILD_POOLING 0",
|
||||
"#cmakedefine01 BUILD_PRELU": "#define BUILD_PRELU 0",
|
||||
"#cmakedefine01 BUILD_REDUCTION": "#define BUILD_REDUCTION 0",
|
||||
"#cmakedefine01 BUILD_REORDER": "#define BUILD_REORDER 0",
|
||||
"#cmakedefine01 BUILD_RESAMPLING": "#define BUILD_RESAMPLING 0",
|
||||
"#cmakedefine01 BUILD_RNN": "#define BUILD_RNN 0",
|
||||
"#cmakedefine01 BUILD_SHUFFLE": "#define BUILD_SHUFFLE 0",
|
||||
"#cmakedefine01 BUILD_SOFTMAX": "#define BUILD_SOFTMAX 0",
|
||||
"#cmakedefine01 BUILD_SUM": "#define BUILD_SUM 0",
|
||||
"#cmakedefine01 BUILD_PRIMITIVE_CPU_ISA_ALL": "#define BUILD_PRIMITIVE_CPU_ISA_ALL 0",
|
||||
"#cmakedefine01 BUILD_SSE41": "#define BUILD_SSE41 0",
|
||||
"#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0",
|
||||
"#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0",
|
||||
"#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0",
|
||||
"#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 0",
|
||||
"#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0",
|
||||
"#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0",
|
||||
"#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0",
|
||||
"#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
|
||||
"#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
|
||||
"#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
|
||||
"#cmakedefine01 BUILD_GROUP_NORMALIZATION": "#define BUILD_GROUP_NORMALIZATION 0",
|
||||
"#cmakedefine01 BUILD_GEMM_KERNELS_ALL": "#define BUILD_GEMM_KERNELS_ALL 1",
|
||||
"#cmakedefine01 BUILD_GEMM_KERNELS_NONE": "#define BUILD_GEMM_KERNELS_NONE 0",
|
||||
"#cmakedefine01 BUILD_GEMM_SSE41": "#define BUILD_GEMM_SSE41 0",
|
||||
"#cmakedefine01 BUILD_GEMM_AVX2": "#define BUILD_GEMM_AVX2 0",
|
||||
"#cmakedefine01 BUILD_GEMM_AVX512": "#define BUILD_GEMM_AVX512 0",
|
||||
"#cmakedefine DNNL_GPU_VENDOR": "#define DNNL_GPU_VENDOR INTEL",
|
||||
"#cmakedefine DNNL_SYCL_GENERIC": "#undef DNNL_SYCL_GENERIC",
|
||||
"#cmakedefine DNNL_DISABLE_GPU_REF_KERNELS": "#undef DNNL_DISABLE_GPU_REF_KERNELS",
|
||||
"#cmakedefine01 BUILD_SDPA": "#define BUILD_SDPA 0",
|
||||
"#cmakedefine01 BUILD_XE2": "#define BUILD_XE2 0",
|
||||
"#cmakedefine01 BUILD_XE3": "#define BUILD_XE3 0",
|
||||
}
|
||||
|
||||
expand_template(
|
||||
name = "dnnl_config_h",
|
||||
out = "include/oneapi/dnnl/dnnl_config.h",
|
||||
substitutions = select({
|
||||
"@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": _DNNL_RUNTIME_OMP,
|
||||
"//conditions:default": _DNNL_RUNTIME_THREADPOOL,
|
||||
}),
|
||||
template = "include/oneapi/dnnl/dnnl_config.h.in",
|
||||
|
|
@ -128,13 +84,21 @@ expand_template(
|
|||
out = "include/oneapi/dnnl/dnnl_version.h",
|
||||
substitutions = {
|
||||
"@DNNL_VERSION_MAJOR@": "3",
|
||||
"@DNNL_VERSION_MINOR@": "2",
|
||||
"@DNNL_VERSION_PATCH@": "1",
|
||||
"@DNNL_VERSION_HASH@": "N/A",
|
||||
"@DNNL_VERSION_MINOR@": "7",
|
||||
"@DNNL_VERSION_PATCH@": "0",
|
||||
},
|
||||
template = "include/oneapi/dnnl/dnnl_version.h.in",
|
||||
)
|
||||
|
||||
expand_template(
|
||||
name = "dnnl_version_hash_h",
|
||||
out = "include/oneapi/dnnl/dnnl_version_hash.h",
|
||||
substitutions = {
|
||||
"@DNNL_VERSION_HASH@": "N/A",
|
||||
},
|
||||
template = "include/oneapi/dnnl/dnnl_version_hash.h.in",
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "mkl_dnn_acl",
|
||||
srcs = glob(
|
||||
|
|
@ -146,10 +110,11 @@ cc_library(
|
|||
exclude = [
|
||||
"src/cpu/x64/**",
|
||||
"src/cpu/rv64/**",
|
||||
"src/cpu/sycl/**",
|
||||
"src/xpu/**",
|
||||
],
|
||||
),
|
||||
copts = select({
|
||||
"@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": _DNNL_COPTS_OMP,
|
||||
"//conditions:default": _DNNL_COPTS_THREADPOOL,
|
||||
}),
|
||||
defines = ["DNNL_AARCH64_USE_ACL=1"],
|
||||
|
|
@ -175,6 +140,7 @@ cc_library(
|
|||
) + [
|
||||
":dnnl_config_h",
|
||||
":dnnl_version_h",
|
||||
":dnnl_version_hash_h",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
|
|
|
|||
|
|
@ -1,31 +0,0 @@
|
|||
/* Copyright 2024 The OpenXLA Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
diff --git a/src/cpu/platform.cpp b/src/cpu/platform.cpp
|
||||
index 65b887ea21..eabdb827bd 100644
|
||||
--- a/src/cpu/platform.cpp
|
||||
+++ b/src/cpu/platform.cpp
|
||||
@@ -117,6 +117,8 @@ bool has_data_type_support(data_type_t data_type) {
|
||||
#if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__)
|
||||
return true;
|
||||
#endif
|
||||
+#elif DNNL_AARCH64_USE_ACL
|
||||
+ return arm_compute::CPUInfo::get().has_bf16();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
--
|
||||
2.34.1
|
||||
|
||||
|
|
@ -1,44 +0,0 @@
|
|||
/* Copyright 2024 The OpenXLA Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
|
||||
index ab13efb9b2..ec261e156d 100644
|
||||
--- a/src/cpu/aarch64/matmul/acl_matmul.hpp
|
||||
+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
|
||||
@@ -78,11 +78,21 @@ struct acl_matmul_t : public primitive_t {
|
||||
= utils::everyone_is(data_type::f16, src_md()->data_type,
|
||||
weights_md()->data_type, dst_md()->data_type)
|
||||
&& platform::has_data_type_support(data_type::f16);
|
||||
+ const bool is_fp32_bf16_ok
|
||||
+ = (utils::everyone_is(data_type::f32, src_md()->data_type,
|
||||
+ dst_md()->data_type, desc()->accum_data_type)
|
||||
+ && platform::has_data_type_support(data_type::f32)
|
||||
+ && utils::everyone_is(
|
||||
+ data_type::bf16, weights_md()->data_type)
|
||||
+ && platform::has_data_type_support(
|
||||
+ data_type::bf16));
|
||||
+
|
||||
const bool is_weights_md_format_ok
|
||||
= utils::one_of(weights_format_kind_received,
|
||||
format_kind::any, format_kind::blocked);
|
||||
bool ok = is_dense_data()
|
||||
- && utils::one_of(true, is_fp32_ok, is_fp16_ok)
|
||||
+ && utils::one_of(
|
||||
+ true, is_fp32_ok, is_fp16_ok, is_fp32_bf16_ok)
|
||||
&& !has_zero_dim_memory() && is_weights_md_format_ok
|
||||
&& set_default_formats()
|
||||
&& attr()->has_default_values(
|
||||
--
|
||||
2.34.1
|
||||
|
|
@ -1,100 +0,0 @@
|
|||
/* Copyright 2024 The OpenXLA Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
|
||||
index 451cc78d52..ab13efb9b2 100644
|
||||
--- a/src/cpu/aarch64/matmul/acl_matmul.hpp
|
||||
+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
|
||||
@@ -67,6 +67,8 @@ struct acl_matmul_t : public primitive_t {
|
||||
|
||||
status_t init(engine_t *engine) {
|
||||
using smask_t = primitive_attr_t::skip_mask_t;
|
||||
+ const format_kind_t weights_format_kind_received
|
||||
+ = weights_md_.format_kind;
|
||||
const bool is_fp32_ok
|
||||
= utils::everyone_is(data_type::f32, src_md()->data_type,
|
||||
weights_md()->data_type, dst_md()->data_type,
|
||||
@@ -76,18 +78,20 @@ struct acl_matmul_t : public primitive_t {
|
||||
= utils::everyone_is(data_type::f16, src_md()->data_type,
|
||||
weights_md()->data_type, dst_md()->data_type)
|
||||
&& platform::has_data_type_support(data_type::f16);
|
||||
+ const bool is_weights_md_format_ok
|
||||
+ = utils::one_of(weights_format_kind_received,
|
||||
+ format_kind::any, format_kind::blocked);
|
||||
bool ok = is_dense_data()
|
||||
&& utils::one_of(true, is_fp32_ok, is_fp16_ok)
|
||||
- && !has_zero_dim_memory()
|
||||
- && weights_md_.format_kind == format_kind::any
|
||||
+ && !has_zero_dim_memory() && is_weights_md_format_ok
|
||||
&& set_default_formats()
|
||||
&& attr()->has_default_values(
|
||||
smask_t::oscale | smask_t::post_ops)
|
||||
&& attr_oscale_ok() && !has_runtime_dims_or_strides();
|
||||
if (!ok) return status::unimplemented;
|
||||
|
||||
- CHECK(acl_matmul_utils::init_conf_matmul(
|
||||
- amp_, src_md_, weights_md_, dst_md_, *desc(), *attr()));
|
||||
+ CHECK(acl_matmul_utils::init_conf_matmul(amp_, src_md_, weights_md_,
|
||||
+ dst_md_, *desc(), *attr(), weights_format_kind_received));
|
||||
|
||||
arm_compute::ActivationLayerInfo act_info;
|
||||
CHECK(post_ops.init(engine, attr_.post_ops_, dst_md_, act_info));
|
||||
diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
|
||||
index a314d96384..027f915a8a 100644
|
||||
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
|
||||
+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
|
||||
@@ -27,7 +27,8 @@ namespace acl_matmul_utils {
|
||||
|
||||
status_t init_conf_matmul(acl_matmul_conf_t &, memory_desc_t &src_md,
|
||||
memory_desc_t &wei_md, memory_desc_t &dst_md, const matmul_desc_t &md,
|
||||
- const primitive_attr_t &attr) {
|
||||
+ const primitive_attr_t &attr,
|
||||
+ format_kind_t weights_format_kind_received) {
|
||||
|
||||
const memory_desc_wrapper src_d(&src_md);
|
||||
const memory_desc_wrapper wei_d(&wei_md);
|
||||
@@ -128,9 +129,16 @@ status_t init_conf_matmul(acl_matmul_conf_t &, memory_desc_t &src_md,
|
||||
for (dim_t i = K_dim - 1; i >= 0; --i)
|
||||
batch_dims.push_back(i);
|
||||
|
||||
+ const memory_desc_t weights_md_received = wei_md;
|
||||
acl_utils::reorder_to_weight_format(amp.wei_tensor_info, wei_md,
|
||||
expected_weight_format, K_dim, N_dim, {}, batch_dims);
|
||||
|
||||
+ ACL_CHECK_SUPPORT((weights_format_kind_received == format_kind::blocked)
|
||||
+ && !(dnnl_memory_desc_equal(&weights_md_received, &wei_md)),
|
||||
+ "specified blocked format not supported by ACL, use "
|
||||
+ "format_kind_t::any to find a supported blocked format for "
|
||||
+ "your platform");
|
||||
+
|
||||
return status::success;
|
||||
}
|
||||
|
||||
diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
|
||||
index 67bb2e78eb..5ba4241abc 100644
|
||||
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
|
||||
+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
|
||||
@@ -52,7 +52,8 @@ namespace acl_matmul_utils {
|
||||
|
||||
status_t init_conf_matmul(acl_matmul_conf_t &, memory_desc_t &src_md,
|
||||
memory_desc_t &wei_md, memory_desc_t &dst_md, const matmul_desc_t &md,
|
||||
- const primitive_attr_t &attr);
|
||||
+ const primitive_attr_t &attr,
|
||||
+ format_kind_t weights_format_kind_received);
|
||||
|
||||
} // namespace acl_matmul_utils
|
||||
|
||||
--
|
||||
2.34.1
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
From 9a9430c7db870b78c6402d786a67921af4a66334 Mon Sep 17 00:00:00 2001
|
||||
From: Kentaro Kawakami <kawakami.k@fujitsu.com>
|
||||
Date: Fri, 26 May 2023 10:58:36 +0900
|
||||
Subject: [PATCH] cpu: aarch64: xbyak_aarch64: BF16 capability detection for
|
||||
Ubuntu 20.04
|
||||
|
||||
---
|
||||
.../aarch64/xbyak_aarch64/src/util_impl_linux.h | 15 ++++++++++++---
|
||||
1 file changed, 12 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
|
||||
index 743843bae50..3db37e972d1 100644
|
||||
--- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
|
||||
+++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
|
||||
@@ -39,6 +39,13 @@
|
||||
#include <asm/hwcap.h>
|
||||
#endif
|
||||
|
||||
+/* Linux kernel used in Ubuntu 20.04 does not have HWCAP2_BF16 definition. */
|
||||
+#ifdef AT_HWCAP2
|
||||
+#ifndef HWCAP2_BF16
|
||||
+#define HWCAP2_BF16 (1UL << 14)
|
||||
+#endif
|
||||
+#endif
|
||||
+
|
||||
namespace Xbyak_aarch64 {
|
||||
namespace util {
|
||||
#define XBYAK_AARCH64_ERROR_ fprintf(stderr, "%s, %d, Error occurrs during read cache infomation.\n", __FILE__, __LINE__);
|
||||
@@ -383,7 +390,7 @@ class CpuInfoLinux : public CpuInfo {
|
||||
}
|
||||
|
||||
void setHwCap() {
|
||||
- unsigned long hwcap = getauxval(AT_HWCAP);
|
||||
+ const unsigned long hwcap = getauxval(AT_HWCAP);
|
||||
if (hwcap & HWCAP_ATOMICS)
|
||||
type_ |= (Type)XBYAK_AARCH64_HWCAP_ATOMIC;
|
||||
|
||||
@@ -391,8 +398,10 @@ class CpuInfoLinux : public CpuInfo {
|
||||
type_ |= (Type)XBYAK_AARCH64_HWCAP_FP;
|
||||
if (hwcap & HWCAP_ASIMD)
|
||||
type_ |= (Type)XBYAK_AARCH64_HWCAP_ADVSIMD;
|
||||
-#ifdef HWCAP2_BF16
|
||||
- if (hwcap & HWCAP2_BF16)
|
||||
+
|
||||
+#ifdef AT_HWCAP2
|
||||
+ const unsigned long hwcap2 = getauxval(AT_HWCAP2);
|
||||
+ if (hwcap2 & HWCAP2_BF16)
|
||||
type_ |= (Type)XBYAK_AARCH64_HWCAP_BF16;
|
||||
#endif
|
||||
|
||||
|
|
@ -1,96 +0,0 @@
|
|||
/* Copyright 2024 The OpenXLA Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
diff --git a/src/cpu/aarch64/acl_post_ops.cpp b/src/cpu/aarch64/acl_post_ops.cpp
|
||||
index ea4bb200ec..3eb53b81bd 100644
|
||||
--- a/src/cpu/aarch64/acl_post_ops.cpp
|
||||
+++ b/src/cpu/aarch64/acl_post_ops.cpp
|
||||
@@ -24,7 +24,7 @@ namespace aarch64 {
|
||||
|
||||
status_t acl_post_ops_t::execute(const exec_ctx_t &ctx, void *src_orig) const {
|
||||
|
||||
- int post_op_index = 0;
|
||||
+ int post_op_index = post_op_start_index_;
|
||||
|
||||
// As these are post ops, this src will also be our dst. If we have a sum
|
||||
// post op, the src/dst will start off in a temporary, then change to
|
||||
diff --git a/src/cpu/aarch64/acl_post_ops.hpp b/src/cpu/aarch64/acl_post_ops.hpp
|
||||
index 7b59ad71d3..ceaa95b73a 100644
|
||||
--- a/src/cpu/aarch64/acl_post_ops.hpp
|
||||
+++ b/src/cpu/aarch64/acl_post_ops.hpp
|
||||
@@ -32,7 +32,9 @@ struct acl_post_ops_t {
|
||||
// init the acl_post_ops_t. Note that this function modifies the passed in
|
||||
// post ops by setting the preferred memory formats
|
||||
status_t init(engine_t *engine, post_ops_t &post_ops,
|
||||
- const memory_desc_t &dst_md) {
|
||||
+ const memory_desc_t &dst_md, int post_op_start_index = 0) {
|
||||
+
|
||||
+ post_op_start_index_ = post_op_start_index;
|
||||
|
||||
CHECK(post_ops.set_default_formats(&dst_md));
|
||||
dst_data_type = dst_md.data_type;
|
||||
@@ -41,7 +43,7 @@ struct acl_post_ops_t {
|
||||
sum_index = -1;
|
||||
post_op_primitives = {};
|
||||
|
||||
- for (int i = 0; i < post_ops.len(); i++) {
|
||||
+ for (int i = post_op_start_index; i < post_ops.len(); i++) {
|
||||
auto &po = post_ops.entry_[i];
|
||||
|
||||
if (po.is_sum()) {
|
||||
@@ -135,7 +137,8 @@ struct acl_post_ops_t {
|
||||
// formats
|
||||
status_t init(engine_t *engine, post_ops_t &base_post_ops,
|
||||
const memory_desc_t &dst_md,
|
||||
- arm_compute::ActivationLayerInfo &act_info_to_fuse) {
|
||||
+ arm_compute::ActivationLayerInfo &act_info_to_fuse,
|
||||
+ int post_op_start_index = 0) {
|
||||
|
||||
CHECK(base_post_ops.set_default_formats(&dst_md));
|
||||
dst_data_type = dst_md.data_type;
|
||||
@@ -149,18 +152,11 @@ struct acl_post_ops_t {
|
||||
"eltwise post op scale must be 1 (no scale)");
|
||||
CHECK(acl_utils::convert_to_acl_act(first_po, act_info_to_fuse));
|
||||
|
||||
- // Copy all but the first, because it has been fused
|
||||
- post_ops_t post_ops;
|
||||
- for (int idx = 1; idx < base_post_ops.len(); ++idx) {
|
||||
- // Construct empty entry then copy, so that we can check for failure
|
||||
- post_ops.entry_.emplace_back();
|
||||
- post_ops.entry_.back().copy_from(base_post_ops.entry_[idx]);
|
||||
- }
|
||||
- return init(engine, post_ops, dst_md);
|
||||
-
|
||||
+ // post_op_start_index + 1 to skip the fused eltwise
|
||||
+ return init(engine, base_post_ops, dst_md, post_op_start_index + 1);
|
||||
} else {
|
||||
// Nothing to fuse, just copy all post ops
|
||||
- return init(engine, base_post_ops, dst_md);
|
||||
+ return init(engine, base_post_ops, dst_md, post_op_start_index);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -179,6 +175,9 @@ struct acl_post_ops_t {
|
||||
private:
|
||||
// Index of the sum post op if there is one, < 0 means no sum
|
||||
int sum_index = -1;
|
||||
+ // Index of the first post op this primitive executes. This is typically the
|
||||
+ // number of post ops which were fused.
|
||||
+ int post_op_start_index_ = 0;
|
||||
data_type_t dst_data_type;
|
||||
// Vector of primitives used to execute the post ops. They are constructed
|
||||
// in init to be either acl_binary_t (for sum, add, sub, div, mul, min and
|
||||
--
|
||||
2.34.1
|
||||
|
|
@ -1,111 +0,0 @@
|
|||
*******************************************************************************
|
||||
Copyright 2023 Arm Limited and affiliates.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*******************************************************************************
|
||||
diff --git a/src/cpu/aarch64/cpu_isa_traits.hpp b/src/cpu/aarch64/cpu_isa_traits.hpp
|
||||
index 4a43b24c5..1a5cfe590 100644
|
||||
--- a/src/cpu/aarch64/cpu_isa_traits.hpp
|
||||
+++ b/src/cpu/aarch64/cpu_isa_traits.hpp
|
||||
@@ -1,6 +1,7 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2018-2023 Intel Corporation
|
||||
* Copyright 2020-2023 FUJITSU LIMITED
|
||||
+* Copyright 2023 Arm Ltd. and affiliates
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -211,10 +212,10 @@ static inline bool mayiuse_atomic() {
|
||||
return cpu().isAtomicSupported();
|
||||
}
|
||||
|
||||
-inline bool isa_has_bf16(cpu_isa_t isa) {
|
||||
- return false;
|
||||
+static inline bool mayiuse_bf16() {
|
||||
+ using namespace Xbyak_aarch64::util;
|
||||
+ return cpu().isBf16Supported();
|
||||
}
|
||||
-
|
||||
} // namespace
|
||||
|
||||
/* whatever is required to generate string literals... */
|
||||
diff --git a/src/cpu/aarch64/jit_uni_reorder.cpp b/src/cpu/aarch64/jit_uni_reorder.cpp
|
||||
index 6bd259ec2..5541bb702 100644
|
||||
--- a/src/cpu/aarch64/jit_uni_reorder.cpp
|
||||
+++ b/src/cpu/aarch64/jit_uni_reorder.cpp
|
||||
@@ -1,7 +1,7 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2018-2023 Intel Corporation
|
||||
* Copyright 2020-2023 FUJITSU LIMITED
|
||||
-* Copyright 2022 Arm Ltd. and affiliates
|
||||
+* Copyright 2022-2023 Arm Ltd. and affiliates
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -163,11 +163,11 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
|
||||
|
||||
bool ok = true && p.ndims > 0
|
||||
&& utils::one_of(p.itype, f32, s32, data_type::s8, u8)
|
||||
- && utils::one_of(p.otype, f32, s32, data_type::s8, u8)
|
||||
+ && utils::one_of(p.otype, f32, bf16, s32, data_type::s8, u8)
|
||||
&& utils::everyone_is(0, p.ioff, p.ooff) /* do we need this? */
|
||||
&& utils::one_of(p.beta, 0.f, 1.f) /* anything else? */
|
||||
- && simple_impl_desc_init(p, nullptr)
|
||||
- && prb_has_small_strides(p);
|
||||
+ && simple_impl_desc_init(p, nullptr) && prb_has_small_strides(p)
|
||||
+ && ((p.otype != bf16) || (p.itype == f32 && mayiuse_bf16()));
|
||||
|
||||
return ok;
|
||||
}
|
||||
@@ -648,6 +648,9 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
|
||||
cvt_v_s32_u8(startIdx, regNum);
|
||||
if (idt == data_type::s8) cvt_v_s8_u8(startIdx, regNum);
|
||||
break;
|
||||
+ case bf16:
|
||||
+ if (idt == f32) cvt_v_f32_bf16(startIdx, regNum);
|
||||
+ break;
|
||||
default: assert(!"unreachable");
|
||||
}
|
||||
};
|
||||
@@ -1677,6 +1680,10 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
|
||||
UNROLL_INST(fcvtzs, VReg4S, tmp, tmp);
|
||||
}
|
||||
|
||||
+ void cvt_v_f32_bf16(const size_t startIdx, const size_t regNum) {
|
||||
+ UNROLL_INST2(bfcvtn, VReg4H(i), VReg4S(i));
|
||||
+ }
|
||||
+
|
||||
void cvt_z_s8_s32(const size_t startIdx, const size_t regNum) {
|
||||
cvt_z_b_s(startIdx, regNum);
|
||||
UNROLL_INST(sxtb, ZRegS, tmp, P_ALL_ONE / T_m, tmp);
|
||||
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
|
||||
index ba5499ba9..d4e21d316 100644
|
||||
--- a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
|
||||
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
|
||||
@@ -1,5 +1,6 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2020-2022 Intel Corporation
|
||||
+* Copyright 2023 Arm Ltd. and affiliates
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -34,6 +35,8 @@ const impl_list_map_t ®ular_f32_bf16_impl_list_map() {
|
||||
DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nChw16c))
|
||||
DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nCdhw16c))
|
||||
|
||||
+ DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
|
||||
+
|
||||
DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8i16o2i, fmt_order::keep))
|
||||
DNNL_NON_X64_ONLY(REG_SR(f32, goihw, bf16, gOIhw8i16o2i, fmt_order::keep))
|
||||
DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8o16i2o, fmt_order::keep))
|
||||
|
|
@ -1,31 +0,0 @@
|
|||
*******************************************************************************
|
||||
Copyright 2024 Arm Limited and affiliates.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*******************************************************************************
|
||||
diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
|
||||
index f043fee4bc..0384cce757 100644
|
||||
--- a/src/cpu/aarch64/acl_convolution_utils.cpp
|
||||
+++ b/src/cpu/aarch64/acl_convolution_utils.cpp
|
||||
@@ -313,10 +313,6 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
|
||||
|
||||
CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
|
||||
|
||||
- // Indirect is slower than gemm for low thread counts, except for fast math
|
||||
- if (dnnl_get_max_threads() < 28 && !acp.fast_math)
|
||||
- return status::unimplemented;
|
||||
-
|
||||
// If we do not need to pad input channels for fast math mode then it would
|
||||
// be faster to run convolution with im2row instead of using indirect kernel
|
||||
int block_by = arm_compute::block_by(acp.weights_info.weight_format());
|
||||
371
third_party/mkl_dnn/onednn_acl_reorder.patch
vendored
371
third_party/mkl_dnn/onednn_acl_reorder.patch
vendored
|
|
@ -1,371 +0,0 @@
|
|||
*******************************************************************************
|
||||
Copyright 2023 Arm Limited and affiliates.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*******************************************************************************
|
||||
diff --git a/src/cpu/aarch64/acl_reorder.cpp b/src/cpu/aarch64/acl_reorder.cpp
|
||||
new file mode 100644
|
||||
index 000000000..061751b55
|
||||
--- /dev/null
|
||||
+++ b/src/cpu/aarch64/acl_reorder.cpp
|
||||
@@ -0,0 +1,52 @@
|
||||
+/*******************************************************************************
|
||||
+* Copyright 2023 Arm Ltd. and affiliates
|
||||
+*
|
||||
+* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
+* you may not use this file except in compliance with the License.
|
||||
+* You may obtain a copy of the License at
|
||||
+*
|
||||
+* http://www.apache.org/licenses/LICENSE-2.0
|
||||
+*
|
||||
+* Unless required by applicable law or agreed to in writing, software
|
||||
+* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
+* See the License for the specific language governing permissions and
|
||||
+* limitations under the License.
|
||||
+*******************************************************************************/
|
||||
+
|
||||
+#include "cpu/aarch64/acl_reorder.hpp"
|
||||
+
|
||||
+namespace dnnl {
|
||||
+namespace impl {
|
||||
+namespace cpu {
|
||||
+namespace aarch64 {
|
||||
+
|
||||
+status_t acl_reorder_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
|
||||
+ // Lock here is needed because resource_mapper does not support
|
||||
+ // concurrent multithreaded access.
|
||||
+ std::lock_guard<std::mutex> _lock {this->mtx};
|
||||
+
|
||||
+ auto src = CTX_IN_MEM(const void *, DNNL_ARG_FROM);
|
||||
+ auto dst = CTX_OUT_MEM(void *, DNNL_ARG_TO);
|
||||
+
|
||||
+ // Retrieve primitive resource and configured Compute Library objects
|
||||
+ auto *acl_resource
|
||||
+ = ctx.get_resource_mapper()->get<acl_reorder_resource_t>(this);
|
||||
+
|
||||
+ acl_reorder_obj_t &acl_obj = acl_resource->get_acl_obj();
|
||||
+
|
||||
+ acl_obj.src_tensor.allocator()->import_memory(const_cast<void *>(src));
|
||||
+ acl_obj.dst_tensor.allocator()->import_memory(dst);
|
||||
+
|
||||
+ acl_obj.reorder.run();
|
||||
+
|
||||
+ acl_obj.src_tensor.allocator()->free();
|
||||
+ acl_obj.dst_tensor.allocator()->free();
|
||||
+
|
||||
+ return status::success;
|
||||
+}
|
||||
+
|
||||
+} // namespace aarch64
|
||||
+} // namespace cpu
|
||||
+} // namespace impl
|
||||
+} // namespace dnnl
|
||||
diff --git a/src/cpu/aarch64/acl_reorder.hpp b/src/cpu/aarch64/acl_reorder.hpp
|
||||
new file mode 100644
|
||||
index 0000000000..edbc38914d
|
||||
--- /dev/null
|
||||
+++ b/src/cpu/aarch64/acl_reorder.hpp
|
||||
@@ -0,0 +1,262 @@
|
||||
+/*******************************************************************************
|
||||
+* Copyright 2023 Arm Ltd. and affiliates
|
||||
+*
|
||||
+* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
+* you may not use this file except in compliance with the License.
|
||||
+* You may obtain a copy of the License at
|
||||
+*
|
||||
+* http://www.apache.org/licenses/LICENSE-2.0
|
||||
+*
|
||||
+* Unless required by applicable law or agreed to in writing, software
|
||||
+* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
+* See the License for the specific language governing permissions and
|
||||
+* limitations under the License.
|
||||
+*******************************************************************************/
|
||||
+#ifndef CPU_AARCH64_ACL_REORDER_HPP
|
||||
+#define CPU_AARCH64_ACL_REORDER_HPP
|
||||
+
|
||||
+#include "cpu/aarch64/acl_utils.hpp"
|
||||
+#include "cpu/reorder/cpu_reorder_pd.hpp"
|
||||
+#include "arm_compute/core/Types.h"
|
||||
+#include "common/utils.hpp"
|
||||
+
|
||||
+namespace dnnl {
|
||||
+namespace impl {
|
||||
+namespace cpu {
|
||||
+namespace aarch64 {
|
||||
+
|
||||
+struct acl_reorder_obj_t {
|
||||
+ arm_compute::NEReorderLayer reorder;
|
||||
+ arm_compute::Tensor src_tensor;
|
||||
+ arm_compute::Tensor dst_tensor;
|
||||
+ arm_compute::WeightFormat src_wf;
|
||||
+ arm_compute::WeightFormat dst_wf;
|
||||
+};
|
||||
+
|
||||
+struct acl_reorder_conf_t {
|
||||
+ arm_compute::TensorInfo src_info;
|
||||
+ arm_compute::TensorInfo dst_info;
|
||||
+ arm_compute::WeightFormat src_wf;
|
||||
+ arm_compute::WeightFormat dst_wf;
|
||||
+};
|
||||
+
|
||||
+struct acl_reorder_resource_t : public resource_t {
|
||||
+ acl_reorder_resource_t() : acl_obj_(utils::make_unique<acl_reorder_obj_t>()) {}
|
||||
+
|
||||
+ status_t configure(const acl_reorder_conf_t &app) {
|
||||
+ if (!acl_obj_) return status::out_of_memory;
|
||||
+
|
||||
+ // Init Compute Library tensors based on info from descriptor
|
||||
+ acl_obj_->src_tensor.allocator()->init(app.src_info);
|
||||
+ acl_obj_->dst_tensor.allocator()->init(app.dst_info);
|
||||
+
|
||||
+ // clang-format off
|
||||
+ acl_obj_->reorder.configure(
|
||||
+ &acl_obj_->src_tensor,
|
||||
+ &acl_obj_->dst_tensor,
|
||||
+ app.src_wf,
|
||||
+ app.dst_wf
|
||||
+ );
|
||||
+ // clang-format on
|
||||
+
|
||||
+ return status::success;
|
||||
+ }
|
||||
+
|
||||
+ acl_reorder_obj_t &get_acl_obj() const { return *acl_obj_; }
|
||||
+ DNNL_DISALLOW_COPY_AND_ASSIGN(acl_reorder_resource_t);
|
||||
+
|
||||
+private:
|
||||
+ std::unique_ptr<acl_reorder_obj_t> acl_obj_;
|
||||
+}; // acl_reorder_resource_t
|
||||
+
|
||||
+struct acl_reorder_fwd_t : public primitive_t {
|
||||
+ using primitive_t::primitive_t;
|
||||
+ struct pd_t : public cpu_reorder_pd_t {
|
||||
+
|
||||
+ using cpu_reorder_pd_t::cpu_reorder_pd_t;
|
||||
+
|
||||
+ DECLARE_COMMON_PD_T("acl", acl_reorder_fwd_t);
|
||||
+
|
||||
+ static status_t create(reorder_pd_t **reorder_pd, engine_t *engine,
|
||||
+ const primitive_attr_t *attr, engine_t *src_engine,
|
||||
+ const memory_desc_t *src_md, engine_t *dst_engine,
|
||||
+ const memory_desc_t *dst_md) {
|
||||
+
|
||||
+ using namespace acl_utils;
|
||||
+ // using skip_mask_t = dnnl_primitive_attr::skip_mask_t;
|
||||
+
|
||||
+ bool ok = src_md->data_type
|
||||
+ == dst_md->data_type // ACL only supports matching src/dst data types
|
||||
+ && utils::one_of(src_md->data_type,
|
||||
+ data_type::f32) // Only supports f32 for now
|
||||
+ && attr->has_default_values();
|
||||
+ if (!ok) return status::unimplemented;
|
||||
+
|
||||
+ int mask = -1;
|
||||
+ bool is_set = false;
|
||||
+ // CHECK(attr->scales_.get(DNNL_ARG_DST, &mask, &is_set));
|
||||
+ const memory_desc_wrapper input_d(src_md);
|
||||
+ if (input_d.has_runtime_dims_or_strides() && is_set && mask > 0)
|
||||
+ return status::unimplemented;
|
||||
+
|
||||
+ // Create and check primitive descriptor
|
||||
+ auto _pd = new pd_t(attr, src_engine->kind(), src_md,
|
||||
+ dst_engine->kind(), dst_md);
|
||||
+ if (_pd == nullptr) return status::out_of_memory;
|
||||
+ if (_pd->init(engine, src_engine, dst_engine) != status::success) {
|
||||
+ delete _pd;
|
||||
+ return status::unimplemented;
|
||||
+ }
|
||||
+
|
||||
+ const memory_desc_wrapper src_d(*src_md);
|
||||
+ const memory_desc_wrapper dst_d(*dst_md);
|
||||
+
|
||||
+ const int ndims = src_d.ndims();
|
||||
+
|
||||
+ auto src_tag = memory_desc_matches_one_of_tag(
|
||||
+ *src_md, format_tag::ba, format_tag::cdba);
|
||||
+ ACL_CHECK_SUPPORT(
|
||||
+ utils::one_of(format_tag::undef, src_tag),
|
||||
+ "");
|
||||
+
|
||||
+ arm_compute::TensorShape acl_tensor_shape_in;
|
||||
+ arm_compute::TensorShape acl_tensor_shape_out;
|
||||
+ // Need even amount of dims in dim 0 for ACL kernel (eg mulitple of 8 rows when blocking by 8)
|
||||
+ int dim_0_rounded_up;
|
||||
+
|
||||
+ // Switch for 2 or 4 dim tensors
|
||||
+ switch(ndims)
|
||||
+ {
|
||||
+ // Currently for Ab4a and Ab8a
|
||||
+ // No format_tag for these, have to deduce from stride
|
||||
+ case 2:
|
||||
+ {
|
||||
+ if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){
|
||||
+ return status::unimplemented;
|
||||
+ }
|
||||
+ int dst_dim_1 = dst_md->dims[1];
|
||||
+ int dst_dim_0_stride = dst_md->format_desc.blocking.strides[0];
|
||||
+ int dst_dim_1_stride = dst_md->format_desc.blocking.strides[1];
|
||||
+ // Interleave of 4 or 8 that stride for dim 1
|
||||
+ if (dst_dim_1_stride != 4 && dst_dim_1_stride != 8){
|
||||
+ return status::unimplemented;
|
||||
+ }
|
||||
+ // Check to ensure it's a blocking transpose
|
||||
+ if (dst_dim_1 * dst_dim_1_stride != dst_dim_0_stride){
|
||||
+ return status::unimplemented;
|
||||
+ }
|
||||
+ if(dst_dim_1_stride == 4){
|
||||
+ // Set Dest WeightFormat
|
||||
+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
|
||||
+ dim_0_rounded_up
|
||||
+ = utils::rnd_up(src_md->dims[0], 4);
|
||||
+ } else {
|
||||
+ // Set Dest WeightFormat
|
||||
+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
|
||||
+ dim_0_rounded_up
|
||||
+ = utils::rnd_up(src_md->dims[0], 8);
|
||||
+ }
|
||||
+ acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[1], src_md->dims[0]);
|
||||
+ acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[1], dim_0_rounded_up);
|
||||
+
|
||||
+ break;
|
||||
+ }
|
||||
+ // Currently for Acdb4a and Acdb8a
|
||||
+ case 4:
|
||||
+ {
|
||||
+
|
||||
+ auto dst_tag = memory_desc_matches_one_of_tag(
|
||||
+ *dst_md, format_tag::Acdb4a, format_tag::Acdb8a);
|
||||
+ ACL_CHECK_SUPPORT(
|
||||
+ utils::one_of(format_tag::undef, dst_tag),
|
||||
+ "");
|
||||
+ if(dst_tag == format_tag::Acdb4a){
|
||||
+ // Set Dest WeightFormat
|
||||
+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
|
||||
+ dim_0_rounded_up
|
||||
+ = utils::rnd_up(src_md->dims[0], 4);
|
||||
+ }
|
||||
+ else{
|
||||
+ // Set Dest WeightFormat
|
||||
+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
|
||||
+ dim_0_rounded_up
|
||||
+ = utils::rnd_up(src_md->dims[0], 8);
|
||||
+ }
|
||||
+ // Currently only supporting AxBx1x1 cases
|
||||
+ if(dst_md->dims[2] != 1 || dst_md->dims[3] != 1){
|
||||
+ return status::unimplemented;
|
||||
+ }
|
||||
+ if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){
|
||||
+ return status::unimplemented;
|
||||
+ }
|
||||
+ acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], src_md->dims[0]);
|
||||
+ acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], dim_0_rounded_up);
|
||||
+ break;
|
||||
+ }
|
||||
+ default:
|
||||
+ return status::unimplemented;
|
||||
+ }
|
||||
+
|
||||
+ // Choose the data layout
|
||||
+ // bool is_nspc = utils::one_of(src_tag, format_tag::nhwc);
|
||||
+ const auto acl_layout = arm_compute::DataLayout::NCHW;
|
||||
+
|
||||
+ // Set Source WeightFormat
|
||||
+ _pd->app_.src_wf = arm_compute::WeightFormat::OHWI;
|
||||
+
|
||||
+ // Create ACL tensor infos
|
||||
+ const data_type_t data_type = src_d.data_type();
|
||||
+ const arm_compute::DataType acl_data_t
|
||||
+ = acl_utils::get_acl_data_t(data_type);
|
||||
+ _pd->app_.src_info = arm_compute::TensorInfo(
|
||||
+ acl_tensor_shape_in, 1, acl_data_t, acl_layout);
|
||||
+ _pd->app_.dst_info = arm_compute::TensorInfo(
|
||||
+ acl_tensor_shape_out, 1, acl_data_t, acl_layout);
|
||||
+
|
||||
+ // Init scratch memory, not used so 0 in this implementation
|
||||
+ _pd->init_scratchpad_md();
|
||||
+
|
||||
+ return safe_ptr_assign(*reorder_pd, _pd);
|
||||
+ } // create
|
||||
+
|
||||
+ friend dnnl::impl::impl_list_item_t;
|
||||
+ acl_reorder_conf_t app_;
|
||||
+
|
||||
+ }; // pd_t
|
||||
+
|
||||
+ acl_reorder_fwd_t(const pd_t *apd) : primitive_t(apd) {}
|
||||
+
|
||||
+ status_t create_resource(
|
||||
+ engine_t *engine, resource_mapper_t &mapper) const override {
|
||||
+ if (mapper.has_resource(this)) return status::success;
|
||||
+
|
||||
+ auto r = utils::make_unique<acl_reorder_resource_t>();
|
||||
+ if (!r) return status::out_of_memory;
|
||||
+
|
||||
+ // Configure the resource based on information from primitive descriptor
|
||||
+ CHECK(r->configure(pd()->app_));
|
||||
+
|
||||
+ mapper.add(this, std::move(r));
|
||||
+ return status::success;
|
||||
+ }
|
||||
+
|
||||
+ status_t execute(const exec_ctx_t &ctx) const override {
|
||||
+ return execute_forward(ctx);
|
||||
+ }
|
||||
+
|
||||
+private:
|
||||
+ // To guard the const execute_forward, the mutex must be 'mutable'
|
||||
+ mutable std::mutex mtx;
|
||||
+ status_t execute_forward(const exec_ctx_t &ctx) const;
|
||||
+ const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
|
||||
+
|
||||
+
|
||||
+}; // acl_reorder_fwd_t
|
||||
+
|
||||
+} // namespace aarch64
|
||||
+} // namespace cpu
|
||||
+} // namespace impl
|
||||
+} // namespace dnnl
|
||||
+
|
||||
+#endif // CPU_AARCH64_ACL_REORDER_HPP
|
||||
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
|
||||
index a4150b619..f4d6b4de3 100644
|
||||
--- a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
|
||||
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
|
||||
@@ -16,6 +16,7 @@
|
||||
*******************************************************************************/
|
||||
|
||||
#include "cpu/reorder/cpu_reorder.hpp"
|
||||
+#include "cpu/aarch64/acl_reorder.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
@@ -28,6 +29,7 @@ const impl_list_map_t ®ular_f32_f32_impl_list_map() {
|
||||
// f32 -> f32
|
||||
{{f32, f32, 0}, {
|
||||
REG_FAST_DIRECT_COPY_F32_F32
|
||||
+ DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
|
||||
|
||||
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
|
||||
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
|
||||
@@ -69,6 +71,8 @@ const impl_list_map_t ®ular_f32_f32_impl_list_map() {
|
||||
nullptr,
|
||||
}},
|
||||
{{f32, f32, 4}, {
|
||||
+
|
||||
+ DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
|
||||
CPU_REORDER_INSTANCE(rnn_weights_reorder_t<f32, f32>)
|
||||
|
||||
REG_FAST_DIRECT_COPY_F32_F32
|
||||
|
|
@ -1,97 +0,0 @@
|
|||
*******************************************************************************
|
||||
Copyright 2023 Arm Limited and affiliates.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*******************************************************************************
|
||||
diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
|
||||
index fd2c76d01..bd7bed837 100644
|
||||
--- a/src/cpu/aarch64/acl_thread.cpp
|
||||
+++ b/src/cpu/aarch64/acl_thread.cpp
|
||||
@@ -55,14 +55,17 @@ void acl_set_benchmark_scheduler_default() {
|
||||
#endif
|
||||
|
||||
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
|
||||
-void acl_set_tp_scheduler() {
|
||||
- static std::once_flag flag_once;
|
||||
- // Create threadpool scheduler
|
||||
- std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
|
||||
- = std::make_unique<ThreadpoolScheduler>();
|
||||
+void acl_set_tp_scheduler(int intra_threads = 0) {
|
||||
+ static thread_local std::once_flag flag_once;
|
||||
// set CUSTOM scheduler in ACL
|
||||
std::call_once(flag_once,
|
||||
- [&]() { arm_compute::Scheduler::set(threadpool_scheduler); });
|
||||
+ [&]() {
|
||||
+ // Create threadpool scheduler
|
||||
+ std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
|
||||
+ = std::make_unique<ThreadpoolScheduler>();
|
||||
+ threadpool_scheduler->set_num_threads(intra_threads);
|
||||
+
|
||||
+ arm_compute::Scheduler::set(threadpool_scheduler); });
|
||||
}
|
||||
|
||||
void acl_set_threadpool_num_threads() {
|
||||
@@ -102,14 +105,6 @@ void set_acl_threading() {
|
||||
acl_set_benchmark_scheduler_default();
|
||||
}
|
||||
#endif
|
||||
-#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
|
||||
- if (verbose_has_profile_externals()) {
|
||||
- acl_set_tp_benchmark_scheduler();
|
||||
- } else {
|
||||
- acl_set_tp_scheduler();
|
||||
- }
|
||||
-
|
||||
-#endif
|
||||
}
|
||||
|
||||
} // namespace acl_thread_utils
|
||||
diff --git a/src/cpu/aarch64/acl_thread.hpp b/src/cpu/aarch64/acl_thread.hpp
|
||||
index f073376e6..654a2aa5d 100644
|
||||
--- a/src/cpu/aarch64/acl_thread.hpp
|
||||
+++ b/src/cpu/aarch64/acl_thread.hpp
|
||||
@@ -40,7 +40,7 @@ void acl_set_benchmark_scheduler_default();
|
||||
|
||||
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
|
||||
// Retrieve threadpool size during primitive execution and set ThreadpoolScheduler num_threads
|
||||
-void acl_set_tp_scheduler();
|
||||
+void acl_set_tp_scheduler(int intra_threads);
|
||||
void acl_set_threadpool_num_threads();
|
||||
// Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler) for DNNL_VERBOSE=profile,profile_externals
|
||||
void acl_set_tp_benchmark_scheduler();
|
||||
diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
|
||||
index 439ca862e..6656c37a5 100644
|
||||
--- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
|
||||
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
|
||||
@@ -102,8 +102,6 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
|
||||
void ThreadpoolScheduler::run_workloads(
|
||||
std::vector<arm_compute::IScheduler::Workload> &workloads) {
|
||||
|
||||
- arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
|
||||
-
|
||||
const unsigned int num_threads
|
||||
= std::min(static_cast<unsigned int>(_num_threads),
|
||||
static_cast<unsigned int>(workloads.size()));
|
||||
diff --git a/src/cpu/cpu_engine.cpp b/src/cpu/cpu_engine.cpp
|
||||
index 0bfec3871..7207b2b60 100644
|
||||
--- a/src/cpu/cpu_engine.cpp
|
||||
+++ b/src/cpu/cpu_engine.cpp
|
||||
@@ -47,6 +47,7 @@ status_t cpu_engine_t::create_stream(stream_t **stream, unsigned flags) {
|
||||
#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
|
||||
status_t cpu_engine_t::create_stream(stream_t **stream,
|
||||
dnnl::threadpool_interop::threadpool_iface *threadpool) {
|
||||
+ dnnl::impl::cpu::aarch64::acl_thread_utils::acl_set_tp_scheduler(threadpool->get_num_threads());
|
||||
return safe_ptr_assign<stream_t>(
|
||||
*stream, new cpu_stream_t(this, threadpool));
|
||||
}
|
||||
43
third_party/mkl_dnn/onednn_acl_threadcap.patch
vendored
43
third_party/mkl_dnn/onednn_acl_threadcap.patch
vendored
|
|
@ -1,43 +0,0 @@
|
|||
*******************************************************************************
|
||||
Copyright 2023 Arm Limited and affiliates.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*******************************************************************************
|
||||
diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
|
||||
index fd2c76d01..2d7c76d48 100644
|
||||
--- a/src/cpu/aarch64/acl_thread.cpp
|
||||
+++ b/src/cpu/aarch64/acl_thread.cpp
|
||||
@@ -17,6 +17,8 @@
|
||||
#include "cpu/aarch64/acl_thread.hpp"
|
||||
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
|
||||
#include "cpu/aarch64/acl_threadpool_scheduler.hpp"
|
||||
+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
|
||||
+#include <thread>
|
||||
#endif
|
||||
#include "cpu/aarch64/acl_benchmark_scheduler.hpp"
|
||||
|
||||
@@ -30,9 +32,10 @@ namespace acl_thread_utils {
|
||||
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
|
||||
void acl_thread_bind() {
|
||||
static std::once_flag flag_once;
|
||||
- // The threads in Compute Library are bound for the cores 0..max_threads-1
|
||||
- // dnnl_get_max_threads() returns OMP_NUM_THREADS
|
||||
- const int max_threads = dnnl_get_max_threads();
|
||||
+ // Cap the number of threads to 90% of the total core count
|
||||
+ // to ensure Compute Library doesn't use too much resource
|
||||
+ int capped_threads = (int)std::floor(0.9*std::thread::hardware_concurrency());
|
||||
+ const int max_threads = std::min(capped_threads, dnnl_get_max_threads());
|
||||
// arm_compute::Scheduler does not support concurrent access thus a
|
||||
// workaround here restricts it to only one call
|
||||
std::call_once(flag_once, [&]() {
|
||||
180
third_party/mkl_dnn/onednn_acl_threadpool_default_max.patch
vendored
Normal file
180
third_party/mkl_dnn/onednn_acl_threadpool_default_max.patch
vendored
Normal file
|
|
@ -0,0 +1,180 @@
|
|||
# *******************************************************************************
|
||||
# Copyright 2025 Arm Limited and affiliates.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# *******************************************************************************
|
||||
diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
|
||||
index 53175a05f9..89731cb356 100644
|
||||
--- a/src/cpu/aarch64/acl_thread.cpp
|
||||
+++ b/src/cpu/aarch64/acl_thread.cpp
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
-* Copyright 2022-2024 Arm Ltd. and affiliates
|
||||
+* Copyright 2022-2025 Arm Ltd. and affiliates
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -83,17 +83,20 @@ void acl_set_threadpool_num_threads() {
|
||||
}
|
||||
// Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler)
|
||||
void acl_set_tp_benchmark_scheduler() {
|
||||
- static std::once_flag flag_once;
|
||||
- // Create threadpool scheduler
|
||||
- std::unique_ptr<arm_compute::IScheduler> threadpool_scheduler
|
||||
- = std::make_unique<ThreadpoolScheduler>();
|
||||
- arm_compute::IScheduler *_real_scheduler = nullptr;
|
||||
- _real_scheduler = threadpool_scheduler.release();
|
||||
- // Create benchmark scheduler and set TP as real scheduler
|
||||
- std::shared_ptr<arm_compute::IScheduler> benchmark_scheduler
|
||||
- = std::make_unique<BenchmarkScheduler>(*_real_scheduler);
|
||||
- std::call_once(flag_once,
|
||||
- [&]() { arm_compute::Scheduler::set(benchmark_scheduler); });
|
||||
+ static thread_local std::once_flag flag_once;
|
||||
+ std::call_once(flag_once, [&]() {
|
||||
+ // Create threadpool scheduler
|
||||
+ std::unique_ptr<arm_compute::IScheduler> threadpool_scheduler
|
||||
+ = std::make_unique<ThreadpoolScheduler>();
|
||||
+ arm_compute::IScheduler *_real_scheduler = nullptr;
|
||||
+ _real_scheduler = threadpool_scheduler.release();
|
||||
+
|
||||
+ // Create benchmark scheduler and set TP as real scheduler
|
||||
+ std::shared_ptr<arm_compute::IScheduler> benchmark_scheduler
|
||||
+ = std::make_unique<BenchmarkScheduler>(*_real_scheduler);
|
||||
+
|
||||
+ arm_compute::Scheduler::set(benchmark_scheduler);
|
||||
+ });
|
||||
}
|
||||
#endif
|
||||
|
||||
diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
|
||||
index 30910398d9..34cf44b7e2 100644
|
||||
--- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
|
||||
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
-* Copyright 2022-2024 Arm Ltd. and affiliates
|
||||
+* Copyright 2022-2025 Arm Ltd. and affiliates
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -18,24 +18,17 @@
|
||||
|
||||
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
|
||||
|
||||
-#include "cpu/aarch64/acl_thread.hpp"
|
||||
-
|
||||
#include "common/counting_barrier.hpp"
|
||||
#include "common/dnnl_thread.hpp"
|
||||
+#include "cpu/aarch64/acl_thread.hpp"
|
||||
|
||||
#include "arm_compute/core/CPP/ICPPKernel.h"
|
||||
#include "arm_compute/core/Error.h"
|
||||
-#include "arm_compute/core/Helpers.h"
|
||||
-#include "arm_compute/core/Utils.h"
|
||||
#include "arm_compute/runtime/IScheduler.h"
|
||||
|
||||
-// BARRIER
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
-#include <chrono>
|
||||
#include <mutex>
|
||||
-#include <thread>
|
||||
-#include <condition_variable>
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
@@ -51,7 +44,7 @@ public:
|
||||
|
||||
/// Function to check the next element in the range if there is one.
|
||||
bool get_next(unsigned int &next) {
|
||||
- next = atomic_fetch_add_explicit(
|
||||
+ next = std::atomic_fetch_add_explicit(
|
||||
&_atomic_counter, 1u, std::memory_order_relaxed);
|
||||
return next < _end;
|
||||
}
|
||||
@@ -70,11 +63,8 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads,
|
||||
} while (feeder.get_next(workload_index));
|
||||
}
|
||||
|
||||
-ThreadpoolScheduler::ThreadpoolScheduler() {
|
||||
- using namespace dnnl::impl::threadpool_utils;
|
||||
- // Set number of threads to one when threadpool is not available.
|
||||
- _num_threads = get_active_threadpool() == nullptr ? 1 : num_threads_hint();
|
||||
-}
|
||||
+ThreadpoolScheduler::ThreadpoolScheduler()
|
||||
+ : _num_threads(dnnl_get_max_threads()) {}
|
||||
|
||||
ThreadpoolScheduler::~ThreadpoolScheduler() = default;
|
||||
|
||||
@@ -83,8 +73,8 @@ unsigned int ThreadpoolScheduler::num_threads() const {
|
||||
}
|
||||
|
||||
void ThreadpoolScheduler::set_num_threads(unsigned int num_threads) {
|
||||
- arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
|
||||
- _num_threads = num_threads == 0 ? num_threads_hint() : num_threads;
|
||||
+ std::lock_guard<std::mutex> lock(this->_mtx);
|
||||
+ _num_threads = num_threads == 0 ? dnnl_get_max_threads() : num_threads;
|
||||
}
|
||||
|
||||
void ThreadpoolScheduler::schedule(ICPPKernel *kernel, const Hints &hints) {
|
||||
@@ -104,7 +94,7 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
|
||||
void ThreadpoolScheduler::run_workloads(
|
||||
std::vector<arm_compute::IScheduler::Workload> &workloads) {
|
||||
|
||||
- arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
|
||||
+ std::lock_guard<std::mutex> lock(this->_mtx);
|
||||
|
||||
const unsigned int num_threads
|
||||
= std::min(static_cast<unsigned int>(_num_threads),
|
||||
diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.hpp b/src/cpu/aarch64/acl_threadpool_scheduler.hpp
|
||||
index e9ba21c803..384dfec1b9 100644
|
||||
--- a/src/cpu/aarch64/acl_threadpool_scheduler.hpp
|
||||
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.hpp
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
-* Copyright 2022 Arm Ltd. and affiliates
|
||||
+* Copyright 2022, 2025 Arm Ltd. and affiliates
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -22,7 +22,8 @@
|
||||
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
|
||||
|
||||
#include "arm_compute/runtime/IScheduler.h"
|
||||
-#include "support/Mutex.h"
|
||||
+
|
||||
+#include <mutex>
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
@@ -32,7 +33,7 @@ namespace aarch64 {
|
||||
class ThreadpoolScheduler final : public arm_compute::IScheduler {
|
||||
public:
|
||||
ThreadpoolScheduler();
|
||||
- ~ThreadpoolScheduler();
|
||||
+ ~ThreadpoolScheduler() override;
|
||||
|
||||
/// Sets the number of threads the scheduler will use to run the kernels.
|
||||
void set_num_threads(unsigned int num_threads) override;
|
||||
@@ -54,8 +55,8 @@ protected:
|
||||
void run_workloads(std::vector<Workload> &workloads) override;
|
||||
|
||||
private:
|
||||
- uint _num_threads {};
|
||||
- arm_compute::Mutex _run_workloads_mutex {};
|
||||
+ unsigned int _num_threads {};
|
||||
+ std::mutex _mtx;
|
||||
};
|
||||
|
||||
} // namespace aarch64
|
||||
|
|
@ -1,98 +0,0 @@
|
|||
diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
|
||||
index 9e8add1f9..cf5e2bf4c 100644
|
||||
--- a/arm_compute/runtime/Scheduler.h
|
||||
+++ b/arm_compute/runtime/Scheduler.h
|
||||
@@ -75,7 +75,7 @@ public:
|
||||
|
||||
private:
|
||||
static Type _scheduler_type;
|
||||
- static std::shared_ptr<IScheduler> _custom_scheduler;
|
||||
+ static thread_local std::shared_ptr<IScheduler> _custom_scheduler;
|
||||
static std::map<Type, std::unique_ptr<IScheduler>> _schedulers;
|
||||
|
||||
Scheduler();
|
||||
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
|
||||
index a5b9eca56..d1ab19397 100644
|
||||
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
|
||||
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
|
||||
@@ -60,8 +60,8 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src,
|
||||
const ConvolutionInfo &info)
|
||||
{
|
||||
ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, info);
|
||||
- const CPUInfo &ci = NEScheduler::get().cpu_info();
|
||||
- const unsigned int num_threads = NEScheduler::get().num_threads();
|
||||
+ const CPUInfo &ci = CPUInfo::get();
|
||||
+ const unsigned int num_threads = CPUInfo::get().get_cpu_num();
|
||||
_pImpl->is_prepared = false;
|
||||
_pImpl->are_weights_const = weights->are_values_constant();
|
||||
|
||||
diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp
|
||||
index 722cd36ee..03aef1632 100644
|
||||
--- a/src/cpu/operators/CpuPool2d.cpp
|
||||
+++ b/src/cpu/operators/CpuPool2d.cpp
|
||||
@@ -66,8 +66,8 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
|
||||
|
||||
if(run_optimised)
|
||||
{
|
||||
- const CPUInfo &ci = NEScheduler::get().cpu_info();
|
||||
- const unsigned int num_threads = NEScheduler::get().num_threads();
|
||||
+ const CPUInfo &ci = CPUInfo::get();
|
||||
+ const unsigned int num_threads = CPUInfo::get().get_cpu_num();
|
||||
|
||||
auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>();
|
||||
ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
|
||||
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
|
||||
*******************************************************************************
|
||||
Copyright 2023 Arm Limited and affiliates.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*******************************************************************************
|
||||
index 9c8563140..f7771945a 100644
|
||||
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
|
||||
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
|
||||
@@ -623,8 +623,8 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
|
||||
arm_gemm::Activation activation, const AsmGemmInfo &info)
|
||||
{
|
||||
Params p = extract_parameters(a, b, d, info);
|
||||
- const CPUInfo &ci = NEScheduler::get().cpu_info();
|
||||
- unsigned int num_threads = NEScheduler::get().num_threads();
|
||||
+ const CPUInfo &ci = CPUInfo::get();
|
||||
+ unsigned int num_threads = CPUInfo::get().get_cpu_num();
|
||||
|
||||
arm_gemm::GemmConfig cfg;
|
||||
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
|
||||
@@ -696,8 +696,8 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
|
||||
ARM_COMPUTE_UNUSED(c);
|
||||
arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
|
||||
Params p = extract_parameters(a, b, d, info);
|
||||
- const CPUInfo &ci = NEScheduler::get().cpu_info();
|
||||
- unsigned int num_threads = NEScheduler::get().num_threads();
|
||||
+ const CPUInfo &ci = CPUInfo::get();
|
||||
+ unsigned int num_threads = CPUInfo::get().get_cpu_num();
|
||||
arm_gemm::GemmConfig cfg;
|
||||
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
|
||||
arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format);
|
||||
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
|
||||
index 0713b9a2a..f15ac2e22 100644
|
||||
--- a/src/runtime/Scheduler.cpp
|
||||
+++ b/src/runtime/Scheduler.cpp
|
||||
@@ -47,7 +47,7 @@ Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
|
||||
Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST;
|
||||
#endif /* ARM_COMPUTE_*_SCHEDULER */
|
||||
|
||||
-std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
|
||||
+thread_local std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
|
||||
|
||||
namespace
|
||||
{
|
||||
|
|
@ -1,8 +1,8 @@
|
|||
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
|
||||
index bf71e534e2..22377f1a32 100644
|
||||
index 547c98576..a31301230 100644
|
||||
--- a/src/BUILD.bazel
|
||||
+++ b/src/BUILD.bazel
|
||||
@@ -971,7 +971,6 @@ filegroup(
|
||||
@@ -1029,7 +1029,6 @@ filegroup(
|
||||
"runtime/NEON/functions/NETranspose.cpp",
|
||||
"runtime/NEON/functions/NEUnstack.cpp",
|
||||
"runtime/NEON/functions/NEWinogradConvolutionLayer.cpp",
|
||||
|
|
@ -10,10 +10,10 @@ index bf71e534e2..22377f1a32 100644
|
|||
"runtime/OffsetLifetimeManager.cpp",
|
||||
"runtime/OffsetMemoryPool.cpp",
|
||||
"runtime/OperatorTensor.cpp",
|
||||
@@ -984,6 +983,10 @@ filegroup(
|
||||
"runtime/Tensor.cpp",
|
||||
"runtime/TensorAllocator.cpp",
|
||||
"runtime/Utils.cpp"] +
|
||||
@@ -1058,6 +1057,10 @@ filegroup(
|
||||
"runtime/experimental/operators/CpuSub.cpp",
|
||||
"runtime/experimental/operators/CpuTranspose.cpp",
|
||||
"runtime/experimental/operators/CpuWinogradConv2d.cpp"] +
|
||||
+ select({
|
||||
+ "//:openmp_flag": ["runtime/OMP/OMPScheduler.cpp"],
|
||||
+ "//conditions:default": [],
|
||||
|
|
|
|||
|
|
@ -9,13 +9,6 @@ _DNNL_COPTS_THREADPOOL = [
|
|||
"-UUSE_CBLAS",
|
||||
]
|
||||
|
||||
_DNNL_COPTS_OMP = [
|
||||
"-fopenmp",
|
||||
"-fexceptions",
|
||||
"-UUSE_MKL",
|
||||
"-UUSE_CBLAS",
|
||||
]
|
||||
|
||||
_DNNL_RUNTIME_THREADPOOL = {
|
||||
"#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
|
||||
"#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
|
||||
|
|
@ -63,61 +56,23 @@ _DNNL_RUNTIME_THREADPOOL = {
|
|||
"#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
|
||||
"#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
|
||||
"#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
|
||||
}
|
||||
|
||||
_DNNL_RUNTIME_OMP = {
|
||||
"#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
|
||||
"#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
|
||||
"#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
|
||||
"#cmakedefine DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE": "#undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE",
|
||||
"#cmakedefine DNNL_WITH_SYCL": "#undef DNNL_WITH_SYCL",
|
||||
"#cmakedefine DNNL_WITH_LEVEL_ZERO": "#undef DNNL_WITH_LEVEL_ZERO",
|
||||
"#cmakedefine DNNL_SYCL_CUDA": "#undef DNNL_SYCL_CUDA",
|
||||
"#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
|
||||
"#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
|
||||
"#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
|
||||
"#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
|
||||
"#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
|
||||
"#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
|
||||
"#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
|
||||
"#cmakedefine01 BUILD_BATCH_NORMALIZATION": "#define BUILD_BATCH_NORMALIZATION 0",
|
||||
"#cmakedefine01 BUILD_BINARY": "#define BUILD_BINARY 0",
|
||||
"#cmakedefine01 BUILD_CONCAT": "#define BUILD_CONCAT 0",
|
||||
"#cmakedefine01 BUILD_CONVOLUTION": "#define BUILD_CONVOLUTION 0",
|
||||
"#cmakedefine01 BUILD_DECONVOLUTION": "#define BUILD_DECONVOLUTION 0",
|
||||
"#cmakedefine01 BUILD_ELTWISE": "#define BUILD_ELTWISE 0",
|
||||
"#cmakedefine01 BUILD_INNER_PRODUCT": "#define BUILD_INNER_PRODUCT 0",
|
||||
"#cmakedefine01 BUILD_LAYER_NORMALIZATION": "#define BUILD_LAYER_NORMALIZATION 0",
|
||||
"#cmakedefine01 BUILD_LRN": "#define BUILD_LRN 0",
|
||||
"#cmakedefine01 BUILD_MATMUL": "#define BUILD_MATMUL 0",
|
||||
"#cmakedefine01 BUILD_POOLING": "#define BUILD_POOLING 0",
|
||||
"#cmakedefine01 BUILD_PRELU": "#define BUILD_PRELU 0",
|
||||
"#cmakedefine01 BUILD_REDUCTION": "#define BUILD_REDUCTION 0",
|
||||
"#cmakedefine01 BUILD_REORDER": "#define BUILD_REORDER 0",
|
||||
"#cmakedefine01 BUILD_RESAMPLING": "#define BUILD_RESAMPLING 0",
|
||||
"#cmakedefine01 BUILD_RNN": "#define BUILD_RNN 0",
|
||||
"#cmakedefine01 BUILD_SHUFFLE": "#define BUILD_SHUFFLE 0",
|
||||
"#cmakedefine01 BUILD_SOFTMAX": "#define BUILD_SOFTMAX 0",
|
||||
"#cmakedefine01 BUILD_SUM": "#define BUILD_SUM 0",
|
||||
"#cmakedefine01 BUILD_PRIMITIVE_CPU_ISA_ALL": "#define BUILD_PRIMITIVE_CPU_ISA_ALL 0",
|
||||
"#cmakedefine01 BUILD_SSE41": "#define BUILD_SSE41 0",
|
||||
"#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0",
|
||||
"#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0",
|
||||
"#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0",
|
||||
"#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 0",
|
||||
"#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0",
|
||||
"#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0",
|
||||
"#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0",
|
||||
"#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
|
||||
"#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
|
||||
"#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
|
||||
"#cmakedefine01 BUILD_GROUP_NORMALIZATION": "#define BUILD_GROUP_NORMALIZATION 0",
|
||||
"#cmakedefine01 BUILD_GEMM_KERNELS_ALL": "#define BUILD_GEMM_KERNELS_ALL 1",
|
||||
"#cmakedefine01 BUILD_GEMM_KERNELS_NONE": "#define BUILD_GEMM_KERNELS_NONE 0",
|
||||
"#cmakedefine01 BUILD_GEMM_SSE41": "#define BUILD_GEMM_SSE41 0",
|
||||
"#cmakedefine01 BUILD_GEMM_AVX2": "#define BUILD_GEMM_AVX2 0",
|
||||
"#cmakedefine01 BUILD_GEMM_AVX512": "#define BUILD_GEMM_AVX512 0",
|
||||
"#cmakedefine DNNL_GPU_VENDOR": "#define DNNL_GPU_VENDOR INTEL",
|
||||
"#cmakedefine DNNL_SYCL_GENERIC": "#undef DNNL_SYCL_GENERIC",
|
||||
"#cmakedefine DNNL_DISABLE_GPU_REF_KERNELS": "#undef DNNL_DISABLE_GPU_REF_KERNELS",
|
||||
"#cmakedefine01 BUILD_SDPA": "#define BUILD_SDPA 0",
|
||||
"#cmakedefine01 BUILD_XE2": "#define BUILD_XE2 0",
|
||||
}
|
||||
|
||||
expand_template(
|
||||
name = "dnnl_config_h",
|
||||
out = "include/oneapi/dnnl/dnnl_config.h",
|
||||
substitutions = select({
|
||||
"@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": _DNNL_RUNTIME_OMP,
|
||||
"//conditions:default": _DNNL_RUNTIME_THREADPOOL,
|
||||
}),
|
||||
template = "include/oneapi/dnnl/dnnl_config.h.in",
|
||||
|
|
@ -128,13 +83,21 @@ expand_template(
|
|||
out = "include/oneapi/dnnl/dnnl_version.h",
|
||||
substitutions = {
|
||||
"@DNNL_VERSION_MAJOR@": "3",
|
||||
"@DNNL_VERSION_MINOR@": "2",
|
||||
"@DNNL_VERSION_PATCH@": "1",
|
||||
"@DNNL_VERSION_HASH@": "N/A",
|
||||
"@DNNL_VERSION_MINOR@": "7",
|
||||
"@DNNL_VERSION_PATCH@": "0",
|
||||
},
|
||||
template = "include/oneapi/dnnl/dnnl_version.h.in",
|
||||
)
|
||||
|
||||
expand_template(
|
||||
name = "dnnl_version_hash_h",
|
||||
out = "include/oneapi/dnnl/dnnl_version_hash.h",
|
||||
substitutions = {
|
||||
"@DNNL_VERSION_HASH@": "N/A",
|
||||
},
|
||||
template = "include/oneapi/dnnl/dnnl_version_hash.h.in",
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "mkl_dnn_acl",
|
||||
srcs = glob(
|
||||
|
|
@ -146,10 +109,11 @@ cc_library(
|
|||
exclude = [
|
||||
"src/cpu/x64/**",
|
||||
"src/cpu/rv64/**",
|
||||
"src/cpu/sycl/**",
|
||||
"src/xpu/**",
|
||||
],
|
||||
),
|
||||
copts = select({
|
||||
"@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": _DNNL_COPTS_OMP,
|
||||
"//conditions:default": _DNNL_COPTS_THREADPOOL,
|
||||
}),
|
||||
defines = ["DNNL_AARCH64_USE_ACL=1"],
|
||||
|
|
@ -175,6 +139,7 @@ cc_library(
|
|||
) + [
|
||||
":dnnl_config_h",
|
||||
":dnnl_version_h",
|
||||
":dnnl_version_hash_h",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
|
|
|
|||
|
|
@ -1,31 +0,0 @@
|
|||
/* Copyright 2024 The OpenXLA Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
diff --git a/src/cpu/platform.cpp b/src/cpu/platform.cpp
|
||||
index 65b887ea21..eabdb827bd 100644
|
||||
--- a/src/cpu/platform.cpp
|
||||
+++ b/src/cpu/platform.cpp
|
||||
@@ -117,6 +117,8 @@ bool has_data_type_support(data_type_t data_type) {
|
||||
#if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__)
|
||||
return true;
|
||||
#endif
|
||||
+#elif DNNL_AARCH64_USE_ACL
|
||||
+ return arm_compute::CPUInfo::get().has_bf16();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
--
|
||||
2.34.1
|
||||
|
||||
|
|
@ -1,44 +0,0 @@
|
|||
/* Copyright 2024 The OpenXLA Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
|
||||
index ab13efb9b2..ec261e156d 100644
|
||||
--- a/src/cpu/aarch64/matmul/acl_matmul.hpp
|
||||
+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
|
||||
@@ -78,11 +78,21 @@ struct acl_matmul_t : public primitive_t {
|
||||
= utils::everyone_is(data_type::f16, src_md()->data_type,
|
||||
weights_md()->data_type, dst_md()->data_type)
|
||||
&& platform::has_data_type_support(data_type::f16);
|
||||
+ const bool is_fp32_bf16_ok
|
||||
+ = (utils::everyone_is(data_type::f32, src_md()->data_type,
|
||||
+ dst_md()->data_type, desc()->accum_data_type)
|
||||
+ && platform::has_data_type_support(data_type::f32)
|
||||
+ && utils::everyone_is(
|
||||
+ data_type::bf16, weights_md()->data_type)
|
||||
+ && platform::has_data_type_support(
|
||||
+ data_type::bf16));
|
||||
+
|
||||
const bool is_weights_md_format_ok
|
||||
= utils::one_of(weights_format_kind_received,
|
||||
format_kind::any, format_kind::blocked);
|
||||
bool ok = is_dense_data()
|
||||
- && utils::one_of(true, is_fp32_ok, is_fp16_ok)
|
||||
+ && utils::one_of(
|
||||
+ true, is_fp32_ok, is_fp16_ok, is_fp32_bf16_ok)
|
||||
&& !has_zero_dim_memory() && is_weights_md_format_ok
|
||||
&& set_default_formats()
|
||||
&& attr()->has_default_values(
|
||||
--
|
||||
2.34.1
|
||||
|
|
@ -1,100 +0,0 @@
|
|||
/* Copyright 2024 The OpenXLA Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
|
||||
index 451cc78d52..ab13efb9b2 100644
|
||||
--- a/src/cpu/aarch64/matmul/acl_matmul.hpp
|
||||
+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
|
||||
@@ -67,6 +67,8 @@ struct acl_matmul_t : public primitive_t {
|
||||
|
||||
status_t init(engine_t *engine) {
|
||||
using smask_t = primitive_attr_t::skip_mask_t;
|
||||
+ const format_kind_t weights_format_kind_received
|
||||
+ = weights_md_.format_kind;
|
||||
const bool is_fp32_ok
|
||||
= utils::everyone_is(data_type::f32, src_md()->data_type,
|
||||
weights_md()->data_type, dst_md()->data_type,
|
||||
@@ -76,18 +78,20 @@ struct acl_matmul_t : public primitive_t {
|
||||
= utils::everyone_is(data_type::f16, src_md()->data_type,
|
||||
weights_md()->data_type, dst_md()->data_type)
|
||||
&& platform::has_data_type_support(data_type::f16);
|
||||
+ const bool is_weights_md_format_ok
|
||||
+ = utils::one_of(weights_format_kind_received,
|
||||
+ format_kind::any, format_kind::blocked);
|
||||
bool ok = is_dense_data()
|
||||
&& utils::one_of(true, is_fp32_ok, is_fp16_ok)
|
||||
- && !has_zero_dim_memory()
|
||||
- && weights_md_.format_kind == format_kind::any
|
||||
+ && !has_zero_dim_memory() && is_weights_md_format_ok
|
||||
&& set_default_formats()
|
||||
&& attr()->has_default_values(
|
||||
smask_t::oscale | smask_t::post_ops)
|
||||
&& attr_oscale_ok() && !has_runtime_dims_or_strides();
|
||||
if (!ok) return status::unimplemented;
|
||||
|
||||
- CHECK(acl_matmul_utils::init_conf_matmul(
|
||||
- amp_, src_md_, weights_md_, dst_md_, *desc(), *attr()));
|
||||
+ CHECK(acl_matmul_utils::init_conf_matmul(amp_, src_md_, weights_md_,
|
||||
+ dst_md_, *desc(), *attr(), weights_format_kind_received));
|
||||
|
||||
arm_compute::ActivationLayerInfo act_info;
|
||||
CHECK(post_ops.init(engine, attr_.post_ops_, dst_md_, act_info));
|
||||
diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
|
||||
index a314d96384..027f915a8a 100644
|
||||
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
|
||||
+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
|
||||
@@ -27,7 +27,8 @@ namespace acl_matmul_utils {
|
||||
|
||||
status_t init_conf_matmul(acl_matmul_conf_t &, memory_desc_t &src_md,
|
||||
memory_desc_t &wei_md, memory_desc_t &dst_md, const matmul_desc_t &md,
|
||||
- const primitive_attr_t &attr) {
|
||||
+ const primitive_attr_t &attr,
|
||||
+ format_kind_t weights_format_kind_received) {
|
||||
|
||||
const memory_desc_wrapper src_d(&src_md);
|
||||
const memory_desc_wrapper wei_d(&wei_md);
|
||||
@@ -128,9 +129,16 @@ status_t init_conf_matmul(acl_matmul_conf_t &, memory_desc_t &src_md,
|
||||
for (dim_t i = K_dim - 1; i >= 0; --i)
|
||||
batch_dims.push_back(i);
|
||||
|
||||
+ const memory_desc_t weights_md_received = wei_md;
|
||||
acl_utils::reorder_to_weight_format(amp.wei_tensor_info, wei_md,
|
||||
expected_weight_format, K_dim, N_dim, {}, batch_dims);
|
||||
|
||||
+ ACL_CHECK_SUPPORT((weights_format_kind_received == format_kind::blocked)
|
||||
+ && !(dnnl_memory_desc_equal(&weights_md_received, &wei_md)),
|
||||
+ "specified blocked format not supported by ACL, use "
|
||||
+ "format_kind_t::any to find a supported blocked format for "
|
||||
+ "your platform");
|
||||
+
|
||||
return status::success;
|
||||
}
|
||||
|
||||
diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
|
||||
index 67bb2e78eb..5ba4241abc 100644
|
||||
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
|
||||
+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
|
||||
@@ -52,7 +52,8 @@ namespace acl_matmul_utils {
|
||||
|
||||
status_t init_conf_matmul(acl_matmul_conf_t &, memory_desc_t &src_md,
|
||||
memory_desc_t &wei_md, memory_desc_t &dst_md, const matmul_desc_t &md,
|
||||
- const primitive_attr_t &attr);
|
||||
+ const primitive_attr_t &attr,
|
||||
+ format_kind_t weights_format_kind_received);
|
||||
|
||||
} // namespace acl_matmul_utils
|
||||
|
||||
--
|
||||
2.34.1
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
From 9a9430c7db870b78c6402d786a67921af4a66334 Mon Sep 17 00:00:00 2001
|
||||
From: Kentaro Kawakami <kawakami.k@fujitsu.com>
|
||||
Date: Fri, 26 May 2023 10:58:36 +0900
|
||||
Subject: [PATCH] cpu: aarch64: xbyak_aarch64: BF16 capability detection for
|
||||
Ubuntu 20.04
|
||||
|
||||
---
|
||||
.../aarch64/xbyak_aarch64/src/util_impl_linux.h | 15 ++++++++++++---
|
||||
1 file changed, 12 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
|
||||
index 743843bae50..3db37e972d1 100644
|
||||
--- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
|
||||
+++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
|
||||
@@ -39,6 +39,13 @@
|
||||
#include <asm/hwcap.h>
|
||||
#endif
|
||||
|
||||
+/* Linux kernel used in Ubuntu 20.04 does not have HWCAP2_BF16 definition. */
|
||||
+#ifdef AT_HWCAP2
|
||||
+#ifndef HWCAP2_BF16
|
||||
+#define HWCAP2_BF16 (1UL << 14)
|
||||
+#endif
|
||||
+#endif
|
||||
+
|
||||
namespace Xbyak_aarch64 {
|
||||
namespace util {
|
||||
#define XBYAK_AARCH64_ERROR_ fprintf(stderr, "%s, %d, Error occurrs during read cache infomation.\n", __FILE__, __LINE__);
|
||||
@@ -383,7 +390,7 @@ class CpuInfoLinux : public CpuInfo {
|
||||
}
|
||||
|
||||
void setHwCap() {
|
||||
- unsigned long hwcap = getauxval(AT_HWCAP);
|
||||
+ const unsigned long hwcap = getauxval(AT_HWCAP);
|
||||
if (hwcap & HWCAP_ATOMICS)
|
||||
type_ |= (Type)XBYAK_AARCH64_HWCAP_ATOMIC;
|
||||
|
||||
@@ -391,8 +398,10 @@ class CpuInfoLinux : public CpuInfo {
|
||||
type_ |= (Type)XBYAK_AARCH64_HWCAP_FP;
|
||||
if (hwcap & HWCAP_ASIMD)
|
||||
type_ |= (Type)XBYAK_AARCH64_HWCAP_ADVSIMD;
|
||||
-#ifdef HWCAP2_BF16
|
||||
- if (hwcap & HWCAP2_BF16)
|
||||
+
|
||||
+#ifdef AT_HWCAP2
|
||||
+ const unsigned long hwcap2 = getauxval(AT_HWCAP2);
|
||||
+ if (hwcap2 & HWCAP2_BF16)
|
||||
type_ |= (Type)XBYAK_AARCH64_HWCAP_BF16;
|
||||
#endif
|
||||
|
||||
|
|
@ -1,96 +0,0 @@
|
|||
/* Copyright 2024 The OpenXLA Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
diff --git a/src/cpu/aarch64/acl_post_ops.cpp b/src/cpu/aarch64/acl_post_ops.cpp
|
||||
index ea4bb200ec..3eb53b81bd 100644
|
||||
--- a/src/cpu/aarch64/acl_post_ops.cpp
|
||||
+++ b/src/cpu/aarch64/acl_post_ops.cpp
|
||||
@@ -24,7 +24,7 @@ namespace aarch64 {
|
||||
|
||||
status_t acl_post_ops_t::execute(const exec_ctx_t &ctx, void *src_orig) const {
|
||||
|
||||
- int post_op_index = 0;
|
||||
+ int post_op_index = post_op_start_index_;
|
||||
|
||||
// As these are post ops, this src will also be our dst. If we have a sum
|
||||
// post op, the src/dst will start off in a temporary, then change to
|
||||
diff --git a/src/cpu/aarch64/acl_post_ops.hpp b/src/cpu/aarch64/acl_post_ops.hpp
|
||||
index 7b59ad71d3..ceaa95b73a 100644
|
||||
--- a/src/cpu/aarch64/acl_post_ops.hpp
|
||||
+++ b/src/cpu/aarch64/acl_post_ops.hpp
|
||||
@@ -32,7 +32,9 @@ struct acl_post_ops_t {
|
||||
// init the acl_post_ops_t. Note that this function modifies the passed in
|
||||
// post ops by setting the preferred memory formats
|
||||
status_t init(engine_t *engine, post_ops_t &post_ops,
|
||||
- const memory_desc_t &dst_md) {
|
||||
+ const memory_desc_t &dst_md, int post_op_start_index = 0) {
|
||||
+
|
||||
+ post_op_start_index_ = post_op_start_index;
|
||||
|
||||
CHECK(post_ops.set_default_formats(&dst_md));
|
||||
dst_data_type = dst_md.data_type;
|
||||
@@ -41,7 +43,7 @@ struct acl_post_ops_t {
|
||||
sum_index = -1;
|
||||
post_op_primitives = {};
|
||||
|
||||
- for (int i = 0; i < post_ops.len(); i++) {
|
||||
+ for (int i = post_op_start_index; i < post_ops.len(); i++) {
|
||||
auto &po = post_ops.entry_[i];
|
||||
|
||||
if (po.is_sum()) {
|
||||
@@ -135,7 +137,8 @@ struct acl_post_ops_t {
|
||||
// formats
|
||||
status_t init(engine_t *engine, post_ops_t &base_post_ops,
|
||||
const memory_desc_t &dst_md,
|
||||
- arm_compute::ActivationLayerInfo &act_info_to_fuse) {
|
||||
+ arm_compute::ActivationLayerInfo &act_info_to_fuse,
|
||||
+ int post_op_start_index = 0) {
|
||||
|
||||
CHECK(base_post_ops.set_default_formats(&dst_md));
|
||||
dst_data_type = dst_md.data_type;
|
||||
@@ -149,18 +152,11 @@ struct acl_post_ops_t {
|
||||
"eltwise post op scale must be 1 (no scale)");
|
||||
CHECK(acl_utils::convert_to_acl_act(first_po, act_info_to_fuse));
|
||||
|
||||
- // Copy all but the first, because it has been fused
|
||||
- post_ops_t post_ops;
|
||||
- for (int idx = 1; idx < base_post_ops.len(); ++idx) {
|
||||
- // Construct empty entry then copy, so that we can check for failure
|
||||
- post_ops.entry_.emplace_back();
|
||||
- post_ops.entry_.back().copy_from(base_post_ops.entry_[idx]);
|
||||
- }
|
||||
- return init(engine, post_ops, dst_md);
|
||||
-
|
||||
+ // post_op_start_index + 1 to skip the fused eltwise
|
||||
+ return init(engine, base_post_ops, dst_md, post_op_start_index + 1);
|
||||
} else {
|
||||
// Nothing to fuse, just copy all post ops
|
||||
- return init(engine, base_post_ops, dst_md);
|
||||
+ return init(engine, base_post_ops, dst_md, post_op_start_index);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -179,6 +175,9 @@ struct acl_post_ops_t {
|
||||
private:
|
||||
// Index of the sum post op if there is one, < 0 means no sum
|
||||
int sum_index = -1;
|
||||
+ // Index of the first post op this primitive executes. This is typically the
|
||||
+ // number of post ops which were fused.
|
||||
+ int post_op_start_index_ = 0;
|
||||
data_type_t dst_data_type;
|
||||
// Vector of primitives used to execute the post ops. They are constructed
|
||||
// in init to be either acl_binary_t (for sum, add, sub, div, mul, min and
|
||||
--
|
||||
2.34.1
|
||||
|
|
@ -1,111 +0,0 @@
|
|||
*******************************************************************************
|
||||
Copyright 2023 Arm Limited and affiliates.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*******************************************************************************
|
||||
diff --git a/src/cpu/aarch64/cpu_isa_traits.hpp b/src/cpu/aarch64/cpu_isa_traits.hpp
|
||||
index 4a43b24c5..1a5cfe590 100644
|
||||
--- a/src/cpu/aarch64/cpu_isa_traits.hpp
|
||||
+++ b/src/cpu/aarch64/cpu_isa_traits.hpp
|
||||
@@ -1,6 +1,7 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2018-2023 Intel Corporation
|
||||
* Copyright 2020-2023 FUJITSU LIMITED
|
||||
+* Copyright 2023 Arm Ltd. and affiliates
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -211,10 +212,10 @@ static inline bool mayiuse_atomic() {
|
||||
return cpu().isAtomicSupported();
|
||||
}
|
||||
|
||||
-inline bool isa_has_bf16(cpu_isa_t isa) {
|
||||
- return false;
|
||||
+static inline bool mayiuse_bf16() {
|
||||
+ using namespace Xbyak_aarch64::util;
|
||||
+ return cpu().isBf16Supported();
|
||||
}
|
||||
-
|
||||
} // namespace
|
||||
|
||||
/* whatever is required to generate string literals... */
|
||||
diff --git a/src/cpu/aarch64/jit_uni_reorder.cpp b/src/cpu/aarch64/jit_uni_reorder.cpp
|
||||
index 6bd259ec2..5541bb702 100644
|
||||
--- a/src/cpu/aarch64/jit_uni_reorder.cpp
|
||||
+++ b/src/cpu/aarch64/jit_uni_reorder.cpp
|
||||
@@ -1,7 +1,7 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2018-2023 Intel Corporation
|
||||
* Copyright 2020-2023 FUJITSU LIMITED
|
||||
-* Copyright 2022 Arm Ltd. and affiliates
|
||||
+* Copyright 2022-2023 Arm Ltd. and affiliates
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -163,11 +163,11 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
|
||||
|
||||
bool ok = true && p.ndims > 0
|
||||
&& utils::one_of(p.itype, f32, s32, data_type::s8, u8)
|
||||
- && utils::one_of(p.otype, f32, s32, data_type::s8, u8)
|
||||
+ && utils::one_of(p.otype, f32, bf16, s32, data_type::s8, u8)
|
||||
&& utils::everyone_is(0, p.ioff, p.ooff) /* do we need this? */
|
||||
&& utils::one_of(p.beta, 0.f, 1.f) /* anything else? */
|
||||
- && simple_impl_desc_init(p, nullptr)
|
||||
- && prb_has_small_strides(p);
|
||||
+ && simple_impl_desc_init(p, nullptr) && prb_has_small_strides(p)
|
||||
+ && ((p.otype != bf16) || (p.itype == f32 && mayiuse_bf16()));
|
||||
|
||||
return ok;
|
||||
}
|
||||
@@ -648,6 +648,9 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
|
||||
cvt_v_s32_u8(startIdx, regNum);
|
||||
if (idt == data_type::s8) cvt_v_s8_u8(startIdx, regNum);
|
||||
break;
|
||||
+ case bf16:
|
||||
+ if (idt == f32) cvt_v_f32_bf16(startIdx, regNum);
|
||||
+ break;
|
||||
default: assert(!"unreachable");
|
||||
}
|
||||
};
|
||||
@@ -1677,6 +1680,10 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
|
||||
UNROLL_INST(fcvtzs, VReg4S, tmp, tmp);
|
||||
}
|
||||
|
||||
+ void cvt_v_f32_bf16(const size_t startIdx, const size_t regNum) {
|
||||
+ UNROLL_INST2(bfcvtn, VReg4H(i), VReg4S(i));
|
||||
+ }
|
||||
+
|
||||
void cvt_z_s8_s32(const size_t startIdx, const size_t regNum) {
|
||||
cvt_z_b_s(startIdx, regNum);
|
||||
UNROLL_INST(sxtb, ZRegS, tmp, P_ALL_ONE / T_m, tmp);
|
||||
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
|
||||
index ba5499ba9..d4e21d316 100644
|
||||
--- a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
|
||||
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
|
||||
@@ -1,5 +1,6 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2020-2022 Intel Corporation
|
||||
+* Copyright 2023 Arm Ltd. and affiliates
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -34,6 +35,8 @@ const impl_list_map_t ®ular_f32_bf16_impl_list_map() {
|
||||
DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nChw16c))
|
||||
DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nCdhw16c))
|
||||
|
||||
+ DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
|
||||
+
|
||||
DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8i16o2i, fmt_order::keep))
|
||||
DNNL_NON_X64_ONLY(REG_SR(f32, goihw, bf16, gOIhw8i16o2i, fmt_order::keep))
|
||||
DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8o16i2o, fmt_order::keep))
|
||||
|
|
@ -1,31 +0,0 @@
|
|||
*******************************************************************************
|
||||
Copyright 2024 Arm Limited and affiliates.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*******************************************************************************
|
||||
diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
|
||||
index f043fee4bc..0384cce757 100644
|
||||
--- a/src/cpu/aarch64/acl_convolution_utils.cpp
|
||||
+++ b/src/cpu/aarch64/acl_convolution_utils.cpp
|
||||
@@ -313,10 +313,6 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
|
||||
|
||||
CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
|
||||
|
||||
- // Indirect is slower than gemm for low thread counts, except for fast math
|
||||
- if (dnnl_get_max_threads() < 28 && !acp.fast_math)
|
||||
- return status::unimplemented;
|
||||
-
|
||||
// If we do not need to pad input channels for fast math mode then it would
|
||||
// be faster to run convolution with im2row instead of using indirect kernel
|
||||
int block_by = arm_compute::block_by(acp.weights_info.weight_format());
|
||||
|
|
@ -1,371 +0,0 @@
|
|||
*******************************************************************************
|
||||
Copyright 2023 Arm Limited and affiliates.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*******************************************************************************
|
||||
diff --git a/src/cpu/aarch64/acl_reorder.cpp b/src/cpu/aarch64/acl_reorder.cpp
|
||||
new file mode 100644
|
||||
index 000000000..061751b55
|
||||
--- /dev/null
|
||||
+++ b/src/cpu/aarch64/acl_reorder.cpp
|
||||
@@ -0,0 +1,52 @@
|
||||
+/*******************************************************************************
|
||||
+* Copyright 2023 Arm Ltd. and affiliates
|
||||
+*
|
||||
+* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
+* you may not use this file except in compliance with the License.
|
||||
+* You may obtain a copy of the License at
|
||||
+*
|
||||
+* http://www.apache.org/licenses/LICENSE-2.0
|
||||
+*
|
||||
+* Unless required by applicable law or agreed to in writing, software
|
||||
+* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
+* See the License for the specific language governing permissions and
|
||||
+* limitations under the License.
|
||||
+*******************************************************************************/
|
||||
+
|
||||
+#include "cpu/aarch64/acl_reorder.hpp"
|
||||
+
|
||||
+namespace dnnl {
|
||||
+namespace impl {
|
||||
+namespace cpu {
|
||||
+namespace aarch64 {
|
||||
+
|
||||
+status_t acl_reorder_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
|
||||
+ // Lock here is needed because resource_mapper does not support
|
||||
+ // concurrent multithreaded access.
|
||||
+ std::lock_guard<std::mutex> _lock {this->mtx};
|
||||
+
|
||||
+ auto src = CTX_IN_MEM(const void *, DNNL_ARG_FROM);
|
||||
+ auto dst = CTX_OUT_MEM(void *, DNNL_ARG_TO);
|
||||
+
|
||||
+ // Retrieve primitive resource and configured Compute Library objects
|
||||
+ auto *acl_resource
|
||||
+ = ctx.get_resource_mapper()->get<acl_reorder_resource_t>(this);
|
||||
+
|
||||
+ acl_reorder_obj_t &acl_obj = acl_resource->get_acl_obj();
|
||||
+
|
||||
+ acl_obj.src_tensor.allocator()->import_memory(const_cast<void *>(src));
|
||||
+ acl_obj.dst_tensor.allocator()->import_memory(dst);
|
||||
+
|
||||
+ acl_obj.reorder.run();
|
||||
+
|
||||
+ acl_obj.src_tensor.allocator()->free();
|
||||
+ acl_obj.dst_tensor.allocator()->free();
|
||||
+
|
||||
+ return status::success;
|
||||
+}
|
||||
+
|
||||
+} // namespace aarch64
|
||||
+} // namespace cpu
|
||||
+} // namespace impl
|
||||
+} // namespace dnnl
|
||||
diff --git a/src/cpu/aarch64/acl_reorder.hpp b/src/cpu/aarch64/acl_reorder.hpp
|
||||
new file mode 100644
|
||||
index 0000000000..edbc38914d
|
||||
--- /dev/null
|
||||
+++ b/src/cpu/aarch64/acl_reorder.hpp
|
||||
@@ -0,0 +1,262 @@
|
||||
+/*******************************************************************************
|
||||
+* Copyright 2023 Arm Ltd. and affiliates
|
||||
+*
|
||||
+* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
+* you may not use this file except in compliance with the License.
|
||||
+* You may obtain a copy of the License at
|
||||
+*
|
||||
+* http://www.apache.org/licenses/LICENSE-2.0
|
||||
+*
|
||||
+* Unless required by applicable law or agreed to in writing, software
|
||||
+* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
+* See the License for the specific language governing permissions and
|
||||
+* limitations under the License.
|
||||
+*******************************************************************************/
|
||||
+#ifndef CPU_AARCH64_ACL_REORDER_HPP
|
||||
+#define CPU_AARCH64_ACL_REORDER_HPP
|
||||
+
|
||||
+#include "cpu/aarch64/acl_utils.hpp"
|
||||
+#include "cpu/reorder/cpu_reorder_pd.hpp"
|
||||
+#include "arm_compute/core/Types.h"
|
||||
+#include "common/utils.hpp"
|
||||
+
|
||||
+namespace dnnl {
|
||||
+namespace impl {
|
||||
+namespace cpu {
|
||||
+namespace aarch64 {
|
||||
+
|
||||
+struct acl_reorder_obj_t {
|
||||
+ arm_compute::NEReorderLayer reorder;
|
||||
+ arm_compute::Tensor src_tensor;
|
||||
+ arm_compute::Tensor dst_tensor;
|
||||
+ arm_compute::WeightFormat src_wf;
|
||||
+ arm_compute::WeightFormat dst_wf;
|
||||
+};
|
||||
+
|
||||
+struct acl_reorder_conf_t {
|
||||
+ arm_compute::TensorInfo src_info;
|
||||
+ arm_compute::TensorInfo dst_info;
|
||||
+ arm_compute::WeightFormat src_wf;
|
||||
+ arm_compute::WeightFormat dst_wf;
|
||||
+};
|
||||
+
|
||||
+struct acl_reorder_resource_t : public resource_t {
|
||||
+ acl_reorder_resource_t() : acl_obj_(utils::make_unique<acl_reorder_obj_t>()) {}
|
||||
+
|
||||
+ status_t configure(const acl_reorder_conf_t &app) {
|
||||
+ if (!acl_obj_) return status::out_of_memory;
|
||||
+
|
||||
+ // Init Compute Library tensors based on info from descriptor
|
||||
+ acl_obj_->src_tensor.allocator()->init(app.src_info);
|
||||
+ acl_obj_->dst_tensor.allocator()->init(app.dst_info);
|
||||
+
|
||||
+ // clang-format off
|
||||
+ acl_obj_->reorder.configure(
|
||||
+ &acl_obj_->src_tensor,
|
||||
+ &acl_obj_->dst_tensor,
|
||||
+ app.src_wf,
|
||||
+ app.dst_wf
|
||||
+ );
|
||||
+ // clang-format on
|
||||
+
|
||||
+ return status::success;
|
||||
+ }
|
||||
+
|
||||
+ acl_reorder_obj_t &get_acl_obj() const { return *acl_obj_; }
|
||||
+ DNNL_DISALLOW_COPY_AND_ASSIGN(acl_reorder_resource_t);
|
||||
+
|
||||
+private:
|
||||
+ std::unique_ptr<acl_reorder_obj_t> acl_obj_;
|
||||
+}; // acl_reorder_resource_t
|
||||
+
|
||||
+struct acl_reorder_fwd_t : public primitive_t {
|
||||
+ using primitive_t::primitive_t;
|
||||
+ struct pd_t : public cpu_reorder_pd_t {
|
||||
+
|
||||
+ using cpu_reorder_pd_t::cpu_reorder_pd_t;
|
||||
+
|
||||
+ DECLARE_COMMON_PD_T("acl", acl_reorder_fwd_t);
|
||||
+
|
||||
+ static status_t create(reorder_pd_t **reorder_pd, engine_t *engine,
|
||||
+ const primitive_attr_t *attr, engine_t *src_engine,
|
||||
+ const memory_desc_t *src_md, engine_t *dst_engine,
|
||||
+ const memory_desc_t *dst_md) {
|
||||
+
|
||||
+ using namespace acl_utils;
|
||||
+ // using skip_mask_t = dnnl_primitive_attr::skip_mask_t;
|
||||
+
|
||||
+ bool ok = src_md->data_type
|
||||
+ == dst_md->data_type // ACL only supports matching src/dst data types
|
||||
+ && utils::one_of(src_md->data_type,
|
||||
+ data_type::f32) // Only supports f32 for now
|
||||
+ && attr->has_default_values();
|
||||
+ if (!ok) return status::unimplemented;
|
||||
+
|
||||
+ int mask = -1;
|
||||
+ bool is_set = false;
|
||||
+ // CHECK(attr->scales_.get(DNNL_ARG_DST, &mask, &is_set));
|
||||
+ const memory_desc_wrapper input_d(src_md);
|
||||
+ if (input_d.has_runtime_dims_or_strides() && is_set && mask > 0)
|
||||
+ return status::unimplemented;
|
||||
+
|
||||
+ // Create and check primitive descriptor
|
||||
+ auto _pd = new pd_t(attr, src_engine->kind(), src_md,
|
||||
+ dst_engine->kind(), dst_md);
|
||||
+ if (_pd == nullptr) return status::out_of_memory;
|
||||
+ if (_pd->init(engine, src_engine, dst_engine) != status::success) {
|
||||
+ delete _pd;
|
||||
+ return status::unimplemented;
|
||||
+ }
|
||||
+
|
||||
+ const memory_desc_wrapper src_d(*src_md);
|
||||
+ const memory_desc_wrapper dst_d(*dst_md);
|
||||
+
|
||||
+ const int ndims = src_d.ndims();
|
||||
+
|
||||
+ auto src_tag = memory_desc_matches_one_of_tag(
|
||||
+ *src_md, format_tag::ba, format_tag::cdba);
|
||||
+ ACL_CHECK_SUPPORT(
|
||||
+ utils::one_of(format_tag::undef, src_tag),
|
||||
+ "");
|
||||
+
|
||||
+ arm_compute::TensorShape acl_tensor_shape_in;
|
||||
+ arm_compute::TensorShape acl_tensor_shape_out;
|
||||
+ // Need even amount of dims in dim 0 for ACL kernel (eg mulitple of 8 rows when blocking by 8)
|
||||
+ int dim_0_rounded_up;
|
||||
+
|
||||
+ // Switch for 2 or 4 dim tensors
|
||||
+ switch(ndims)
|
||||
+ {
|
||||
+ // Currently for Ab4a and Ab8a
|
||||
+ // No format_tag for these, have to deduce from stride
|
||||
+ case 2:
|
||||
+ {
|
||||
+ if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){
|
||||
+ return status::unimplemented;
|
||||
+ }
|
||||
+ int dst_dim_1 = dst_md->dims[1];
|
||||
+ int dst_dim_0_stride = dst_md->format_desc.blocking.strides[0];
|
||||
+ int dst_dim_1_stride = dst_md->format_desc.blocking.strides[1];
|
||||
+ // Interleave of 4 or 8 that stride for dim 1
|
||||
+ if (dst_dim_1_stride != 4 && dst_dim_1_stride != 8){
|
||||
+ return status::unimplemented;
|
||||
+ }
|
||||
+ // Check to ensure it's a blocking transpose
|
||||
+ if (dst_dim_1 * dst_dim_1_stride != dst_dim_0_stride){
|
||||
+ return status::unimplemented;
|
||||
+ }
|
||||
+ if(dst_dim_1_stride == 4){
|
||||
+ // Set Dest WeightFormat
|
||||
+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
|
||||
+ dim_0_rounded_up
|
||||
+ = utils::rnd_up(src_md->dims[0], 4);
|
||||
+ } else {
|
||||
+ // Set Dest WeightFormat
|
||||
+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
|
||||
+ dim_0_rounded_up
|
||||
+ = utils::rnd_up(src_md->dims[0], 8);
|
||||
+ }
|
||||
+ acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[1], src_md->dims[0]);
|
||||
+ acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[1], dim_0_rounded_up);
|
||||
+
|
||||
+ break;
|
||||
+ }
|
||||
+ // Currently for Acdb4a and Acdb8a
|
||||
+ case 4:
|
||||
+ {
|
||||
+
|
||||
+ auto dst_tag = memory_desc_matches_one_of_tag(
|
||||
+ *dst_md, format_tag::Acdb4a, format_tag::Acdb8a);
|
||||
+ ACL_CHECK_SUPPORT(
|
||||
+ utils::one_of(format_tag::undef, dst_tag),
|
||||
+ "");
|
||||
+ if(dst_tag == format_tag::Acdb4a){
|
||||
+ // Set Dest WeightFormat
|
||||
+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
|
||||
+ dim_0_rounded_up
|
||||
+ = utils::rnd_up(src_md->dims[0], 4);
|
||||
+ }
|
||||
+ else{
|
||||
+ // Set Dest WeightFormat
|
||||
+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
|
||||
+ dim_0_rounded_up
|
||||
+ = utils::rnd_up(src_md->dims[0], 8);
|
||||
+ }
|
||||
+ // Currently only supporting AxBx1x1 cases
|
||||
+ if(dst_md->dims[2] != 1 || dst_md->dims[3] != 1){
|
||||
+ return status::unimplemented;
|
||||
+ }
|
||||
+ if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){
|
||||
+ return status::unimplemented;
|
||||
+ }
|
||||
+ acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], src_md->dims[0]);
|
||||
+ acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], dim_0_rounded_up);
|
||||
+ break;
|
||||
+ }
|
||||
+ default:
|
||||
+ return status::unimplemented;
|
||||
+ }
|
||||
+
|
||||
+ // Choose the data layout
|
||||
+ // bool is_nspc = utils::one_of(src_tag, format_tag::nhwc);
|
||||
+ const auto acl_layout = arm_compute::DataLayout::NCHW;
|
||||
+
|
||||
+ // Set Source WeightFormat
|
||||
+ _pd->app_.src_wf = arm_compute::WeightFormat::OHWI;
|
||||
+
|
||||
+ // Create ACL tensor infos
|
||||
+ const data_type_t data_type = src_d.data_type();
|
||||
+ const arm_compute::DataType acl_data_t
|
||||
+ = acl_utils::get_acl_data_t(data_type);
|
||||
+ _pd->app_.src_info = arm_compute::TensorInfo(
|
||||
+ acl_tensor_shape_in, 1, acl_data_t, acl_layout);
|
||||
+ _pd->app_.dst_info = arm_compute::TensorInfo(
|
||||
+ acl_tensor_shape_out, 1, acl_data_t, acl_layout);
|
||||
+
|
||||
+ // Init scratch memory, not used so 0 in this implementation
|
||||
+ _pd->init_scratchpad_md();
|
||||
+
|
||||
+ return safe_ptr_assign(*reorder_pd, _pd);
|
||||
+ } // create
|
||||
+
|
||||
+ friend dnnl::impl::impl_list_item_t;
|
||||
+ acl_reorder_conf_t app_;
|
||||
+
|
||||
+ }; // pd_t
|
||||
+
|
||||
+ acl_reorder_fwd_t(const pd_t *apd) : primitive_t(apd) {}
|
||||
+
|
||||
+ status_t create_resource(
|
||||
+ engine_t *engine, resource_mapper_t &mapper) const override {
|
||||
+ if (mapper.has_resource(this)) return status::success;
|
||||
+
|
||||
+ auto r = utils::make_unique<acl_reorder_resource_t>();
|
||||
+ if (!r) return status::out_of_memory;
|
||||
+
|
||||
+ // Configure the resource based on information from primitive descriptor
|
||||
+ CHECK(r->configure(pd()->app_));
|
||||
+
|
||||
+ mapper.add(this, std::move(r));
|
||||
+ return status::success;
|
||||
+ }
|
||||
+
|
||||
+ status_t execute(const exec_ctx_t &ctx) const override {
|
||||
+ return execute_forward(ctx);
|
||||
+ }
|
||||
+
|
||||
+private:
|
||||
+ // To guard the const execute_forward, the mutex must be 'mutable'
|
||||
+ mutable std::mutex mtx;
|
||||
+ status_t execute_forward(const exec_ctx_t &ctx) const;
|
||||
+ const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
|
||||
+
|
||||
+
|
||||
+}; // acl_reorder_fwd_t
|
||||
+
|
||||
+} // namespace aarch64
|
||||
+} // namespace cpu
|
||||
+} // namespace impl
|
||||
+} // namespace dnnl
|
||||
+
|
||||
+#endif // CPU_AARCH64_ACL_REORDER_HPP
|
||||
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
|
||||
index a4150b619..f4d6b4de3 100644
|
||||
--- a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
|
||||
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
|
||||
@@ -16,6 +16,7 @@
|
||||
*******************************************************************************/
|
||||
|
||||
#include "cpu/reorder/cpu_reorder.hpp"
|
||||
+#include "cpu/aarch64/acl_reorder.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
@@ -28,6 +29,7 @@ const impl_list_map_t ®ular_f32_f32_impl_list_map() {
|
||||
// f32 -> f32
|
||||
{{f32, f32, 0}, {
|
||||
REG_FAST_DIRECT_COPY_F32_F32
|
||||
+ DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
|
||||
|
||||
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
|
||||
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
|
||||
@@ -69,6 +71,8 @@ const impl_list_map_t ®ular_f32_f32_impl_list_map() {
|
||||
nullptr,
|
||||
}},
|
||||
{{f32, f32, 4}, {
|
||||
+
|
||||
+ DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
|
||||
CPU_REORDER_INSTANCE(rnn_weights_reorder_t<f32, f32>)
|
||||
|
||||
REG_FAST_DIRECT_COPY_F32_F32
|
||||
|
|
@ -1,97 +0,0 @@
|
|||
*******************************************************************************
|
||||
Copyright 2023 Arm Limited and affiliates.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*******************************************************************************
|
||||
diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
|
||||
index fd2c76d01..bd7bed837 100644
|
||||
--- a/src/cpu/aarch64/acl_thread.cpp
|
||||
+++ b/src/cpu/aarch64/acl_thread.cpp
|
||||
@@ -55,14 +55,17 @@ void acl_set_benchmark_scheduler_default() {
|
||||
#endif
|
||||
|
||||
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
|
||||
-void acl_set_tp_scheduler() {
|
||||
- static std::once_flag flag_once;
|
||||
- // Create threadpool scheduler
|
||||
- std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
|
||||
- = std::make_unique<ThreadpoolScheduler>();
|
||||
+void acl_set_tp_scheduler(int intra_threads = 0) {
|
||||
+ static thread_local std::once_flag flag_once;
|
||||
// set CUSTOM scheduler in ACL
|
||||
std::call_once(flag_once,
|
||||
- [&]() { arm_compute::Scheduler::set(threadpool_scheduler); });
|
||||
+ [&]() {
|
||||
+ // Create threadpool scheduler
|
||||
+ std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
|
||||
+ = std::make_unique<ThreadpoolScheduler>();
|
||||
+ threadpool_scheduler->set_num_threads(intra_threads);
|
||||
+
|
||||
+ arm_compute::Scheduler::set(threadpool_scheduler); });
|
||||
}
|
||||
|
||||
void acl_set_threadpool_num_threads() {
|
||||
@@ -102,14 +105,6 @@ void set_acl_threading() {
|
||||
acl_set_benchmark_scheduler_default();
|
||||
}
|
||||
#endif
|
||||
-#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
|
||||
- if (verbose_has_profile_externals()) {
|
||||
- acl_set_tp_benchmark_scheduler();
|
||||
- } else {
|
||||
- acl_set_tp_scheduler();
|
||||
- }
|
||||
-
|
||||
-#endif
|
||||
}
|
||||
|
||||
} // namespace acl_thread_utils
|
||||
diff --git a/src/cpu/aarch64/acl_thread.hpp b/src/cpu/aarch64/acl_thread.hpp
|
||||
index f073376e6..654a2aa5d 100644
|
||||
--- a/src/cpu/aarch64/acl_thread.hpp
|
||||
+++ b/src/cpu/aarch64/acl_thread.hpp
|
||||
@@ -40,7 +40,7 @@ void acl_set_benchmark_scheduler_default();
|
||||
|
||||
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
|
||||
// Retrieve threadpool size during primitive execution and set ThreadpoolScheduler num_threads
|
||||
-void acl_set_tp_scheduler();
|
||||
+void acl_set_tp_scheduler(int intra_threads);
|
||||
void acl_set_threadpool_num_threads();
|
||||
// Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler) for DNNL_VERBOSE=profile,profile_externals
|
||||
void acl_set_tp_benchmark_scheduler();
|
||||
diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
|
||||
index 439ca862e..6656c37a5 100644
|
||||
--- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
|
||||
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
|
||||
@@ -102,8 +102,6 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
|
||||
void ThreadpoolScheduler::run_workloads(
|
||||
std::vector<arm_compute::IScheduler::Workload> &workloads) {
|
||||
|
||||
- arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
|
||||
-
|
||||
const unsigned int num_threads
|
||||
= std::min(static_cast<unsigned int>(_num_threads),
|
||||
static_cast<unsigned int>(workloads.size()));
|
||||
diff --git a/src/cpu/cpu_engine.cpp b/src/cpu/cpu_engine.cpp
|
||||
index 0bfec3871..7207b2b60 100644
|
||||
--- a/src/cpu/cpu_engine.cpp
|
||||
+++ b/src/cpu/cpu_engine.cpp
|
||||
@@ -47,6 +47,7 @@ status_t cpu_engine_t::create_stream(stream_t **stream, unsigned flags) {
|
||||
#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
|
||||
status_t cpu_engine_t::create_stream(stream_t **stream,
|
||||
dnnl::threadpool_interop::threadpool_iface *threadpool) {
|
||||
+ dnnl::impl::cpu::aarch64::acl_thread_utils::acl_set_tp_scheduler(threadpool->get_num_threads());
|
||||
return safe_ptr_assign<stream_t>(
|
||||
*stream, new cpu_stream_t(this, threadpool));
|
||||
}
|
||||
|
|
@ -1,43 +0,0 @@
|
|||
*******************************************************************************
|
||||
Copyright 2023 Arm Limited and affiliates.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*******************************************************************************
|
||||
diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
|
||||
index fd2c76d01..2d7c76d48 100644
|
||||
--- a/src/cpu/aarch64/acl_thread.cpp
|
||||
+++ b/src/cpu/aarch64/acl_thread.cpp
|
||||
@@ -17,6 +17,8 @@
|
||||
#include "cpu/aarch64/acl_thread.hpp"
|
||||
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
|
||||
#include "cpu/aarch64/acl_threadpool_scheduler.hpp"
|
||||
+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
|
||||
+#include <thread>
|
||||
#endif
|
||||
#include "cpu/aarch64/acl_benchmark_scheduler.hpp"
|
||||
|
||||
@@ -30,9 +32,10 @@ namespace acl_thread_utils {
|
||||
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
|
||||
void acl_thread_bind() {
|
||||
static std::once_flag flag_once;
|
||||
- // The threads in Compute Library are bound for the cores 0..max_threads-1
|
||||
- // dnnl_get_max_threads() returns OMP_NUM_THREADS
|
||||
- const int max_threads = dnnl_get_max_threads();
|
||||
+ // Cap the number of threads to 90% of the total core count
|
||||
+ // to ensure Compute Library doesn't use too much resource
|
||||
+ int capped_threads = (int)std::floor(0.9*std::thread::hardware_concurrency());
|
||||
+ const int max_threads = std::min(capped_threads, dnnl_get_max_threads());
|
||||
// arm_compute::Scheduler does not support concurrent access thus a
|
||||
// workaround here restricts it to only one call
|
||||
std::call_once(flag_once, [&]() {
|
||||
180
third_party/xla/third_party/mkl_dnn/onednn_acl_threadpool_default_max.patch
vendored
Normal file
180
third_party/xla/third_party/mkl_dnn/onednn_acl_threadpool_default_max.patch
vendored
Normal file
|
|
@ -0,0 +1,180 @@
|
|||
# *******************************************************************************
|
||||
# Copyright 2025 Arm Limited and affiliates.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# *******************************************************************************
|
||||
diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
|
||||
index 53175a05f9..89731cb356 100644
|
||||
--- a/src/cpu/aarch64/acl_thread.cpp
|
||||
+++ b/src/cpu/aarch64/acl_thread.cpp
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
-* Copyright 2022-2024 Arm Ltd. and affiliates
|
||||
+* Copyright 2022-2025 Arm Ltd. and affiliates
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -83,17 +83,20 @@ void acl_set_threadpool_num_threads() {
|
||||
}
|
||||
// Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler)
|
||||
void acl_set_tp_benchmark_scheduler() {
|
||||
- static std::once_flag flag_once;
|
||||
- // Create threadpool scheduler
|
||||
- std::unique_ptr<arm_compute::IScheduler> threadpool_scheduler
|
||||
- = std::make_unique<ThreadpoolScheduler>();
|
||||
- arm_compute::IScheduler *_real_scheduler = nullptr;
|
||||
- _real_scheduler = threadpool_scheduler.release();
|
||||
- // Create benchmark scheduler and set TP as real scheduler
|
||||
- std::shared_ptr<arm_compute::IScheduler> benchmark_scheduler
|
||||
- = std::make_unique<BenchmarkScheduler>(*_real_scheduler);
|
||||
- std::call_once(flag_once,
|
||||
- [&]() { arm_compute::Scheduler::set(benchmark_scheduler); });
|
||||
+ static thread_local std::once_flag flag_once;
|
||||
+ std::call_once(flag_once, [&]() {
|
||||
+ // Create threadpool scheduler
|
||||
+ std::unique_ptr<arm_compute::IScheduler> threadpool_scheduler
|
||||
+ = std::make_unique<ThreadpoolScheduler>();
|
||||
+ arm_compute::IScheduler *_real_scheduler = nullptr;
|
||||
+ _real_scheduler = threadpool_scheduler.release();
|
||||
+
|
||||
+ // Create benchmark scheduler and set TP as real scheduler
|
||||
+ std::shared_ptr<arm_compute::IScheduler> benchmark_scheduler
|
||||
+ = std::make_unique<BenchmarkScheduler>(*_real_scheduler);
|
||||
+
|
||||
+ arm_compute::Scheduler::set(benchmark_scheduler);
|
||||
+ });
|
||||
}
|
||||
#endif
|
||||
|
||||
diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
|
||||
index 30910398d9..34cf44b7e2 100644
|
||||
--- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
|
||||
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
-* Copyright 2022-2024 Arm Ltd. and affiliates
|
||||
+* Copyright 2022-2025 Arm Ltd. and affiliates
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -18,24 +18,17 @@
|
||||
|
||||
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
|
||||
|
||||
-#include "cpu/aarch64/acl_thread.hpp"
|
||||
-
|
||||
#include "common/counting_barrier.hpp"
|
||||
#include "common/dnnl_thread.hpp"
|
||||
+#include "cpu/aarch64/acl_thread.hpp"
|
||||
|
||||
#include "arm_compute/core/CPP/ICPPKernel.h"
|
||||
#include "arm_compute/core/Error.h"
|
||||
-#include "arm_compute/core/Helpers.h"
|
||||
-#include "arm_compute/core/Utils.h"
|
||||
#include "arm_compute/runtime/IScheduler.h"
|
||||
|
||||
-// BARRIER
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
-#include <chrono>
|
||||
#include <mutex>
|
||||
-#include <thread>
|
||||
-#include <condition_variable>
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
@@ -51,7 +44,7 @@ public:
|
||||
|
||||
/// Function to check the next element in the range if there is one.
|
||||
bool get_next(unsigned int &next) {
|
||||
- next = atomic_fetch_add_explicit(
|
||||
+ next = std::atomic_fetch_add_explicit(
|
||||
&_atomic_counter, 1u, std::memory_order_relaxed);
|
||||
return next < _end;
|
||||
}
|
||||
@@ -70,11 +63,8 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads,
|
||||
} while (feeder.get_next(workload_index));
|
||||
}
|
||||
|
||||
-ThreadpoolScheduler::ThreadpoolScheduler() {
|
||||
- using namespace dnnl::impl::threadpool_utils;
|
||||
- // Set number of threads to one when threadpool is not available.
|
||||
- _num_threads = get_active_threadpool() == nullptr ? 1 : num_threads_hint();
|
||||
-}
|
||||
+ThreadpoolScheduler::ThreadpoolScheduler()
|
||||
+ : _num_threads(dnnl_get_max_threads()) {}
|
||||
|
||||
ThreadpoolScheduler::~ThreadpoolScheduler() = default;
|
||||
|
||||
@@ -83,8 +73,8 @@ unsigned int ThreadpoolScheduler::num_threads() const {
|
||||
}
|
||||
|
||||
void ThreadpoolScheduler::set_num_threads(unsigned int num_threads) {
|
||||
- arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
|
||||
- _num_threads = num_threads == 0 ? num_threads_hint() : num_threads;
|
||||
+ std::lock_guard<std::mutex> lock(this->_mtx);
|
||||
+ _num_threads = num_threads == 0 ? dnnl_get_max_threads() : num_threads;
|
||||
}
|
||||
|
||||
void ThreadpoolScheduler::schedule(ICPPKernel *kernel, const Hints &hints) {
|
||||
@@ -104,7 +94,7 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
|
||||
void ThreadpoolScheduler::run_workloads(
|
||||
std::vector<arm_compute::IScheduler::Workload> &workloads) {
|
||||
|
||||
- arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
|
||||
+ std::lock_guard<std::mutex> lock(this->_mtx);
|
||||
|
||||
const unsigned int num_threads
|
||||
= std::min(static_cast<unsigned int>(_num_threads),
|
||||
diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.hpp b/src/cpu/aarch64/acl_threadpool_scheduler.hpp
|
||||
index e9ba21c803..384dfec1b9 100644
|
||||
--- a/src/cpu/aarch64/acl_threadpool_scheduler.hpp
|
||||
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.hpp
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
-* Copyright 2022 Arm Ltd. and affiliates
|
||||
+* Copyright 2022, 2025 Arm Ltd. and affiliates
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -22,7 +22,8 @@
|
||||
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
|
||||
|
||||
#include "arm_compute/runtime/IScheduler.h"
|
||||
-#include "support/Mutex.h"
|
||||
+
|
||||
+#include <mutex>
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
@@ -32,7 +33,7 @@ namespace aarch64 {
|
||||
class ThreadpoolScheduler final : public arm_compute::IScheduler {
|
||||
public:
|
||||
ThreadpoolScheduler();
|
||||
- ~ThreadpoolScheduler();
|
||||
+ ~ThreadpoolScheduler() override;
|
||||
|
||||
/// Sets the number of threads the scheduler will use to run the kernels.
|
||||
void set_num_threads(unsigned int num_threads) override;
|
||||
@@ -54,8 +55,8 @@ protected:
|
||||
void run_workloads(std::vector<Workload> &workloads) override;
|
||||
|
||||
private:
|
||||
- uint _num_threads {};
|
||||
- arm_compute::Mutex _run_workloads_mutex {};
|
||||
+ unsigned int _num_threads {};
|
||||
+ std::mutex _mtx;
|
||||
};
|
||||
|
||||
} // namespace aarch64
|
||||
24
third_party/xla/tsl_workspace2.bzl
vendored
24
third_party/xla/tsl_workspace2.bzl
vendored
|
|
@ -163,33 +163,23 @@ def _tf_repositories():
|
|||
name = "mkl_dnn_acl_compatible",
|
||||
build_file = "//third_party/mkl_dnn:mkldnn_acl.BUILD",
|
||||
patch_file = [
|
||||
"//third_party/mkl_dnn:onednn_acl_threadcap.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_reorder.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_thread_local_scheduler.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_fp32_bf16_reorder.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_indirect_conv.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_fix_segfault_during_postop_execute.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_add_bf16_platform_support_check.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_add_sbgemm_matmul_primitive_definition.patch",
|
||||
"//third_party/mkl_dnn:onednn_acl_threadpool_default_max.patch",
|
||||
],
|
||||
sha256 = "2f76b407ef8893cca71340f88cd800019a1f14f8ac1bbdbb89a84be1370b52e3",
|
||||
strip_prefix = "oneDNN-3.2.1",
|
||||
urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.2.1.tar.gz"),
|
||||
sha256 = "5792cbc07764c6e25c459ff68efb5cfcd7f4a0ba66dca6a4a2c681cd7a644596",
|
||||
strip_prefix = "oneDNN-3.7",
|
||||
urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.7.zip"),
|
||||
)
|
||||
|
||||
tf_http_archive(
|
||||
name = "compute_library",
|
||||
patch_file = [
|
||||
"//third_party/compute_library:compute_library.patch",
|
||||
"//third_party/compute_library:acl_thread_local_scheduler.patch",
|
||||
"//third_party/compute_library:exclude_omp_scheduler.patch",
|
||||
"//third_party/compute_library:include_string.patch",
|
||||
],
|
||||
sha256 = "c4ca329a78da380163b2d86e91ba728349b6f0ee97d66e260a694ef37f0b0d93",
|
||||
strip_prefix = "ComputeLibrary-23.05.1",
|
||||
urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/v23.05.1.tar.gz"),
|
||||
sha256 = "8273f68cd0bb17e9231a11a6618d245eb6d623884ae681c00e7a4eabca2dad42",
|
||||
strip_prefix = "ComputeLibrary-24.12",
|
||||
urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/refs/tags/v24.12.tar.gz"),
|
||||
)
|
||||
|
||||
tf_http_archive(
|
||||
|
|
|
|||
8
third_party/xla/xla/tsl/mkl/BUILD.bazel
vendored
8
third_party/xla/xla/tsl/mkl/BUILD.bazel
vendored
|
|
@ -82,14 +82,6 @@ config_setting(
|
|||
},
|
||||
)
|
||||
|
||||
config_setting(
|
||||
name = "build_with_mkl_aarch64_openmp",
|
||||
define_values = {
|
||||
"build_with_mkl_aarch64": "true",
|
||||
"build_with_openmp": "true",
|
||||
},
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "LICENSE",
|
||||
srcs = [
|
||||
|
|
|
|||
7
third_party/xla/xla/tsl/mkl/build_defs.bzl
vendored
7
third_party/xla/xla/tsl/mkl/build_defs.bzl
vendored
|
|
@ -7,7 +7,6 @@ if_mkl_lnx_x64 is a conditional to check for MKL
|
|||
if_enable_mkl is a conditional to check if building with MKL and MKL is enabled.
|
||||
if_mkldnn_openmp checks if we are building x86 backend with OpenMP.
|
||||
if_mkldnn_aarch64_acl checks if we are building with Arm Compute Library.
|
||||
if_mkldnn_aarch64_acl_openmp checks if we are building ACL with OpenMP.
|
||||
|
||||
mkl_repository is a repository rule for creating MKL repository rule that can
|
||||
be pointed to either a local folder, or download it from the internet.
|
||||
|
|
@ -146,12 +145,6 @@ def if_mkldnn_aarch64_acl(if_true, if_false = []):
|
|||
"//conditions:default": if_false,
|
||||
})
|
||||
|
||||
def if_mkldnn_aarch64_acl_openmp(if_true, if_false = []):
|
||||
return select({
|
||||
"@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": if_true,
|
||||
"//conditions:default": if_false,
|
||||
})
|
||||
|
||||
# Temporarily disable Graph API on aarch64 until we change the aarch64 BUILD
|
||||
# file to support Graph API.
|
||||
def if_graph_api(if_true, if_false = []):
|
||||
|
|
|
|||
2
third_party/xla/xla/tsl/tsl.bzl
vendored
2
third_party/xla/xla/tsl/tsl.bzl
vendored
|
|
@ -10,7 +10,6 @@ load(
|
|||
"if_enable_mkl",
|
||||
"if_mkl",
|
||||
"if_mkldnn_aarch64_acl",
|
||||
"if_mkldnn_aarch64_acl_openmp",
|
||||
"if_mkldnn_openmp",
|
||||
"onednn_v3_define",
|
||||
)
|
||||
|
|
@ -334,7 +333,6 @@ def tsl_copts(
|
|||
if_mkldnn_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
|
||||
onednn_v3_define() +
|
||||
if_mkldnn_aarch64_acl(["-DDNNL_AARCH64_USE_ACL=1"]) +
|
||||
if_mkldnn_aarch64_acl_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
|
||||
if_enable_acl(["-DXLA_CPU_USE_ACL=1", "-fexceptions"]) +
|
||||
if_android_arm(["-mfpu=neon", "-fomit-frame-pointer"]) +
|
||||
if_linux_x86_64(["-msse3"]) +
|
||||
|
|
|
|||
|
|
@ -154,9 +154,7 @@ class OneDnnThreadPool : public threadpool_iface {
|
|||
static void set_onednn_max_threads(int num_threads) {
|
||||
#if DNNL_VERSION_MAJOR >= 3 || \
|
||||
(DNNL_VERSION_MAJOR == 2 && DNNL_VERSION_MINOR >= 7)
|
||||
#ifndef DNNL_AARCH64_USE_ACL
|
||||
dnnl_threadpool_interop_set_max_concurrency(num_threads);
|
||||
#endif // DNNL_AARCH64_USE_ACL
|
||||
#endif // DNNL_VERSION_MAJOR >= 3 ||
|
||||
// (DNNL_VERSION_MAJOR == 2 && DNNL_VERSION_MINOR >= 7)
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user