diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 2412168c9d..62bddbb413 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -12,6 +12,7 @@ ocv_add_dispatched_file(mean SSE2 AVX2 LASX) ocv_add_dispatched_file(merge SSE2 AVX2 LASX) ocv_add_dispatched_file(split SSE2 AVX2 LASX) ocv_add_dispatched_file(sum SSE2 AVX2 LASX) +ocv_add_dispatched_file(norm SSE2 SSE4_1 AVX AVX2 NEON_DOTPROD LASX) # dispatching for accuracy tests ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2 AVX512_SKX) diff --git a/modules/core/src/norm.cpp b/modules/core/src/norm.dispatch.cpp similarity index 92% rename from modules/core/src/norm.cpp rename to modules/core/src/norm.dispatch.cpp index 0452e40a55..e43e33c92e 100644 --- a/modules/core/src/norm.cpp +++ b/modules/core/src/norm.dispatch.cpp @@ -7,6 +7,9 @@ #include "opencl_kernels_core.hpp" #include "stat.hpp" +#include "norm.simd.hpp" +#include "norm.simd_declarations.hpp" + /****************************************************************************************\ * norm * \****************************************************************************************/ @@ -215,72 +218,6 @@ int normL1_(const uchar* a, const uchar* b, int n) //================================================================================================== -template int -normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn) -{ - ST result = *_result; - if( !mask ) - { - result = std::max(result, normInf(src, len*cn)); - } - else - { - for( int i = 0; i < len; i++, src += cn ) - if( mask[i] ) - { - for( int k = 0; k < cn; k++ ) - result = std::max(result, ST(cv_abs(src[k]))); - } - } - *_result = result; - return 0; -} - -template int -normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn) -{ - ST result = *_result; - if( !mask ) - { - result += normL1(src, len*cn); - } - else - { - for( int i = 0; i < len; i++, src += cn ) - if( mask[i] ) - { - for( int k = 0; k < cn; k++ ) - result += cv_abs(src[k]); - } - } - *_result = result; - return 0; -} - -template int -normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn) -{ - ST result = *_result; - if( !mask ) - { - result += normL2Sqr(src, len*cn); - } - else - { - for( int i = 0; i < len; i++, src += cn ) - if( mask[i] ) - { - for( int k = 0; k < cn; k++ ) - { - T v = src[k]; - result += (ST)v*v; - } - } - } - *_result = result; - return 0; -} - template int normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn) { @@ -347,51 +284,27 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le return 0; } -#define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \ - static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \ -{ return norm##L##_(src, mask, r, len, cn); } \ +#define CV_DEF_NORM_DIFF_FUNC(L, suffix, type, ntype) \ static int normDiff##L##_##suffix(const type* src1, const type* src2, \ const uchar* mask, ntype* r, int len, int cn) \ { return normDiff##L##_(src1, src2, mask, r, (int)len, cn); } -#define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \ - CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \ - CV_DEF_NORM_FUNC(L1, suffix, type, l1type) \ - CV_DEF_NORM_FUNC(L2, suffix, type, l2type) - -CV_DEF_NORM_ALL(8u, uchar, int, int, int) -CV_DEF_NORM_ALL(8s, schar, int, int, int) -CV_DEF_NORM_ALL(16u, ushort, int, int, double) -CV_DEF_NORM_ALL(16s, short, int, int, double) -CV_DEF_NORM_ALL(32s, int, int, double, double) -CV_DEF_NORM_ALL(32f, float, float, double, double) -CV_DEF_NORM_ALL(64f, double, double, double, double) +#define CV_DEF_NORM_DIFF_ALL(suffix, type, inftype, l1type, l2type) \ + CV_DEF_NORM_DIFF_FUNC(Inf, suffix, type, inftype) \ + CV_DEF_NORM_DIFF_FUNC(L1, suffix, type, l1type) \ + CV_DEF_NORM_DIFF_FUNC(L2, suffix, type, l2type) +CV_DEF_NORM_DIFF_ALL(8u, uchar, int, int, int) +CV_DEF_NORM_DIFF_ALL(8s, schar, int, int, int) +CV_DEF_NORM_DIFF_ALL(16u, ushort, int, int, double) +CV_DEF_NORM_DIFF_ALL(16s, short, int, int, double) +CV_DEF_NORM_DIFF_ALL(32s, int, int, double, double) +CV_DEF_NORM_DIFF_ALL(32f, float, float, double, double) +CV_DEF_NORM_DIFF_ALL(64f, double, double, double, double) typedef int (*NormFunc)(const uchar*, const uchar*, uchar*, int, int); typedef int (*NormDiffFunc)(const uchar*, const uchar*, const uchar*, uchar*, int, int); -static NormFunc getNormFunc(int normType, int depth) -{ - static NormFunc normTab[3][8] = - { - { - (NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s), - (NormFunc)GET_OPTIMIZED(normInf_32s), (NormFunc)GET_OPTIMIZED(normInf_32f), (NormFunc)normInf_64f, 0 - }, - { - (NormFunc)GET_OPTIMIZED(normL1_8u), (NormFunc)GET_OPTIMIZED(normL1_8s), (NormFunc)GET_OPTIMIZED(normL1_16u), (NormFunc)GET_OPTIMIZED(normL1_16s), - (NormFunc)GET_OPTIMIZED(normL1_32s), (NormFunc)GET_OPTIMIZED(normL1_32f), (NormFunc)normL1_64f, 0 - }, - { - (NormFunc)GET_OPTIMIZED(normL2_8u), (NormFunc)GET_OPTIMIZED(normL2_8s), (NormFunc)GET_OPTIMIZED(normL2_16u), (NormFunc)GET_OPTIMIZED(normL2_16s), - (NormFunc)GET_OPTIMIZED(normL2_32s), (NormFunc)GET_OPTIMIZED(normL2_32f), (NormFunc)normL2_64f, 0 - } - }; - - return normTab[normType][depth]; -} - static NormDiffFunc getNormDiffFunc(int normType, int depth) { static NormDiffFunc normDiffTab[3][8] = @@ -603,6 +516,11 @@ static bool ipp_norm(Mat &src, int normType, Mat &mask, double &result) } // ipp_norm() #endif // HAVE_IPP +static NormFunc getNormFunc(int normType, int depth) { + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(getNormFunc, (normType, depth), CV_CPU_DISPATCH_MODES_ALL); +} + double norm( InputArray _src, int normType, InputArray _mask ) { CV_INSTRUMENT_REGION(); @@ -637,6 +555,9 @@ double norm( InputArray _src, int normType, InputArray _mask ) CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_norm(src, normType, mask, _result), _result); + NormFunc func = getNormFunc(normType >> 1, depth == CV_16F ? CV_32F : depth); + CV_Assert( func != 0 ); + if( src.isContinuous() && mask.empty() ) { size_t len = src.total()*cn; @@ -644,30 +565,18 @@ double norm( InputArray _src, int normType, InputArray _mask ) { if( depth == CV_32F ) { - const float* data = src.ptr(); + const uchar* data = src.ptr(); - if( normType == NORM_L2 ) + if( normType == NORM_L2 || normType == NORM_L2SQR || normType == NORM_L1 ) { double result = 0; - GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1); - return std::sqrt(result); - } - if( normType == NORM_L2SQR ) - { - double result = 0; - GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1); - return result; - } - if( normType == NORM_L1 ) - { - double result = 0; - GET_OPTIMIZED(normL1_32f)(data, 0, &result, (int)len, 1); - return result; + func(data, 0, (uchar*)&result, (int)len, 1); + return normType == NORM_L2 ? std::sqrt(result) : result; } if( normType == NORM_INF ) { float result = 0; - GET_OPTIMIZED(normInf_32f)(data, 0, &result, (int)len, 1); + func(data, 0, (uchar*)&result, (int)len, 1); return result; } } @@ -714,9 +623,6 @@ double norm( InputArray _src, int normType, InputArray _mask ) return result; } - NormFunc func = getNormFunc(normType >> 1, depth == CV_16F ? CV_32F : depth); - CV_Assert( func != 0 ); - const Mat* arrays[] = {&src, &mask, 0}; uchar* ptrs[2] = {}; union diff --git a/modules/core/src/norm.rvv1p0.hpp b/modules/core/src/norm.rvv1p0.hpp new file mode 100644 index 0000000000..3db05c50a4 --- /dev/null +++ b/modules/core/src/norm.rvv1p0.hpp @@ -0,0 +1,200 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copytright (C) 2025, SpaceMIT Inc., all rights reserved. + +#include "opencv2/core/hal/intrin.hpp" + +namespace cv { + +namespace { + +// [TODO] Drop this until rvv has dedicated intrinsics for abs on integers. +template inline ST __riscv_vabs(const T&); + +template<> inline +vuint8m1_t __riscv_vabs(const vint8m1_t& v) { + const int vle8m1 = __riscv_vsetvlmax_e8m1(); + vint8m1_t mask = __riscv_vsra_vx_i8m1(v, 7, vle8m1); + vint8m1_t v_xor = __riscv_vxor_vv_i8m1(v, mask, vle8m1); + return __riscv_vreinterpret_v_i8m1_u8m1( + __riscv_vsub_vv_i8m1(v_xor, mask, vle8m1) + ); +} + +template<> inline +vuint16m1_t __riscv_vabs(const vint16m1_t& v) { + const int vle16m1 = __riscv_vsetvlmax_e16m1(); + vint16m1_t mask = __riscv_vsra_vx_i16m1(v, 15, vle16m1); + vint16m1_t v_xor = __riscv_vxor_vv_i16m1(v, mask, vle16m1); + return __riscv_vreinterpret_v_i16m1_u16m1( + __riscv_vsub_vv_i16m1(v_xor, mask, vle16m1) + ); +} + +template<> inline +vuint32m1_t __riscv_vabs(const vint32m1_t& v) { + const int vle32m1 = __riscv_vsetvlmax_e32m1(); + vint32m1_t mask = __riscv_vsra_vx_i32m1(v, 31, vle32m1); + vint32m1_t v_xor = __riscv_vxor_vv_i32m1(v, mask, vle32m1); + return __riscv_vreinterpret_v_i32m1_u32m1( + __riscv_vsub_vv_i32m1(v_xor, mask, vle32m1) + ); +} +} + +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN + +template inline +ST normInf_rvv(const T* src, int n, int& j); + +template<> inline +int normInf_rvv(const int* src, int n, int& j) { + const int vle32m1 = __riscv_vsetvlmax_e32m1(); + vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1); + vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1); + for (; j <= n - 2 * vle32m1; j += 2 * vle32m1) { + vuint32m1_t v0 = __riscv_vabs(__riscv_vle32_v_i32m1(src + j, vle32m1)); + r0 = __riscv_vmaxu(r0, v0, vle32m1); + + vuint32m1_t v1 = __riscv_vabs(__riscv_vle32_v_i32m1(src + j + vle32m1, vle32m1)); + r1 = __riscv_vmaxu(r1, v1, vle32m1); + } + r0 = __riscv_vmaxu(r0, r1, vle32m1); + return (int)__riscv_vmv_x(__riscv_vredmaxu(r0, __riscv_vmv_v_x_u32m1(0, vle32m1), vle32m1)); +} + +template inline +ST normL1_rvv(const T* src, int n, int& j); + +template<> inline +int normL1_rvv(const schar* src, int n, int& j) { + const int vle8m1 = __riscv_vsetvlmax_e8m1(); + const int vle16m1 = __riscv_vsetvlmax_e16m1(); + const int vle32m1 = __riscv_vsetvlmax_e32m1(); + vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1); + vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1); + vuint16m1_t zero = __riscv_vmv_v_x_u16m1(0, vle16m1); + for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) { + vuint8m1_t v0 = __riscv_vabs(__riscv_vle8_v_i8m1(src + j, vle8m1)); + vuint16m1_t u0 = __riscv_vwredsumu_tu(zero, v0, zero, vle8m1); + r0 = __riscv_vwredsumu(u0, r0, vle16m1); + + vuint8m1_t v1 = __riscv_vabs(__riscv_vle8_v_i8m1(src + j + vle8m1, vle8m1)); + vuint16m1_t u1 = __riscv_vwredsumu_tu(zero, v1, zero, vle8m1); + r1 = __riscv_vwredsumu(u1, r1, vle16m1); + } + return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1)); +} + +template<> inline +int normL1_rvv(const ushort* src, int n, int& j) { + const int vle16m1 = __riscv_vsetvlmax_e16m1(); + const int vle32m1 = __riscv_vsetvlmax_e32m1(); + vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1); + vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1); + for (; j <= n - 2 * vle16m1; j += 2 * vle16m1) { + vuint16m1_t v0 = __riscv_vle16_v_u16m1(src + j, vle16m1); + r0 = __riscv_vwredsumu(v0, r0, vle16m1); + + vuint16m1_t v1 = __riscv_vle16_v_u16m1(src + j + vle16m1, vle16m1); + r1 = __riscv_vwredsumu(v1, r1, vle16m1); + } + return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1)); +} + +template<> inline +int normL1_rvv(const short* src, int n, int& j) { + const int vle16m1 = __riscv_vsetvlmax_e16m1(); + const int vle32m1 = __riscv_vsetvlmax_e32m1(); + vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1); + vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1); + for (; j<= n - 2 * vle16m1; j += 2 * vle16m1) { + vuint16m1_t v0 = __riscv_vabs(__riscv_vle16_v_i16m1(src + j, vle16m1)); + r0 = __riscv_vwredsumu(v0, r0, vle16m1); + + vuint16m1_t v1 = __riscv_vabs(__riscv_vle16_v_i16m1(src + j + vle16m1, vle16m1)); + r1 = __riscv_vwredsumu(v1, r1, vle16m1); + } + return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1)); +} + +template<> inline +double normL1_rvv(const double* src, int n, int& j) { + const int vle64m1 = __riscv_vsetvlmax_e64m1(); + vfloat64m1_t r0 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1); + vfloat64m1_t r1 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1); + for (; j <= n - 2 * vle64m1; j += 2 * vle64m1) { + vfloat64m1_t v0 = __riscv_vle64_v_f64m1(src + j, vle64m1); + v0 = __riscv_vfabs(v0, vle64m1); + r0 = __riscv_vfadd(r0, v0, vle64m1); + + vfloat64m1_t v1 = __riscv_vle64_v_f64m1(src + j + vle64m1, vle64m1); + v1 = __riscv_vfabs(v1, vle64m1); + r1 = __riscv_vfadd(r1, v1, vle64m1); + } + r0 = __riscv_vfadd(r0, r1, vle64m1); + return __riscv_vfmv_f(__riscv_vfredusum(r0, __riscv_vfmv_v_f_f64m1(0.f, vle64m1), vle64m1)); +} + +template inline +ST normL2_rvv(const T* src, int n, int& j); + +template<> inline +int normL2_rvv(const uchar* src, int n, int& j) { + const int vle8m1 = __riscv_vsetvlmax_e8m1(); + const int vle16m1 = __riscv_vsetvlmax_e16m1(); + const int vle32m1 = __riscv_vsetvlmax_e32m1(); + vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1); + vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1); + for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) { + vuint8m1_t v0 = __riscv_vle8_v_u8m1(src + j, vle8m1); + vuint16m2_t u0 = __riscv_vwmulu(v0, v0, vle8m1); + r0 = __riscv_vwredsumu(u0, r0, vle16m1 * 2); + + vuint8m1_t v1 = __riscv_vle8_v_u8m1(src + j + vle8m1, vle8m1); + vuint16m2_t u1 = __riscv_vwmulu(v1, v1, vle8m1); + r1 = __riscv_vwredsumu(u1, r1, vle16m1 * 2); + } + return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1)); +} + +template<> inline +int normL2_rvv(const schar* src, int n, int& j) { + const int vle8m1 = __riscv_vsetvlmax_e8m1(); + const int vle16m1 = __riscv_vsetvlmax_e16m1(); + const int vle32m1 = __riscv_vsetvlmax_e32m1(); + vint32m1_t r0 = __riscv_vmv_v_x_i32m1(0, vle32m1); + vint32m1_t r1 = __riscv_vmv_v_x_i32m1(0, vle32m1); + for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) { + vint8m1_t v0 = __riscv_vle8_v_i8m1(src + j, vle8m1); + vint16m2_t u0 = __riscv_vwmul(v0, v0, vle8m1); + r0 = __riscv_vwredsum(u0, r0, vle16m1 * 2); + + vint8m1_t v1 = __riscv_vle8_v_i8m1(src + j + vle8m1, vle8m1); + vint16m2_t u1 = __riscv_vwmul(v1, v1, vle8m1); + r1 = __riscv_vwredsum(u1, r1, vle16m1 * 2); + } + return __riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1)); +} + +template<> inline +double normL2_rvv(const double* src, int n, int& j) { + const int vle64m1 = __riscv_vsetvlmax_e64m1(); + vfloat64m1_t r0 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1); + vfloat64m1_t r1 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1); + for (; j <= n - 2 * vle64m1; j += 2 * vle64m1) { + vfloat64m1_t v0 = __riscv_vle64_v_f64m1(src + j, vle64m1); + r0 = __riscv_vfmacc(r0, v0, v0, vle64m1); + + vfloat64m1_t v1 = __riscv_vle64_v_f64m1(src + j + vle64m1, vle64m1); + r1 = __riscv_vfmacc(r1, v1, v1, vle64m1); + } + r0 = __riscv_vfadd(r0, r1, vle64m1); + return __riscv_vfmv_f(__riscv_vfredusum(r0, __riscv_vfmv_v_f_f64m1(0.f, vle64m1), vle64m1)); +} + +CV_CPU_OPTIMIZATION_NAMESPACE_END + +} // cv:: diff --git a/modules/core/src/norm.simd.hpp b/modules/core/src/norm.simd.hpp new file mode 100644 index 0000000000..fd7b658ba1 --- /dev/null +++ b/modules/core/src/norm.simd.hpp @@ -0,0 +1,676 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +#include "precomp.hpp" + +#if CV_RVV +#include "norm.rvv1p0.hpp" +#endif + +namespace cv { + +using NormFunc = int (*)(const uchar*, const uchar*, uchar*, int, int); + +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN + +NormFunc getNormFunc(int normType, int depth); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +template +struct NormInf_SIMD { + inline ST operator() (const T* src, int n) const { + ST s = 0; + for (int i = 0; i < n; i++) { + s = std::max(s, (ST)cv_abs(src[i])); + } + return s; + } +}; + +template +struct NormL1_SIMD { + inline ST operator() (const T* src, int n) const { + ST s = 0; + for (int i = 0; i < n; i++) { + s += cv_abs(src[i]); + } + return s; + } +}; + +template +struct NormL2_SIMD { + inline ST operator() (const T* src, int n) const { + ST s = 0; + for (int i = 0; i < n; i++) { + ST v = src[i]; + s += v * v; + } + return s; + } +}; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + +template<> +struct NormInf_SIMD { + int operator() (const uchar* src, int n) const { + int j = 0; + int s = 0; + v_uint8 r0 = vx_setzero_u8(), r1 = vx_setzero_u8(); + v_uint8 r2 = vx_setzero_u8(), r3 = vx_setzero_u8(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + r0 = v_max(r0, vx_load(src + j )); + r1 = v_max(r1, vx_load(src + j + VTraits::vlanes())); + r2 = v_max(r2, vx_load(src + j + 2 * VTraits::vlanes())); + r3 = v_max(r3, vx_load(src + j + 3 * VTraits::vlanes())); + } + r0 = v_max(r0, v_max(r1, v_max(r2, r3))); + for (; j < n; j++) { + s = std::max(s, (int)src[j]); + } + return std::max(s, (int)v_reduce_max(r0)); + } +}; + +template<> +struct NormInf_SIMD { + int operator() (const schar* src, int n) const { + int j = 0; + int s = 0; + v_uint8 r0 = vx_setzero_u8(), r1 = vx_setzero_u8(); + v_uint8 r2 = vx_setzero_u8(), r3 = vx_setzero_u8(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + r0 = v_max(r0, v_abs(vx_load(src + j ))); + r1 = v_max(r1, v_abs(vx_load(src + j + VTraits::vlanes()))); + r2 = v_max(r2, v_abs(vx_load(src + j + 2 * VTraits::vlanes()))); + r3 = v_max(r3, v_abs(vx_load(src + j + 3 * VTraits::vlanes()))); + } + r0 = v_max(r0, v_max(r1, v_max(r2, r3))); + for (; j < n; j++) { + s = std::max(s, cv_abs(src[j])); + } + return std::max(s, saturate_cast(v_reduce_max(r0))); + } +}; + +template<> +struct NormInf_SIMD { + int operator() (const ushort* src, int n) const { + int j = 0; + int s = 0; + v_uint16 d0 = vx_setzero_u16(), d1 = vx_setzero_u16(); + v_uint16 d2 = vx_setzero_u16(), d3 = vx_setzero_u16(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + d0 = v_max(d0, vx_load(src + j )); + d1 = v_max(d1, vx_load(src + j + VTraits::vlanes())); + d2 = v_max(d2, vx_load(src + j + 2 * VTraits::vlanes())); + d3 = v_max(d3, vx_load(src + j + 3 * VTraits::vlanes())); + } + d0 = v_max(d0, v_max(d1, v_max(d2, d3))); + for (; j < n; j++) { + s = std::max(s, (int)src[j]); + } + return std::max(s, (int)v_reduce_max(d0)); + } +}; + +template<> +struct NormInf_SIMD { + int operator() (const short* src, int n) const { + int j = 0; + int s = 0; + v_uint16 d0 = vx_setzero_u16(), d1 = vx_setzero_u16(); + v_uint16 d2 = vx_setzero_u16(), d3 = vx_setzero_u16(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + d0 = v_max(d0, v_abs(vx_load(src + j ))); + d1 = v_max(d1, v_abs(vx_load(src + j + VTraits::vlanes()))); + d2 = v_max(d2, v_abs(vx_load(src + j + 2 * VTraits::vlanes()))); + d3 = v_max(d3, v_abs(vx_load(src + j + 3 * VTraits::vlanes()))); + } + d0 = v_max(d0, v_max(d1, v_max(d2, d3))); + for (; j < n; j++) { + s = std::max(s, saturate_cast(cv_abs(src[j]))); + } + return std::max(s, saturate_cast(v_reduce_max(d0))); + } +}; + +template<> +struct NormInf_SIMD { + int operator() (const int* src, int n) const { + int j = 0; + int s = 0; +#if CV_RVV + s = normInf_rvv(src, n, j); +#else + v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32(); + v_uint32 r2 = vx_setzero_u32(), r3 = vx_setzero_u32(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + r0 = v_max(r0, v_abs(vx_load(src + j ))); + r1 = v_max(r1, v_abs(vx_load(src + j + VTraits::vlanes()))); + r2 = v_max(r2, v_abs(vx_load(src + j + 2 * VTraits::vlanes()))); + r3 = v_max(r3, v_abs(vx_load(src + j + 3 * VTraits::vlanes()))); + } + r0 = v_max(r0, v_max(r1, v_max(r2, r3))); + s = std::max(s, saturate_cast(v_reduce_max(r0))); +#endif + for (; j < n; j++) { + s = std::max(s, cv_abs(src[j])); + } + return s; + } +}; + +template<> +struct NormInf_SIMD { + float operator() (const float* src, int n) const { + int j = 0; + float s = 0.f; + v_float32 r0 = vx_setzero_f32(), r1 = vx_setzero_f32(); + v_float32 r2 = vx_setzero_f32(), r3 = vx_setzero_f32(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + r0 = v_max(r0, v_abs(vx_load(src + j ))); + r1 = v_max(r1, v_abs(vx_load(src + j + VTraits::vlanes()))); + r2 = v_max(r2, v_abs(vx_load(src + j + 2 * VTraits::vlanes()))); + r3 = v_max(r3, v_abs(vx_load(src + j + 3 * VTraits::vlanes()))); + } + r0 = v_max(r0, v_max(r1, v_max(r2, r3))); + for (; j < n; j++) { + s = std::max(s, cv_abs(src[j])); + } + return std::max(s, v_reduce_max(r0)); + } +}; + +template<> +struct NormL1_SIMD { + int operator() (const uchar* src, int n) const { + int j = 0; + int s = 0; + v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32(); + v_uint8 one = vx_setall_u8(1); + for (; j<= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_uint8 v0 = vx_load(src + j); + r0 = v_dotprod_expand_fast(v0, one, r0); + + v_uint8 v1 = vx_load(src + j + VTraits::vlanes()); + r1 = v_dotprod_expand_fast(v1, one, r1); + } + s += v_reduce_sum(v_add(r0, r1)); + for (; j < n; j++) { + s += src[j]; + } + return s; + } +}; + +template<> +struct NormL1_SIMD { + int operator() (const schar* src, int n) const { + int j = 0; + int s = 0; +#if CV_RVV + s = normL1_rvv(src, n, j); +#else + v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32(); + v_uint8 one = vx_setall_u8(1); + for (; j<= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_uint8 v0 = v_abs(vx_load(src + j)); + r0 = v_dotprod_expand_fast(v0, one, r0); + + v_uint8 v1 = v_abs(vx_load(src + j + VTraits::vlanes())); + r1 = v_dotprod_expand_fast(v1, one, r1); + } + s += v_reduce_sum(v_add(r0, r1)); +#endif + for (; j < n; j++) { + s += saturate_cast(cv_abs(src[j])); + } + return s; + } +}; + +template<> +struct NormL1_SIMD { + int operator() (const ushort* src, int n) const { + int j = 0; + int s = 0; +#if CV_RVV + s = normL1_rvv(src, n, j); +#else + v_uint32 r00 = vx_setzero_u32(), r01 = vx_setzero_u32(); + v_uint32 r10 = vx_setzero_u32(), r11 = vx_setzero_u32(); + for (; j<= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_uint16 v0 = vx_load(src + j); + v_uint32 v00, v01; + v_expand(v0, v00, v01); + r00 = v_add(r00, v00); + r01 = v_add(r01, v01); + + v_uint16 v1 = vx_load(src + j + VTraits::vlanes()); + v_uint32 v10, v11; + v_expand(v1, v10, v11); + r10 = v_add(r10, v10); + r11 = v_add(r11, v11); + } + s += (int)v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11)); +#endif + for (; j < n; j++) { + s += src[j]; + } + return s; + } +}; + +template<> +struct NormL1_SIMD { + int operator() (const short* src, int n) const { + int j = 0; + int s = 0; +#if CV_RVV + s = normL1_rvv(src, n, j); +#else + v_uint32 r00 = vx_setzero_u32(), r01 = vx_setzero_u32(); + v_uint32 r10 = vx_setzero_u32(), r11 = vx_setzero_u32(); + for (; j<= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_uint16 v0 = v_abs(vx_load(src + j)); + v_uint32 v00, v01; + v_expand(v0, v00, v01); + r00 = v_add(r00, v00); + r01 = v_add(r01, v01); + + v_uint16 v1 = v_abs(vx_load(src + j + VTraits::vlanes())); + v_uint32 v10, v11; + v_expand(v1, v10, v11); + r10 = v_add(r10, v10); + r11 = v_add(r11, v11); + } + s += (int)v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11)); +#endif + for (; j < n; j++) { + s += saturate_cast(cv_abs(src[j])); + } + return s; + } +}; + +template<> +struct NormL2_SIMD { + int operator() (const uchar* src, int n) const { + int j = 0; + int s = 0; +#if CV_RVV + s = normL2_rvv(src, n, j); +#else + v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_uint8 v0 = vx_load(src + j); + r0 = v_dotprod_expand_fast(v0, v0, r0); + + v_uint8 v1 = vx_load(src + j + VTraits::vlanes()); + r1 = v_dotprod_expand_fast(v1, v1, r1); + } + s += v_reduce_sum(v_add(r0, r1)); +#endif + for (; j < n; j++) { + int v = saturate_cast(src[j]); + s += v * v; + } + return s; + } +}; + +template<> +struct NormL2_SIMD { + int operator() (const schar* src, int n) const { + int j = 0; + int s = 0; +#if CV_RVV + s = normL2_rvv(src, n, j); +#else + v_int32 r0 = vx_setzero_s32(), r1 = vx_setzero_s32(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_int8 v0 = vx_load(src + j); + r0 = v_dotprod_expand_fast(v0, v0, r0); + v_int8 v1 = vx_load(src + j + VTraits::vlanes()); + r1 = v_dotprod_expand_fast(v1, v1, r1); + } + s += v_reduce_sum(v_add(r0, r1)); +#endif + for (; j < n; j++) { + int v = saturate_cast(src[j]); + s += v * v; + } + return s; + } +}; + +#endif + +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) + +template<> +struct NormInf_SIMD { + double operator() (const double* src, int n) const { + int j = 0; + double s = 0.f; + v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64(); + v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + r0 = v_max(r0, v_abs(vx_load(src + j ))); + r1 = v_max(r1, v_abs(vx_load(src + j + VTraits::vlanes()))); + r2 = v_max(r2, v_abs(vx_load(src + j + 2 * VTraits::vlanes()))); + r3 = v_max(r3, v_abs(vx_load(src + j + 3 * VTraits::vlanes()))); + } + r0 = v_max(r0, v_max(r1, v_max(r2, r3))); + for (; j < n; j++) { + s = std::max(s, cv_abs(src[j])); + } + // [TODO]: use v_reduce_max when it supports float64 + double t[VTraits::max_nlanes]; + vx_store(t, r0); + for (int i = 0; i < VTraits::vlanes(); i++) { + s = std::max(s, cv_abs(t[i])); + } + return s; + } +}; + +template<> +struct NormL1_SIMD { + double operator() (const int* src, int n) const { + int j = 0; + double s = 0.f; + v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64(); + v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_float32 v0 = v_abs(v_cvt_f32(vx_load(src + j))), v1 = v_abs(v_cvt_f32(vx_load(src + j + VTraits::vlanes()))); + r00 = v_add(r00, v_cvt_f64(v0)); r01 = v_add(r01, v_cvt_f64_high(v0)); + r10 = v_add(r10, v_cvt_f64(v1)); r11 = v_add(r11, v_cvt_f64_high(v1)); + } + s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11)); + for (; j < n; j++) { + s += cv_abs(src[j]); + } + return s; + } +}; + +template<> +struct NormL1_SIMD { + double operator() (const float* src, int n) const { + int j = 0; + double s = 0.f; + v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64(); + v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64(); + v_float64 r20 = vx_setzero_f64(), r21 = vx_setzero_f64(); + v_float64 r30 = vx_setzero_f64(), r31 = vx_setzero_f64(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + v_float32 v0 = v_abs(vx_load(src + j)), v1 = v_abs(vx_load(src + j + VTraits::vlanes())); + r00 = v_add(r00, v_cvt_f64(v0)); r01 = v_add(r01, v_cvt_f64_high(v0)); + r10 = v_add(r10, v_cvt_f64(v1)); r11 = v_add(r11, v_cvt_f64_high(v1)); + + v_float32 v2 = v_abs(vx_load(src + j + 2 * VTraits::vlanes())), v3 = v_abs(vx_load(src + j + 3 * VTraits::vlanes())); + r20 = v_add(r20, v_cvt_f64(v2)); r21 = v_add(r21, v_cvt_f64_high(v2)); + r30 = v_add(r30, v_cvt_f64(v3)); r31 = v_add(r31, v_cvt_f64_high(v3)); + } + s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11)); + s += v_reduce_sum(v_add(v_add(v_add(r20, r21), r30), r31)); + for (; j < n; j++) { + s += cv_abs(src[j]); + } + return s; + } +}; + +template<> +struct NormL1_SIMD { + double operator() (const double* src, int n) const { + int j = 0; + double s = 0.f; +#if CV_RVV // This is introduced to workaround the accuracy issue on ci + s = normL1_rvv(src, n, j); +#else + v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64(); + v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + r00 = v_add(r00, v_abs(vx_load(src + j ))); + r01 = v_add(r01, v_abs(vx_load(src + j + VTraits::vlanes()))); + r10 = v_add(r10, v_abs(vx_load(src + j + 2 * VTraits::vlanes()))); + r11 = v_add(r11, v_abs(vx_load(src + j + 3 * VTraits::vlanes()))); + } + s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11)); +#endif + for (; j < n; j++) { + s += cv_abs(src[j]); + } + return s; + } +}; + +template<> +struct NormL2_SIMD { + double operator() (const ushort* src, int n) const { + int j = 0; + double s = 0.f; + v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_uint16 v0 = vx_load(src + j); + v_uint64 u0 = v_dotprod_expand_fast(v0, v0); + r0 = v_add(r0, v_cvt_f64(v_reinterpret_as_s64(u0))); + + v_uint16 v1 = vx_load(src + j + VTraits::vlanes()); + v_uint64 u1 = v_dotprod_expand_fast(v1, v1); + r1 = v_add(r1, v_cvt_f64(v_reinterpret_as_s64(u1))); + } + s += v_reduce_sum(v_add(r0, r1)); + for (; j < n; j++) { + double v = saturate_cast(src[j]); + s += v * v; + } + return s; + } +}; + +template<> +struct NormL2_SIMD { + double operator() (const short* src, int n) const { + int j = 0; + double s = 0.f; + v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_int16 v0 = vx_load(src + j); + r0 = v_add(r0, v_cvt_f64(v_dotprod_expand_fast(v0, v0))); + + v_int16 v1 = vx_load(src + j + VTraits::vlanes()); + r1 = v_add(r1, v_cvt_f64(v_dotprod_expand_fast(v1, v1))); + } + s += v_reduce_sum(v_add(r0, r1)); + for (; j < n; j++) { + double v = saturate_cast(src[j]); + s += v * v; + } + return s; + } +}; + +template<> +struct NormL2_SIMD { + double operator() (const int* src, int n) const { + int j = 0; + double s = 0.f; + v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_int32 v0 = vx_load(src + j); + r0 = v_dotprod_expand_fast(v0, v0, r0); + + v_int32 v1 = vx_load(src + j + VTraits::vlanes()); + r1 = v_dotprod_expand_fast(v1, v1, r1); + } + s += v_reduce_sum(v_add(r0, r1)); + for (; j < n; j++) { + double v = src[j]; + s += v * v; + } + return s; + } +}; + +template<> +struct NormL2_SIMD { + double operator() (const float* src, int n) const { + int j = 0; + double s = 0.f; + v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64(); + v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_float32 v0 = vx_load(src + j), v1 = vx_load(src + j + VTraits::vlanes()); + v_float64 v00 = v_cvt_f64(v0), v01 = v_cvt_f64_high(v0); + v_float64 v10 = v_cvt_f64(v1), v11 = v_cvt_f64_high(v1); + r00 = v_fma(v00, v00, r00); r01 = v_fma(v01, v01, r01); + r10 = v_fma(v10, v10, r10); r11 = v_fma(v11, v11, r11); + } + s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11)); + for (; j < n; j++) { + double v = src[j]; + s += v * v; + } + return s; + } +}; + +template<> +struct NormL2_SIMD { + double operator() (const double* src, int n) const { + int j = 0; + double s = 0.f; +#if CV_RVV // This is introduced to workaround the accuracy issue on ci + s = normL2_rvv(src, n, j); +#else + v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64(); + v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + v_float64 v00 = vx_load(src + j ); + v_float64 v01 = vx_load(src + j + VTraits::vlanes()); + v_float64 v10 = vx_load(src + j + 2 * VTraits::vlanes()); + v_float64 v11 = vx_load(src + j + 3 * VTraits::vlanes()); + r00 = v_fma(v00, v00, r00); r01 = v_fma(v01, v01, r01); + r10 = v_fma(v10, v10, r10); r11 = v_fma(v11, v11, r11); + } + s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11)); +#endif + for (; j < n; j++) { + double v = src[j]; + s += v * v; + } + return s; + } +}; + +#endif + +template int +normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn) { + ST result = *_result; + if( !mask ) { + NormInf_SIMD op; + result = std::max(result, op(src, len*cn)); + } else { + for( int i = 0; i < len; i++, src += cn ) { + if( mask[i] ) { + for( int k = 0; k < cn; k++ ) { + result = std::max(result, ST(cv_abs(src[k]))); + } + } + } + } + *_result = result; + return 0; +} + +template int +normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn) { + ST result = *_result; + if( !mask ) { + NormL1_SIMD op; + result += op(src, len*cn); + } else { + for( int i = 0; i < len; i++, src += cn ) { + if( mask[i] ) { + for( int k = 0; k < cn; k++ ) { + result += cv_abs(src[k]); + } + } + } + } + *_result = result; + return 0; +} + +template int +normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn) { + ST result = *_result; + if( !mask ) { + NormL2_SIMD op; + result += op(src, len*cn); + } else { + for( int i = 0; i < len; i++, src += cn ) { + if( mask[i] ) { + for( int k = 0; k < cn; k++ ) { + T v = src[k]; + result += (ST)v*v; + } + } + } + } + *_result = result; + return 0; +} + +#define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \ + static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \ +{ CV_INSTRUMENT_REGION(); return norm##L##_(src, mask, r, len, cn); } \ + +#define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \ + CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \ + CV_DEF_NORM_FUNC(L1, suffix, type, l1type) \ + CV_DEF_NORM_FUNC(L2, suffix, type, l2type) + +CV_DEF_NORM_ALL(8u, uchar, int, int, int) +CV_DEF_NORM_ALL(8s, schar, int, int, int) +CV_DEF_NORM_ALL(16u, ushort, int, int, double) +CV_DEF_NORM_ALL(16s, short, int, int, double) +CV_DEF_NORM_ALL(32s, int, int, double, double) +CV_DEF_NORM_ALL(32f, float, float, double, double) +CV_DEF_NORM_ALL(64f, double, double, double, double) + +NormFunc getNormFunc(int normType, int depth) +{ + CV_INSTRUMENT_REGION(); + static NormFunc normTab[3][8] = + { + { + (NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s), + (NormFunc)GET_OPTIMIZED(normInf_32s), (NormFunc)GET_OPTIMIZED(normInf_32f), (NormFunc)normInf_64f, 0 + }, + { + (NormFunc)GET_OPTIMIZED(normL1_8u), (NormFunc)GET_OPTIMIZED(normL1_8s), (NormFunc)GET_OPTIMIZED(normL1_16u), (NormFunc)GET_OPTIMIZED(normL1_16s), + (NormFunc)GET_OPTIMIZED(normL1_32s), (NormFunc)GET_OPTIMIZED(normL1_32f), (NormFunc)normL1_64f, 0 + }, + { + (NormFunc)GET_OPTIMIZED(normL2_8u), (NormFunc)GET_OPTIMIZED(normL2_8s), (NormFunc)GET_OPTIMIZED(normL2_16u), (NormFunc)GET_OPTIMIZED(normL2_16s), + (NormFunc)GET_OPTIMIZED(normL2_32s), (NormFunc)GET_OPTIMIZED(normL2_32f), (NormFunc)normL2_64f, 0 + } + }; + + return normTab[normType][depth]; +} + +#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +CV_CPU_OPTIMIZATION_NAMESPACE_END + +} // cv::