diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/common.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/common.hpp new file mode 100644 index 0000000000..8db03267e1 --- /dev/null +++ b/3rdparty/hal_rvv/hal_rvv_1p0/common.hpp @@ -0,0 +1,30 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_HAL_RVV_COMMON_HPP_INCLUDED +#define OPENCV_HAL_RVV_COMMON_HPP_INCLUDED + +#include + +namespace cv { namespace cv_hal_rvv { namespace custom_intrin { + +#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(_Tpvs, _Tpvd, shift, suffix) \ + inline _Tpvd __riscv_vabs(const _Tpvs& v, const int vl) { \ + _Tpvs mask = __riscv_vsra(v, shift, vl); \ + _Tpvs v_xor = __riscv_vxor(v, mask, vl); \ + return __riscv_vreinterpret_##suffix( \ + __riscv_vsub(v_xor, mask, vl) \ + ); \ + } + +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m2_t, vuint8m2_t, 7, u8m2) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m8_t, vuint8m8_t, 7, u8m8) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m4_t, vuint16m4_t, 15, u16m4) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m8_t, vuint16m8_t, 15, u16m8) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m4_t, vuint32m4_t, 31, u32m4) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m8_t, vuint32m8_t, 31, u32m8) + +}}} // cv::cv_hal_rvv::custom_intrin + +#endif diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/norm.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/norm.hpp index 260978f6ee..1e583f29da 100644 --- a/3rdparty/hal_rvv/hal_rvv_1p0/norm.hpp +++ b/3rdparty/hal_rvv/hal_rvv_1p0/norm.hpp @@ -1,520 +1,1094 @@ // This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. - +// // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. +// Copyright (C) 2025, SpaceMIT Inc., all rights reserved. +// Third party copyrights are property of their respective owners. #ifndef OPENCV_HAL_RVV_NORM_HPP_INCLUDED #define OPENCV_HAL_RVV_NORM_HPP_INCLUDED -#include +#include "common.hpp" namespace cv { namespace cv_hal_rvv { namespace norm { #undef cv_hal_norm #define cv_hal_norm cv::cv_hal_rvv::norm::norm -inline int normInf_8UC1(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e8m8(); - auto vec_max = __riscv_vmv_v_x_u8m8(0, vlmax); +namespace { - if (mask) - { - for (int i = 0; i < height; i++) - { - const uchar* src_row = src + i * src_step; - const uchar* mask_row = mask + i * mask_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e8m8(width - j); - auto vec_src = __riscv_vle8_v_u8m8(src_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8m8(mask_row + j, vl); - auto bool_mask = __riscv_vmsne(vec_mask, 0, vl); - vec_max = __riscv_vmaxu_tumu(bool_mask, vec_max, vec_max, vec_src, vl); +template +struct NormInf_RVV { + inline ST operator() (const T* src, int n) const { + ST s = 0; + for (int i = 0; i < n; i++) { + s = std::max(s, (ST)cv_abs(src[i])); + } + return s; + } +}; + +template +struct NormL1_RVV { + inline ST operator() (const T* src, int n) const { + ST s = 0; + for (int i = 0; i < n; i++) { + s += cv_abs(src[i]); + } + return s; + } +}; + +template +struct NormL2_RVV { + inline ST operator() (const T* src, int n) const { + ST s = 0; + for (int i = 0; i < n; i++) { + ST v = src[i]; + s += v * v; + } + return s; + } +}; + +template<> +struct NormInf_RVV { + int operator() (const uchar* src, int n) const { + int vlmax = __riscv_vsetvlmax_e8m8(); + auto s = __riscv_vmv_v_x_u8m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e8m8(n - i); + auto v = __riscv_vle8_v_u8m8(src + i, vl); + s = __riscv_vmaxu_tu(s, s, v, vl); + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax)); + } +}; + +template<> +struct NormInf_RVV { + int operator() (const schar* src, int n) const { + int vlmax = __riscv_vsetvlmax_e8m8(); + auto s = __riscv_vmv_v_x_u8m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e8m8(n - i); + auto v = __riscv_vle8_v_i8m8(src + i, vl); + s = __riscv_vmaxu_tu(s, s, custom_intrin::__riscv_vabs(v, vl), vl); + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax)); + } +}; + +template<> +struct NormInf_RVV { + int operator() (const ushort* src, int n) const { + int vlmax = __riscv_vsetvlmax_e16m8(); + auto s = __riscv_vmv_v_x_u16m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e16m8(n - i); + auto v = __riscv_vle16_v_u16m8(src + i, vl); + s = __riscv_vmaxu_tu(s, s, v, vl); + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax)); + } +}; + +template<> +struct NormInf_RVV { + int operator() (const short* src, int n) const { + int vlmax = __riscv_vsetvlmax_e16m8(); + auto s = __riscv_vmv_v_x_u16m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e16m8(n - i); + auto v = __riscv_vle16_v_i16m8(src + i, vl); + s = __riscv_vmaxu_tu(s, s, custom_intrin::__riscv_vabs(v, vl), vl); + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax)); + } +}; + +template<> +struct NormInf_RVV { + int operator() (const int* src, int n) const { + int vlmax = __riscv_vsetvlmax_e32m8(); + auto s = __riscv_vmv_v_x_u32m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e32m8(n - i); + auto v = __riscv_vle32_v_i32m8(src + i, vl); + s = __riscv_vmaxu_tu(s, s, custom_intrin::__riscv_vabs(v, vl), vl); + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u32m1(0, __riscv_vsetvlmax_e32m1()), vlmax)); + } +}; + +template<> +struct NormInf_RVV { + float operator() (const float* src, int n) const { + int vlmax = __riscv_vsetvlmax_e32m8(); + auto s = __riscv_vfmv_v_f_f32m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e32m8(n - i); + auto v = __riscv_vle32_v_f32m8(src + i, vl); + s = __riscv_vfmax_tu(s, s, __riscv_vfabs(v, vl), vl); + } + return __riscv_vfmv_f(__riscv_vfredmax(s, __riscv_vfmv_s_f_f32m1(0, __riscv_vsetvlmax_e32m1()), vlmax)); + } +}; + +template<> +struct NormInf_RVV { + double operator() (const double* src, int n) const { + int vlmax = __riscv_vsetvlmax_e64m8(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e64m8(n - i); + auto v = __riscv_vle64_v_f64m8(src + i, vl); + s = __riscv_vfmax_tu(s, s, __riscv_vfabs(v, vl), vl); + } + return __riscv_vfmv_f(__riscv_vfredmax(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct NormL1_RVV { + int operator() (const uchar* src, int n) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + auto zero = __riscv_vmv_v_x_u16m1(0, __riscv_vsetvlmax_e16m1()); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e8m8(n - i); + auto v = __riscv_vle8_v_u8m8(src + i, vl); + s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, vl); + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct NormL1_RVV { + int operator() (const schar* src, int n) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + auto zero = __riscv_vmv_v_x_u16m1(0, __riscv_vsetvlmax_e16m1()); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e8m8(n - i); + auto v = custom_intrin::__riscv_vabs(__riscv_vle8_v_i8m8(src + i, vl), vl); + s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, vl); + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct NormL1_RVV { + int operator() (const ushort* src, int n) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e16m8(n - i); + auto v = __riscv_vle16_v_u16m8(src + i, vl); + s = __riscv_vwredsumu(v, s, vl); + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct NormL1_RVV { + int operator() (const short* src, int n) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e16m8(n - i); + auto v = custom_intrin::__riscv_vabs(__riscv_vle16_v_i16m8(src + i, vl), vl); + s = __riscv_vwredsumu(v, s, vl); + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct NormL1_RVV { + double operator() (const int* src, int n) const { + int vlmax = __riscv_vsetvlmax_e32m4(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e32m4(n - i); + auto v = custom_intrin::__riscv_vabs(__riscv_vle32_v_i32m4(src + i, vl), vl); + s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v, vl), vl); + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct NormL1_RVV { + double operator() (const float* src, int n) const { + int vlmax = __riscv_vsetvlmax_e32m4(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e32m4(n - i); + auto v = __riscv_vfabs(__riscv_vle32_v_f32m4(src + i, vl), vl); + s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v, vl), vl); + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct NormL1_RVV { + double operator() (const double* src, int n) const { + int vlmax = __riscv_vsetvlmax_e64m8(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e64m8(n - i); + auto v = __riscv_vle64_v_f64m8(src + i, vl); + s = __riscv_vfadd_tu(s, s, __riscv_vfabs(v, vl), vl); + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct NormL2_RVV { + int operator() (const uchar* src, int n) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e8m4(n - i); + auto v = __riscv_vle8_v_u8m4(src + i, vl); + s = __riscv_vwredsumu(__riscv_vwmulu(v, v, vl), s, vl); + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct NormL2_RVV { + int operator() (const schar* src, int n) const { + auto s = __riscv_vmv_v_x_i32m1(0, __riscv_vsetvlmax_e32m1()); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e8m4(n - i); + auto v = __riscv_vle8_v_i8m4(src + i, vl); + s = __riscv_vwredsum(__riscv_vwmul(v, v, vl), s, vl); + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct NormL2_RVV { + double operator() (const ushort* src, int n) const { + int vlmax = __riscv_vsetvlmax_e16m2(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e16m2(n - i); + auto v = __riscv_vle16_v_u16m2(src + i, vl); + auto v_mul = __riscv_vwmulu(v, v, vl); + s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v_mul, vl), vl); + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct NormL2_RVV { + double operator() (const short* src, int n) const { + int vlmax = __riscv_vsetvlmax_e16m2(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e16m2(n - i); + auto v = __riscv_vle16_v_i16m2(src + i, vl); + auto v_mul = __riscv_vwmul(v, v, vl); + s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v_mul, vl), vl); + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e32m1()), vlmax)); + } +}; + +template<> +struct NormL2_RVV { + double operator() (const int* src, int n) const { + int vlmax = __riscv_vsetvlmax_e32m4(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e32m4(n - i); + auto v = __riscv_vle32_v_i32m4(src + i, vl); + auto v_mul = __riscv_vwmul(v, v, vl); + s = __riscv_vfadd_tu(s, s, __riscv_vfcvt_f(v_mul, vl), vl); + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct NormL2_RVV { + double operator() (const float* src, int n) const { + int vlmax = __riscv_vsetvlmax_e32m4(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e32m4(n - i); + auto v = __riscv_vle32_v_f32m4(src + i, vl); + auto v_mul = __riscv_vfwmul(v, v, vl); + s = __riscv_vfadd_tu(s, s, v_mul, vl); + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct NormL2_RVV { + double operator() (const double* src, int n) const { + int vlmax = __riscv_vsetvlmax_e64m8(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e64m8(n - i); + auto v = __riscv_vle64_v_f64m8(src + i, vl); + auto v_mul = __riscv_vfmul(v, v, vl); + s = __riscv_vfadd_tu(s, s, v_mul, vl); + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +// Norm with mask + +template +struct MaskedNormInf_RVV { + inline ST operator() (const T* src, const uchar* mask, int len, int cn) const { + ST s = 0; + for( int i = 0; i < len; i++, src += cn ) { + if( mask[i] ) { + for( int k = 0; k < cn; k++ ) { + s = std::max(s, ST(cv_abs(src[k]))); + } } } + return s; } - else - { - for (int i = 0; i < height; i++) - { - const uchar* src_row = src + i * src_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e8m8(width - j); - auto vec_src = __riscv_vle8_v_u8m8(src_row + j, vl); - vec_max = __riscv_vmaxu_tu(vec_max, vec_max, vec_src, vl); +}; + +template +struct MaskedNormL1_RVV { + inline ST operator() (const T* src, const uchar* mask, int len, int cn) const { + ST s = 0; + for( int i = 0; i < len; i++, src += cn ) { + if( mask[i] ) { + for( int k = 0; k < cn; k++ ) { + s += cv_abs(src[k]); + } } } + return s; } - auto sc_max = __riscv_vmv_s_x_u8m1(0, vlmax); - sc_max = __riscv_vredmaxu(vec_max, sc_max, vlmax); - *result = __riscv_vmv_x(sc_max); +}; - return CV_HAL_ERROR_OK; +template +struct MaskedNormL2_RVV { + inline ST operator() (const T* src, const uchar* mask, int len, int cn) const { + ST s = 0; + for( int i = 0; i < len; i++, src += cn ) { + if( mask[i] ) { + for( int k = 0; k < cn; k++ ) { + T v = src[k]; + s += (ST)v*v; + } + } + } + return s; + } +}; + +template<> +struct MaskedNormInf_RVV { + int operator() (const uchar* src, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e8m8(); + auto s = __riscv_vmv_v_x_u8m8(0, vlmax); + if (cn == 1) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m8(len - i); + auto v = __riscv_vle8_v_u8m8(src + i, vl); + auto m = __riscv_vle8_v_u8m8(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vmaxu_tumu(b, s, s, v, vl); + } + } else if (cn == 4) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m2(len - i); + auto v = __riscv_vle8_v_u8m8(src + i * 4, vl * 4); + auto m = __riscv_vle8_v_u8m2(mask + i, vl); + auto b = __riscv_vmsne(__riscv_vreinterpret_u8m8(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4); + s = __riscv_vmaxu_tumu(b, s, s, v, vl * 4); + } + } else { + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m8(len - i); + auto v = __riscv_vlse8_v_u8m8(src + cn * i + cn_index, sizeof(uchar) * cn, vl); + auto m = __riscv_vle8_v_u8m8(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vmaxu_tumu(b, s, s, v, vl); + } + } + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax)); + } +}; + +template<> +struct MaskedNormL1_RVV { + int operator() (const uchar* src, const uchar* mask, int len, int cn) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + auto zero = __riscv_vmv_v_x_u16m1(0, __riscv_vsetvlmax_e16m1()); + if (cn == 1) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m8(len - i); + auto v = __riscv_vle8_v_u8m8(src + i, vl); + auto m = __riscv_vle8_v_u8m8(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); + } + } else if (cn == 4) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m2(len - i); + auto v = __riscv_vle8_v_u8m8(src + i * 4, vl * 4); + auto m = __riscv_vle8_v_u8m2(mask + i, vl); + auto b = __riscv_vmsne(__riscv_vreinterpret_u8m8(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4); + s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl * 4), s, __riscv_vsetvlmax_e16m1()); + } + } else { + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m8(len - i); + auto v = __riscv_vlse8_v_u8m8(src + cn * i + cn_index, sizeof(uchar) * cn, vl); + auto m = __riscv_vle8_v_u8m8(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); + } + } + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct MaskedNormL2_RVV { + int operator() (const uchar* src, const uchar* mask, int len, int cn) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + if (cn == 1) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m4(len - i); + auto v = __riscv_vle8_v_u8m4(src + i, vl); + auto m = __riscv_vle8_v_u8m4(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl); + } + } else if (cn == 4) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m1(len - i); + auto v = __riscv_vle8_v_u8m4(src + i * 4, vl * 4); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(__riscv_vreinterpret_u8m4(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4); + s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl * 4), s, vl * 4); + } + } else { + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m4(len - i); + auto v = __riscv_vlse8_v_u8m4(src + cn * i + cn_index, sizeof(uchar) * cn, vl); + auto m = __riscv_vle8_v_u8m4(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl); + } + } + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct MaskedNormInf_RVV { + int operator() (const schar* src, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e8m8(); + auto s = __riscv_vmv_v_x_u8m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m8(len - i); + auto v = __riscv_vlse8_v_i8m8(src + cn * i + cn_index, sizeof(schar) * cn, vl); + auto m = __riscv_vle8_v_u8m8(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vmaxu_tumu(b, s, s, custom_intrin::__riscv_vabs(v, vl), vl); + } + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax)); + } +}; + +template<> +struct MaskedNormL1_RVV { + int operator() (const schar* src, const uchar* mask, int len, int cn) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + auto zero = __riscv_vmv_v_x_u16m1(0, __riscv_vsetvlmax_e16m1()); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m8(len - i); + auto v = custom_intrin::__riscv_vabs(__riscv_vlse8_v_i8m8(src + cn * i + cn_index, sizeof(schar) * cn, vl), vl); + auto m = __riscv_vle8_v_u8m8(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); + } + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct MaskedNormL2_RVV { + int operator() (const schar* src, const uchar* mask, int len, int cn) const { + auto s = __riscv_vmv_v_x_i32m1(0, __riscv_vsetvlmax_e32m1()); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m4(len - i); + auto v = __riscv_vlse8_v_i8m4(src + cn * i + cn_index, sizeof(schar) * cn, vl); + auto m = __riscv_vle8_v_u8m4(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vwredsum(b, __riscv_vwmul(b, v, v, vl), s, vl); + } + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct MaskedNormInf_RVV { + int operator() (const ushort* src, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e16m8(); + auto s = __riscv_vmv_v_x_u16m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e16m8(len - i); + auto v = __riscv_vlse16_v_u16m8(src + cn * i + cn_index, sizeof(ushort) * cn, vl); + auto m = __riscv_vle8_v_u8m4(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vmaxu_tumu(b, s, s, v, vl); + } + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax)); + } +}; + +template<> +struct MaskedNormL1_RVV { + int operator() (const ushort* src, const uchar* mask, int len, int cn) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m4(len - i); + auto v = __riscv_vlse16_v_u16m8(src + cn * i + cn_index, sizeof(ushort) * cn, vl); + auto m = __riscv_vle8_v_u8m4(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vwredsumu_tum(b, s, v, s, vl); + } + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct MaskedNormL2_RVV { + double operator() (const ushort* src, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e16m2(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e16m2(len - i); + auto v = __riscv_vlse16_v_u16m2(src + cn * i + cn_index, sizeof(ushort) * cn, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + auto v_mul = __riscv_vwmulu(b, v, v, vl); + s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, v_mul, vl), vl); + } + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct MaskedNormInf_RVV { + int operator() (const short* src, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e16m8(); + auto s = __riscv_vmv_v_x_u16m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e16m8(len - i); + auto v = __riscv_vlse16_v_i16m8(src + cn * i + cn_index, sizeof(short) * cn, vl); + auto m = __riscv_vle8_v_u8m4(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vmaxu_tumu(b, s, s, custom_intrin::__riscv_vabs(v, vl), vl); + } + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax)); + } +}; + +template<> +struct MaskedNormL1_RVV { + int operator() (const short* src, const uchar* mask, int len, int cn) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m4(len - i); + auto v = custom_intrin::__riscv_vabs(__riscv_vlse16_v_i16m8(src + cn * i + cn_index, sizeof(short) * cn, vl), vl); + auto m = __riscv_vle8_v_u8m4(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vwredsumu_tum(b, s, v, s, vl); + } + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct MaskedNormL2_RVV { + double operator() (const short* src, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e16m2(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e16m2(len - i); + auto v = __riscv_vlse16_v_i16m2(src + cn * i + cn_index, sizeof(short) * cn, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + auto v_mul = __riscv_vwmul(b, v, v, vl); + s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, v_mul, vl), vl); + } + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e32m1()), vlmax)); + } +}; + +template<> +struct MaskedNormInf_RVV { + int operator() (const int* src, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e32m8(); + auto s = __riscv_vmv_v_x_u32m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e32m8(len - i); + auto v = __riscv_vlse32_v_i32m8(src + cn * i + cn_index, sizeof(int) * cn, vl); + auto m = __riscv_vle8_v_u8m2(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vmaxu_tumu(b, s, s, custom_intrin::__riscv_vabs(v, vl), vl); + } + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u32m1(0, __riscv_vsetvlmax_e32m1()), vlmax)); + } +}; + +template<> +struct MaskedNormL1_RVV { + double operator() (const int* src, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e32m4(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e32m4(len - i); + auto v = __riscv_vlse32_v_i32m4(src + cn * i + cn_index, sizeof(int) * cn, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, custom_intrin::__riscv_vabs(v, vl), vl), vl); + } + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct MaskedNormL2_RVV { + double operator() (const int* src, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e32m4(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e16m2(len - i); + auto v = __riscv_vlse32_v_i32m4(src + cn * i + cn_index, sizeof(int) * cn, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + auto v_mul = __riscv_vwmul(b, v, v, vl); + s = __riscv_vfadd_tumu(b, s, s, __riscv_vfcvt_f(b, v_mul, vl), vl); + } + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct MaskedNormInf_RVV { + float operator() (const float* src, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e32m8(); + auto s = __riscv_vfmv_v_f_f32m8(0, vlmax); + if (cn == 1) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e32m8(len - i); + auto v = __riscv_vle32_v_f32m8(src + i, vl); + auto m = __riscv_vle8_v_u8m2(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vfmax_tumu(b, s, s, __riscv_vfabs(v, vl), vl); + } + } else { + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e32m8(len - i); + auto v = __riscv_vlse32_v_f32m8(src + cn * i + cn_index, sizeof(float) * cn, vl); + auto m = __riscv_vle8_v_u8m2(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vfmax_tumu(b, s, s, __riscv_vfabs(v, vl), vl); + } + } + } + return __riscv_vfmv_f(__riscv_vfredmax(s, __riscv_vfmv_s_f_f32m1(0, __riscv_vsetvlmax_e32m1()), vlmax)); + } +}; + +template<> +struct MaskedNormL1_RVV { + double operator() (const float* src, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e32m4(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + if (cn == 1) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e32m4(len - i); + auto v = __riscv_vle32_v_f32m4(src + i, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, __riscv_vfabs(v, vl), vl), vl); + } + } else { + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e32m4(len - i); + auto v = __riscv_vlse32_v_f32m4(src + cn * i + cn_index, sizeof(float) * cn, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, __riscv_vfabs(v, vl), vl), vl); + } + } + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct MaskedNormL2_RVV { + double operator() (const float* src, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e32m4(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + if (cn == 1) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e32m4(len - i); + auto v = __riscv_vle32_v_f32m4(src + i, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + auto v_mul = __riscv_vfwmul(b, v, v, vl); + s = __riscv_vfadd_tumu(b, s, s, v_mul, vl); + } + } else { + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e32m4(len - i); + auto v = __riscv_vlse32_v_f32m4(src + cn * i + cn_index, sizeof(float) * cn, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + auto v_mul = __riscv_vfwmul(b, v, v, vl); + s = __riscv_vfadd_tumu(b, s, s, v_mul, vl); + } + } + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct MaskedNormInf_RVV { + double operator() (const double* src, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e64m8(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e64m8(len - i); + auto v = __riscv_vlse64_v_f64m8(src + cn * i + cn_index, sizeof(double) * cn, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vfmax_tumu(b, s, s, __riscv_vfabs(v, vl), vl); + } + } + return __riscv_vfmv_f(__riscv_vfredmax(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct MaskedNormL1_RVV { + double operator() (const double* src, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e64m8(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e64m8(len - i); + auto v = __riscv_vlse64_v_f64m8(src + cn * i + cn_index, sizeof(double) * cn, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vfadd_tumu(b, s, s, __riscv_vfabs(v, vl), vl); + } + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct MaskedNormL2_RVV { + double operator() (const double* src, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e64m8(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e64m8(len - i); + auto v = __riscv_vlse64_v_f64m8(src + cn * i + cn_index, sizeof(double) * cn, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + auto v_mul = __riscv_vfmul(b, v, v, vl); + s = __riscv_vfadd_tumu(b, s, s, v_mul, vl); + } + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template int +normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn) { + ST result = *_result; + if( !mask ) { + NormInf_RVV op; + result = std::max(result, op(src, len*cn)); + } else { + MaskedNormInf_RVV op; + result = std::max(result, op(src, mask, len, cn)); + } + *_result = result; + return 0; } -inline int normL1_8UC1(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e8m2(); - auto vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax); - - if (mask) - { - for (int i = 0; i < height; i++) - { - const uchar* src_row = src + i * src_step; - const uchar* mask_row = mask + i * mask_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e8m2(width - j); - auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8m2(mask_row + j, vl); - auto bool_mask = __riscv_vmsne(vec_mask, 0, vl); - auto vec_zext = __riscv_vzext_vf4_u32m8_m(bool_mask, vec_src, vl); - vec_sum = __riscv_vadd_tumu(bool_mask, vec_sum, vec_sum, vec_zext, vl); - } - } +template int +normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn) { + ST result = *_result; + if( !mask ) { + NormL1_RVV op; + result += op(src, len*cn); + } else { + MaskedNormL1_RVV op; + result += op(src, mask, len, cn); } - else - { - for (int i = 0; i < height; i++) - { - const uchar* src_row = src + i * src_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e8m2(width - j); - auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl); - auto vec_zext = __riscv_vzext_vf4(vec_src, vl); - vec_sum = __riscv_vadd_tu(vec_sum, vec_sum, vec_zext, vl); - } - } - } - auto sc_sum = __riscv_vmv_s_x_u32m1(0, vlmax); - sc_sum = __riscv_vredsum(vec_sum, sc_sum, vlmax); - *result = __riscv_vmv_x(sc_sum); - - return CV_HAL_ERROR_OK; + *_result = result; + return 0; } -inline int normL2Sqr_8UC1(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e8m2(); - auto vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax); - int cnt = 0; - auto reduce = [&](int vl) { - if ((cnt += vl) < (1 << 16)) - return; - cnt = vl; - for (int i = 0; i < vlmax; i++) - { - *result += __riscv_vmv_x(vec_sum); - vec_sum = __riscv_vslidedown(vec_sum, 1, vlmax); - } - vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax); - }; - - *result = 0; - if (mask) - { - for (int i = 0; i < height; i++) - { - const uchar* src_row = src + i * src_step; - const uchar* mask_row = mask + i * mask_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e8m2(width - j); - reduce(vl); - - auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8m2(mask_row + j, vl); - auto bool_mask = __riscv_vmsne(vec_mask, 0, vl); - auto vec_mul = __riscv_vwmulu_vv_u16m4_m(bool_mask, vec_src, vec_src, vl); - auto vec_zext = __riscv_vzext_vf2_u32m8_m(bool_mask, vec_mul, vl); - vec_sum = __riscv_vadd_tumu(bool_mask, vec_sum, vec_sum, vec_zext, vl); - } - } +template int +normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn) { + ST result = *_result; + if( !mask ) { + NormL2_RVV op; + result += op(src, len*cn); + } else { + MaskedNormL2_RVV op; + result += op(src, mask, len, cn); } - else - { - for (int i = 0; i < height; i++) - { - const uchar* src_row = src + i * src_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e8m2(width - j); - reduce(vl); - - auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl); - auto vec_mul = __riscv_vwmulu(vec_src, vec_src, vl); - auto vec_zext = __riscv_vzext_vf2(vec_mul, vl); - vec_sum = __riscv_vadd_tu(vec_sum, vec_sum, vec_zext, vl); - } - } - } - reduce(1 << 16); - - return CV_HAL_ERROR_OK; + *_result = result; + return 0; } -inline int normInf_8UC4(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e8m8(); - auto vec_max = __riscv_vmv_v_x_u8m8(0, vlmax); +#define CV_HAL_RVV_DEF_NORM_FUNC(L, suffix, type, ntype) \ + static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \ + { return norm##L##_(src, mask, r, len, cn); } - if (mask) - { - for (int i = 0; i < height; i++) - { - const uchar* src_row = src + i * src_step; - const uchar* mask_row = mask + i * mask_step; - int vl, vlm; - for (int j = 0, jm = 0; j < width * 4; j += vl, jm += vlm) - { - vl = __riscv_vsetvl_e8m8(width * 4 - j); - vlm = __riscv_vsetvl_e8m2(width - jm); - auto vec_src = __riscv_vle8_v_u8m8(src_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8m2(mask_row + jm, vlm); - auto vec_mask_ext = __riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(vec_mask, 1, vlm), vlm), 0x01010101, vlm); - auto bool_mask_ext = __riscv_vmsne(__riscv_vreinterpret_u8m8(vec_mask_ext), 0, vl); - vec_max = __riscv_vmaxu_tumu(bool_mask_ext, vec_max, vec_max, vec_src, vl); - } - } - } - else - { - for (int i = 0; i < height; i++) - { - const uchar* src_row = src + i * src_step; - int vl; - for (int j = 0; j < width * 4; j += vl) - { - vl = __riscv_vsetvl_e8m8(width * 4 - j); - auto vec_src = __riscv_vle8_v_u8m8(src_row + j, vl); - vec_max = __riscv_vmaxu_tu(vec_max, vec_max, vec_src, vl); - } - } - } - auto sc_max = __riscv_vmv_s_x_u8m1(0, vlmax); - sc_max = __riscv_vredmaxu(vec_max, sc_max, vlmax); - *result = __riscv_vmv_x(sc_max); +#define CV_HAL_RVV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \ + CV_HAL_RVV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \ + CV_HAL_RVV_DEF_NORM_FUNC(L1, suffix, type, l1type) \ + CV_HAL_RVV_DEF_NORM_FUNC(L2, suffix, type, l2type) + +CV_HAL_RVV_DEF_NORM_ALL(8u, uchar, int, int, int) +CV_HAL_RVV_DEF_NORM_ALL(8s, schar, int, int, int) +CV_HAL_RVV_DEF_NORM_ALL(16u, ushort, int, int, double) +CV_HAL_RVV_DEF_NORM_ALL(16s, short, int, int, double) +CV_HAL_RVV_DEF_NORM_ALL(32s, int, int, double, double) +CV_HAL_RVV_DEF_NORM_ALL(32f, float, float, double, double) +CV_HAL_RVV_DEF_NORM_ALL(64f, double, double, double, double) - return CV_HAL_ERROR_OK; -} - -inline int normL1_8UC4(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e8m2(); - auto vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax); - - if (mask) - { - for (int i = 0; i < height; i++) - { - const uchar* src_row = src + i * src_step; - const uchar* mask_row = mask + i * mask_step; - int vl, vlm; - for (int j = 0, jm = 0; j < width * 4; j += vl, jm += vlm) - { - vl = __riscv_vsetvl_e8m2(width * 4 - j); - vlm = __riscv_vsetvl_e8mf2(width - jm); - auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8mf2(mask_row + jm, vlm); - auto vec_mask_ext = __riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(vec_mask, 1, vlm), vlm), 0x01010101, vlm); - auto bool_mask_ext = __riscv_vmsne(__riscv_vreinterpret_u8m2(vec_mask_ext), 0, vl); - auto vec_zext = __riscv_vzext_vf4_u32m8_m(bool_mask_ext, vec_src, vl); - vec_sum = __riscv_vadd_tumu(bool_mask_ext, vec_sum, vec_sum, vec_zext, vl); - } - } - } - else - { - for (int i = 0; i < height; i++) - { - const uchar* src_row = src + i * src_step; - int vl; - for (int j = 0; j < width * 4; j += vl) - { - vl = __riscv_vsetvl_e8m2(width * 4 - j); - auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl); - auto vec_zext = __riscv_vzext_vf4(vec_src, vl); - vec_sum = __riscv_vadd_tu(vec_sum, vec_sum, vec_zext, vl); - } - } - } - auto sc_sum = __riscv_vmv_s_x_u32m1(0, vlmax); - sc_sum = __riscv_vredsum(vec_sum, sc_sum, vlmax); - *result = __riscv_vmv_x(sc_sum); - - return CV_HAL_ERROR_OK; -} - -inline int normL2Sqr_8UC4(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e8m2(); - auto vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax); - int cnt = 0; - auto reduce = [&](int vl) { - if ((cnt += vl) < (1 << 16)) - return; - cnt = vl; - for (int i = 0; i < vlmax; i++) - { - *result += __riscv_vmv_x(vec_sum); - vec_sum = __riscv_vslidedown(vec_sum, 1, vlmax); - } - vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax); - }; - - *result = 0; - if (mask) - { - for (int i = 0; i < height; i++) - { - const uchar* src_row = src + i * src_step; - const uchar* mask_row = mask + i * mask_step; - int vl, vlm; - for (int j = 0, jm = 0; j < width * 4; j += vl, jm += vlm) - { - vl = __riscv_vsetvl_e8m2(width * 4 - j); - vlm = __riscv_vsetvl_e8mf2(width - jm); - reduce(vl); - - auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8mf2(mask_row + jm, vlm); - auto vec_mask_ext = __riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(vec_mask, 1, vlm), vlm), 0x01010101, vlm); - auto bool_mask_ext = __riscv_vmsne(__riscv_vreinterpret_u8m2(vec_mask_ext), 0, vl); - auto vec_mul = __riscv_vwmulu_vv_u16m4_m(bool_mask_ext, vec_src, vec_src, vl); - auto vec_zext = __riscv_vzext_vf2_u32m8_m(bool_mask_ext, vec_mul, vl); - vec_sum = __riscv_vadd_tumu(bool_mask_ext, vec_sum, vec_sum, vec_zext, vl); - } - } - } - else - { - for (int i = 0; i < height; i++) - { - const uchar* src_row = src + i * src_step; - int vl; - for (int j = 0; j < width * 4; j += vl) - { - vl = __riscv_vsetvl_e8m2(width * 4 - j); - reduce(vl); - - auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl); - auto vec_mul = __riscv_vwmulu(vec_src, vec_src, vl); - auto vec_zext = __riscv_vzext_vf2(vec_mul, vl); - vec_sum = __riscv_vadd_tu(vec_sum, vec_sum, vec_zext, vl); - } - } - } - reduce(1 << 16); - - return CV_HAL_ERROR_OK; -} - -inline int normInf_32FC1(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e32m8(); - auto vec_max = __riscv_vfmv_v_f_f32m8(0, vlmax); - - if (mask) - { - for (int i = 0; i < height; i++) - { - const float* src_row = reinterpret_cast(src + i * src_step); - const uchar* mask_row = mask + i * mask_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m8(width - j); - auto vec_src = __riscv_vle32_v_f32m8(src_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8m2(mask_row + j, vl); - auto bool_mask = __riscv_vmsne(vec_mask, 0, vl); - auto vec_abs = __riscv_vfabs_v_f32m8_m(bool_mask, vec_src, vl); - vec_max = __riscv_vfmax_tumu(bool_mask, vec_max, vec_max, vec_abs, vl); - } - } - } - else - { - for (int i = 0; i < height; i++) - { - const float* src_row = reinterpret_cast(src + i * src_step); - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m8(width - j); - auto vec_src = __riscv_vle32_v_f32m8(src_row + j, vl); - auto vec_abs = __riscv_vfabs(vec_src, vl); - vec_max = __riscv_vfmax_tu(vec_max, vec_max, vec_abs, vl); - } - } - } - auto sc_max = __riscv_vfmv_s_f_f32m1(0, vlmax); - sc_max = __riscv_vfredmax(vec_max, sc_max, vlmax); - *result = __riscv_vfmv_f(sc_max); - - return CV_HAL_ERROR_OK; -} - -inline int normL1_32FC1(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e32m4(); - auto vec_sum = __riscv_vfmv_v_f_f64m8(0, vlmax); - - if (mask) - { - for (int i = 0; i < height; i++) - { - const float* src_row = reinterpret_cast(src + i * src_step); - const uchar* mask_row = mask + i * mask_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m4(width - j); - auto vec_src = __riscv_vle32_v_f32m4(src_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8m1(mask_row + j, vl); - auto bool_mask = __riscv_vmsne(vec_mask, 0, vl); - auto vec_abs = __riscv_vfabs_v_f32m4_m(bool_mask, vec_src, vl); - auto vec_fext = __riscv_vfwcvt_f_f_v_f64m8_m(bool_mask, vec_abs, vl); - vec_sum = __riscv_vfadd_tumu(bool_mask, vec_sum, vec_sum, vec_fext, vl); - } - } - } - else - { - for (int i = 0; i < height; i++) - { - const float* src_row = reinterpret_cast(src + i * src_step); - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m4(width - j); - auto vec_src = __riscv_vle32_v_f32m4(src_row + j, vl); - auto vec_abs = __riscv_vfabs(vec_src, vl); - auto vec_fext = __riscv_vfwcvt_f_f_v_f64m8(vec_abs, vl); - vec_sum = __riscv_vfadd_tu(vec_sum, vec_sum, vec_fext, vl); - } - } - } - auto sc_sum = __riscv_vfmv_s_f_f64m1(0, vlmax); - sc_sum = __riscv_vfredosum(vec_sum, sc_sum, vlmax); - *result = __riscv_vfmv_f(sc_sum); - - return CV_HAL_ERROR_OK; -} - -inline int normL2Sqr_32FC1(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e32m4(); - auto vec_sum = __riscv_vfmv_v_f_f64m8(0, vlmax); - - if (mask) - { - for (int i = 0; i < height; i++) - { - const float* src_row = reinterpret_cast(src + i * src_step); - const uchar* mask_row = mask + i * mask_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m4(width - j); - auto vec_src = __riscv_vle32_v_f32m4(src_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8m1(mask_row + j, vl); - auto bool_mask = __riscv_vmsne(vec_mask, 0, vl); - auto vec_mul = __riscv_vfwmul_vv_f64m8_m(bool_mask, vec_src, vec_src, vl); - vec_sum = __riscv_vfadd_tumu(bool_mask, vec_sum, vec_sum, vec_mul, vl); - } - } - } - else - { - for (int i = 0; i < height; i++) - { - const float* src_row = reinterpret_cast(src + i * src_step); - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m4(width - j); - auto vec_src = __riscv_vle32_v_f32m4(src_row + j, vl); - auto vec_mul = __riscv_vfwmul(vec_src, vec_src, vl); - vec_sum = __riscv_vfadd_tu(vec_sum, vec_sum, vec_mul, vl); - } - } - } - auto sc_sum = __riscv_vfmv_s_f_f64m1(0, vlmax); - sc_sum = __riscv_vfredosum(vec_sum, sc_sum, vlmax); - *result = __riscv_vfmv_f(sc_sum); - - return CV_HAL_ERROR_OK; } +using NormFunc = int (*)(const uchar*, const uchar*, uchar*, int, int); inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, - int height, int type, int norm_type, double* result) -{ - if (!result) - return CV_HAL_ERROR_OK; + int height, int type, int norm_type, double* result) { + int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - switch (type) - { - case CV_8UC1: - switch (norm_type) - { - case NORM_INF: - return normInf_8UC1(src, src_step, mask, mask_step, width, height, result); - case NORM_L1: - return normL1_8UC1(src, src_step, mask, mask_step, width, height, result); - case NORM_L2SQR: - return normL2Sqr_8UC1(src, src_step, mask, mask_step, width, height, result); - case NORM_L2: - int ret = normL2Sqr_8UC1(src, src_step, mask, mask_step, width, height, result); - *result = std::sqrt(*result); - return ret; - } - return CV_HAL_ERROR_NOT_IMPLEMENTED; - case CV_8UC4: - switch (norm_type) - { - case NORM_INF: - return normInf_8UC4(src, src_step, mask, mask_step, width, height, result); - case NORM_L1: - return normL1_8UC4(src, src_step, mask, mask_step, width, height, result); - case NORM_L2SQR: - return normL2Sqr_8UC4(src, src_step, mask, mask_step, width, height, result); - case NORM_L2: - int ret = normL2Sqr_8UC4(src, src_step, mask, mask_step, width, height, result); - *result = std::sqrt(*result); - return ret; - } - return CV_HAL_ERROR_NOT_IMPLEMENTED; - case CV_32FC1: - switch (norm_type) - { - case NORM_INF: - return normInf_32FC1(src, src_step, mask, mask_step, width, height, result); - case NORM_L1: - return normL1_32FC1(src, src_step, mask, mask_step, width, height, result); - case NORM_L2SQR: - return normL2Sqr_32FC1(src, src_step, mask, mask_step, width, height, result); - case NORM_L2: - int ret = normL2Sqr_32FC1(src, src_step, mask, mask_step, width, height, result); - *result = std::sqrt(*result); - return ret; - } + if (result == nullptr || depth == CV_16F || norm_type > NORM_L2SQR) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } - return CV_HAL_ERROR_NOT_IMPLEMENTED; + // [FIXME] append 0's when merging to 5.x + static NormFunc norm_tab[3][CV_DEPTH_MAX] = { + { + (NormFunc)(normInf_8u), (NormFunc)(normInf_8s), + (NormFunc)(normInf_16u), (NormFunc)(normInf_16s), + (NormFunc)(normInf_32s), (NormFunc)(normInf_32f), + (NormFunc)(normInf_64f), 0, + }, + { + (NormFunc)(normL1_8u), (NormFunc)(normL1_8s), + (NormFunc)(normL1_16u), (NormFunc)(normL1_16s), + (NormFunc)(normL1_32s), (NormFunc)(normL1_32f), + (NormFunc)(normL1_64f), 0, + }, + { + (NormFunc)(normL2_8u), (NormFunc)(normL2_8s), + (NormFunc)(normL2_16u), (NormFunc)(normL2_16s), + (NormFunc)(normL2_32s), (NormFunc)(normL2_32f), + (NormFunc)(normL2_64f), 0, + }, + }; + + static const size_t elem_size_tab[CV_DEPTH_MAX] = { + sizeof(uchar), sizeof(schar), + sizeof(ushort), sizeof(short), + sizeof(int), sizeof(float), + sizeof(int64_t), 0, + }; + + bool src_continuous = (src_step == width * elem_size_tab[depth] * cn || (src_step != width * elem_size_tab[depth] * cn && height == 1)); + bool mask_continuous = (mask_step == width); + size_t nplanes = 1; + size_t size = width * height; + if ((mask && (!src_continuous || !mask_continuous)) || !src_continuous) { + nplanes = height; + size = width; + } + + NormFunc func = norm_tab[norm_type >> 1][depth]; + if (func == nullptr) { + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + // Handle overflow + union { + double d; + int i; + float f; + } res; + res.d = 0; + if ((norm_type == NORM_L1 && depth <= CV_16S) || + ((norm_type == NORM_L2 || norm_type == NORM_L2SQR) && depth <= CV_8S)) { + const size_t esz = elem_size_tab[depth] * cn; + const int total = (int)size; + const int intSumBlockSize = (norm_type == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn; + const int blockSize = std::min(total, intSumBlockSize); + int isum = 0; + int count = 0; + auto _src = src; + auto _mask = mask; + for (size_t i = 0; i < nplanes; i++) { + if ((mask && (!src_continuous || !mask_continuous)) || !src_continuous) { + _src = src + src_step * i; + _mask = mask + mask_step * i; + } + for (int j = 0; j < total; j += blockSize) { + int bsz = std::min(total - j, blockSize); + func(_src, _mask, (uchar*)&isum, bsz, cn); + count += bsz; + if (count + blockSize >= intSumBlockSize || (i + 1 >= nplanes && j + bsz >= total)) { + res.d += isum; + isum = 0; + count = 0; + } + _src += bsz * esz; + if (mask) { + _mask += bsz; + } + } + } + } else { + auto _src = src; + auto _mask = mask; + for (size_t i = 0; i < nplanes; i++) { + if ((mask && (!src_continuous || !mask_continuous)) || !src_continuous) { + _src = src + src_step * i; + _mask = mask + mask_step * i; + } + func(_src, _mask, (uchar*)&res, (int)size, cn); + } + } + + if (norm_type == NORM_INF) { + if (depth == CV_64F) { + *result = res.d; + } else if (depth == CV_32F) { + *result = res.f; + } else { + *result = res.i; + } + } else if (norm_type == NORM_L2) { + *result = std::sqrt(res.d); + } else { + *result = res.d; + } + + return CV_HAL_ERROR_OK; } -}}} +}}} // cv::cv_hal_rvv::norm #endif diff --git a/modules/core/perf/perf_norm.cpp b/modules/core/perf/perf_norm.cpp index 8bcf9ea224..c47398f8fc 100644 --- a/modules/core/perf/perf_norm.cpp +++ b/modules/core/perf/perf_norm.cpp @@ -14,7 +14,7 @@ typedef perf::TestBaseWithParam Size_MatType_NormType; PERF_TEST_P(Size_MatType_NormType, norm, testing::Combine( testing::Values(TYPICAL_MAT_SIZES), - testing::Values(TYPICAL_MAT_TYPES), + testing::Values(CV_8UC1, CV_8UC4, CV_8SC1, CV_16UC1, CV_16SC1, CV_32SC1, CV_32FC1, CV_64FC1), testing::Values((int)NORM_INF, (int)NORM_L1, (int)NORM_L2) ) ) @@ -36,7 +36,7 @@ PERF_TEST_P(Size_MatType_NormType, norm, PERF_TEST_P(Size_MatType_NormType, norm_mask, testing::Combine( testing::Values(TYPICAL_MAT_SIZES), - testing::Values(TYPICAL_MAT_TYPES), + testing::Values(CV_8UC1, CV_8UC4, CV_8SC1, CV_16UC1, CV_16SC1, CV_32SC1, CV_32FC1, CV_64FC1), testing::Values((int)NORM_INF, (int)NORM_L1, (int)NORM_L2) ) ) diff --git a/modules/core/src/norm.rvv1p0.hpp b/modules/core/src/norm.rvv1p0.hpp deleted file mode 100644 index 3db05c50a4..0000000000 --- a/modules/core/src/norm.rvv1p0.hpp +++ /dev/null @@ -1,200 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html. -// -// Copytright (C) 2025, SpaceMIT Inc., all rights reserved. - -#include "opencv2/core/hal/intrin.hpp" - -namespace cv { - -namespace { - -// [TODO] Drop this until rvv has dedicated intrinsics for abs on integers. -template inline ST __riscv_vabs(const T&); - -template<> inline -vuint8m1_t __riscv_vabs(const vint8m1_t& v) { - const int vle8m1 = __riscv_vsetvlmax_e8m1(); - vint8m1_t mask = __riscv_vsra_vx_i8m1(v, 7, vle8m1); - vint8m1_t v_xor = __riscv_vxor_vv_i8m1(v, mask, vle8m1); - return __riscv_vreinterpret_v_i8m1_u8m1( - __riscv_vsub_vv_i8m1(v_xor, mask, vle8m1) - ); -} - -template<> inline -vuint16m1_t __riscv_vabs(const vint16m1_t& v) { - const int vle16m1 = __riscv_vsetvlmax_e16m1(); - vint16m1_t mask = __riscv_vsra_vx_i16m1(v, 15, vle16m1); - vint16m1_t v_xor = __riscv_vxor_vv_i16m1(v, mask, vle16m1); - return __riscv_vreinterpret_v_i16m1_u16m1( - __riscv_vsub_vv_i16m1(v_xor, mask, vle16m1) - ); -} - -template<> inline -vuint32m1_t __riscv_vabs(const vint32m1_t& v) { - const int vle32m1 = __riscv_vsetvlmax_e32m1(); - vint32m1_t mask = __riscv_vsra_vx_i32m1(v, 31, vle32m1); - vint32m1_t v_xor = __riscv_vxor_vv_i32m1(v, mask, vle32m1); - return __riscv_vreinterpret_v_i32m1_u32m1( - __riscv_vsub_vv_i32m1(v_xor, mask, vle32m1) - ); -} -} - -CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN - -template inline -ST normInf_rvv(const T* src, int n, int& j); - -template<> inline -int normInf_rvv(const int* src, int n, int& j) { - const int vle32m1 = __riscv_vsetvlmax_e32m1(); - vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1); - vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1); - for (; j <= n - 2 * vle32m1; j += 2 * vle32m1) { - vuint32m1_t v0 = __riscv_vabs(__riscv_vle32_v_i32m1(src + j, vle32m1)); - r0 = __riscv_vmaxu(r0, v0, vle32m1); - - vuint32m1_t v1 = __riscv_vabs(__riscv_vle32_v_i32m1(src + j + vle32m1, vle32m1)); - r1 = __riscv_vmaxu(r1, v1, vle32m1); - } - r0 = __riscv_vmaxu(r0, r1, vle32m1); - return (int)__riscv_vmv_x(__riscv_vredmaxu(r0, __riscv_vmv_v_x_u32m1(0, vle32m1), vle32m1)); -} - -template inline -ST normL1_rvv(const T* src, int n, int& j); - -template<> inline -int normL1_rvv(const schar* src, int n, int& j) { - const int vle8m1 = __riscv_vsetvlmax_e8m1(); - const int vle16m1 = __riscv_vsetvlmax_e16m1(); - const int vle32m1 = __riscv_vsetvlmax_e32m1(); - vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1); - vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1); - vuint16m1_t zero = __riscv_vmv_v_x_u16m1(0, vle16m1); - for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) { - vuint8m1_t v0 = __riscv_vabs(__riscv_vle8_v_i8m1(src + j, vle8m1)); - vuint16m1_t u0 = __riscv_vwredsumu_tu(zero, v0, zero, vle8m1); - r0 = __riscv_vwredsumu(u0, r0, vle16m1); - - vuint8m1_t v1 = __riscv_vabs(__riscv_vle8_v_i8m1(src + j + vle8m1, vle8m1)); - vuint16m1_t u1 = __riscv_vwredsumu_tu(zero, v1, zero, vle8m1); - r1 = __riscv_vwredsumu(u1, r1, vle16m1); - } - return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1)); -} - -template<> inline -int normL1_rvv(const ushort* src, int n, int& j) { - const int vle16m1 = __riscv_vsetvlmax_e16m1(); - const int vle32m1 = __riscv_vsetvlmax_e32m1(); - vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1); - vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1); - for (; j <= n - 2 * vle16m1; j += 2 * vle16m1) { - vuint16m1_t v0 = __riscv_vle16_v_u16m1(src + j, vle16m1); - r0 = __riscv_vwredsumu(v0, r0, vle16m1); - - vuint16m1_t v1 = __riscv_vle16_v_u16m1(src + j + vle16m1, vle16m1); - r1 = __riscv_vwredsumu(v1, r1, vle16m1); - } - return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1)); -} - -template<> inline -int normL1_rvv(const short* src, int n, int& j) { - const int vle16m1 = __riscv_vsetvlmax_e16m1(); - const int vle32m1 = __riscv_vsetvlmax_e32m1(); - vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1); - vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1); - for (; j<= n - 2 * vle16m1; j += 2 * vle16m1) { - vuint16m1_t v0 = __riscv_vabs(__riscv_vle16_v_i16m1(src + j, vle16m1)); - r0 = __riscv_vwredsumu(v0, r0, vle16m1); - - vuint16m1_t v1 = __riscv_vabs(__riscv_vle16_v_i16m1(src + j + vle16m1, vle16m1)); - r1 = __riscv_vwredsumu(v1, r1, vle16m1); - } - return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1)); -} - -template<> inline -double normL1_rvv(const double* src, int n, int& j) { - const int vle64m1 = __riscv_vsetvlmax_e64m1(); - vfloat64m1_t r0 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1); - vfloat64m1_t r1 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1); - for (; j <= n - 2 * vle64m1; j += 2 * vle64m1) { - vfloat64m1_t v0 = __riscv_vle64_v_f64m1(src + j, vle64m1); - v0 = __riscv_vfabs(v0, vle64m1); - r0 = __riscv_vfadd(r0, v0, vle64m1); - - vfloat64m1_t v1 = __riscv_vle64_v_f64m1(src + j + vle64m1, vle64m1); - v1 = __riscv_vfabs(v1, vle64m1); - r1 = __riscv_vfadd(r1, v1, vle64m1); - } - r0 = __riscv_vfadd(r0, r1, vle64m1); - return __riscv_vfmv_f(__riscv_vfredusum(r0, __riscv_vfmv_v_f_f64m1(0.f, vle64m1), vle64m1)); -} - -template inline -ST normL2_rvv(const T* src, int n, int& j); - -template<> inline -int normL2_rvv(const uchar* src, int n, int& j) { - const int vle8m1 = __riscv_vsetvlmax_e8m1(); - const int vle16m1 = __riscv_vsetvlmax_e16m1(); - const int vle32m1 = __riscv_vsetvlmax_e32m1(); - vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1); - vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1); - for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) { - vuint8m1_t v0 = __riscv_vle8_v_u8m1(src + j, vle8m1); - vuint16m2_t u0 = __riscv_vwmulu(v0, v0, vle8m1); - r0 = __riscv_vwredsumu(u0, r0, vle16m1 * 2); - - vuint8m1_t v1 = __riscv_vle8_v_u8m1(src + j + vle8m1, vle8m1); - vuint16m2_t u1 = __riscv_vwmulu(v1, v1, vle8m1); - r1 = __riscv_vwredsumu(u1, r1, vle16m1 * 2); - } - return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1)); -} - -template<> inline -int normL2_rvv(const schar* src, int n, int& j) { - const int vle8m1 = __riscv_vsetvlmax_e8m1(); - const int vle16m1 = __riscv_vsetvlmax_e16m1(); - const int vle32m1 = __riscv_vsetvlmax_e32m1(); - vint32m1_t r0 = __riscv_vmv_v_x_i32m1(0, vle32m1); - vint32m1_t r1 = __riscv_vmv_v_x_i32m1(0, vle32m1); - for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) { - vint8m1_t v0 = __riscv_vle8_v_i8m1(src + j, vle8m1); - vint16m2_t u0 = __riscv_vwmul(v0, v0, vle8m1); - r0 = __riscv_vwredsum(u0, r0, vle16m1 * 2); - - vint8m1_t v1 = __riscv_vle8_v_i8m1(src + j + vle8m1, vle8m1); - vint16m2_t u1 = __riscv_vwmul(v1, v1, vle8m1); - r1 = __riscv_vwredsum(u1, r1, vle16m1 * 2); - } - return __riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1)); -} - -template<> inline -double normL2_rvv(const double* src, int n, int& j) { - const int vle64m1 = __riscv_vsetvlmax_e64m1(); - vfloat64m1_t r0 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1); - vfloat64m1_t r1 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1); - for (; j <= n - 2 * vle64m1; j += 2 * vle64m1) { - vfloat64m1_t v0 = __riscv_vle64_v_f64m1(src + j, vle64m1); - r0 = __riscv_vfmacc(r0, v0, v0, vle64m1); - - vfloat64m1_t v1 = __riscv_vle64_v_f64m1(src + j + vle64m1, vle64m1); - r1 = __riscv_vfmacc(r1, v1, v1, vle64m1); - } - r0 = __riscv_vfadd(r0, r1, vle64m1); - return __riscv_vfmv_f(__riscv_vfredusum(r0, __riscv_vfmv_v_f_f64m1(0.f, vle64m1), vle64m1)); -} - -CV_CPU_OPTIMIZATION_NAMESPACE_END - -} // cv:: diff --git a/modules/core/src/norm.simd.hpp b/modules/core/src/norm.simd.hpp index c2b72d2e13..0c3cd5d995 100644 --- a/modules/core/src/norm.simd.hpp +++ b/modules/core/src/norm.simd.hpp @@ -4,10 +4,6 @@ #include "precomp.hpp" -#if CV_RVV -#include "norm.rvv1p0.hpp" -#endif - namespace cv { using NormFunc = int (*)(const uchar*, const uchar*, uchar*, int, int); @@ -181,9 +177,6 @@ struct NormInf_SIMD { int operator() (const int* src, int n) const { int j = 0; int s = 0; -#if CV_RVV - s = normInf_rvv(src, n, j); -#else v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32(); v_uint32 r2 = vx_setzero_u32(), r3 = vx_setzero_u32(); for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { @@ -194,7 +187,6 @@ struct NormInf_SIMD { } r0 = v_max(r0, v_max(r1, v_max(r2, r3))); s = std::max(s, saturate_cast(v_reduce_max(r0))); -#endif for (; j < n; j++) { s = std::max(s, cv_abs(src[j])); } @@ -250,9 +242,6 @@ struct NormL1_SIMD { int operator() (const schar* src, int n) const { int j = 0; int s = 0; -#if CV_RVV - s = normL1_rvv(src, n, j); -#else v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32(); v_uint8 one = vx_setall_u8(1); for (; j<= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { @@ -263,7 +252,6 @@ struct NormL1_SIMD { r1 = v_dotprod_expand_fast(v1, one, r1); } s += v_reduce_sum(v_add(r0, r1)); -#endif for (; j < n; j++) { s += saturate_cast(cv_abs(src[j])); } @@ -276,9 +264,6 @@ struct NormL1_SIMD { int operator() (const ushort* src, int n) const { int j = 0; int s = 0; -#if CV_RVV - s = normL1_rvv(src, n, j); -#else v_uint32 r00 = vx_setzero_u32(), r01 = vx_setzero_u32(); v_uint32 r10 = vx_setzero_u32(), r11 = vx_setzero_u32(); for (; j<= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { @@ -295,7 +280,6 @@ struct NormL1_SIMD { r11 = v_add(r11, v11); } s += (int)v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11)); -#endif for (; j < n; j++) { s += src[j]; } @@ -308,9 +292,6 @@ struct NormL1_SIMD { int operator() (const short* src, int n) const { int j = 0; int s = 0; -#if CV_RVV - s = normL1_rvv(src, n, j); -#else v_uint32 r00 = vx_setzero_u32(), r01 = vx_setzero_u32(); v_uint32 r10 = vx_setzero_u32(), r11 = vx_setzero_u32(); for (; j<= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { @@ -327,7 +308,6 @@ struct NormL1_SIMD { r11 = v_add(r11, v11); } s += (int)v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11)); -#endif for (; j < n; j++) { s += saturate_cast(cv_abs(src[j])); } @@ -340,9 +320,6 @@ struct NormL2_SIMD { int operator() (const uchar* src, int n) const { int j = 0; int s = 0; -#if CV_RVV - s = normL2_rvv(src, n, j); -#else v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32(); for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { v_uint8 v0 = vx_load(src + j); @@ -352,7 +329,6 @@ struct NormL2_SIMD { r1 = v_dotprod_expand_fast(v1, v1, r1); } s += v_reduce_sum(v_add(r0, r1)); -#endif for (; j < n; j++) { int v = saturate_cast(src[j]); s += v * v; @@ -366,9 +342,6 @@ struct NormL2_SIMD { int operator() (const schar* src, int n) const { int j = 0; int s = 0; -#if CV_RVV - s = normL2_rvv(src, n, j); -#else v_int32 r0 = vx_setzero_s32(), r1 = vx_setzero_s32(); for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { v_int8 v0 = vx_load(src + j); @@ -377,7 +350,6 @@ struct NormL2_SIMD { r1 = v_dotprod_expand_fast(v1, v1, r1); } s += v_reduce_sum(v_add(r0, r1)); -#endif for (; j < n; j++) { int v = saturate_cast(src[j]); s += v * v; @@ -825,31 +797,6 @@ struct NormL1_SIMD { } }; -template<> -struct NormL1_SIMD { - double operator() (const double* src, int n) const { - int j = 0; - double s = 0.f; -#if CV_RVV // This is introduced to workaround the accuracy issue on ci - s = normL1_rvv(src, n, j); -#else - v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64(); - v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64(); - for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { - r00 = v_add(r00, v_abs(vx_load(src + j ))); - r01 = v_add(r01, v_abs(vx_load(src + j + VTraits::vlanes()))); - r10 = v_add(r10, v_abs(vx_load(src + j + 2 * VTraits::vlanes()))); - r11 = v_add(r11, v_abs(vx_load(src + j + 3 * VTraits::vlanes()))); - } - s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11)); -#endif - for (; j < n; j++) { - s += cv_abs(src[j]); - } - return s; - } -}; - template<> struct NormL2_SIMD { double operator() (const ushort* src, int n) const { @@ -941,14 +888,36 @@ struct NormL2_SIMD { } }; +#endif + +#if CV_SIMD_64F // CV_SIMD_SCALABLE_64F has accuracy problem with the following kernels on ci + +template<> +struct NormL1_SIMD { + double operator() (const double* src, int n) const { + int j = 0; + double s = 0.f; + v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64(); + v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + r00 = v_add(r00, v_abs(vx_load(src + j ))); + r01 = v_add(r01, v_abs(vx_load(src + j + VTraits::vlanes()))); + r10 = v_add(r10, v_abs(vx_load(src + j + 2 * VTraits::vlanes()))); + r11 = v_add(r11, v_abs(vx_load(src + j + 3 * VTraits::vlanes()))); + } + s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11)); + for (; j < n; j++) { + s += cv_abs(src[j]); + } + return s; + } +}; + template<> struct NormL2_SIMD { double operator() (const double* src, int n) const { int j = 0; double s = 0.f; -#if CV_RVV // This is introduced to workaround the accuracy issue on ci - s = normL2_rvv(src, n, j); -#else v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64(); v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64(); for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { @@ -960,7 +929,6 @@ struct NormL2_SIMD { r10 = v_fma(v10, v10, r10); r11 = v_fma(v11, v11, r11); } s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11)); -#endif for (; j < n; j++) { double v = src[j]; s += v * v; @@ -1362,7 +1330,9 @@ CV_DEF_NORM_ALL(64f, double, double, double, double) NormFunc getNormFunc(int normType, int depth) { CV_INSTRUMENT_REGION(); - static NormFunc normTab[3][8] = + + // [FIXME] append 0's when merging to 5.x + static NormFunc normTab[3][CV_DEPTH_MAX] = { { (NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s),