diff --git a/CMakeLists.txt b/CMakeLists.txt index 93a7ad2799..150a018f8a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -996,9 +996,9 @@ if(WITH_NDSRVP) endif() if(WITH_HAL_RVV) - ocv_debug_message(STATUS "Enable HAL RVV acceleration") - if(NOT ";${OpenCV_HAL};" MATCHES ";halrvv;") - set(OpenCV_HAL "halrvv;${OpenCV_HAL}") + ocv_debug_message(STATUS "Enable RVV HAL acceleration") + if(NOT ";${OpenCV_HAL};" MATCHES ";rvvhal;") + set(OpenCV_HAL "rvvhal;${OpenCV_HAL}") endif() endif() @@ -1031,13 +1031,13 @@ foreach(hal ${OpenCV_HAL}) else() message(STATUS "NDSRVP: Andes GNU Toolchain DSP extension is not enabled, disabling ndsrvp...") endif() - elseif(hal STREQUAL "halrvv") + elseif(hal STREQUAL "rvvhal") if(";${CPU_BASELINE_FINAL};" MATCHES ";RVV;") add_subdirectory(hal/riscv-rvv) ocv_hal_register(RVV_HAL_LIBRARIES RVV_HAL_HEADERS RVV_HAL_INCLUDE_DIRS) - list(APPEND OpenCV_USED_HAL "HAL RVV (ver ${RVV_HAL_VERSION})") + list(APPEND OpenCV_USED_HAL "RVV HAL (ver ${RVV_HAL_VERSION})") else() - message(STATUS "HAL RVV: RVV is not available, disabling halrvv...") + message(STATUS "RVV HAL: RVV is not available, disabling RVV HAL...") endif() elseif(hal STREQUAL "ipp") add_subdirectory(hal/ipp) diff --git a/hal/riscv-rvv/CMakeLists.txt b/hal/riscv-rvv/CMakeLists.txt index 8c19800053..a0c9e628b3 100644 --- a/hal/riscv-rvv/CMakeLists.txt +++ b/hal/riscv-rvv/CMakeLists.txt @@ -1,9 +1,26 @@ cmake_minimum_required(VERSION ${MIN_VER_CMAKE} FATAL_ERROR) -set(HAL_LIB_NAME "") +set(RVV_HAL_INCLUDE_DIR include) +set(RVV_HAL_SOURCE_DIR src) + +file(GLOB rvv_hal_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${RVV_HAL_INCLUDE_DIR}/*.hpp") +file(GLOB rvv_hal_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${RVV_HAL_SOURCE_DIR}/**/*.cpp") + +set(HAL_LIB_NAME "rvv_hal") +add_library(${HAL_LIB_NAME} STATIC) +target_sources(${HAL_LIB_NAME} PRIVATE ${rvv_hal_headers} ${rvv_hal_sources}) + +set_target_properties(${HAL_LIB_NAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH}) +if(NOT BUILD_SHARED_LIBS) + ocv_install_target(${HAL_LIB_NAME} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev) +endif() +target_include_directories(${HAL_LIB_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/modules/core/include + ${CMAKE_SOURCE_DIR}/modules/imgproc/include) # ${CMAKE_SOURCE_DIR}/modules/features2d/include set(RVV_HAL_FOUND TRUE CACHE INTERNAL "") set(RVV_HAL_VERSION "0.0.1" CACHE INTERNAL "") set(RVV_HAL_LIBRARIES ${HAL_LIB_NAME} CACHE INTERNAL "") -set(RVV_HAL_HEADERS "hal_rvv.hpp" CACHE INTERNAL "") -set(RVV_HAL_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_SOURCE_DIR}/modules/imgproc/include" CACHE INTERNAL "") +set(RVV_HAL_HEADERS "rvv_hal.hpp" CACHE INTERNAL "") +set(RVV_HAL_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}" CACHE INTERNAL "") diff --git a/hal/riscv-rvv/hal_rvv.hpp b/hal/riscv-rvv/hal_rvv.hpp deleted file mode 100644 index 8fe78bd8b9..0000000000 --- a/hal/riscv-rvv/hal_rvv.hpp +++ /dev/null @@ -1,65 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html. - -#ifndef OPENCV_HAL_RVV_HPP_INCLUDED -#define OPENCV_HAL_RVV_HPP_INCLUDED - -#include "opencv2/core/base.hpp" -#include "opencv2/core/hal/interface.h" -#include "opencv2/imgproc/hal/interface.h" - -#ifndef CV_HAL_RVV_071_ENABLED -# if defined(__GNUC__) && __GNUC__ == 10 && __GNUC_MINOR__ == 4 && defined(__THEAD_VERSION__) && defined(__riscv_v) && __riscv_v == 7000 -# define CV_HAL_RVV_071_ENABLED 1 -# else -# define CV_HAL_RVV_071_ENABLED 0 -# endif -#endif - -#if CV_HAL_RVV_071_ENABLED -#include "version/hal_rvv_071.hpp" -#endif - -#if defined(__riscv_v) && __riscv_v == 1000000 -#include "hal_rvv_1p0/types.hpp" -#include "hal_rvv_1p0/merge.hpp" // core -#include "hal_rvv_1p0/mean.hpp" // core -#include "hal_rvv_1p0/dxt.hpp" // core -#include "hal_rvv_1p0/norm.hpp" // core -#include "hal_rvv_1p0/norm_diff.hpp" // core -#include "hal_rvv_1p0/norm_hamming.hpp" // core -#include "hal_rvv_1p0/convert_scale.hpp" // core -#include "hal_rvv_1p0/minmax.hpp" // core -#include "hal_rvv_1p0/atan.hpp" // core -#include "hal_rvv_1p0/split.hpp" // core -#include "hal_rvv_1p0/magnitude.hpp" // core -#include "hal_rvv_1p0/cart_to_polar.hpp" // core -#include "hal_rvv_1p0/polar_to_cart.hpp" // core -#include "hal_rvv_1p0/flip.hpp" // core -#include "hal_rvv_1p0/lut.hpp" // core -#include "hal_rvv_1p0/exp.hpp" // core -#include "hal_rvv_1p0/log.hpp" // core -#include "hal_rvv_1p0/lu.hpp" // core -#include "hal_rvv_1p0/cholesky.hpp" // core -#include "hal_rvv_1p0/qr.hpp" // core -#include "hal_rvv_1p0/svd.hpp" // core -#include "hal_rvv_1p0/sqrt.hpp" // core -#include "hal_rvv_1p0/copy_mask.hpp" // core -#include "hal_rvv_1p0/div.hpp" // core -#include "hal_rvv_1p0/dotprod.hpp" // core -#include "hal_rvv_1p0/compare.hpp" // core -#include "hal_rvv_1p0/transpose.hpp" // core - -#include "hal_rvv_1p0/moments.hpp" // imgproc -#include "hal_rvv_1p0/filter.hpp" // imgproc -#include "hal_rvv_1p0/pyramids.hpp" // imgproc -#include "hal_rvv_1p0/color.hpp" // imgproc -#include "hal_rvv_1p0/warp.hpp" // imgproc -#include "hal_rvv_1p0/thresh.hpp" // imgproc -#include "hal_rvv_1p0/histogram.hpp" // imgproc -#include "hal_rvv_1p0/resize.hpp" // imgproc -#include "hal_rvv_1p0/integral.hpp" // imgproc -#endif - -#endif diff --git a/hal/riscv-rvv/hal_rvv_1p0/atan.hpp b/hal/riscv-rvv/hal_rvv_1p0/atan.hpp deleted file mode 100644 index b864fea2c1..0000000000 --- a/hal/riscv-rvv/hal_rvv_1p0/atan.hpp +++ /dev/null @@ -1,128 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level -// directory of this distribution and at http://opencv.org/license.html. - -// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. - -#ifndef OPENCV_HAL_RVV_ATAN_HPP_INCLUDED -#define OPENCV_HAL_RVV_ATAN_HPP_INCLUDED - -#undef cv_hal_fastAtan32f -#define cv_hal_fastAtan32f cv::cv_hal_rvv::fast_atan_32 - -#undef cv_hal_fastAtan64f -#define cv_hal_fastAtan64f cv::cv_hal_rvv::fast_atan_64 - -#include - -#include - -namespace cv { namespace cv_hal_rvv { - -namespace detail { -// ref: mathfuncs_core.simd.hpp -static constexpr float pi = CV_PI; - -struct AtanParams -{ - float p1, p3, p5, p7, angle_90; -}; - -static constexpr AtanParams atan_params_rad { - 0.9997878412794807F, - -0.3258083974640975F, - 0.1555786518463281F, - -0.04432655554792128F, - 90.F * (pi / 180.F)}; -static constexpr AtanParams atan_params_deg { - atan_params_rad.p1 * (180 / pi), - atan_params_rad.p3 * (180 / pi), - atan_params_rad.p5 * (180 / pi), - atan_params_rad.p7 * (180 / pi), - 90.F}; - -template -__attribute__((always_inline)) inline VEC_T - rvv_atan(VEC_T vy, VEC_T vx, size_t vl, const AtanParams& params) -{ - const auto ax = __riscv_vfabs(vx, vl); - const auto ay = __riscv_vfabs(vy, vl); - // Reciprocal Estimate (vfrec7) is not accurate enough to pass the test of cartToPolar. - const auto c = __riscv_vfdiv(__riscv_vfmin(ax, ay, vl), - __riscv_vfadd(__riscv_vfmax(ax, ay, vl), FLT_EPSILON, vl), - vl); - const auto c2 = __riscv_vfmul(c, c, vl); - - // Using vfmadd only results in about a 2% performance improvement, but it occupies 3 additional - // M4 registers. (Performance test on phase32f::VectorLength::1048576: time decreased - // from 5.952ms to 5.805ms on Muse Pi) - // Additionally, when registers are nearly fully utilized (though not yet exhausted), the - // compiler is likely to fail to optimize and may introduce slower memory access (e.g., in - // cv::cv_hal_rvv::fast_atan_64). - // Saving registers can also make this function more reusable in other contexts. - // Therefore, vfmadd is not used here. - auto a = __riscv_vfadd(__riscv_vfmul(c2, params.p7, vl), params.p5, vl); - a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p3, vl); - a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p1, vl); - a = __riscv_vfmul(a, c, vl); - - a = __riscv_vfrsub_mu(__riscv_vmflt(ax, ay, vl), a, a, params.angle_90, vl); - a = __riscv_vfrsub_mu(__riscv_vmflt(vx, 0.F, vl), a, a, params.angle_90 * 2, vl); - a = __riscv_vfrsub_mu(__riscv_vmflt(vy, 0.F, vl), a, a, params.angle_90 * 4, vl); - - return a; -} - -} // namespace detail - -inline int fast_atan_32(const float* y, const float* x, float* dst, size_t n, bool angle_in_deg) -{ - auto atan_params = angle_in_deg ? detail::atan_params_deg : detail::atan_params_rad; - - for (size_t vl = 0; n > 0; n -= vl) - { - vl = __riscv_vsetvl_e32m4(n); - - auto vy = __riscv_vle32_v_f32m4(y, vl); - auto vx = __riscv_vle32_v_f32m4(x, vl); - - auto a = detail::rvv_atan(vy, vx, vl, atan_params); - - __riscv_vse32(dst, a, vl); - - x += vl; - y += vl; - dst += vl; - } - - return CV_HAL_ERROR_OK; -} - -inline int fast_atan_64(const double* y, const double* x, double* dst, size_t n, bool angle_in_deg) -{ - // this also uses float32 version, ref: mathfuncs_core.simd.hpp - - auto atan_params = angle_in_deg ? detail::atan_params_deg : detail::atan_params_rad; - - for (size_t vl = 0; n > 0; n -= vl) - { - vl = __riscv_vsetvl_e64m8(n); - - auto vy = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(y, vl), vl); - auto vx = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(x, vl), vl); - - auto a = detail::rvv_atan(vy, vx, vl, atan_params); - - __riscv_vse64(dst, __riscv_vfwcvt_f(a, vl), vl); - - x += vl; - y += vl; - dst += vl; - } - - return CV_HAL_ERROR_OK; -} - -}} // namespace cv::cv_hal_rvv - -#endif //OPENCV_HAL_RVV_ATAN_HPP_INCLUDED diff --git a/hal/riscv-rvv/hal_rvv_1p0/common.hpp b/hal/riscv-rvv/hal_rvv_1p0/common.hpp deleted file mode 100644 index 9fc01d2897..0000000000 --- a/hal/riscv-rvv/hal_rvv_1p0/common.hpp +++ /dev/null @@ -1,52 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html. -// -// Copyright (C) 2025, SpaceMIT Inc., all rights reserved. -// Third party copyrights are property of their respective owners. - -#ifndef OPENCV_HAL_RVV_COMMON_HPP_INCLUDED -#define OPENCV_HAL_RVV_COMMON_HPP_INCLUDED - -#include - -namespace cv { namespace cv_hal_rvv { namespace custom_intrin { - -#define CV_HAL_RVV_NOOP(a) (a) - -#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(_Tpvs, _Tpvd, shift, suffix) \ - inline _Tpvd __riscv_vabs(const _Tpvs& v, const int vl) { \ - _Tpvs mask = __riscv_vsra(v, shift, vl); \ - _Tpvs v_xor = __riscv_vxor(v, mask, vl); \ - return __riscv_vreinterpret_##suffix( \ - __riscv_vsub(v_xor, mask, vl) \ - ); \ - } - -CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m2_t, vuint8m2_t, 7, u8m2) -CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m8_t, vuint8m8_t, 7, u8m8) -CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m4_t, vuint16m4_t, 15, u16m4) -CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m8_t, vuint16m8_t, 15, u16m8) -CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m4_t, vuint32m4_t, 31, u32m4) -CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m8_t, vuint32m8_t, 31, u32m8) - -#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(_Tpvs, _Tpvd, cast, sub, max, min) \ - inline _Tpvd __riscv_vabd(const _Tpvs& v1, const _Tpvs& v2, const int vl) { \ - return cast(__riscv_##sub(__riscv_##max(v1, v2, vl), __riscv_##min(v1, v2, vl), vl)); \ - } - -CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m4_t, vuint8m4_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu) -CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m8_t, vuint8m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu) -CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m2_t, vuint16m2_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu) -CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m8_t, vuint16m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu) - -CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m4_t, vuint8m4_t, __riscv_vreinterpret_u8m4, vsub, vmax, vmin) -CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m8_t, vuint8m8_t, __riscv_vreinterpret_u8m8, vsub, vmax, vmin) -CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m2_t, vuint16m2_t, __riscv_vreinterpret_u16m2, vsub, vmax, vmin) -CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m8_t, vuint16m8_t, __riscv_vreinterpret_u16m8, vsub, vmax, vmin) -CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m4_t, vuint32m4_t, __riscv_vreinterpret_u32m4, vsub, vmax, vmin) -CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m8_t, vuint32m8_t, __riscv_vreinterpret_u32m8, vsub, vmax, vmin) - -}}} // cv::cv_hal_rvv::custom_intrin - -#endif diff --git a/hal/riscv-rvv/hal_rvv_1p0/div.hpp b/hal/riscv-rvv/hal_rvv_1p0/div.hpp deleted file mode 100644 index ccbeb6403d..0000000000 --- a/hal/riscv-rvv/hal_rvv_1p0/div.hpp +++ /dev/null @@ -1,268 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html. -// -// Copyright (C) 2025, SpaceMIT Inc., all rights reserved. -// Third party copyrights are property of their respective owners. - -#ifndef OPENCV_HAL_RVV_DIV_HPP_INCLUDED -#define OPENCV_HAL_RVV_DIV_HPP_INCLUDED - -#include -#include - -namespace cv { namespace cv_hal_rvv { namespace div { - -namespace { - - inline size_t setvl(int l) { return __riscv_vsetvl_e8m2(l); } - - inline vuint8m2_t vle(const uint8_t *p, int vl) { return __riscv_vle8_v_u8m2(p, vl); } - inline vint8m2_t vle(const int8_t *p, int vl) { return __riscv_vle8_v_i8m2(p, vl); } - inline vuint16m4_t vle(const uint16_t *p, int vl) { return __riscv_vle16_v_u16m4(p, vl); } - inline vint16m4_t vle(const int16_t *p, int vl) { return __riscv_vle16_v_i16m4(p, vl); } - inline vint32m8_t vle(const int *p, int vl) { return __riscv_vle32_v_i32m8(p, vl); } - inline vfloat32m8_t vle(const float *p, int vl) { return __riscv_vle32_v_f32m8(p, vl); } - - inline void vse(uint8_t *p, const vuint8m2_t &v, int vl) { __riscv_vse8(p, v, vl); } - inline void vse(int8_t *p, const vint8m2_t &v, int vl) { __riscv_vse8(p, v, vl); } - inline void vse(uint16_t *p, const vuint16m4_t &v, int vl) { __riscv_vse16(p, v, vl); } - inline void vse(int16_t *p, const vint16m4_t &v, int vl) { __riscv_vse16(p, v, vl); } - inline void vse(int *p, const vint32m8_t &v, int vl) { __riscv_vse32(p, v, vl); } - inline void vse(float *p, const vfloat32m8_t &v, int vl) { __riscv_vse32(p, v, vl); } - - inline vuint16m4_t ext(const vuint8m2_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); } - inline vint16m4_t ext(const vint8m2_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); } - inline vuint32m8_t ext(const vuint16m4_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); } - inline vint32m8_t ext(const vint16m4_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); } - - inline vuint8m2_t nclip(const vuint16m4_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); } - inline vint8m2_t nclip(const vint16m4_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); } - inline vuint16m4_t nclip(const vuint32m8_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); } - inline vint16m4_t nclip(const vint32m8_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); } - - template inline - VT div_sat(const VT &v1, const VT &v2, const float scale, const int vl) { - return nclip(div_sat(ext(v1, vl), ext(v2, vl), scale, vl), vl); - } - template <> inline - vint32m8_t div_sat(const vint32m8_t &v1, const vint32m8_t &v2, const float scale, const int vl) { - auto f1 = __riscv_vfcvt_f(v1, vl); - auto f2 = __riscv_vfcvt_f(v2, vl); - auto res = __riscv_vfmul(f1, __riscv_vfrdiv(f2, scale, vl), vl); - return __riscv_vfcvt_x(res, vl); - } - template <> inline - vuint32m8_t div_sat(const vuint32m8_t &v1, const vuint32m8_t &v2, const float scale, const int vl) { - auto f1 = __riscv_vfcvt_f(v1, vl); - auto f2 = __riscv_vfcvt_f(v2, vl); - auto res = __riscv_vfmul(f1, __riscv_vfrdiv(f2, scale, vl), vl); - return __riscv_vfcvt_xu(res, vl); - } - - template inline - VT recip_sat(const VT &v, const float scale, const int vl) { - return nclip(recip_sat(ext(v, vl), scale, vl), vl); - } - template <> inline - vint32m8_t recip_sat(const vint32m8_t &v, const float scale, const int vl) { - auto f = __riscv_vfcvt_f(v, vl); - auto res = __riscv_vfrdiv(f, scale, vl); - return __riscv_vfcvt_x(res, vl); - } - template <> inline - vuint32m8_t recip_sat(const vuint32m8_t &v, const float scale, const int vl) { - auto f = __riscv_vfcvt_f(v, vl); - auto res = __riscv_vfrdiv(f, scale, vl); - return __riscv_vfcvt_xu(res, vl); - } - -} // anonymous - -#undef cv_hal_div8u -#define cv_hal_div8u cv::cv_hal_rvv::div::div -#undef cv_hal_div8s -#define cv_hal_div8s cv::cv_hal_rvv::div::div -#undef cv_hal_div16u -#define cv_hal_div16u cv::cv_hal_rvv::div::div -#undef cv_hal_div16s -#define cv_hal_div16s cv::cv_hal_rvv::div::div -#undef cv_hal_div32s -#define cv_hal_div32s cv::cv_hal_rvv::div::div -#undef cv_hal_div32f -#define cv_hal_div32f cv::cv_hal_rvv::div::div -// #undef cv_hal_div64f -// #define cv_hal_div64f cv::cv_hal_rvv::div::div - -template inline -int div(const ST *src1, size_t step1, const ST *src2, size_t step2, - ST *dst, size_t step, int width, int height, float scale) { - if (scale == 0.f || - (scale * static_cast(std::numeric_limits::max())) < 1.f && - (scale * static_cast(std::numeric_limits::max())) > -1.f) { - for (int h = 0; h < height; h++) { - ST *dst_h = reinterpret_cast((uchar*)dst + h * step); - std::memset(dst_h, 0, sizeof(ST) * width); - } - return CV_HAL_ERROR_OK; - } - - for (int h = 0; h < height; h++) { - const ST *src1_h = reinterpret_cast((const uchar*)src1 + h * step1); - const ST *src2_h = reinterpret_cast((const uchar*)src2 + h * step2); - ST *dst_h = reinterpret_cast((uchar*)dst + h * step); - - int vl; - for (int w = 0; w < width; w += vl) { - vl = setvl(width - w); - - auto v1 = vle(src1_h + w, vl); - auto v2 = vle(src2_h + w, vl); - - auto mask = __riscv_vmseq(v2, 0, vl); - vse(dst_h + w, __riscv_vmerge(div_sat(v1, v2, scale, vl), 0, mask, vl), vl); - } - } - - return CV_HAL_ERROR_OK; -} - -template <> inline -int div(const float *src1, size_t step1, const float *src2, size_t step2, - float *dst, size_t step, int width, int height, float scale) { - if (scale == 0.f) { - for (int h = 0; h < height; h++) { - float *dst_h = reinterpret_cast((uchar*)dst + h * step); - std::memset(dst_h, 0, sizeof(float) * width); - } - return CV_HAL_ERROR_OK; - } - - if (std::fabs(scale - 1.f) < FLT_EPSILON) { - for (int h = 0; h < height; h++) { - const float *src1_h = reinterpret_cast((const uchar*)src1 + h * step1); - const float *src2_h = reinterpret_cast((const uchar*)src2 + h * step2); - float *dst_h = reinterpret_cast((uchar*)dst + h * step); - - int vl; - for (int w = 0; w < width; w += vl) { - vl = setvl(width - w); - - auto v1 = vle(src1_h + w, vl); - auto v2 = vle(src2_h + w, vl); - - vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfrdiv(v2, 1.f, vl), vl), vl); - } - } - } else { - for (int h = 0; h < height; h++) { - const float *src1_h = reinterpret_cast((const uchar*)src1 + h * step1); - const float *src2_h = reinterpret_cast((const uchar*)src2 + h * step2); - float *dst_h = reinterpret_cast((uchar*)dst + h * step); - - int vl; - for (int w = 0; w < width; w += vl) { - vl = setvl(width - w); - - auto v1 = vle(src1_h + w, vl); - auto v2 = vle(src2_h + w, vl); - - vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfrdiv(v2, scale, vl), vl), vl); - } - } - } - - return CV_HAL_ERROR_OK; -} - -#undef cv_hal_recip8u -#define cv_hal_recip8u cv::cv_hal_rvv::div::recip -#undef cv_hal_recip8s -#define cv_hal_recip8s cv::cv_hal_rvv::div::recip -#undef cv_hal_recip16u -#define cv_hal_recip16u cv::cv_hal_rvv::div::recip -#undef cv_hal_recip16s -#define cv_hal_recip16s cv::cv_hal_rvv::div::recip -#undef cv_hal_recip32s -#define cv_hal_recip32s cv::cv_hal_rvv::div::recip -#undef cv_hal_recip32f -#define cv_hal_recip32f cv::cv_hal_rvv::div::recip -// #undef cv_hal_recip64f -// #define cv_hal_recip64f cv::cv_hal_rvv::div::recip - -template inline -int recip(const ST *src_data, size_t src_step, ST *dst_data, size_t dst_step, - int width, int height, float scale) { - if (scale == 0.f || scale < 1.f && scale > -1.f) { - for (int h = 0; h < height; h++) { - ST *dst_h = reinterpret_cast((uchar*)dst_data + h * dst_step); - std::memset(dst_h, 0, sizeof(ST) * width); - } - return CV_HAL_ERROR_OK; - } - - for (int h = 0; h < height; h++) { - const ST *src_h = reinterpret_cast((const uchar*)src_data + h * src_step); - ST *dst_h = reinterpret_cast((uchar*)dst_data + h * dst_step); - - int vl; - for (int w = 0; w < width; w += vl) { - vl = setvl(width - w); - - auto v = vle(src_h + w, vl); - - auto mask = __riscv_vmseq(v, 0, vl); - vse(dst_h + w, __riscv_vmerge(recip_sat(v, scale, vl), 0, mask, vl), vl); - } - } - - return CV_HAL_ERROR_OK; -} - -template <> inline -int recip(const float *src_data, size_t src_step, float *dst_data, size_t dst_step, - int width, int height, float scale) { - if (scale == 0.f) { - for (int h = 0; h < height; h++) { - float *dst_h = reinterpret_cast((uchar*)dst_data + h * dst_step); - std::memset(dst_h, 0, sizeof(float) * width); - } - return CV_HAL_ERROR_OK; - } - - if (std::fabs(scale - 1.f) < FLT_EPSILON) { - for (int h = 0; h < height; h++) { - const float *src_h = reinterpret_cast((const uchar*)src_data + h * src_step); - float *dst_h = reinterpret_cast((uchar*)dst_data + h * dst_step); - - int vl; - for (int w = 0; w < width; w += vl) { - vl = setvl(width - w); - - auto v = vle(src_h + w, vl); - - vse(dst_h + w, __riscv_vfrdiv(v, 1.f, vl), vl); - } - } - } else { - for (int h = 0; h < height; h++) { - const float *src_h = reinterpret_cast((const uchar*)src_data + h * src_step); - float *dst_h = reinterpret_cast((uchar*)dst_data + h * dst_step); - - int vl; - for (int w = 0; w < width; w += vl) { - vl = setvl(width - w); - - auto v = vle(src_h + w, vl); - - vse(dst_h + w, __riscv_vfrdiv(v, scale, vl), vl); - } - } - } - - return CV_HAL_ERROR_OK; -} - -}}} // cv::cv_hal_rvv::div - -#endif // OPENCV_HAL_RVV_DIV_HPP_INCLUDED diff --git a/hal/riscv-rvv/hal_rvv_1p0/filter.hpp b/hal/riscv-rvv/hal_rvv_1p0/filter.hpp deleted file mode 100644 index 85949137e3..0000000000 --- a/hal/riscv-rvv/hal_rvv_1p0/filter.hpp +++ /dev/null @@ -1,2553 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html. - -// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. - -#ifndef OPENCV_HAL_RVV_FILTER_HPP_INCLUDED -#define OPENCV_HAL_RVV_FILTER_HPP_INCLUDED - -#include - -struct cvhalFilter2D; - -namespace cv { namespace cv_hal_rvv { - -namespace filter { -#undef cv_hal_filterInit -#undef cv_hal_filter -#undef cv_hal_filterFree -#define cv_hal_filterInit cv::cv_hal_rvv::filter::filterInit -#define cv_hal_filter cv::cv_hal_rvv::filter::filter -#define cv_hal_filterFree cv::cv_hal_rvv::filter::filterFree - -class FilterInvoker : public ParallelLoopBody -{ -public: - template - FilterInvoker(std::function _func, Args&&... args) - { - func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward(args)...); - } - - virtual void operator()(const Range& range) const override - { - func(range.start, range.end); - } - -private: - std::function func; -}; - -template -static inline int invoke(int height, std::function func, Args&&... args) -{ - cv::parallel_for_(Range(1, height), FilterInvoker(func, std::forward(args)...), cv::getNumThreads()); - return func(0, 1, std::forward(args)...); -} - -static inline int borderInterpolate( int p, int len, int borderType ) -{ - if ((unsigned)p < (unsigned)len) - ; - else if (borderType == BORDER_REPLICATE) - p = p < 0 ? 0 : len - 1; - else if (borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101) - { - int delta = borderType == BORDER_REFLECT_101; - if (len == 1) - return 0; - do - { - if (p < 0) - p = -p - 1 + delta; - else - p = len - 1 - (p - len) - delta; - } - while( (unsigned)p >= (unsigned)len ); - } - else if (borderType == BORDER_WRAP) - { - if (p < 0) - p -= ((p-len+1)/len)*len; - if (p >= len) - p %= len; - } - else if (borderType == BORDER_CONSTANT) - p = -1; - return p; -} - -struct Filter2D -{ - const uchar* kernel_data; - size_t kernel_step; - int kernel_type; - int kernel_width; - int kernel_height; - int src_type; - int dst_type; - int borderType; - double delta; - int anchor_x; - int anchor_y; -}; - -inline int filterInit(cvhalFilter2D** context, uchar* kernel_data, size_t kernel_step, int kernel_type, int kernel_width, int kernel_height, int /*max_width*/, int /*max_height*/, int src_type, int dst_type, int borderType, double delta, int anchor_x, int anchor_y, bool /*allowSubmatrix*/, bool /*allowInplace*/) -{ - if (kernel_type != CV_32FC1 || src_type != CV_8UC4 || dst_type != CV_8UC4) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if (kernel_width != kernel_height) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if (kernel_width != 3 && kernel_width != 5) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - - anchor_x = anchor_x < 0 ? kernel_width / 2 : anchor_x; - anchor_y = anchor_y < 0 ? kernel_height / 2 : anchor_y; - *context = reinterpret_cast(new Filter2D{kernel_data, kernel_step, kernel_type, kernel_width, kernel_height, src_type, dst_type, borderType, delta, anchor_x, anchor_y}); - return CV_HAL_ERROR_OK; -} - -static void process3(int anchor, int left, int right, float delta, const float* kernel, const uchar* row0, const uchar* row1, const uchar* row2, uchar* dst) -{ - int vl; - for (int i = left; i < right; i += vl) - { - vl = __riscv_vsetvl_e8m1(right - i); - auto s0 = __riscv_vfmv_v_f_f32m4(delta, vl); - auto s1 = __riscv_vfmv_v_f_f32m4(delta, vl); - auto s2 = __riscv_vfmv_v_f_f32m4(delta, vl); - auto s3 = __riscv_vfmv_v_f_f32m4(delta, vl); - - auto addshift = [&](vfloat32m4_t a, vfloat32m4_t b, float k0, float k1, float k2, float r1, float r2) { - a = __riscv_vfmacc(a, k0, b, vl); - b = __riscv_vfslide1down(b, r1, vl); - a = __riscv_vfmacc(a, k1, b, vl); - b = __riscv_vfslide1down(b, r2, vl); - return __riscv_vfmacc(a, k2, b, vl); - }; - auto loadsrc = [&](const uchar* row, float k0, float k1, float k2) { - if (!row) return; - - const uchar* extra = row + (i - anchor) * 4; - auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl); - auto v0 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl), vl); - auto v1 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl), vl); - auto v2 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl), vl); - auto v3 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl), vl); - - extra += vl * 4; - s0 = addshift(s0, v0, k0, k1, k2, extra[0], extra[4]); - s1 = addshift(s1, v1, k0, k1, k2, extra[1], extra[5]); - s2 = addshift(s2, v2, k0, k1, k2, extra[2], extra[6]); - s3 = addshift(s3, v3, k0, k1, k2, extra[3], extra[7]); - }; - - loadsrc(row0, kernel[0], kernel[1], kernel[2]); - loadsrc(row1, kernel[3], kernel[4], kernel[5]); - loadsrc(row2, kernel[6], kernel[7], kernel[8]); - vuint8m1x4_t val{}; - val = __riscv_vset_v_u8m1_u8m1x4(val, 0, __riscv_vnclipu(__riscv_vfncvt_xu(s0, vl), 0, __RISCV_VXRM_RNU, vl)); - val = __riscv_vset_v_u8m1_u8m1x4(val, 1, __riscv_vnclipu(__riscv_vfncvt_xu(s1, vl), 0, __RISCV_VXRM_RNU, vl)); - val = __riscv_vset_v_u8m1_u8m1x4(val, 2, __riscv_vnclipu(__riscv_vfncvt_xu(s2, vl), 0, __RISCV_VXRM_RNU, vl)); - val = __riscv_vset_v_u8m1_u8m1x4(val, 3, __riscv_vnclipu(__riscv_vfncvt_xu(s3, vl), 0, __RISCV_VXRM_RNU, vl)); - __riscv_vsseg4e8(dst + i * 4, val, vl); - } -} - -static void process5(int anchor, int left, int right, float delta, const float* kernel, const uchar* row0, const uchar* row1, const uchar* row2, const uchar* row3, const uchar* row4, uchar* dst) -{ - int vl; - for (int i = left; i < right; i += vl) - { - vl = __riscv_vsetvl_e8m1(right - i); - auto s0 = __riscv_vfmv_v_f_f32m4(delta, vl); - auto s1 = __riscv_vfmv_v_f_f32m4(delta, vl); - auto s2 = __riscv_vfmv_v_f_f32m4(delta, vl); - auto s3 = __riscv_vfmv_v_f_f32m4(delta, vl); - - auto addshift = [&](vfloat32m4_t a, vfloat32m4_t b, float k0, float k1, float k2, float k3, float k4, float r1, float r2, float r3, float r4) { - a = __riscv_vfmacc(a, k0, b, vl); - b = __riscv_vfslide1down(b, r1, vl); - a = __riscv_vfmacc(a, k1, b, vl); - b = __riscv_vfslide1down(b, r2, vl); - a = __riscv_vfmacc(a, k2, b, vl); - b = __riscv_vfslide1down(b, r3, vl); - a = __riscv_vfmacc(a, k3, b, vl); - b = __riscv_vfslide1down(b, r4, vl); - return __riscv_vfmacc(a, k4, b, vl); - }; - auto loadsrc = [&](const uchar* row, float k0, float k1, float k2, float k3, float k4) { - if (!row) return; - - const uchar* extra = row + (i - anchor) * 4; - auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl); - auto v0 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl), vl); - auto v1 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl), vl); - auto v2 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl), vl); - auto v3 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl), vl); - - extra += vl * 4; - s0 = addshift(s0, v0, k0, k1, k2, k3, k4, extra[0], extra[4], extra[ 8], extra[12]); - s1 = addshift(s1, v1, k0, k1, k2, k3, k4, extra[1], extra[5], extra[ 9], extra[13]); - s2 = addshift(s2, v2, k0, k1, k2, k3, k4, extra[2], extra[6], extra[10], extra[14]); - s3 = addshift(s3, v3, k0, k1, k2, k3, k4, extra[3], extra[7], extra[11], extra[15]); - }; - - loadsrc(row0, kernel[ 0], kernel[ 1], kernel[ 2], kernel[ 3], kernel[ 4]); - loadsrc(row1, kernel[ 5], kernel[ 6], kernel[ 7], kernel[ 8], kernel[ 9]); - loadsrc(row2, kernel[10], kernel[11], kernel[12], kernel[13], kernel[14]); - loadsrc(row3, kernel[15], kernel[16], kernel[17], kernel[18], kernel[19]); - loadsrc(row4, kernel[20], kernel[21], kernel[22], kernel[23], kernel[24]); - vuint8m1x4_t val{}; - val = __riscv_vset_v_u8m1_u8m1x4(val, 0, __riscv_vnclipu(__riscv_vfncvt_xu(s0, vl), 0, __RISCV_VXRM_RNU, vl)); - val = __riscv_vset_v_u8m1_u8m1x4(val, 1, __riscv_vnclipu(__riscv_vfncvt_xu(s1, vl), 0, __RISCV_VXRM_RNU, vl)); - val = __riscv_vset_v_u8m1_u8m1x4(val, 2, __riscv_vnclipu(__riscv_vfncvt_xu(s2, vl), 0, __RISCV_VXRM_RNU, vl)); - val = __riscv_vset_v_u8m1_u8m1x4(val, 3, __riscv_vnclipu(__riscv_vfncvt_xu(s3, vl), 0, __RISCV_VXRM_RNU, vl)); - __riscv_vsseg4e8(dst + i * 4, val, vl); - } -} - -// the algorithm is copied from 3rdparty/carotene/src/convolution.cpp, -// in the function void CAROTENE_NS::convolution -template -static inline int filter(int start, int end, Filter2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, int width, int height, int full_width, int full_height, int offset_x, int offset_y) -{ - float kernel[ksize * ksize]; - for (int i = 0; i < ksize * ksize; i++) - { - kernel[i] = reinterpret_cast(data->kernel_data + (i / ksize) * data->kernel_step)[i % ksize]; - } - - constexpr int noval = std::numeric_limits::max(); - auto access = [&](int x, int y) { - int pi, pj; - if (data->borderType & BORDER_ISOLATED) - { - pi = borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED); - pj = borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED); - pi = pi < 0 ? noval : pi; - pj = pj < 0 ? noval : pj; - } - else - { - pi = borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType); - pj = borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType); - pi = pi < 0 ? noval : pi - offset_y; - pj = pj < 0 ? noval : pj - offset_x; - } - return std::make_pair(pi, pj); - }; - - auto process = [&](int x, int y) { - float sum0, sum1, sum2, sum3; - sum0 = sum1 = sum2 = sum3 = data->delta; - for (int i = 0; i < ksize * ksize; i++) - { - auto p = access(x + i / ksize, y + i % ksize); - if (p.first != noval && p.second != noval) - { - sum0 += kernel[i] * src_data[p.first * src_step + p.second * 4 ]; - sum1 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 1]; - sum2 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 2]; - sum3 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 3]; - } - } - dst_data[(x * width + y) * 4 ] = std::max(0, std::min((int)std::round(sum0), (int)std::numeric_limits::max())); - dst_data[(x * width + y) * 4 + 1] = std::max(0, std::min((int)std::round(sum1), (int)std::numeric_limits::max())); - dst_data[(x * width + y) * 4 + 2] = std::max(0, std::min((int)std::round(sum2), (int)std::numeric_limits::max())); - dst_data[(x * width + y) * 4 + 3] = std::max(0, std::min((int)std::round(sum3), (int)std::numeric_limits::max())); - }; - - const int left = data->anchor_x, right = width - (ksize - 1 - data->anchor_x); - for (int i = start; i < end; i++) - { - if (left >= right) - { - for (int j = 0; j < width; j++) - process(i, j); - } - else - { - for (int j = 0; j < left; j++) - process(i, j); - for (int j = right; j < width; j++) - process(i, j); - - const uchar* row0 = access(i , 0).first == noval ? nullptr : src_data + access(i , 0).first * src_step; - const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step; - const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step; - if (ksize == 3) - { - process3(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, dst_data + i * width * 4); - } - else - { - const uchar* row3 = access(i + 3, 0).first == noval ? nullptr : src_data + access(i + 3, 0).first * src_step; - const uchar* row4 = access(i + 4, 0).first == noval ? nullptr : src_data + access(i + 4, 0).first * src_step; - process5(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, row3, row4, dst_data + i * width * 4); - } - } - } - - return CV_HAL_ERROR_OK; -} - -inline int filter(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y) -{ - Filter2D* data = reinterpret_cast(context); - std::vector dst(width * height * 4); - - int res = CV_HAL_ERROR_NOT_IMPLEMENTED; - switch (data->kernel_width) - { - case 3: - res = invoke(height, {filter<3>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y); - break; - case 5: - res = invoke(height, {filter<5>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y); - break; - } - - for (int i = 0; i < height; i++) - memcpy(dst_data + i * dst_step, dst.data() + i * width * 4, width * 4); - return res; -} - -inline int filterFree(cvhalFilter2D* context) -{ - delete reinterpret_cast(context); - return CV_HAL_ERROR_OK; -} -} // cv::cv_hal_rvv::filter - -namespace sepFilter { -#undef cv_hal_sepFilterInit -#undef cv_hal_sepFilter -#undef cv_hal_sepFilterFree -#define cv_hal_sepFilterInit cv::cv_hal_rvv::sepFilter::sepFilterInit -#define cv_hal_sepFilter cv::cv_hal_rvv::sepFilter::sepFilter -#define cv_hal_sepFilterFree cv::cv_hal_rvv::sepFilter::sepFilterFree - -struct sepFilter2D -{ - int src_type; - int dst_type; - int kernel_type; - const uchar* kernelx_data; - int kernelx_length; - const uchar* kernely_data; - int kernely_length; - int anchor_x; - int anchor_y; - double delta; - int borderType; -}; - -inline int sepFilterInit(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type, uchar* kernelx_data, int kernelx_length, uchar* kernely_data, int kernely_length, int anchor_x, int anchor_y, double delta, int borderType) -{ - if (kernel_type != CV_32FC1) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if (src_type != CV_8UC1 && src_type != CV_16SC1 && src_type != CV_32FC1) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if (dst_type != CV_16SC1 && dst_type != CV_32FC1) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if ((kernelx_length != 3 && kernelx_length != 5) || kernelx_length != kernely_length) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - - anchor_x = anchor_x < 0 ? kernelx_length / 2 : anchor_x; - anchor_y = anchor_y < 0 ? kernely_length / 2 : anchor_y; - *context = reinterpret_cast(new sepFilter2D{src_type, dst_type, kernel_type, kernelx_data, kernelx_length, kernely_data, kernely_length, anchor_x, anchor_y, delta, borderType & ~BORDER_ISOLATED}); - return CV_HAL_ERROR_OK; -} - -// the algorithm is copied from 3rdparty/carotene/src/separable_filter.hpp, -// in the functor RowFilter3x3S16Generic and ColFilter3x3S16Generic -template -static inline int sepFilter(int start, int end, sepFilter2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y) -{ - constexpr int noval = std::numeric_limits::max(); - auto accessX = [&](int x) { - int pi; - if (data->borderType & BORDER_ISOLATED) - { - pi = filter::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED); - pi = pi < 0 ? noval : pi; - } - else - { - pi = filter::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType); - pi = pi < 0 ? noval : pi - offset_y; - } - return pi; - }; - auto accessY = [&](int y) { - int pj; - if (data->borderType & BORDER_ISOLATED) - { - pj = filter::borderInterpolate(y - data->anchor_x, width, data->borderType & ~BORDER_ISOLATED); - pj = pj < 0 ? noval : pj; - } - else - { - pj = filter::borderInterpolate(offset_x + y - data->anchor_x, full_width, data->borderType); - pj = pj < 0 ? noval : pj - offset_x; - } - return pj; - }; - auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; }; - - const float* kx = reinterpret_cast(data->kernelx_data); - const float* ky = reinterpret_cast(data->kernely_data); - std::vector res(width * ksize); - auto process = [&](int x, int y) { - float sum = 0; - for (int i = 0; i < ksize; i++) - { - int p = accessY(y + i); - if (p != noval) - { - sum += kx[i] * reinterpret_cast(src_data + x * src_step)[p]; - } - } - res[p2idx(x, y)] = sum; - }; - - const int left = data->anchor_x, right = width - (ksize - 1 - data->anchor_x); - for (int i = start - data->anchor_y; i < end + (ksize - 1 - data->anchor_y); i++) - { - if (i + offset_y >= 0 && i + offset_y < full_height) - { - if (left >= right) - { - for (int j = 0; j < width; j++) - process(i, j); - } - else - { - for (int j = 0; j < left; j++) - process(i, j); - for (int j = right; j < width; j++) - process(i, j); - - int vl; - for (int j = left; j < right; j += vl) - { - vl = __riscv_vsetvl_e8m2(right - j); - const T* extra = reinterpret_cast(src_data + i * src_step) + j - data->anchor_x; - vfloat32m8_t src; - if (std::is_same::value) - { - src = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vle8_v_u8m2(reinterpret_cast(extra), vl), vl), vl); - } - else if (std::is_same::value) - { - src = __riscv_vfwcvt_f(__riscv_vle16_v_i16m4(reinterpret_cast(extra), vl), vl); - } - else - { - src = __riscv_vle32_v_f32m8(reinterpret_cast(extra), vl); - } - - extra += vl; - auto sum = __riscv_vfmul(src, kx[0], vl); - src = __riscv_vfslide1down(src, extra[0], vl); - sum = __riscv_vfmacc(sum, kx[1], src, vl); - src = __riscv_vfslide1down(src, extra[1], vl); - sum = __riscv_vfmacc(sum, kx[2], src, vl); - if (ksize == 5) - { - src = __riscv_vfslide1down(src, extra[2], vl); - sum = __riscv_vfmacc(sum, kx[3], src, vl); - src = __riscv_vfslide1down(src, extra[3], vl); - sum = __riscv_vfmacc(sum, kx[4], src, vl); - } - __riscv_vse32(res.data() + p2idx(i, j), sum, vl); - } - } - } - - int cur = i - (ksize - 1 - data->anchor_y); - if (cur >= start) - { - const float* row0 = accessX(cur ) == noval ? nullptr : res.data() + p2idx(accessX(cur ), 0); - const float* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0); - const float* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0); - const float* row3 = nullptr, *row4 = nullptr; - if (ksize == 5) - { - row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0); - row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0); - } - - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m4(width - j); - auto v0 = row0 ? __riscv_vle32_v_f32m4(row0 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl); - auto v1 = row1 ? __riscv_vle32_v_f32m4(row1 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl); - auto v2 = row2 ? __riscv_vle32_v_f32m4(row2 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl); - auto sum = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmv_v_f_f32m4(data->delta, vl), ky[0], v0, vl), ky[1], v1, vl), ky[2], v2, vl); - - if (ksize == 5) - { - auto v3 = row3 ? __riscv_vle32_v_f32m4(row3 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl); - auto v4 = row4 ? __riscv_vle32_v_f32m4(row4 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl); - sum = __riscv_vfmacc(__riscv_vfmacc(sum, ky[3], v3, vl), ky[4], v4, vl); - } - - if (data->dst_type == CV_16SC1) - { - __riscv_vse16(reinterpret_cast(dst_data + cur * dst_step) + j, __riscv_vfncvt_x(sum, vl), vl); - } - else - { - __riscv_vse32(reinterpret_cast(dst_data + cur * dst_step) + j, sum, vl); - } - } - } - } - - return CV_HAL_ERROR_OK; -} - -inline int sepFilter(cvhalFilter2D *context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y) -{ - sepFilter2D* data = reinterpret_cast(context); - - uchar* _dst_data = dst_data; - size_t _dst_step = dst_step; - const size_t size = CV_ELEM_SIZE(data->dst_type); - std::vector dst; - if (src_data == _dst_data) - { - dst = std::vector(width * height * size); - dst_data = dst.data(); - dst_step = width * size; - } - - int res = CV_HAL_ERROR_NOT_IMPLEMENTED; - switch (data->kernelx_length*100 + data->src_type) - { - case 300 + CV_8UC1: - res = filter::invoke(height, {sepFilter<3, uchar>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); - break; - case 500 + CV_8UC1: - res = filter::invoke(height, {sepFilter<5, uchar>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); - break; - case 300 + CV_16SC1: - res = filter::invoke(height, {sepFilter<3, short>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); - break; - case 500 + CV_16SC1: - res = filter::invoke(height, {sepFilter<5, short>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); - break; - case 300 + CV_32FC1: - res = filter::invoke(height, {sepFilter<3, float>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); - break; - case 500 + CV_32FC1: - res = filter::invoke(height, {sepFilter<5, float>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); - break; - } - if (res == CV_HAL_ERROR_NOT_IMPLEMENTED) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - - if (src_data == _dst_data) - { - for (int i = 0; i < height; i++) - memcpy(_dst_data + i * _dst_step, dst.data() + i * dst_step, dst_step); - } - - return res; -} - -inline int sepFilterFree(cvhalFilter2D* context) -{ - delete reinterpret_cast(context); - return CV_HAL_ERROR_OK; -} -} // cv::cv_hal_rvv::sepFilter - -namespace morph { -#undef cv_hal_morphInit -#undef cv_hal_morph -#undef cv_hal_morphFree -#define cv_hal_morphInit cv::cv_hal_rvv::morph::morphInit -#define cv_hal_morph cv::cv_hal_rvv::morph::morph -#define cv_hal_morphFree cv::cv_hal_rvv::morph::morphFree - -struct Morph2D -{ - int operation; - int src_type; - int dst_type; - int kernel_type; - uchar* kernel_data; - size_t kernel_step; - int kernel_width; - int kernel_height; - int anchor_x; - int anchor_y; - int borderType; - const uchar* borderValue; -}; - -inline int morphInit(cvhalFilter2D** context, int operation, int src_type, int dst_type, int /*max_width*/, int /*max_height*/, int kernel_type, uchar* kernel_data, size_t kernel_step, int kernel_width, int kernel_height, int anchor_x, int anchor_y, int borderType, const double borderValue[4], int iterations, bool /*allowSubmatrix*/, bool /*allowInplace*/) -{ - if (kernel_type != CV_8UC1 || src_type != dst_type) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if (src_type != CV_8UC1 && src_type != CV_8UC4) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if (kernel_width != kernel_height || kernel_width != 3) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if (iterations != 1) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if (operation != CV_HAL_MORPH_ERODE && operation != CV_HAL_MORPH_DILATE) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - - uchar* borderV; - if (src_type == CV_8UC1) - { - borderV = new uchar{static_cast(borderValue[0])}; - if (operation == CV_HAL_MORPH_DILATE && borderValue[0] == DBL_MAX) - borderV[0] = 0; - } - else - { - borderV = new uchar[4]{static_cast(borderValue[0]), static_cast(borderValue[1]), static_cast(borderValue[2]), static_cast(borderValue[3])}; - if (operation == CV_HAL_MORPH_DILATE) - { - if (borderValue[0] == DBL_MAX) - borderV[0] = 0; - if (borderValue[1] == DBL_MAX) - borderV[1] = 0; - if (borderValue[2] == DBL_MAX) - borderV[2] = 0; - if (borderValue[3] == DBL_MAX) - borderV[3] = 0; - } - } - - anchor_x = anchor_x < 0 ? kernel_width / 2 : anchor_x; - anchor_y = anchor_y < 0 ? kernel_height / 2 : anchor_y; - *context = reinterpret_cast(new Morph2D{operation, src_type, dst_type, kernel_type, kernel_data, kernel_step, kernel_width, kernel_height, anchor_x, anchor_y, borderType, borderV}); - return CV_HAL_ERROR_OK; -} - -template struct rvv; -template<> struct rvv -{ - static inline uchar init() { return std::numeric_limits::max(); } - static inline uchar mop(uchar a, uchar b) { return a < b ? a : b; } - static inline vuint8m4_t vop(vuint8m4_t a, vuint8m4_t b, size_t c) { return __riscv_vminu(a, b, c); } - static inline vuint8m4_t vop(vuint8m4_t a, uchar b, size_t c) { return __riscv_vminu(a, b, c); } -}; -template<> struct rvv -{ - static inline uchar init() { return std::numeric_limits::min(); } - static inline uchar mop(uchar a, uchar b) { return a > b ? a : b; } - static inline vuint8m4_t vop(vuint8m4_t a, vuint8m4_t b, size_t c) { return __riscv_vmaxu(a, b, c); } - static inline vuint8m4_t vop(vuint8m4_t a, uchar b, size_t c) { return __riscv_vmaxu(a, b, c); } -}; - -// the algorithm is copied from 3rdparty/carotene/src/morph.cpp, -// in the function template void morph3x3 -template -static inline int morph(int start, int end, Morph2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, int width, int height, int full_width, int full_height, int offset_x, int offset_y) -{ - bool kernel[9]; - for (int i = 0; i < 9; i++) - { - kernel[i] = data->kernel_data[(i / 3) * data->kernel_step + i % 3] != 0; - } - - constexpr int noval = std::numeric_limits::max(); - auto access = [&](int x, int y) { - int pi, pj; - if (data->borderType & BORDER_ISOLATED) - { - pi = filter::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED); - pj = filter::borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED); - pi = pi < 0 ? noval : pi; - pj = pj < 0 ? noval : pj; - } - else - { - pi = filter::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType); - pj = filter::borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType); - pi = pi < 0 ? noval : pi - offset_y; - pj = pj < 0 ? noval : pj - offset_x; - } - return std::make_pair(pi, pj); - }; - - auto process = [&](int x, int y) { - if (data->src_type == CV_8UC1) - { - uchar val = rvv::init(); - for (int i = 0; i < 9; i++) - { - if (kernel[i]) - { - auto p = access(x + i / 3, y + i % 3); - if (p.first != noval && p.second != noval) - { - val = rvv::mop(val, src_data[p.first * src_step + p.second]); - } - else - { - val = rvv::mop(val, data->borderValue[0]); - } - } - } - dst_data[x * width + y] = val; - } - else - { - uchar val0, val1, val2, val3; - val0 = val1 = val2 = val3 = rvv::init(); - for (int i = 0; i < 9; i++) - { - if (kernel[i]) - { - auto p = access(x + i / 3, y + i % 3); - if (p.first != noval && p.second != noval) - { - val0 = rvv::mop(val0, src_data[p.first * src_step + p.second * 4 ]); - val1 = rvv::mop(val1, src_data[p.first * src_step + p.second * 4 + 1]); - val2 = rvv::mop(val2, src_data[p.first * src_step + p.second * 4 + 2]); - val3 = rvv::mop(val3, src_data[p.first * src_step + p.second * 4 + 3]); - } - else - { - val0 = rvv::mop(val0, data->borderValue[0]); - val1 = rvv::mop(val1, data->borderValue[1]); - val2 = rvv::mop(val2, data->borderValue[2]); - val3 = rvv::mop(val3, data->borderValue[3]); - } - } - } - dst_data[(x * width + y) * 4 ] = val0; - dst_data[(x * width + y) * 4 + 1] = val1; - dst_data[(x * width + y) * 4 + 2] = val2; - dst_data[(x * width + y) * 4 + 3] = val3; - } - }; - - const int left = data->anchor_x, right = width - (2 - data->anchor_x); - for (int i = start; i < end; i++) - { - if (left >= right) - { - for (int j = 0; j < width; j++) - process(i, j); - } - else - { - for (int j = 0; j < left; j++) - process(i, j); - for (int j = right; j < width; j++) - process(i, j); - - const uchar* row0 = access(i , 0).first == noval ? nullptr : src_data + access(i , 0).first * src_step; - const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step; - const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step; - if (data->src_type == CV_8UC1) - { - int vl; - for (int j = left; j < right; j += vl) - { - vl = __riscv_vsetvl_e8m4(right - j); - auto m0 = __riscv_vmv_v_x_u8m4(rvv::init(), vl); - auto loadsrc = [&](const uchar* row, bool k0, bool k1, bool k2) { - if (!row) - { - m0 = rvv::vop(m0, data->borderValue[0], vl); - return; - } - - const uchar* extra = row + j - data->anchor_x; - auto v0 = __riscv_vle8_v_u8m4(extra, vl); - - if (k0) m0 = rvv::vop(m0, v0, vl); - v0 = __riscv_vslide1down(v0, extra[vl], vl); - if (k1) m0 = rvv::vop(m0, v0, vl); - if (!k2) return; - v0 = __riscv_vslide1down(v0, extra[vl + 1], vl); - m0 = rvv::vop(m0, v0, vl); - }; - - loadsrc(row0, kernel[0], kernel[1], kernel[2]); - loadsrc(row1, kernel[3], kernel[4], kernel[5]); - loadsrc(row2, kernel[6], kernel[7], kernel[8]); - __riscv_vse8(dst_data + i * width + j, m0, vl); - } - } - else - { - int vl, vl0, vl1; - for (int j = left; j < right; j += vl) - { - vl = __riscv_vsetvl_e8m4(right - j); - vl0 = std::min(vl, (int)__riscv_vlenb() * 2); - vl1 = vl - vl0; - auto m0 = __riscv_vmv_v_x_u8m4(rvv::init(), vl); - auto m1 = __riscv_vmv_v_x_u8m4(rvv::init(), vl); - auto m2 = __riscv_vmv_v_x_u8m4(rvv::init(), vl); - auto m3 = __riscv_vmv_v_x_u8m4(rvv::init(), vl); - - auto opshift = [&](vuint8m4_t a, vuint8m4_t b, bool k0, bool k1, bool k2, uchar r1, uchar r2) { - if (k0) a = rvv::vop(a, b, vl); - b = __riscv_vslide1down(b, r1, vl); - if (k1) a = rvv::vop(a, b, vl); - if (!k2) return a; - b = __riscv_vslide1down(b, r2, vl); - return rvv::vop(a, b, vl); - }; - auto loadsrc = [&](const uchar* row, bool k0, bool k1, bool k2) { - if (!row) - { - m0 = rvv::vop(m0, data->borderValue[0], vl); - m1 = rvv::vop(m1, data->borderValue[1], vl); - m2 = rvv::vop(m2, data->borderValue[2], vl); - m3 = rvv::vop(m3, data->borderValue[3], vl); - return; - } - - vuint8m4_t v0{}, v1{}, v2{}, v3{}; - const uchar* extra = row + (j - data->anchor_x) * 4; - auto src = __riscv_vlseg4e8_v_u8m2x4(extra, vl0); - v0 = __riscv_vset_v_u8m2_u8m4(v0, 0, __riscv_vget_v_u8m2x4_u8m2(src, 0)); - v1 = __riscv_vset_v_u8m2_u8m4(v1, 0, __riscv_vget_v_u8m2x4_u8m2(src, 1)); - v2 = __riscv_vset_v_u8m2_u8m4(v2, 0, __riscv_vget_v_u8m2x4_u8m2(src, 2)); - v3 = __riscv_vset_v_u8m2_u8m4(v3, 0, __riscv_vget_v_u8m2x4_u8m2(src, 3)); - src = __riscv_vlseg4e8_v_u8m2x4(extra + vl0 * 4, vl1); - v0 = __riscv_vset_v_u8m2_u8m4(v0, 1, __riscv_vget_v_u8m2x4_u8m2(src, 0)); - v1 = __riscv_vset_v_u8m2_u8m4(v1, 1, __riscv_vget_v_u8m2x4_u8m2(src, 1)); - v2 = __riscv_vset_v_u8m2_u8m4(v2, 1, __riscv_vget_v_u8m2x4_u8m2(src, 2)); - v3 = __riscv_vset_v_u8m2_u8m4(v3, 1, __riscv_vget_v_u8m2x4_u8m2(src, 3)); - - extra += vl * 4; - m0 = opshift(m0, v0, k0, k1, k2, extra[0], extra[4]); - m1 = opshift(m1, v1, k0, k1, k2, extra[1], extra[5]); - m2 = opshift(m2, v2, k0, k1, k2, extra[2], extra[6]); - m3 = opshift(m3, v3, k0, k1, k2, extra[3], extra[7]); - }; - - loadsrc(row0, kernel[0], kernel[1], kernel[2]); - loadsrc(row1, kernel[3], kernel[4], kernel[5]); - loadsrc(row2, kernel[6], kernel[7], kernel[8]); - vuint8m2x4_t val{}; - val = __riscv_vset_v_u8m2_u8m2x4(val, 0, __riscv_vget_v_u8m4_u8m2(m0, 0)); - val = __riscv_vset_v_u8m2_u8m2x4(val, 1, __riscv_vget_v_u8m4_u8m2(m1, 0)); - val = __riscv_vset_v_u8m2_u8m2x4(val, 2, __riscv_vget_v_u8m4_u8m2(m2, 0)); - val = __riscv_vset_v_u8m2_u8m2x4(val, 3, __riscv_vget_v_u8m4_u8m2(m3, 0)); - __riscv_vsseg4e8(dst_data + (i * width + j) * 4, val, vl0); - val = __riscv_vset_v_u8m2_u8m2x4(val, 0, __riscv_vget_v_u8m4_u8m2(m0, 1)); - val = __riscv_vset_v_u8m2_u8m2x4(val, 1, __riscv_vget_v_u8m4_u8m2(m1, 1)); - val = __riscv_vset_v_u8m2_u8m2x4(val, 2, __riscv_vget_v_u8m4_u8m2(m2, 1)); - val = __riscv_vset_v_u8m2_u8m2x4(val, 3, __riscv_vget_v_u8m4_u8m2(m3, 1)); - __riscv_vsseg4e8(dst_data + (i * width + j + vl0) * 4, val, vl1); - } - } - } - } - - return CV_HAL_ERROR_OK; -} - -inline int morph(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_full_width, int src_full_height, int src_roi_x, int src_roi_y, int /*dst_full_width*/, int /*dst_full_height*/, int /*dst_roi_x*/, int /*dst_roi_y*/) -{ - Morph2D* data = reinterpret_cast(context); - int cn = data->src_type == CV_8UC1 ? 1 : 4; - std::vector dst(width * height * cn); - - int res = CV_HAL_ERROR_NOT_IMPLEMENTED; - switch (data->operation) - { - case CV_HAL_MORPH_ERODE: - res = filter::invoke(height, {morph}, data, src_data, src_step, dst.data(), width, height, src_full_width, src_full_height, src_roi_x, src_roi_y); - break; - case CV_HAL_MORPH_DILATE: - res = filter::invoke(height, {morph}, data, src_data, src_step, dst.data(), width, height, src_full_width, src_full_height, src_roi_x, src_roi_y); - break; - } - - for (int i = 0; i < height; i++) - memcpy(dst_data + i * dst_step, dst.data() + i * width * cn, width * cn); - return res; -} - -inline int morphFree(cvhalFilter2D* context) -{ - delete reinterpret_cast(context)->borderValue; - delete reinterpret_cast(context); - return CV_HAL_ERROR_OK; -} -} // cv::cv_hal_rvv::morph - -namespace gaussianBlurBinomial { -#undef cv_hal_gaussianBlurBinomial -#define cv_hal_gaussianBlurBinomial cv::cv_hal_rvv::gaussianBlurBinomial::gaussianBlurBinomial - -// the algorithm is same as cv_hal_sepFilter -template -static inline int gaussianBlurC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int border_type) -{ - using T = typename helperT::ElemType; - using WT = typename helperWT::ElemType; - - constexpr int noval = std::numeric_limits::max(); - auto accessX = [&](int x) { - int pi = filter::borderInterpolate(offset_y + x - ksize / 2, full_height, border_type); - return pi < 0 ? noval : pi - offset_y; - }; - auto accessY = [&](int y) { - int pj = filter::borderInterpolate(offset_x + y - ksize / 2, full_width, border_type); - return pj < 0 ? noval : pj - offset_x; - }; - auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; }; - - constexpr uint kernel[2][5] = {{1, 2, 1}, {1, 4, 6, 4, 1}}; - std::vector res(width * ksize); - auto process = [&](int x, int y) { - WT sum = 0; - for (int i = 0; i < ksize; i++) - { - int p = accessY(y + i); - if (p != noval) - { - sum += kernel[ksize == 5][i] * static_cast(reinterpret_cast(src_data + x * src_step)[p]); - } - } - res[p2idx(x, y)] = sum; - }; - - const int left = ksize / 2, right = width - ksize / 2; - for (int i = start - ksize / 2; i < end + ksize / 2; i++) - { - if (i + offset_y >= 0 && i + offset_y < full_height) - { - if (left >= right) - { - for (int j = 0; j < width; j++) - process(i, j); - } - else - { - for (int j = 0; j < left; j++) - process(i, j); - for (int j = right; j < width; j++) - process(i, j); - - int vl; - for (int j = left; j < right; j += vl) - { - vl = helperT::setvl(right - j); - const T* extra = reinterpret_cast(src_data + i * src_step) + j - ksize / 2; - auto src = __riscv_vzext_vf2(helperT::vload(extra, vl), vl); - - extra += vl; - auto sum = src; - if (ksize == 3) - { - src = __riscv_vslide1down(src, extra[0], vl); - sum = __riscv_vadd(sum, __riscv_vsll(src, 1, vl), vl); - src = __riscv_vslide1down(src, extra[1], vl); - sum = __riscv_vadd(sum, src, vl); - } - else - { - src = __riscv_vslide1down(src, extra[0], vl); - sum = __riscv_vadd(sum, __riscv_vsll(src, 2, vl), vl); - src = __riscv_vslide1down(src, extra[1], vl); - sum = __riscv_vadd(sum, __riscv_vadd(__riscv_vsll(src, 1, vl), __riscv_vsll(src, 2, vl), vl), vl); - src = __riscv_vslide1down(src, extra[2], vl); - sum = __riscv_vadd(sum, __riscv_vsll(src, 2, vl), vl); - src = __riscv_vslide1down(src, extra[3], vl); - sum = __riscv_vadd(sum, src, vl); - } - helperWT::vstore(res.data() + p2idx(i, j), sum, vl); - } - } - } - - int cur = i - ksize / 2; - if (cur >= start) - { - const WT* row0 = accessX(cur ) == noval ? nullptr : res.data() + p2idx(accessX(cur ), 0); - const WT* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0); - const WT* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0); - const WT* row3 = nullptr, *row4 = nullptr; - if (ksize == 5) - { - row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0); - row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0); - } - - int vl; - for (int j = 0; j < width; j += vl) - { - vl = helperWT::setvl(width - j); - auto v0 = row0 ? helperWT::vload(row0 + j, vl) : helperWT::vmv(0, vl); - auto v1 = row1 ? helperWT::vload(row1 + j, vl) : helperWT::vmv(0, vl); - auto v2 = row2 ? helperWT::vload(row2 + j, vl) : helperWT::vmv(0, vl); - typename helperWT::VecType sum; - if (ksize == 3) - { - sum = __riscv_vadd(__riscv_vadd(v0, v2, vl), __riscv_vsll(v1, 1, vl), vl); - } - else - { - sum = __riscv_vadd(v0, __riscv_vadd(__riscv_vsll(v2, 1, vl), __riscv_vsll(v2, 2, vl), vl), vl); - auto v3 = row3 ? helperWT::vload(row3 + j, vl) : helperWT::vmv(0, vl); - sum = __riscv_vadd(sum, __riscv_vsll(__riscv_vadd(v1, v3, vl), 2, vl), vl); - auto v4 = row4 ? helperWT::vload(row4 + j, vl) : helperWT::vmv(0, vl); - sum = __riscv_vadd(sum, v4, vl); - } - helperT::vstore(reinterpret_cast(dst_data + cur * dst_step) + j, __riscv_vnclipu(sum, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl), vl); - } - } - } - - return CV_HAL_ERROR_OK; -} - -template -static inline int gaussianBlurC4(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int border_type) -{ - constexpr int noval = std::numeric_limits::max(); - auto accessX = [&](int x) { - int pi = filter::borderInterpolate(offset_y + x - ksize / 2, full_height, border_type); - return pi < 0 ? noval : pi - offset_y; - }; - auto accessY = [&](int y) { - int pj = filter::borderInterpolate(offset_x + y - ksize / 2, full_width, border_type); - return pj < 0 ? noval : pj - offset_x; - }; - auto p2idx = [&](int x, int y){ return ((x + ksize) % ksize * width + y) * 4; }; - - constexpr uint kernel[2][5] = {{1, 2, 1}, {1, 4, 6, 4, 1}}; - std::vector res(width * ksize * 4); - auto process = [&](int x, int y) { - ushort sum0, sum1, sum2, sum3; - sum0 = sum1 = sum2 = sum3 = 0; - for (int i = 0; i < ksize; i++) - { - int p = accessY(y + i); - if (p != noval) - { - sum0 += kernel[ksize == 5][i] * static_cast((src_data + x * src_step)[p * 4 ]); - sum1 += kernel[ksize == 5][i] * static_cast((src_data + x * src_step)[p * 4 + 1]); - sum2 += kernel[ksize == 5][i] * static_cast((src_data + x * src_step)[p * 4 + 2]); - sum3 += kernel[ksize == 5][i] * static_cast((src_data + x * src_step)[p * 4 + 3]); - } - } - res[p2idx(x, y) ] = sum0; - res[p2idx(x, y) + 1] = sum1; - res[p2idx(x, y) + 2] = sum2; - res[p2idx(x, y) + 3] = sum3; - }; - - const int left = ksize / 2, right = width - ksize / 2; - for (int i = start - ksize / 2; i < end + ksize / 2; i++) - { - if (i + offset_y >= 0 && i + offset_y < full_height) - { - if (left >= right) - { - for (int j = 0; j < width; j++) - process(i, j); - } - else - { - for (int j = 0; j < left; j++) - process(i, j); - for (int j = right; j < width; j++) - process(i, j); - - int vl; - for (int j = left; j < right; j += vl) - { - vl = __riscv_vsetvl_e8m1(right - j); - const uchar* extra = src_data + i * src_step + (j - ksize / 2) * 4; - auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl); - auto src0 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl); - auto src1 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl); - auto src2 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl); - auto src3 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl); - - extra += vl * 4; - auto sum0 = src0, sum1 = src1, sum2 = src2, sum3 = src3; - if (ksize == 3) - { - src0 = __riscv_vslide1down(src0, extra[0], vl); - src1 = __riscv_vslide1down(src1, extra[1], vl); - src2 = __riscv_vslide1down(src2, extra[2], vl); - src3 = __riscv_vslide1down(src3, extra[3], vl); - sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 1, vl), vl); - sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 1, vl), vl); - sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 1, vl), vl); - sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 1, vl), vl); - src0 = __riscv_vslide1down(src0, extra[4], vl); - src1 = __riscv_vslide1down(src1, extra[5], vl); - src2 = __riscv_vslide1down(src2, extra[6], vl); - src3 = __riscv_vslide1down(src3, extra[7], vl); - sum0 = __riscv_vadd(sum0, src0, vl); - sum1 = __riscv_vadd(sum1, src1, vl); - sum2 = __riscv_vadd(sum2, src2, vl); - sum3 = __riscv_vadd(sum3, src3, vl); - } - else - { - src0 = __riscv_vslide1down(src0, extra[0], vl); - src1 = __riscv_vslide1down(src1, extra[1], vl); - src2 = __riscv_vslide1down(src2, extra[2], vl); - src3 = __riscv_vslide1down(src3, extra[3], vl); - sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl); - sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl); - sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl); - sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl); - src0 = __riscv_vslide1down(src0, extra[4], vl); - src1 = __riscv_vslide1down(src1, extra[5], vl); - src2 = __riscv_vslide1down(src2, extra[6], vl); - src3 = __riscv_vslide1down(src3, extra[7], vl); - sum0 = __riscv_vadd(sum0, __riscv_vadd(__riscv_vsll(src0, 1, vl), __riscv_vsll(src0, 2, vl), vl), vl); - sum1 = __riscv_vadd(sum1, __riscv_vadd(__riscv_vsll(src1, 1, vl), __riscv_vsll(src1, 2, vl), vl), vl); - sum2 = __riscv_vadd(sum2, __riscv_vadd(__riscv_vsll(src2, 1, vl), __riscv_vsll(src2, 2, vl), vl), vl); - sum3 = __riscv_vadd(sum3, __riscv_vadd(__riscv_vsll(src3, 1, vl), __riscv_vsll(src3, 2, vl), vl), vl); - src0 = __riscv_vslide1down(src0, extra[ 8], vl); - src1 = __riscv_vslide1down(src1, extra[ 9], vl); - src2 = __riscv_vslide1down(src2, extra[10], vl); - src3 = __riscv_vslide1down(src3, extra[11], vl); - sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl); - sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl); - sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl); - sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl); - src0 = __riscv_vslide1down(src0, extra[12], vl); - src1 = __riscv_vslide1down(src1, extra[13], vl); - src2 = __riscv_vslide1down(src2, extra[14], vl); - src3 = __riscv_vslide1down(src3, extra[15], vl); - sum0 = __riscv_vadd(sum0, src0, vl); - sum1 = __riscv_vadd(sum1, src1, vl); - sum2 = __riscv_vadd(sum2, src2, vl); - sum3 = __riscv_vadd(sum3, src3, vl); - } - - vuint16m2x4_t dst{}; - dst = __riscv_vset_v_u16m2_u16m2x4(dst, 0, sum0); - dst = __riscv_vset_v_u16m2_u16m2x4(dst, 1, sum1); - dst = __riscv_vset_v_u16m2_u16m2x4(dst, 2, sum2); - dst = __riscv_vset_v_u16m2_u16m2x4(dst, 3, sum3); - __riscv_vsseg4e16(res.data() + p2idx(i, j), dst, vl); - } - } - } - - int cur = i - ksize / 2; - if (cur >= start) - { - const ushort* row0 = accessX(cur ) == noval ? nullptr : res.data() + p2idx(accessX(cur ), 0); - const ushort* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0); - const ushort* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0); - const ushort* row3 = nullptr, *row4 = nullptr; - if (ksize == 5) - { - row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0); - row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0); - } - - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e16m2(width - j); - vuint16m2_t sum0, sum1, sum2, sum3, src0{}, src1{}, src2{}, src3{}; - sum0 = sum1 = sum2 = sum3 = __riscv_vmv_v_x_u16m2(0, vl); - - auto loadres = [&](const ushort* row) { - auto src = __riscv_vlseg4e16_v_u16m2x4(row + j * 4, vl); - src0 = __riscv_vget_v_u16m2x4_u16m2(src, 0); - src1 = __riscv_vget_v_u16m2x4_u16m2(src, 1); - src2 = __riscv_vget_v_u16m2x4_u16m2(src, 2); - src3 = __riscv_vget_v_u16m2x4_u16m2(src, 3); - }; - if (row0) - { - loadres(row0); - sum0 = src0; - sum1 = src1; - sum2 = src2; - sum3 = src3; - } - if (row1) - { - loadres(row1); - sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, ksize == 5 ? 2 : 1, vl), vl); - sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, ksize == 5 ? 2 : 1, vl), vl); - sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, ksize == 5 ? 2 : 1, vl), vl); - sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, ksize == 5 ? 2 : 1, vl), vl); - } - if (row2) - { - loadres(row2); - if (ksize == 5) - { - src0 = __riscv_vadd(__riscv_vsll(src0, 1, vl), __riscv_vsll(src0, 2, vl), vl); - src1 = __riscv_vadd(__riscv_vsll(src1, 1, vl), __riscv_vsll(src1, 2, vl), vl); - src2 = __riscv_vadd(__riscv_vsll(src2, 1, vl), __riscv_vsll(src2, 2, vl), vl); - src3 = __riscv_vadd(__riscv_vsll(src3, 1, vl), __riscv_vsll(src3, 2, vl), vl); - } - sum0 = __riscv_vadd(sum0, src0, vl); - sum1 = __riscv_vadd(sum1, src1, vl); - sum2 = __riscv_vadd(sum2, src2, vl); - sum3 = __riscv_vadd(sum3, src3, vl); - } - if (row3) - { - loadres(row3); - sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl); - sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl); - sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl); - sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl); - } - if (row4) - { - loadres(row4); - sum0 = __riscv_vadd(sum0, src0, vl); - sum1 = __riscv_vadd(sum1, src1, vl); - sum2 = __riscv_vadd(sum2, src2, vl); - sum3 = __riscv_vadd(sum3, src3, vl); - } - - vuint8m1x4_t dst{}; - dst = __riscv_vset_v_u8m1_u8m1x4(dst, 0, __riscv_vnclipu(sum0, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl)); - dst = __riscv_vset_v_u8m1_u8m1x4(dst, 1, __riscv_vnclipu(sum1, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl)); - dst = __riscv_vset_v_u8m1_u8m1x4(dst, 2, __riscv_vnclipu(sum2, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl)); - dst = __riscv_vset_v_u8m1_u8m1x4(dst, 3, __riscv_vnclipu(sum3, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl)); - __riscv_vsseg4e8(dst_data + cur * dst_step + j * 4, dst, vl); - } - } - } - - return CV_HAL_ERROR_OK; -} - -inline int gaussianBlurBinomial(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, size_t margin_left, size_t margin_top, size_t margin_right, size_t margin_bottom, size_t ksize, int border_type) -{ - const int type = CV_MAKETYPE(depth, cn); - if ((type != CV_8UC1 && type != CV_8UC4 && type != CV_16UC1) || src_data == dst_data) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if ((ksize != 3 && ksize != 5) || border_type & BORDER_ISOLATED || border_type == BORDER_WRAP) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - - switch (ksize*100 + type) - { - case 300 + CV_8UC1: - return filter::invoke(height, {gaussianBlurC1<3, RVV_U8M4, RVV_U16M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type); - case 500 + CV_8UC1: - return filter::invoke(height, {gaussianBlurC1<5, RVV_U8M4, RVV_U16M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type); - case 300 + CV_16UC1: - return filter::invoke(height, {gaussianBlurC1<3, RVV_U16M4, RVV_U32M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type); - case 500 + CV_16UC1: - return filter::invoke(height, {gaussianBlurC1<5, RVV_U16M4, RVV_U32M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type); - case 300 + CV_8UC4: - return filter::invoke(height, {gaussianBlurC4<3>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type); - case 500 + CV_8UC4: - return filter::invoke(height, {gaussianBlurC4<5>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type); - } - - return CV_HAL_ERROR_NOT_IMPLEMENTED; -} -} // cv::cv_hal_rvv::gaussianBlurBinomial - -namespace medianBlur { -#undef cv_hal_medianBlur -#define cv_hal_medianBlur cv::cv_hal_rvv::medianBlur::medianBlur - -// the algorithm is copied from imgproc/src/median_blur.simd.cpp -// in the function template static void medianBlur_SortNet -template -static inline int medianBlurC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height) -{ - using T = typename helper::ElemType; - using VT = typename helper::VecType; - - for (int i = start; i < end; i++) - { - const T* row0 = reinterpret_cast(src_data + std::min(std::max(i - ksize / 2, 0), height - 1) * src_step); - const T* row1 = reinterpret_cast(src_data + std::min(std::max(i + 1 - ksize / 2, 0), height - 1) * src_step); - const T* row2 = reinterpret_cast(src_data + std::min(std::max(i + 2 - ksize / 2, 0), height - 1) * src_step); - const T* row3 = reinterpret_cast(src_data + std::min(std::max(i + 3 - ksize / 2, 0), height - 1) * src_step); - const T* row4 = reinterpret_cast(src_data + std::min(std::max(i + 4 - ksize / 2, 0), height - 1) * src_step); - int vl; - auto vop = [&vl](VT& a, VT& b) { - auto t = a; - a = helper::vmin(a, b, vl); - b = helper::vmax(t, b, vl); - }; - - for (int j = 0; j < width; j += vl) - { - vl = helper::setvl(width - j); - if (ksize == 3) - { - VT p0, p1, p2; - VT p3, p4, p5; - VT p6, p7, p8; - if (j != 0) - { - p0 = helper::vload(row0 + j - 1, vl); - p3 = helper::vload(row1 + j - 1, vl); - p6 = helper::vload(row2 + j - 1, vl); - } - else - { - p0 = helper::vslide1up(helper::vload(row0, vl), row0[0], vl); - p3 = helper::vslide1up(helper::vload(row1, vl), row1[0], vl); - p6 = helper::vslide1up(helper::vload(row2, vl), row2[0], vl); - } - p1 = helper::vslide1down(p0, row0[j + vl - 1], vl); - p4 = helper::vslide1down(p3, row1[j + vl - 1], vl); - p7 = helper::vslide1down(p6, row2[j + vl - 1], vl); - p2 = helper::vslide1down(p1, row0[std::min(width - 1, j + vl)], vl); - p5 = helper::vslide1down(p4, row1[std::min(width - 1, j + vl)], vl); - p8 = helper::vslide1down(p7, row2[std::min(width - 1, j + vl)], vl); - - vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1); - vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5); - vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7); - vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7); - vop(p4, p2); vop(p6, p4); vop(p4, p2); - helper::vstore(reinterpret_cast(dst_data + i * dst_step) + j, p4, vl); - } - else - { - VT p0, p1, p2, p3, p4; - VT p5, p6, p7, p8, p9; - VT p10, p11, p12, p13, p14; - VT p15, p16, p17, p18, p19; - VT p20, p21, p22, p23, p24; - if (j >= 2) - { - p0 = helper::vload(row0 + j - 2, vl); - p5 = helper::vload(row1 + j - 2, vl); - p10 = helper::vload(row2 + j - 2, vl); - p15 = helper::vload(row3 + j - 2, vl); - p20 = helper::vload(row4 + j - 2, vl); - } - else - { - p0 = helper::vslide1up(helper::vload(row0, vl), row0[0], vl); - p5 = helper::vslide1up(helper::vload(row1, vl), row1[0], vl); - p10 = helper::vslide1up(helper::vload(row2, vl), row2[0], vl); - p15 = helper::vslide1up(helper::vload(row3, vl), row3[0], vl); - p20 = helper::vslide1up(helper::vload(row4, vl), row4[0], vl); - if (j == 0) - { - p0 = helper::vslide1up(p0, row0[0], vl); - p5 = helper::vslide1up(p5, row1[0], vl); - p10 = helper::vslide1up(p10, row2[0], vl); - p15 = helper::vslide1up(p15, row3[0], vl); - p20 = helper::vslide1up(p20, row4[0], vl); - } - } - p1 = helper::vslide1down(p0, row0[j + vl - 2], vl); - p6 = helper::vslide1down(p5, row1[j + vl - 2], vl); - p11 = helper::vslide1down(p10, row2[j + vl - 2], vl); - p16 = helper::vslide1down(p15, row3[j + vl - 2], vl); - p21 = helper::vslide1down(p20, row4[j + vl - 2], vl); - p2 = helper::vslide1down(p1, row0[j + vl - 1], vl); - p7 = helper::vslide1down(p6, row1[j + vl - 1], vl); - p12 = helper::vslide1down(p11, row2[j + vl - 1], vl); - p17 = helper::vslide1down(p16, row3[j + vl - 1], vl); - p22 = helper::vslide1down(p21, row4[j + vl - 1], vl); - p3 = helper::vslide1down(p2, row0[std::min(width - 1, j + vl)], vl); - p8 = helper::vslide1down(p7, row1[std::min(width - 1, j + vl)], vl); - p13 = helper::vslide1down(p12, row2[std::min(width - 1, j + vl)], vl); - p18 = helper::vslide1down(p17, row3[std::min(width - 1, j + vl)], vl); - p23 = helper::vslide1down(p22, row4[std::min(width - 1, j + vl)], vl); - p4 = helper::vslide1down(p3, row0[std::min(width - 1, j + vl + 1)], vl); - p9 = helper::vslide1down(p8, row1[std::min(width - 1, j + vl + 1)], vl); - p14 = helper::vslide1down(p13, row2[std::min(width - 1, j + vl + 1)], vl); - p19 = helper::vslide1down(p18, row3[std::min(width - 1, j + vl + 1)], vl); - p24 = helper::vslide1down(p23, row4[std::min(width - 1, j + vl + 1)], vl); - - vop(p1, p2); vop(p0, p1); vop(p1, p2); vop(p4, p5); vop(p3, p4); - vop(p4, p5); vop(p0, p3); vop(p2, p5); vop(p2, p3); vop(p1, p4); - vop(p1, p2); vop(p3, p4); vop(p7, p8); vop(p6, p7); vop(p7, p8); - vop(p10, p11); vop(p9, p10); vop(p10, p11); vop(p6, p9); vop(p8, p11); - vop(p8, p9); vop(p7, p10); vop(p7, p8); vop(p9, p10); vop(p0, p6); - vop(p4, p10); vop(p4, p6); vop(p2, p8); vop(p2, p4); vop(p6, p8); - vop(p1, p7); vop(p5, p11); vop(p5, p7); vop(p3, p9); vop(p3, p5); - vop(p7, p9); vop(p1, p2); vop(p3, p4); vop(p5, p6); vop(p7, p8); - vop(p9, p10); vop(p13, p14); vop(p12, p13); vop(p13, p14); vop(p16, p17); - vop(p15, p16); vop(p16, p17); vop(p12, p15); vop(p14, p17); vop(p14, p15); - vop(p13, p16); vop(p13, p14); vop(p15, p16); vop(p19, p20); vop(p18, p19); - vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p21, p23); vop(p22, p24); - vop(p22, p23); vop(p18, p21); vop(p20, p23); vop(p20, p21); vop(p19, p22); - vop(p22, p24); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p12, p18); - vop(p16, p22); vop(p16, p18); vop(p14, p20); vop(p20, p24); vop(p14, p16); - vop(p18, p20); vop(p22, p24); vop(p13, p19); vop(p17, p23); vop(p17, p19); - vop(p15, p21); vop(p15, p17); vop(p19, p21); vop(p13, p14); vop(p15, p16); - vop(p17, p18); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p0, p12); - vop(p8, p20); vop(p8, p12); vop(p4, p16); vop(p16, p24); vop(p12, p16); - vop(p2, p14); vop(p10, p22); vop(p10, p14); vop(p6, p18); vop(p6, p10); - vop(p10, p12); vop(p1, p13); vop(p9, p21); vop(p9, p13); vop(p5, p17); - vop(p13, p17); vop(p3, p15); vop(p11, p23); vop(p11, p15); vop(p7, p19); - vop(p7, p11); vop(p11, p13); vop(p11, p12); - helper::vstore(reinterpret_cast(dst_data + i * dst_step) + j, p12, vl); - } - } - } - - return CV_HAL_ERROR_OK; -} - -template -static inline int medianBlurC4(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height) -{ - for (int i = start; i < end; i++) - { - const uchar* row0 = src_data + std::min(std::max(i - ksize / 2, 0), height - 1) * src_step; - const uchar* row1 = src_data + std::min(std::max(i + 1 - ksize / 2, 0), height - 1) * src_step; - const uchar* row2 = src_data + std::min(std::max(i + 2 - ksize / 2, 0), height - 1) * src_step; - const uchar* row3 = src_data + std::min(std::max(i + 3 - ksize / 2, 0), height - 1) * src_step; - const uchar* row4 = src_data + std::min(std::max(i + 4 - ksize / 2, 0), height - 1) * src_step; - int vl; - for (int j = 0; j < width; j += vl) - { - if (ksize == 3) - { - vl = __riscv_vsetvl_e8m1(width - j); - vuint8m1_t p00, p01, p02; - vuint8m1_t p03, p04, p05; - vuint8m1_t p06, p07, p08; - vuint8m1_t p10, p11, p12; - vuint8m1_t p13, p14, p15; - vuint8m1_t p16, p17, p18; - vuint8m1_t p20, p21, p22; - vuint8m1_t p23, p24, p25; - vuint8m1_t p26, p27, p28; - vuint8m1_t p30, p31, p32; - vuint8m1_t p33, p34, p35; - vuint8m1_t p36, p37, p38; - auto loadsrc = [&vl](const uchar* row, vuint8m1_t& p0, vuint8m1_t& p1, vuint8m1_t& p2, vuint8m1_t& p3) { - auto src = __riscv_vlseg4e8_v_u8m1x4(row, vl); - p0 = __riscv_vget_v_u8m1x4_u8m1(src, 0); - p1 = __riscv_vget_v_u8m1x4_u8m1(src, 1); - p2 = __riscv_vget_v_u8m1x4_u8m1(src, 2); - p3 = __riscv_vget_v_u8m1x4_u8m1(src, 3); - }; - if (j != 0) - { - loadsrc(row0 + (j - 1) * 4, p00, p10, p20, p30); - loadsrc(row1 + (j - 1) * 4, p03, p13, p23, p33); - loadsrc(row2 + (j - 1) * 4, p06, p16, p26, p36); - } - else - { - loadsrc(row0, p00, p10, p20, p30); - loadsrc(row1, p03, p13, p23, p33); - loadsrc(row2, p06, p16, p26, p36); - p00 = __riscv_vslide1up(p00, row0[0], vl); - p10 = __riscv_vslide1up(p10, row0[1], vl); - p20 = __riscv_vslide1up(p20, row0[2], vl); - p30 = __riscv_vslide1up(p30, row0[3], vl); - p03 = __riscv_vslide1up(p03, row1[0], vl); - p13 = __riscv_vslide1up(p13, row1[1], vl); - p23 = __riscv_vslide1up(p23, row1[2], vl); - p33 = __riscv_vslide1up(p33, row1[3], vl); - p06 = __riscv_vslide1up(p06, row2[0], vl); - p16 = __riscv_vslide1up(p16, row2[1], vl); - p26 = __riscv_vslide1up(p26, row2[2], vl); - p36 = __riscv_vslide1up(p36, row2[3], vl); - } - p01 = __riscv_vslide1down(p00, row0[(j + vl - 1) * 4 ], vl); - p11 = __riscv_vslide1down(p10, row0[(j + vl - 1) * 4 + 1], vl); - p21 = __riscv_vslide1down(p20, row0[(j + vl - 1) * 4 + 2], vl); - p31 = __riscv_vslide1down(p30, row0[(j + vl - 1) * 4 + 3], vl); - p04 = __riscv_vslide1down(p03, row1[(j + vl - 1) * 4 ], vl); - p14 = __riscv_vslide1down(p13, row1[(j + vl - 1) * 4 + 1], vl); - p24 = __riscv_vslide1down(p23, row1[(j + vl - 1) * 4 + 2], vl); - p34 = __riscv_vslide1down(p33, row1[(j + vl - 1) * 4 + 3], vl); - p07 = __riscv_vslide1down(p06, row2[(j + vl - 1) * 4 ], vl); - p17 = __riscv_vslide1down(p16, row2[(j + vl - 1) * 4 + 1], vl); - p27 = __riscv_vslide1down(p26, row2[(j + vl - 1) * 4 + 2], vl); - p37 = __riscv_vslide1down(p36, row2[(j + vl - 1) * 4 + 3], vl); - p02 = __riscv_vslide1down(p01, row0[std::min(width - 1, j + vl) * 4 ], vl); - p12 = __riscv_vslide1down(p11, row0[std::min(width - 1, j + vl) * 4 + 1], vl); - p22 = __riscv_vslide1down(p21, row0[std::min(width - 1, j + vl) * 4 + 2], vl); - p32 = __riscv_vslide1down(p31, row0[std::min(width - 1, j + vl) * 4 + 3], vl); - p05 = __riscv_vslide1down(p04, row1[std::min(width - 1, j + vl) * 4 ], vl); - p15 = __riscv_vslide1down(p14, row1[std::min(width - 1, j + vl) * 4 + 1], vl); - p25 = __riscv_vslide1down(p24, row1[std::min(width - 1, j + vl) * 4 + 2], vl); - p35 = __riscv_vslide1down(p34, row1[std::min(width - 1, j + vl) * 4 + 3], vl); - p08 = __riscv_vslide1down(p07, row2[std::min(width - 1, j + vl) * 4 ], vl); - p18 = __riscv_vslide1down(p17, row2[std::min(width - 1, j + vl) * 4 + 1], vl); - p28 = __riscv_vslide1down(p27, row2[std::min(width - 1, j + vl) * 4 + 2], vl); - p38 = __riscv_vslide1down(p37, row2[std::min(width - 1, j + vl) * 4 + 3], vl); - - auto vop = [&vl](vuint8m1_t& a, vuint8m1_t& b) { - auto t = a; - a = __riscv_vminu(a, b, vl); - b = __riscv_vmaxu(t, b, vl); - }; - vuint8m1x4_t dst{}; - vop(p01, p02); vop(p04, p05); vop(p07, p08); vop(p00, p01); - vop(p03, p04); vop(p06, p07); vop(p01, p02); vop(p04, p05); - vop(p07, p08); vop(p00, p03); vop(p05, p08); vop(p04, p07); - vop(p03, p06); vop(p01, p04); vop(p02, p05); vop(p04, p07); - vop(p04, p02); vop(p06, p04); vop(p04, p02); - dst = __riscv_vset_v_u8m1_u8m1x4(dst, 0, p04); - vop(p11, p12); vop(p14, p15); vop(p17, p18); vop(p10, p11); - vop(p13, p14); vop(p16, p17); vop(p11, p12); vop(p14, p15); - vop(p17, p18); vop(p10, p13); vop(p15, p18); vop(p14, p17); - vop(p13, p16); vop(p11, p14); vop(p12, p15); vop(p14, p17); - vop(p14, p12); vop(p16, p14); vop(p14, p12); - dst = __riscv_vset_v_u8m1_u8m1x4(dst, 1, p14); - vop(p21, p22); vop(p24, p25); vop(p27, p28); vop(p20, p21); - vop(p23, p24); vop(p26, p27); vop(p21, p22); vop(p24, p25); - vop(p27, p28); vop(p20, p23); vop(p25, p28); vop(p24, p27); - vop(p23, p26); vop(p21, p24); vop(p22, p25); vop(p24, p27); - vop(p24, p22); vop(p26, p24); vop(p24, p22); - dst = __riscv_vset_v_u8m1_u8m1x4(dst, 2, p24); - vop(p31, p32); vop(p34, p35); vop(p37, p38); vop(p30, p31); - vop(p33, p34); vop(p36, p37); vop(p31, p32); vop(p34, p35); - vop(p37, p38); vop(p30, p33); vop(p35, p38); vop(p34, p37); - vop(p33, p36); vop(p31, p34); vop(p32, p35); vop(p34, p37); - vop(p34, p32); vop(p36, p34); vop(p34, p32); - dst = __riscv_vset_v_u8m1_u8m1x4(dst, 3, p34); - __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl); - } - else - { - vl = __riscv_vsetvl_e8m2(width - j); - vuint8m2_t p00, p01, p02, p03, p04; - vuint8m2_t p05, p06, p07, p08, p09; - vuint8m2_t p010, p011, p012, p013, p014; - vuint8m2_t p015, p016, p017, p018, p019; - vuint8m2_t p020, p021, p022, p023, p024; - vuint8m2_t p10, p11, p12, p13, p14; - vuint8m2_t p15, p16, p17, p18, p19; - vuint8m2_t p110, p111, p112, p113, p114; - vuint8m2_t p115, p116, p117, p118, p119; - vuint8m2_t p120, p121, p122, p123, p124; - vuint8m2_t p20, p21, p22, p23, p24; - vuint8m2_t p25, p26, p27, p28, p29; - vuint8m2_t p210, p211, p212, p213, p214; - vuint8m2_t p215, p216, p217, p218, p219; - vuint8m2_t p220, p221, p222, p223, p224; - vuint8m2_t p30, p31, p32, p33, p34; - vuint8m2_t p35, p36, p37, p38, p39; - vuint8m2_t p310, p311, p312, p313, p314; - vuint8m2_t p315, p316, p317, p318, p319; - vuint8m2_t p320, p321, p322, p323, p324; - auto loadsrc = [&vl](const uchar* row, vuint8m2_t& p0, vuint8m2_t& p1, vuint8m2_t& p2, vuint8m2_t& p3) { - auto src = __riscv_vlseg4e8_v_u8m2x4(row, vl); - p0 = __riscv_vget_v_u8m2x4_u8m2(src, 0); - p1 = __riscv_vget_v_u8m2x4_u8m2(src, 1); - p2 = __riscv_vget_v_u8m2x4_u8m2(src, 2); - p3 = __riscv_vget_v_u8m2x4_u8m2(src, 3); - }; - if (j >= 2) - { - loadsrc(row0 + (j - 2) * 4, p00, p10, p20, p30); - loadsrc(row1 + (j - 2) * 4, p05, p15, p25, p35); - loadsrc(row2 + (j - 2) * 4, p010, p110, p210, p310); - loadsrc(row3 + (j - 2) * 4, p015, p115, p215, p315); - loadsrc(row4 + (j - 2) * 4, p020, p120, p220, p320); - } - else - { - loadsrc(row0, p00, p10, p20, p30); - loadsrc(row1, p05, p15, p25, p35); - loadsrc(row2, p010, p110, p210, p310); - loadsrc(row3, p015, p115, p215, p315); - loadsrc(row4, p020, p120, p220, p320); - auto slideup = [&] { - p00 = __riscv_vslide1up(p00, row0[0], vl); - p10 = __riscv_vslide1up(p10, row0[1], vl); - p20 = __riscv_vslide1up(p20, row0[2], vl); - p30 = __riscv_vslide1up(p30, row0[3], vl); - p05 = __riscv_vslide1up(p05, row1[0], vl); - p15 = __riscv_vslide1up(p15, row1[1], vl); - p25 = __riscv_vslide1up(p25, row1[2], vl); - p35 = __riscv_vslide1up(p35, row1[3], vl); - p010 = __riscv_vslide1up(p010, row2[0], vl); - p110 = __riscv_vslide1up(p110, row2[1], vl); - p210 = __riscv_vslide1up(p210, row2[2], vl); - p310 = __riscv_vslide1up(p310, row2[3], vl); - p015 = __riscv_vslide1up(p015, row3[0], vl); - p115 = __riscv_vslide1up(p115, row3[1], vl); - p215 = __riscv_vslide1up(p215, row3[2], vl); - p315 = __riscv_vslide1up(p315, row3[3], vl); - p020 = __riscv_vslide1up(p020, row4[0], vl); - p120 = __riscv_vslide1up(p120, row4[1], vl); - p220 = __riscv_vslide1up(p220, row4[2], vl); - p320 = __riscv_vslide1up(p320, row4[3], vl); - }; - slideup(); - if (j == 0) - { - slideup(); - } - } - p01 = __riscv_vslide1down(p00, row0[(j + vl - 2) * 4 ], vl); - p11 = __riscv_vslide1down(p10, row0[(j + vl - 2) * 4 + 1], vl); - p21 = __riscv_vslide1down(p20, row0[(j + vl - 2) * 4 + 2], vl); - p31 = __riscv_vslide1down(p30, row0[(j + vl - 2) * 4 + 3], vl); - p06 = __riscv_vslide1down(p05, row1[(j + vl - 2) * 4 ], vl); - p16 = __riscv_vslide1down(p15, row1[(j + vl - 2) * 4 + 1], vl); - p26 = __riscv_vslide1down(p25, row1[(j + vl - 2) * 4 + 2], vl); - p36 = __riscv_vslide1down(p35, row1[(j + vl - 2) * 4 + 3], vl); - p011 = __riscv_vslide1down(p010, row2[(j + vl - 2) * 4 ], vl); - p111 = __riscv_vslide1down(p110, row2[(j + vl - 2) * 4 + 1], vl); - p211 = __riscv_vslide1down(p210, row2[(j + vl - 2) * 4 + 2], vl); - p311 = __riscv_vslide1down(p310, row2[(j + vl - 2) * 4 + 3], vl); - p016 = __riscv_vslide1down(p015, row3[(j + vl - 2) * 4 ], vl); - p116 = __riscv_vslide1down(p115, row3[(j + vl - 2) * 4 + 1], vl); - p216 = __riscv_vslide1down(p215, row3[(j + vl - 2) * 4 + 2], vl); - p316 = __riscv_vslide1down(p315, row3[(j + vl - 2) * 4 + 3], vl); - p021 = __riscv_vslide1down(p020, row4[(j + vl - 2) * 4 ], vl); - p121 = __riscv_vslide1down(p120, row4[(j + vl - 2) * 4 + 1], vl); - p221 = __riscv_vslide1down(p220, row4[(j + vl - 2) * 4 + 2], vl); - p321 = __riscv_vslide1down(p320, row4[(j + vl - 2) * 4 + 3], vl); - p02 = __riscv_vslide1down(p01, row0[(j + vl - 1) * 4 ], vl); - p12 = __riscv_vslide1down(p11, row0[(j + vl - 1) * 4 + 1], vl); - p22 = __riscv_vslide1down(p21, row0[(j + vl - 1) * 4 + 2], vl); - p32 = __riscv_vslide1down(p31, row0[(j + vl - 1) * 4 + 3], vl); - p07 = __riscv_vslide1down(p06, row1[(j + vl - 1) * 4 ], vl); - p17 = __riscv_vslide1down(p16, row1[(j + vl - 1) * 4 + 1], vl); - p27 = __riscv_vslide1down(p26, row1[(j + vl - 1) * 4 + 2], vl); - p37 = __riscv_vslide1down(p36, row1[(j + vl - 1) * 4 + 3], vl); - p012 = __riscv_vslide1down(p011, row2[(j + vl - 1) * 4 ], vl); - p112 = __riscv_vslide1down(p111, row2[(j + vl - 1) * 4 + 1], vl); - p212 = __riscv_vslide1down(p211, row2[(j + vl - 1) * 4 + 2], vl); - p312 = __riscv_vslide1down(p311, row2[(j + vl - 1) * 4 + 3], vl); - p017 = __riscv_vslide1down(p016, row3[(j + vl - 1) * 4 ], vl); - p117 = __riscv_vslide1down(p116, row3[(j + vl - 1) * 4 + 1], vl); - p217 = __riscv_vslide1down(p216, row3[(j + vl - 1) * 4 + 2], vl); - p317 = __riscv_vslide1down(p316, row3[(j + vl - 1) * 4 + 3], vl); - p022 = __riscv_vslide1down(p021, row4[(j + vl - 1) * 4 ], vl); - p122 = __riscv_vslide1down(p121, row4[(j + vl - 1) * 4 + 1], vl); - p222 = __riscv_vslide1down(p221, row4[(j + vl - 1) * 4 + 2], vl); - p322 = __riscv_vslide1down(p321, row4[(j + vl - 1) * 4 + 3], vl); - p03 = __riscv_vslide1down(p02, row0[std::min(width - 1, j + vl) * 4 ], vl); - p13 = __riscv_vslide1down(p12, row0[std::min(width - 1, j + vl) * 4 + 1], vl); - p23 = __riscv_vslide1down(p22, row0[std::min(width - 1, j + vl) * 4 + 2], vl); - p33 = __riscv_vslide1down(p32, row0[std::min(width - 1, j + vl) * 4 + 3], vl); - p08 = __riscv_vslide1down(p07, row1[std::min(width - 1, j + vl) * 4 ], vl); - p18 = __riscv_vslide1down(p17, row1[std::min(width - 1, j + vl) * 4 + 1], vl); - p28 = __riscv_vslide1down(p27, row1[std::min(width - 1, j + vl) * 4 + 2], vl); - p38 = __riscv_vslide1down(p37, row1[std::min(width - 1, j + vl) * 4 + 3], vl); - p013 = __riscv_vslide1down(p012, row2[std::min(width - 1, j + vl) * 4 ], vl); - p113 = __riscv_vslide1down(p112, row2[std::min(width - 1, j + vl) * 4 + 1], vl); - p213 = __riscv_vslide1down(p212, row2[std::min(width - 1, j + vl) * 4 + 2], vl); - p313 = __riscv_vslide1down(p312, row2[std::min(width - 1, j + vl) * 4 + 3], vl); - p018 = __riscv_vslide1down(p017, row3[std::min(width - 1, j + vl) * 4 ], vl); - p118 = __riscv_vslide1down(p117, row3[std::min(width - 1, j + vl) * 4 + 1], vl); - p218 = __riscv_vslide1down(p217, row3[std::min(width - 1, j + vl) * 4 + 2], vl); - p318 = __riscv_vslide1down(p317, row3[std::min(width - 1, j + vl) * 4 + 3], vl); - p023 = __riscv_vslide1down(p022, row4[std::min(width - 1, j + vl) * 4 ], vl); - p123 = __riscv_vslide1down(p122, row4[std::min(width - 1, j + vl) * 4 + 1], vl); - p223 = __riscv_vslide1down(p222, row4[std::min(width - 1, j + vl) * 4 + 2], vl); - p323 = __riscv_vslide1down(p322, row4[std::min(width - 1, j + vl) * 4 + 3], vl); - p04 = __riscv_vslide1down(p03, row0[std::min(width - 1, j + vl + 1) * 4 ], vl); - p14 = __riscv_vslide1down(p13, row0[std::min(width - 1, j + vl + 1) * 4 + 1], vl); - p24 = __riscv_vslide1down(p23, row0[std::min(width - 1, j + vl + 1) * 4 + 2], vl); - p34 = __riscv_vslide1down(p33, row0[std::min(width - 1, j + vl + 1) * 4 + 3], vl); - p09 = __riscv_vslide1down(p08, row1[std::min(width - 1, j + vl + 1) * 4 ], vl); - p19 = __riscv_vslide1down(p18, row1[std::min(width - 1, j + vl + 1) * 4 + 1], vl); - p29 = __riscv_vslide1down(p28, row1[std::min(width - 1, j + vl + 1) * 4 + 2], vl); - p39 = __riscv_vslide1down(p38, row1[std::min(width - 1, j + vl + 1) * 4 + 3], vl); - p014 = __riscv_vslide1down(p013, row2[std::min(width - 1, j + vl + 1) * 4 ], vl); - p114 = __riscv_vslide1down(p113, row2[std::min(width - 1, j + vl + 1) * 4 + 1], vl); - p214 = __riscv_vslide1down(p213, row2[std::min(width - 1, j + vl + 1) * 4 + 2], vl); - p314 = __riscv_vslide1down(p313, row2[std::min(width - 1, j + vl + 1) * 4 + 3], vl); - p019 = __riscv_vslide1down(p018, row3[std::min(width - 1, j + vl + 1) * 4 ], vl); - p119 = __riscv_vslide1down(p118, row3[std::min(width - 1, j + vl + 1) * 4 + 1], vl); - p219 = __riscv_vslide1down(p218, row3[std::min(width - 1, j + vl + 1) * 4 + 2], vl); - p319 = __riscv_vslide1down(p318, row3[std::min(width - 1, j + vl + 1) * 4 + 3], vl); - p024 = __riscv_vslide1down(p023, row4[std::min(width - 1, j + vl + 1) * 4 ], vl); - p124 = __riscv_vslide1down(p123, row4[std::min(width - 1, j + vl + 1) * 4 + 1], vl); - p224 = __riscv_vslide1down(p223, row4[std::min(width - 1, j + vl + 1) * 4 + 2], vl); - p324 = __riscv_vslide1down(p323, row4[std::min(width - 1, j + vl + 1) * 4 + 3], vl); - - auto vop = [&vl](vuint8m2_t& a, vuint8m2_t& b) { - auto t = a; - a = __riscv_vminu(a, b, vl); - b = __riscv_vmaxu(t, b, vl); - }; - vuint8m2x4_t dst{}; - vop(p01, p02); vop(p00, p01); vop(p01, p02); vop(p04, p05); vop(p03, p04); - vop(p04, p05); vop(p00, p03); vop(p02, p05); vop(p02, p03); vop(p01, p04); - vop(p01, p02); vop(p03, p04); vop(p07, p08); vop(p06, p07); vop(p07, p08); - vop(p010, p011); vop(p09, p010); vop(p010, p011); vop(p06, p09); vop(p08, p011); - vop(p08, p09); vop(p07, p010); vop(p07, p08); vop(p09, p010); vop(p00, p06); - vop(p04, p010); vop(p04, p06); vop(p02, p08); vop(p02, p04); vop(p06, p08); - vop(p01, p07); vop(p05, p011); vop(p05, p07); vop(p03, p09); vop(p03, p05); - vop(p07, p09); vop(p01, p02); vop(p03, p04); vop(p05, p06); vop(p07, p08); - vop(p09, p010); vop(p013, p014); vop(p012, p013); vop(p013, p014); vop(p016, p017); - vop(p015, p016); vop(p016, p017); vop(p012, p015); vop(p014, p017); vop(p014, p015); - vop(p013, p016); vop(p013, p014); vop(p015, p016); vop(p019, p020); vop(p018, p019); - vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p021, p023); vop(p022, p024); - vop(p022, p023); vop(p018, p021); vop(p020, p023); vop(p020, p021); vop(p019, p022); - vop(p022, p024); vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p012, p018); - vop(p016, p022); vop(p016, p018); vop(p014, p020); vop(p020, p024); vop(p014, p016); - vop(p018, p020); vop(p022, p024); vop(p013, p019); vop(p017, p023); vop(p017, p019); - vop(p015, p021); vop(p015, p017); vop(p019, p021); vop(p013, p014); vop(p015, p016); - vop(p017, p018); vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p00, p012); - vop(p08, p020); vop(p08, p012); vop(p04, p016); vop(p016, p024); vop(p012, p016); - vop(p02, p014); vop(p010, p022); vop(p010, p014); vop(p06, p018); vop(p06, p010); - vop(p010, p012); vop(p01, p013); vop(p09, p021); vop(p09, p013); vop(p05, p017); - vop(p013, p017); vop(p03, p015); vop(p011, p023); vop(p011, p015); vop(p07, p019); - vop(p07, p011); vop(p011, p013); vop(p011, p012); - dst = __riscv_vset_v_u8m2_u8m2x4(dst, 0, p012); - vop(p11, p12); vop(p10, p11); vop(p11, p12); vop(p14, p15); vop(p13, p14); - vop(p14, p15); vop(p10, p13); vop(p12, p15); vop(p12, p13); vop(p11, p14); - vop(p11, p12); vop(p13, p14); vop(p17, p18); vop(p16, p17); vop(p17, p18); - vop(p110, p111); vop(p19, p110); vop(p110, p111); vop(p16, p19); vop(p18, p111); - vop(p18, p19); vop(p17, p110); vop(p17, p18); vop(p19, p110); vop(p10, p16); - vop(p14, p110); vop(p14, p16); vop(p12, p18); vop(p12, p14); vop(p16, p18); - vop(p11, p17); vop(p15, p111); vop(p15, p17); vop(p13, p19); vop(p13, p15); - vop(p17, p19); vop(p11, p12); vop(p13, p14); vop(p15, p16); vop(p17, p18); - vop(p19, p110); vop(p113, p114); vop(p112, p113); vop(p113, p114); vop(p116, p117); - vop(p115, p116); vop(p116, p117); vop(p112, p115); vop(p114, p117); vop(p114, p115); - vop(p113, p116); vop(p113, p114); vop(p115, p116); vop(p119, p120); vop(p118, p119); - vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p121, p123); vop(p122, p124); - vop(p122, p123); vop(p118, p121); vop(p120, p123); vop(p120, p121); vop(p119, p122); - vop(p122, p124); vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p112, p118); - vop(p116, p122); vop(p116, p118); vop(p114, p120); vop(p120, p124); vop(p114, p116); - vop(p118, p120); vop(p122, p124); vop(p113, p119); vop(p117, p123); vop(p117, p119); - vop(p115, p121); vop(p115, p117); vop(p119, p121); vop(p113, p114); vop(p115, p116); - vop(p117, p118); vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p10, p112); - vop(p18, p120); vop(p18, p112); vop(p14, p116); vop(p116, p124); vop(p112, p116); - vop(p12, p114); vop(p110, p122); vop(p110, p114); vop(p16, p118); vop(p16, p110); - vop(p110, p112); vop(p11, p113); vop(p19, p121); vop(p19, p113); vop(p15, p117); - vop(p113, p117); vop(p13, p115); vop(p111, p123); vop(p111, p115); vop(p17, p119); - vop(p17, p111); vop(p111, p113); vop(p111, p112); - dst = __riscv_vset_v_u8m2_u8m2x4(dst, 1, p112); - vop(p21, p22); vop(p20, p21); vop(p21, p22); vop(p24, p25); vop(p23, p24); - vop(p24, p25); vop(p20, p23); vop(p22, p25); vop(p22, p23); vop(p21, p24); - vop(p21, p22); vop(p23, p24); vop(p27, p28); vop(p26, p27); vop(p27, p28); - vop(p210, p211); vop(p29, p210); vop(p210, p211); vop(p26, p29); vop(p28, p211); - vop(p28, p29); vop(p27, p210); vop(p27, p28); vop(p29, p210); vop(p20, p26); - vop(p24, p210); vop(p24, p26); vop(p22, p28); vop(p22, p24); vop(p26, p28); - vop(p21, p27); vop(p25, p211); vop(p25, p27); vop(p23, p29); vop(p23, p25); - vop(p27, p29); vop(p21, p22); vop(p23, p24); vop(p25, p26); vop(p27, p28); - vop(p29, p210); vop(p213, p214); vop(p212, p213); vop(p213, p214); vop(p216, p217); - vop(p215, p216); vop(p216, p217); vop(p212, p215); vop(p214, p217); vop(p214, p215); - vop(p213, p216); vop(p213, p214); vop(p215, p216); vop(p219, p220); vop(p218, p219); - vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p221, p223); vop(p222, p224); - vop(p222, p223); vop(p218, p221); vop(p220, p223); vop(p220, p221); vop(p219, p222); - vop(p222, p224); vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p212, p218); - vop(p216, p222); vop(p216, p218); vop(p214, p220); vop(p220, p224); vop(p214, p216); - vop(p218, p220); vop(p222, p224); vop(p213, p219); vop(p217, p223); vop(p217, p219); - vop(p215, p221); vop(p215, p217); vop(p219, p221); vop(p213, p214); vop(p215, p216); - vop(p217, p218); vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p20, p212); - vop(p28, p220); vop(p28, p212); vop(p24, p216); vop(p216, p224); vop(p212, p216); - vop(p22, p214); vop(p210, p222); vop(p210, p214); vop(p26, p218); vop(p26, p210); - vop(p210, p212); vop(p21, p213); vop(p29, p221); vop(p29, p213); vop(p25, p217); - vop(p213, p217); vop(p23, p215); vop(p211, p223); vop(p211, p215); vop(p27, p219); - vop(p27, p211); vop(p211, p213); vop(p211, p212); - dst = __riscv_vset_v_u8m2_u8m2x4(dst, 2, p212); - vop(p31, p32); vop(p30, p31); vop(p31, p32); vop(p34, p35); vop(p33, p34); - vop(p34, p35); vop(p30, p33); vop(p32, p35); vop(p32, p33); vop(p31, p34); - vop(p31, p32); vop(p33, p34); vop(p37, p38); vop(p36, p37); vop(p37, p38); - vop(p310, p311); vop(p39, p310); vop(p310, p311); vop(p36, p39); vop(p38, p311); - vop(p38, p39); vop(p37, p310); vop(p37, p38); vop(p39, p310); vop(p30, p36); - vop(p34, p310); vop(p34, p36); vop(p32, p38); vop(p32, p34); vop(p36, p38); - vop(p31, p37); vop(p35, p311); vop(p35, p37); vop(p33, p39); vop(p33, p35); - vop(p37, p39); vop(p31, p32); vop(p33, p34); vop(p35, p36); vop(p37, p38); - vop(p39, p310); vop(p313, p314); vop(p312, p313); vop(p313, p314); vop(p316, p317); - vop(p315, p316); vop(p316, p317); vop(p312, p315); vop(p314, p317); vop(p314, p315); - vop(p313, p316); vop(p313, p314); vop(p315, p316); vop(p319, p320); vop(p318, p319); - vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p321, p323); vop(p322, p324); - vop(p322, p323); vop(p318, p321); vop(p320, p323); vop(p320, p321); vop(p319, p322); - vop(p322, p324); vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p312, p318); - vop(p316, p322); vop(p316, p318); vop(p314, p320); vop(p320, p324); vop(p314, p316); - vop(p318, p320); vop(p322, p324); vop(p313, p319); vop(p317, p323); vop(p317, p319); - vop(p315, p321); vop(p315, p317); vop(p319, p321); vop(p313, p314); vop(p315, p316); - vop(p317, p318); vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p30, p312); - vop(p38, p320); vop(p38, p312); vop(p34, p316); vop(p316, p324); vop(p312, p316); - vop(p32, p314); vop(p310, p322); vop(p310, p314); vop(p36, p318); vop(p36, p310); - vop(p310, p312); vop(p31, p313); vop(p39, p321); vop(p39, p313); vop(p35, p317); - vop(p313, p317); vop(p33, p315); vop(p311, p323); vop(p311, p315); vop(p37, p319); - vop(p37, p311); vop(p311, p313); vop(p311, p312); - dst = __riscv_vset_v_u8m2_u8m2x4(dst, 3, p312); - __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl); - } - } - } - - return CV_HAL_ERROR_OK; -} - -inline int medianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, int ksize) -{ - const int type = CV_MAKETYPE(depth, cn); - if (type != CV_8UC1 && type != CV_8UC4 && type != CV_16UC1 && type != CV_16SC1 && type != CV_32FC1) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if ((ksize != 3 && ksize != 5) || src_data == dst_data) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - - switch (ksize*100 + type) - { - case 300 + CV_8UC1: - return filter::invoke(height, {medianBlurC1<3, RVV_U8M4>}, src_data, src_step, dst_data, dst_step, width, height); - case 300 + CV_16UC1: - return filter::invoke(height, {medianBlurC1<3, RVV_U16M4>}, src_data, src_step, dst_data, dst_step, width, height); - case 300 + CV_16SC1: - return filter::invoke(height, {medianBlurC1<3, RVV_I16M4>}, src_data, src_step, dst_data, dst_step, width, height); - case 300 + CV_32FC1: - return filter::invoke(height, {medianBlurC1<3, RVV_F32M4>}, src_data, src_step, dst_data, dst_step, width, height); - case 500 + CV_8UC1: - return filter::invoke(height, {medianBlurC1<5, RVV_U8M1>}, src_data, src_step, dst_data, dst_step, width, height); - case 500 + CV_16UC1: - return filter::invoke(height, {medianBlurC1<5, RVV_U16M1>}, src_data, src_step, dst_data, dst_step, width, height); - case 500 + CV_16SC1: - return filter::invoke(height, {medianBlurC1<5, RVV_I16M1>}, src_data, src_step, dst_data, dst_step, width, height); - case 500 + CV_32FC1: - return filter::invoke(height, {medianBlurC1<5, RVV_F32M1>}, src_data, src_step, dst_data, dst_step, width, height); - - case 300 + CV_8UC4: - return filter::invoke(height, {medianBlurC4<3>}, src_data, src_step, dst_data, dst_step, width, height); - case 500 + CV_8UC4: - return filter::invoke(height, {medianBlurC4<5>}, src_data, src_step, dst_data, dst_step, width, height); - } - - return CV_HAL_ERROR_NOT_IMPLEMENTED; -} -} // cv::cv_hal_rvv::medianBlur - -namespace boxFilter { -#undef cv_hal_boxFilter -#define cv_hal_boxFilter cv::cv_hal_rvv::boxFilter::boxFilter - -template struct rvv; -template<> struct rvv -{ - static inline vuint16m8_t vcvt0(vuint8m4_t a, size_t b) { return __riscv_vzext_vf2(a, b); } - static inline vuint8m4_t vcvt1(vuint16m8_t a, size_t b) { return __riscv_vnclipu(a, 0, __RISCV_VXRM_RNU, b); } - static inline vuint16m8_t vdiv(vuint16m8_t a, ushort b, size_t c) { return __riscv_vdivu(__riscv_vadd(a, b / 2, c), b, c); } -}; -template<> struct rvv -{ - static inline vint32m8_t vcvt0(vint16m4_t a, size_t b) { return __riscv_vsext_vf2(a, b); } - static inline vint16m4_t vcvt1(vint32m8_t a, size_t b) { return __riscv_vnclip(a, 0, __RISCV_VXRM_RNU, b); } - static inline vint32m8_t vdiv(vint32m8_t a, int b, size_t c) { return __riscv_vdiv(__riscv_vadd(a, b / 2, c), b, c); } -}; -template<> struct rvv -{ - static inline vint32m8_t vcvt0(vint32m8_t a, size_t) { return a; } - static inline vint32m8_t vcvt1(vint32m8_t a, size_t) { return a; } - static inline vint32m8_t vdiv(vint32m8_t a, int b, size_t c) { return __riscv_vdiv(__riscv_vadd(a, b / 2, c), b, c); } -}; -template<> struct rvv -{ - static inline vfloat32m8_t vcvt0(vfloat32m8_t a, size_t) { return a; } - static inline vfloat32m8_t vcvt1(vfloat32m8_t a, size_t) { return a; } - static inline vfloat32m8_t vdiv(vfloat32m8_t a, float b, size_t c) { return __riscv_vfdiv(a, b, c); } -}; - -// the algorithm is same as cv_hal_sepFilter -template -static inline int boxFilterC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int anchor_x, int anchor_y, bool normalize, int border_type) -{ - using T = typename helperT::ElemType; - using WT = typename helperWT::ElemType; - - constexpr int noval = std::numeric_limits::max(); - auto accessX = [&](int x) { - int pi = filter::borderInterpolate(offset_y + x - anchor_y, full_height, border_type); - return pi < 0 ? noval : pi - offset_y; - }; - auto accessY = [&](int y) { - int pj = filter::borderInterpolate(offset_x + y - anchor_x, full_width, border_type); - return pj < 0 ? noval : pj - offset_x; - }; - auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; }; - - std::vector res(width * ksize); - auto process = [&](int x, int y) { - WT sum = 0; - for (int i = 0; i < ksize; i++) - { - int p = accessY(y + i); - if (p != noval) - { - sum += reinterpret_cast(src_data + x * src_step)[p]; - } - } - res[p2idx(x, y)] = sum; - }; - - const int left = anchor_x, right = width - (ksize - 1 - anchor_x); - for (int i = start - anchor_y; i < end + (ksize - 1 - anchor_y); i++) - { - if (i + offset_y >= 0 && i + offset_y < full_height) - { - if (left >= right) - { - for (int j = 0; j < width; j++) - process(i, j); - } - else - { - for (int j = 0; j < left; j++) - process(i, j); - for (int j = right; j < width; j++) - process(i, j); - - int vl; - for (int j = left; j < right; j += vl) - { - vl = helperT::setvl(right - j); - const T* extra = reinterpret_cast(src_data + i * src_step) + j - anchor_x; - auto src = rvv::vcvt0(helperT::vload(extra, vl), vl); - - extra += vl; - auto sum = src; - src = helperWT::vslide1down(src, extra[0], vl); - sum = helperWT::vadd(sum, src, vl); - src = helperWT::vslide1down(src, extra[1], vl); - sum = helperWT::vadd(sum, src, vl); - if (ksize == 5) - { - src = helperWT::vslide1down(src, extra[2], vl); - sum = helperWT::vadd(sum, src, vl); - src = helperWT::vslide1down(src, extra[3], vl); - sum = helperWT::vadd(sum, src, vl); - } - helperWT::vstore(res.data() + p2idx(i, j), sum, vl); - } - } - } - - int cur = i - (ksize - 1 - anchor_y); - if (cur >= start) - { - const WT* row0 = accessX(cur ) == noval ? nullptr : res.data() + p2idx(accessX(cur ), 0); - const WT* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0); - const WT* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0); - const WT* row3 = nullptr, *row4 = nullptr; - if (ksize == 5) - { - row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0); - row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0); - } - - int vl; - for (int j = 0; j < width; j += vl) - { - vl = helperWT::setvl(width - j); - auto sum = row0 ? helperWT::vload(row0 + j, vl) : helperWT::vmv(0, vl); - if (row1) sum = helperWT::vadd(sum, helperWT::vload(row1 + j, vl), vl); - if (row2) sum = helperWT::vadd(sum, helperWT::vload(row2 + j, vl), vl); - if (row3) sum = helperWT::vadd(sum, helperWT::vload(row3 + j, vl), vl); - if (row4) sum = helperWT::vadd(sum, helperWT::vload(row4 + j, vl), vl); - if (normalize) sum = rvv::vdiv(sum, ksize * ksize, vl); - - if (cast) - { - helperT::vstore(reinterpret_cast(dst_data + cur * dst_step) + j, rvv::vcvt1(sum, vl), vl); - } - else - { - helperWT::vstore(reinterpret_cast(dst_data + cur * dst_step) + j, sum, vl); - } - } - } - } - - return CV_HAL_ERROR_OK; -} - -template -static inline int boxFilterC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int anchor_x, int anchor_y, bool normalize, int border_type) -{ - constexpr int noval = std::numeric_limits::max(); - auto accessX = [&](int x) { - int pi = filter::borderInterpolate(offset_y + x - anchor_y, full_height, border_type); - return pi < 0 ? noval : pi - offset_y; - }; - auto accessY = [&](int y) { - int pj = filter::borderInterpolate(offset_x + y - anchor_x, full_width, border_type); - return pj < 0 ? noval : pj - offset_x; - }; - auto p2idx = [&](int x, int y){ return ((x + ksize) % ksize * width + y) * 3; }; - - std::vector res(width * ksize * 3); - auto process = [&](int x, int y) { - float sum0, sum1, sum2; - sum0 = sum1 = sum2 = 0; - for (int i = 0; i < ksize; i++) - { - int p = accessY(y + i); - if (p != noval) - { - sum0 += reinterpret_cast(src_data + x * src_step)[p * 3 ]; - sum1 += reinterpret_cast(src_data + x * src_step)[p * 3 + 1]; - sum2 += reinterpret_cast(src_data + x * src_step)[p * 3 + 2]; - } - } - res[p2idx(x, y) ] = sum0; - res[p2idx(x, y) + 1] = sum1; - res[p2idx(x, y) + 2] = sum2; - }; - - const int left = anchor_x, right = width - (ksize - 1 - anchor_x); - for (int i = start - anchor_y; i < end + (ksize - 1 - anchor_y); i++) - { - if (i + offset_y >= 0 && i + offset_y < full_height) - { - if (left >= right) - { - for (int j = 0; j < width; j++) - process(i, j); - } - else - { - for (int j = 0; j < left; j++) - process(i, j); - for (int j = right; j < width; j++) - process(i, j); - - int vl; - for (int j = left; j < right; j += vl) - { - vl = __riscv_vsetvl_e32m2(right - j); - const float* extra = reinterpret_cast(src_data + i * src_step) + (j - anchor_x) * 3; - auto src = __riscv_vlseg3e32_v_f32m2x3(extra, vl); - auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0); - auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1); - auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2); - - extra += vl * 3; - auto sum0 = src0, sum1 = src1, sum2 = src2; - src0 = __riscv_vfslide1down(src0, extra[0], vl); - src1 = __riscv_vfslide1down(src1, extra[1], vl); - src2 = __riscv_vfslide1down(src2, extra[2], vl); - sum0 = __riscv_vfadd(sum0, src0, vl); - sum1 = __riscv_vfadd(sum1, src1, vl); - sum2 = __riscv_vfadd(sum2, src2, vl); - src0 = __riscv_vfslide1down(src0, extra[3], vl); - src1 = __riscv_vfslide1down(src1, extra[4], vl); - src2 = __riscv_vfslide1down(src2, extra[5], vl); - sum0 = __riscv_vfadd(sum0, src0, vl); - sum1 = __riscv_vfadd(sum1, src1, vl); - sum2 = __riscv_vfadd(sum2, src2, vl); - if (ksize == 5) - { - src0 = __riscv_vfslide1down(src0, extra[6], vl); - src1 = __riscv_vfslide1down(src1, extra[7], vl); - src2 = __riscv_vfslide1down(src2, extra[8], vl); - sum0 = __riscv_vfadd(sum0, src0, vl); - sum1 = __riscv_vfadd(sum1, src1, vl); - sum2 = __riscv_vfadd(sum2, src2, vl); - src0 = __riscv_vfslide1down(src0, extra[ 9], vl); - src1 = __riscv_vfslide1down(src1, extra[10], vl); - src2 = __riscv_vfslide1down(src2, extra[11], vl); - sum0 = __riscv_vfadd(sum0, src0, vl); - sum1 = __riscv_vfadd(sum1, src1, vl); - sum2 = __riscv_vfadd(sum2, src2, vl); - } - - vfloat32m2x3_t dst{}; - dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, sum0); - dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, sum1); - dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, sum2); - __riscv_vsseg3e32(res.data() + p2idx(i, j), dst, vl); - } - } - } - - int cur = i - (ksize - 1 - anchor_y); - if (cur >= start) - { - const float* row0 = accessX(cur ) == noval ? nullptr : res.data() + p2idx(accessX(cur ), 0); - const float* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0); - const float* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0); - const float* row3 = nullptr, *row4 = nullptr; - if (ksize == 5) - { - row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0); - row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0); - } - - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m2(width - j); - vfloat32m2_t sum0, sum1, sum2; - sum0 = sum1 = sum2 = __riscv_vfmv_v_f_f32m2(0, vl); - auto loadres = [&](const float* row) { - if (!row) return; - auto src = __riscv_vlseg3e32_v_f32m2x3(row + j * 3, vl); - sum0 = __riscv_vfadd(sum0, __riscv_vget_v_f32m2x3_f32m2(src, 0), vl); - sum1 = __riscv_vfadd(sum1, __riscv_vget_v_f32m2x3_f32m2(src, 1), vl); - sum2 = __riscv_vfadd(sum2, __riscv_vget_v_f32m2x3_f32m2(src, 2), vl); - }; - loadres(row0); - loadres(row1); - loadres(row2); - loadres(row3); - loadres(row4); - if (normalize) - { - sum0 = __riscv_vfdiv(sum0, ksize * ksize, vl); - sum1 = __riscv_vfdiv(sum1, ksize * ksize, vl); - sum2 = __riscv_vfdiv(sum2, ksize * ksize, vl); - } - - vfloat32m2x3_t dst{}; - dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, sum0); - dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, sum1); - dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, sum2); - __riscv_vsseg3e32(reinterpret_cast(dst_data + cur * dst_step) + j * 3, dst, vl); - } - } - } - - return CV_HAL_ERROR_OK; -} - -inline int boxFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, size_t ksize_width, size_t ksize_height, int anchor_x, int anchor_y, bool normalize, int border_type) -{ - const int src_type = CV_MAKETYPE(src_depth, cn), dst_type = CV_MAKETYPE(dst_depth, cn); - if (ksize_width != ksize_height || (ksize_width != 3 && ksize_width != 5)) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if (border_type & BORDER_ISOLATED || border_type == BORDER_WRAP) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - - uchar* _dst_data = dst_data; - size_t _dst_step = dst_step; - const size_t size = CV_ELEM_SIZE(dst_type); - std::vector dst; - if (src_data == _dst_data) - { - dst = std::vector(width * height * size); - dst_data = dst.data(); - dst_step = width * size; - } - - int res = CV_HAL_ERROR_NOT_IMPLEMENTED; - anchor_x = anchor_x < 0 ? ksize_width / 2 : anchor_x; - anchor_y = anchor_y < 0 ? ksize_height / 2 : anchor_y; - if (src_type != dst_type) - { - if (src_type == CV_8UC1 && dst_type == CV_16UC1) - { - if (ksize_width == 3) - { - res = filter::invoke(height, {boxFilterC1<3, RVV_U8M4, RVV_U16M8, false>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); - } - if (ksize_width == 5) - { - res = filter::invoke(height, {boxFilterC1<5, RVV_U8M4, RVV_U16M8, false>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); - } - } - } - else - { - switch (ksize_width*100 + src_type) - { - case 300 + CV_8UC1: - res = filter::invoke(height, {boxFilterC1<3, RVV_U8M4, RVV_U16M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); - break; - case 500 + CV_8UC1: - res = filter::invoke(height, {boxFilterC1<5, RVV_U8M4, RVV_U16M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); - break; - case 300 + CV_16SC1: - res = filter::invoke(height, {boxFilterC1<3, RVV_I16M4, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); - break; - case 500 + CV_16SC1: - res = filter::invoke(height, {boxFilterC1<5, RVV_I16M4, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); - break; - case 300 + CV_32SC1: - res = filter::invoke(height, {boxFilterC1<3, RVV_I32M8, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); - break; - case 500 + CV_32SC1: - res = filter::invoke(height, {boxFilterC1<5, RVV_I32M8, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); - break; - case 300 + CV_32FC1: - res = filter::invoke(height, {boxFilterC1<3, RVV_F32M8, RVV_F32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); - break; - case 500 + CV_32FC1: - res = filter::invoke(height, {boxFilterC1<5, RVV_F32M8, RVV_F32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); - break; - case 300 + CV_32FC3: - res = filter::invoke(height, {boxFilterC3<3>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); - break; - case 500 + CV_32FC3: - res = filter::invoke(height, {boxFilterC3<5>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); - break; - } - } - if (res == CV_HAL_ERROR_NOT_IMPLEMENTED) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - - if (src_data == _dst_data) - { - for (int i = 0; i < height; i++) - memcpy(_dst_data + i * _dst_step, dst.data() + i * dst_step, dst_step); - } - - return res; -} -} // cv::cv_hal_rvv::boxFilter - -namespace bilateralFilter { -#undef cv_hal_bilateralFilter -#define cv_hal_bilateralFilter cv::cv_hal_rvv::bilateralFilter::bilateralFilter - -// the algorithm is copied from imgproc/src/bilateral_filter.simd.cpp -// in the functor BilateralFilter_8u_Invoker -static inline int bilateralFilter8UC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* color_weight) -{ - constexpr int align = 31; - std::vector _sum(width + align), _wsum(width + align); - float* sum = reinterpret_cast(((size_t)_sum.data() + align) & ~align); - float* wsum = reinterpret_cast(((size_t)_wsum.data() + align) & ~align); - - for (int i = start; i < end; i++) - { - const uchar* sptr = src_data + (i+radius) * src_step + radius; - memset(sum, 0, sizeof(float) * width); - memset(wsum, 0, sizeof(float) * width); - for(int k = 0; k < maxk; k++) - { - const uchar* ksptr = sptr + space_ofs[k]; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e8m2(width - j); - auto src = __riscv_vle8_v_u8m2(sptr + j, vl); - auto ksrc = __riscv_vle8_v_u8m2(ksptr + j, vl); - auto diff = __riscv_vsub(__riscv_vmaxu(src, ksrc, vl), __riscv_vminu(src, ksrc, vl), vl); - auto w = __riscv_vloxei16_v_f32m8(color_weight, __riscv_vmul(__riscv_vzext_vf2(diff, vl), sizeof(float), vl), vl); - w = __riscv_vfmul(w, space_weight[k], vl); - - __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl); - __riscv_vse32(sum + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc, vl), vl), __riscv_vle32_v_f32m8(sum + j, vl), vl), vl); - } - } - - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e8m2(width - j); - auto dst = __riscv_vfncvt_xu(__riscv_vfdiv(__riscv_vle32_v_f32m8(sum + j, vl), __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl); - __riscv_vse8(dst_data + i * dst_step + j, __riscv_vncvt_x(dst, vl), vl); - } - } - - return CV_HAL_ERROR_OK; -} - -static inline int bilateralFilter8UC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* color_weight) -{ - constexpr int align = 31; - std::vector _sum_b(width + align), _sum_g(width + align), _sum_r(width + align), _wsum(width + align); - float* sum_b = reinterpret_cast(((size_t)_sum_b.data() + align) & ~align); - float* sum_g = reinterpret_cast(((size_t)_sum_g.data() + align) & ~align); - float* sum_r = reinterpret_cast(((size_t)_sum_r.data() + align) & ~align); - float* wsum = reinterpret_cast(((size_t)_wsum.data() + align) & ~align); - - for (int i = start; i < end; i++) - { - const uchar* sptr = src_data + (i+radius) * src_step + radius*3; - memset(sum_b, 0, sizeof(float) * width); - memset(sum_g, 0, sizeof(float) * width); - memset(sum_r, 0, sizeof(float) * width); - memset(wsum, 0, sizeof(float) * width); - for(int k = 0; k < maxk; k++) - { - const uchar* ksptr = sptr + space_ofs[k]; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e8m2(width - j); - auto src = __riscv_vlseg3e8_v_u8m2x3(sptr + j * 3, vl); - auto src0 = __riscv_vget_v_u8m2x3_u8m2(src, 0); - auto src1 = __riscv_vget_v_u8m2x3_u8m2(src, 1); - auto src2 = __riscv_vget_v_u8m2x3_u8m2(src, 2); - src = __riscv_vlseg3e8_v_u8m2x3(ksptr + j * 3, vl); - auto ksrc0 = __riscv_vget_v_u8m2x3_u8m2(src, 0); - auto ksrc1 = __riscv_vget_v_u8m2x3_u8m2(src, 1); - auto ksrc2 = __riscv_vget_v_u8m2x3_u8m2(src, 2); - - auto diff0 = __riscv_vsub(__riscv_vmaxu(src0, ksrc0, vl), __riscv_vminu(src0, ksrc0, vl), vl); - auto diff1 = __riscv_vsub(__riscv_vmaxu(src1, ksrc1, vl), __riscv_vminu(src1, ksrc1, vl), vl); - auto diff2 = __riscv_vsub(__riscv_vmaxu(src2, ksrc2, vl), __riscv_vminu(src2, ksrc2, vl), vl); - auto w = __riscv_vloxei16_v_f32m8(color_weight, __riscv_vmul(__riscv_vadd(__riscv_vadd(__riscv_vzext_vf2(diff0, vl), __riscv_vzext_vf2(diff1, vl), vl), __riscv_vzext_vf2(diff2, vl), vl), sizeof(float), vl), vl); - w = __riscv_vfmul(w, space_weight[k], vl); - - __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl); - __riscv_vse32(sum_b + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc0, vl), vl), __riscv_vle32_v_f32m8(sum_b + j, vl), vl), vl); - __riscv_vse32(sum_g + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc1, vl), vl), __riscv_vle32_v_f32m8(sum_g + j, vl), vl), vl); - __riscv_vse32(sum_r + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc2, vl), vl), __riscv_vle32_v_f32m8(sum_r + j, vl), vl), vl); - } - } - - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e8m2(width - j); - auto w = __riscv_vfrdiv(__riscv_vle32_v_f32m8(wsum + j, vl), 1.0f, vl); - vuint8m2x3_t dst{}; - dst = __riscv_vset_v_u8m2_u8m2x3(dst, 0,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_b + j, vl), w, vl), vl), vl)); - dst = __riscv_vset_v_u8m2_u8m2x3(dst, 1,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_g + j, vl), w, vl), vl), vl)); - dst = __riscv_vset_v_u8m2_u8m2x3(dst, 2,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_r + j, vl), w, vl), vl), vl)); - __riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl); - } - } - - return CV_HAL_ERROR_OK; -} - -// the algorithm is copied from imgproc/src/bilateral_filter.simd.cpp -// in the functor BilateralFilter_32f_Invoker -static inline int bilateralFilter32FC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* expLUT, float scale_index) -{ - constexpr int align = 31; - std::vector _sum(width + align), _wsum(width + align); - float* sum = reinterpret_cast(((size_t)_sum.data() + align) & ~align); - float* wsum = reinterpret_cast(((size_t)_wsum.data() + align) & ~align); - - for (int i = start; i < end; i++) - { - const float* sptr = reinterpret_cast(src_data + (i+radius) * src_step) + radius; - memset(sum, 0, sizeof(float) * width); - memset(wsum, 0, sizeof(float) * width); - for(int k = 0; k < maxk; k++) - { - const float* ksptr = sptr + space_ofs[k]; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m4(width - j); - auto src = __riscv_vle32_v_f32m4(sptr + j, vl); - auto ksrc = __riscv_vle32_v_f32m4(ksptr + j, vl); - auto diff = __riscv_vfmul(__riscv_vfabs(__riscv_vfsub(src, ksrc, vl), vl), scale_index, vl); - auto idx = __riscv_vfcvt_rtz_x(diff, vl); - auto alpha = __riscv_vfsub(diff, __riscv_vfcvt_f(idx, vl), vl); - - auto exp = __riscv_vloxseg2ei32_v_f32m4x2(expLUT, __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vmul(idx, sizeof(float), vl)), vl); - auto w = __riscv_vfmadd(alpha, __riscv_vfsub(__riscv_vget_v_f32m4x2_f32m4(exp, 1), __riscv_vget_v_f32m4x2_f32m4(exp, 0), vl), __riscv_vget_v_f32m4x2_f32m4(exp, 0), vl); - w = __riscv_vfmul(w, space_weight[k], vl); - - __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m4(wsum + j, vl), vl), vl); - __riscv_vse32(sum + j, __riscv_vfmadd(w, ksrc, __riscv_vle32_v_f32m4(sum + j, vl), vl), vl); - } - } - - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m4(width - j); - auto src = __riscv_vle32_v_f32m4(sptr + j, vl); - auto dst = __riscv_vfdiv(__riscv_vfadd(__riscv_vle32_v_f32m4(sum + j, vl), src, vl), __riscv_vfadd(__riscv_vle32_v_f32m4(wsum + j, vl), 1, vl), vl); - __riscv_vse32(reinterpret_cast(dst_data + i * dst_step) + j, dst, vl); - } - } - - return CV_HAL_ERROR_OK; -} - -static inline int bilateralFilter32FC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* expLUT, float scale_index) -{ - constexpr int align = 31; - std::vector _sum_b(width + align), _sum_g(width + align), _sum_r(width + align), _wsum(width + align); - float* sum_b = reinterpret_cast(((size_t)_sum_b.data() + align) & ~align); - float* sum_g = reinterpret_cast(((size_t)_sum_g.data() + align) & ~align); - float* sum_r = reinterpret_cast(((size_t)_sum_r.data() + align) & ~align); - float* wsum = reinterpret_cast(((size_t)_wsum.data() + align) & ~align); - - for (int i = start; i < end; i++) - { - const float* sptr = reinterpret_cast(src_data + (i+radius) * src_step) + radius*3; - memset(sum_b, 0, sizeof(float) * width); - memset(sum_g, 0, sizeof(float) * width); - memset(sum_r, 0, sizeof(float) * width); - memset(wsum, 0, sizeof(float) * width); - for(int k = 0; k < maxk; k++) - { - const float* ksptr = sptr + space_ofs[k]; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m2(width - j); - auto src = __riscv_vlseg3e32_v_f32m2x3(sptr + j * 3, vl); - auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0); - auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1); - auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2); - src = __riscv_vlseg3e32_v_f32m2x3(ksptr + j * 3, vl); - auto ksrc0 = __riscv_vget_v_f32m2x3_f32m2(src, 0); - auto ksrc1 = __riscv_vget_v_f32m2x3_f32m2(src, 1); - auto ksrc2 = __riscv_vget_v_f32m2x3_f32m2(src, 2); - - auto diff = __riscv_vfmul(__riscv_vfadd(__riscv_vfadd(__riscv_vfabs(__riscv_vfsub(src0, ksrc0, vl), vl), __riscv_vfabs(__riscv_vfsub(src1, ksrc1, vl), vl), vl), __riscv_vfabs(__riscv_vfsub(src2, ksrc2, vl), vl), vl), scale_index, vl); - auto idx = __riscv_vfcvt_rtz_x(diff, vl); - auto alpha = __riscv_vfsub(diff, __riscv_vfcvt_f(idx, vl), vl); - - auto exp = __riscv_vloxseg2ei32_v_f32m2x2(expLUT, __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmul(idx, sizeof(float), vl)), vl); - auto w = __riscv_vfmadd(alpha, __riscv_vfsub(__riscv_vget_v_f32m2x2_f32m2(exp, 1), __riscv_vget_v_f32m2x2_f32m2(exp, 0), vl), __riscv_vget_v_f32m2x2_f32m2(exp, 0), vl); - w = __riscv_vfmul(w, space_weight[k], vl); - - __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m2(wsum + j, vl), vl), vl); - __riscv_vse32(sum_b + j, __riscv_vfmadd(w, ksrc0, __riscv_vle32_v_f32m2(sum_b + j, vl), vl), vl); - __riscv_vse32(sum_g + j, __riscv_vfmadd(w, ksrc1, __riscv_vle32_v_f32m2(sum_g + j, vl), vl), vl); - __riscv_vse32(sum_r + j, __riscv_vfmadd(w, ksrc2, __riscv_vle32_v_f32m2(sum_r + j, vl), vl), vl); - } - } - - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m2(width - j); - auto w = __riscv_vfrdiv(__riscv_vfadd(__riscv_vle32_v_f32m2(wsum + j, vl), 1, vl), 1, vl); - auto src = __riscv_vlseg3e32_v_f32m2x3(sptr + j * 3, vl); - auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0); - auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1); - auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2); - - vfloat32m2x3_t dst{}; - dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_b + j, vl), src0, vl), vl)); - dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_g + j, vl), src1, vl), vl)); - dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_r + j, vl), src2, vl), vl)); - __riscv_vsseg3e32(reinterpret_cast(dst_data + i * dst_step) + j * 3, dst, vl); - } - } - - return CV_HAL_ERROR_OK; -} - -// the algorithm is copied from imgproc/src/bilateral_filter.dispatch.cpp -// in the function static void bilateralFilter_8u and bilateralFilter_32f -inline int bilateralFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, - int width, int height, int depth, int cn, int d, double sigma_color, double sigma_space, int border_type) -{ - const int type = CV_MAKETYPE(depth, cn); - if (type != CV_8UC1 && type != CV_8UC3 && type != CV_32FC1 && type != CV_32FC3) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if (type == CV_32FC1 && width * height > 1 << 20) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - if (src_data == dst_data || border_type & BORDER_ISOLATED) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - - sigma_color = sigma_color <= 0 ? 1 : sigma_color; - sigma_space = sigma_space <= 0 ? 1 : sigma_space; - double gauss_color_coeff = -0.5/(sigma_color*sigma_color); - double gauss_space_coeff = -0.5/(sigma_space*sigma_space); - int radius = d <= 0 ? std::round(sigma_space*1.5) : d/2; - radius = std::max(radius, 1); - d = radius*2 + 1; - - const int size = depth == CV_32F ? cn * sizeof(float) : cn; - const int temp_step = (width + radius * 2) * size; - std::vector _temp((width + radius * 2) * (height + radius * 2) * size, 0); - uchar* temp = _temp.data(); - std::vector width_interpolate(radius * 2); - for (int j = 0; j < radius; j++) - { - width_interpolate[j] = filter::borderInterpolate(j - radius, width, border_type); - width_interpolate[j + radius] = filter::borderInterpolate(width + j, width, border_type); - } - for (int i = 0; i < height + radius * 2; i++) - { - int x = filter::borderInterpolate(i - radius, height, border_type); - if (x != -1) - { - for (int j = 0; j < radius; j++) - { - int y = width_interpolate[j]; - if (y != -1) - memcpy(temp + i * temp_step + j * size, src_data + x * src_step + y * size, size); - y = width_interpolate[j + radius]; - if (y != -1) - memcpy(temp + i * temp_step + (width + j + radius) * size, src_data + x * src_step + y * size, size); - } - memcpy(temp + i * temp_step + radius * size, src_data + x * src_step, width * size); - } - } - - std::vector _space_weight(d*d); - std::vector _space_ofs(d*d); - float* space_weight = _space_weight.data(); - int* space_ofs = _space_ofs.data(); - int maxk = 0; - for (int i = -radius; i <= radius; i++) - { - for (int j = -radius; j <= radius; j++) - { - double r = std::sqrt((double)i*i + (double)j*j); - if (r <= radius && (depth == CV_8U || i != 0 || j != 0)) - { - space_weight[maxk] = static_cast(r*r*gauss_space_coeff); - space_ofs[maxk++] = (i * (temp_step / size) + j) * cn; - } - } - } - cv::cv_hal_rvv::exp32f(space_weight, space_weight, maxk); - - if (depth == CV_8U) - { - std::vector _color_weight(cn*256); - float* color_weight = _color_weight.data(); - for (int i = 0; i < 256*cn; i++) - color_weight[i] = static_cast(i*i*gauss_color_coeff); - cv::cv_hal_rvv::exp32f(color_weight, color_weight, 256*cn); - - switch (cn) - { - case 1: - return filter::invoke(height, {bilateralFilter8UC1}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, color_weight); - case 3: - return filter::invoke(height, {bilateralFilter8UC3}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, color_weight); - } - } - else - { - double minValSrc = -1, maxValSrc = 1; - cv::cv_hal_rvv::minmax::minMaxIdx(src_data, src_step, width * cn, height, CV_32F, &minValSrc, &maxValSrc, nullptr, nullptr, nullptr); - if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON) - { - for (int i = 0; i < width; i++) - memcpy(dst_data + i * dst_step, src_data + i * src_step, width * size); - return CV_HAL_ERROR_OK; - } - - const int kExpNumBinsPerChannel = 1 << 12; - const int kExpNumBins = kExpNumBinsPerChannel * cn; - const float scale_index = kExpNumBins / static_cast((maxValSrc - minValSrc) * cn); - std::vector _expLUT(kExpNumBins+2, 0); - float* expLUT = _expLUT.data(); - for (int i = 0; i < kExpNumBins+2; i++) - { - double val = i / scale_index; - expLUT[i] = static_cast(val * val * gauss_color_coeff); - } - cv::cv_hal_rvv::exp32f(expLUT, expLUT, kExpNumBins+2); - - switch (cn) - { - case 1: - return filter::invoke(height, {bilateralFilter32FC1}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, expLUT, scale_index); - case 3: - return filter::invoke(height, {bilateralFilter32FC3}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, expLUT, scale_index); - } - } - - return CV_HAL_ERROR_NOT_IMPLEMENTED; -} -} // cv::cv_hal_rvv::bilateralFilter - -}} - -#endif diff --git a/hal/riscv-rvv/hal_rvv_1p0/polar_to_cart.hpp b/hal/riscv-rvv/hal_rvv_1p0/polar_to_cart.hpp deleted file mode 100644 index feab2047e5..0000000000 --- a/hal/riscv-rvv/hal_rvv_1p0/polar_to_cart.hpp +++ /dev/null @@ -1,53 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html. - -// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. - -#ifndef OPENCV_HAL_RVV_POLAR_TO_CART_HPP_INCLUDED -#define OPENCV_HAL_RVV_POLAR_TO_CART_HPP_INCLUDED - -#include -#include "hal_rvv_1p0/sincos.hpp" -#include "hal_rvv_1p0/types.hpp" - -namespace cv { namespace cv_hal_rvv { - -#undef cv_hal_polarToCart32f -#define cv_hal_polarToCart32f cv::cv_hal_rvv::polarToCart -#undef cv_hal_polarToCart64f -#define cv_hal_polarToCart64f cv::cv_hal_rvv::polarToCart - -template -inline int - polarToCart(const Elem* mag, const Elem* angle, Elem* x, Elem* y, int len, bool angleInDegrees) -{ - using T = RVV_F32M4; - const auto sincos_scale = angleInDegrees ? detail::sincos_deg_scale : detail::sincos_rad_scale; - - size_t vl; - auto cos_p2 = T::vmv(detail::sincos_cos_p2, T::setvlmax()); - auto cos_p0 = T::vmv(detail::sincos_cos_p0, T::setvlmax()); - for (; len > 0; len -= (int)vl, angle += vl, x += vl, y += vl) - { - vl = RVV_T::setvl(len); - auto vangle = T::cast(RVV_T::vload(angle, vl), vl); - T::VecType vsin, vcos; - detail::SinCos32f(vangle, vsin, vcos, sincos_scale, cos_p2, cos_p0, vl); - if (mag) - { - auto vmag = T::cast(RVV_T::vload(mag, vl), vl); - vsin = __riscv_vfmul(vsin, vmag, vl); - vcos = __riscv_vfmul(vcos, vmag, vl); - mag += vl; - } - RVV_T::vstore(x, RVV_T::cast(vcos, vl), vl); - RVV_T::vstore(y, RVV_T::cast(vsin, vl), vl); - } - - return CV_HAL_ERROR_OK; -} - -}} // namespace cv::cv_hal_rvv - -#endif // OPENCV_HAL_RVV_POLAR_TO_CART_HPP_INCLUDED diff --git a/hal/riscv-rvv/hal_rvv_1p0/sqrt.hpp b/hal/riscv-rvv/hal_rvv_1p0/sqrt.hpp deleted file mode 100644 index b87998d637..0000000000 --- a/hal/riscv-rvv/hal_rvv_1p0/sqrt.hpp +++ /dev/null @@ -1,131 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level -// directory of this distribution and at http://opencv.org/license.html. - -// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. - -#ifndef OPENCV_HAL_RVV_SQRT_HPP_INCLUDED -#define OPENCV_HAL_RVV_SQRT_HPP_INCLUDED - -#include -#include -#include "hal_rvv_1p0/types.hpp" - -namespace cv { namespace cv_hal_rvv { - -#undef cv_hal_sqrt32f -#undef cv_hal_sqrt64f -#undef cv_hal_invSqrt32f -#undef cv_hal_invSqrt64f - -#define cv_hal_sqrt32f cv::cv_hal_rvv::sqrt> -#define cv_hal_sqrt64f cv::cv_hal_rvv::sqrt> - -#ifdef __clang__ -// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access. -// So a smaller LMUL is used here. -# define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt> -# define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt> -#else -# define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt> -# define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt> -#endif - -namespace detail { - -// Newton-Raphson method -// Use 4 LMUL registers -template -inline VEC_T sqrt(VEC_T x, size_t vl) -{ - auto x2 = __riscv_vfmul(x, 0.5, vl); - auto y = __riscv_vfrsqrt7(x, vl); -#ifdef __clang__ -#pragma unroll -#endif - for (size_t i = 0; i < iter_times; i++) - { - auto t = __riscv_vfmul(y, y, vl); - t = __riscv_vfmul(t, x2, vl); - t = __riscv_vfrsub(t, 1.5, vl); - y = __riscv_vfmul(t, y, vl); - } - // just to prevent the compiler from calculating mask before the iteration, which will run out - // of registers and cause memory access. - asm volatile("" ::: "memory"); - auto classified = __riscv_vfclass(x, vl); - // block -0, +0, positive subnormal number, +inf - auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl); - return __riscv_vfmul_mu(mask, x, x, y, vl); -} - -// Newton-Raphson method -// Use 3 LMUL registers and 1 mask register -template -inline VEC_T invSqrt(VEC_T x, size_t vl) -{ - auto classified = __riscv_vfclass(x, vl); - // block -0, +0, positive subnormal number, +inf - auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl); - auto x2 = __riscv_vfmul(x, 0.5, vl); - auto y = __riscv_vfrsqrt7(x, vl); -#ifdef __clang__ -#pragma unroll -#endif - for (size_t i = 0; i < iter_times; i++) - { - auto t = __riscv_vfmul(y, y, vl); - t = __riscv_vfmul(t, x2, vl); - t = __riscv_vfrsub(t, 1.5, vl); - y = __riscv_vfmul_mu(mask, y, t, y, vl); - } - return y; -} - -} // namespace detail - -template -struct Sqrt32f -{ - using T = RVV_T; - static constexpr size_t iter_times = 2; -}; - -template -struct Sqrt64f -{ - using T = RVV_T; - static constexpr size_t iter_times = 3; -}; - -template -inline int sqrt(const Elem* src, Elem* dst, int _len) -{ - size_t vl; - for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl) - { - vl = SQRT_T::T::setvl(len); - auto x = SQRT_T::T::vload(src, vl); - SQRT_T::T::vstore(dst, detail::sqrt(x, vl), vl); - } - - return CV_HAL_ERROR_OK; -} - -template -inline int invSqrt(const Elem* src, Elem* dst, int _len) -{ - size_t vl; - for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl) - { - vl = SQRT_T::T::setvl(len); - auto x = SQRT_T::T::vload(src, vl); - SQRT_T::T::vstore(dst, detail::invSqrt(x, vl), vl); - } - - return CV_HAL_ERROR_OK; -} - -}} // namespace cv::cv_hal_rvv - -#endif // OPENCV_HAL_RVV_SQRT_HPP_INCLUDED diff --git a/hal/riscv-rvv/include/core.hpp b/hal/riscv-rvv/include/core.hpp new file mode 100644 index 0000000000..b800420d42 --- /dev/null +++ b/hal/riscv-rvv/include/core.hpp @@ -0,0 +1,332 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_RVV_HAL_CORE_HPP +#define OPENCV_RVV_HAL_CORE_HPP + +namespace cv { namespace rvv_hal { namespace core { + +#if CV_HAL_RVV_1P0_ENABLED + +/* ############ merge ############ */ + +int merge8u(const uchar** src, uchar* dst, int len, int cn); +int merge16u(const ushort** src, ushort* dst, int len, int cn); +int merge32s(const int** src, int* dst, int len, int cn); +int merge64s(const int64** src, int64* dst, int len, int cn); + +#undef cv_hal_merge8u +#define cv_hal_merge8u cv::rvv_hal::core::merge8u +#undef cv_hal_merge16u +#define cv_hal_merge16u cv::rvv_hal::core::merge16u +#undef cv_hal_merge32s +#define cv_hal_merge32s cv::rvv_hal::core::merge32s +#undef cv_hal_merge64s +#define cv_hal_merge64s cv::rvv_hal::core::merge64s + +/* ############ meanStdDev ############ */ + +int meanStdDev(const uchar* src_data, size_t src_step, int width, int height, int src_type, + double* mean_val, double* stddev_val, uchar* mask, size_t mask_step); + +#undef cv_hal_meanStdDev +#define cv_hal_meanStdDev cv::rvv_hal::core::meanStdDev + +/* ############ dft ############ */ + +int dft(const uchar* src, uchar* dst, int depth, int nf, int *factors, double scale, + int* itab, void* wave, int tab_size, int n, bool isInverse, bool noPermute); + +#undef cv_hal_dft +#define cv_hal_dft cv::rvv_hal::core::dft + +/* ############ norm ############ */ + +int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, + int width, int height, int type, int norm_type, double* result); + +#undef cv_hal_norm +#define cv_hal_norm cv::rvv_hal::core::norm + +/* ############ normDiff ############ */ + +int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, + const uchar* mask, size_t mask_step, int width, int height, int type, + int norm_type, double* result); + +#undef cv_hal_normDiff +#define cv_hal_normDiff cv::rvv_hal::core::normDiff + +/* ############ normHamming ############ */ + +int normHamming8u(const uchar* a, int n, int cellSize, int* result); +int normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result); + +#undef cv_hal_normHamming8u +#define cv_hal_normHamming8u cv::rvv_hal::core::normHamming8u +#undef cv_hal_normHammingDiff8u +#define cv_hal_normHammingDiff8u cv::rvv_hal::core::normHammingDiff8u + +/* ############ convertScale ############ */ + +int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t dst_step, + int width, int height, int sdepth, int ddepth, double alpha, double beta); + +#undef cv_hal_convertScale +#define cv_hal_convertScale cv::rvv_hal::core::convertScale + +/* ############ minMaxIdx ############ */ + +int minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth, + double* minVal, double* maxVal, int* minIdx, int* maxIdx, uchar* mask, size_t mask_step = 0); + +#undef cv_hal_minMaxIdx +#define cv_hal_minMaxIdx cv::rvv_hal::core::minMaxIdx +#undef cv_hal_minMaxIdxMaskStep +#define cv_hal_minMaxIdxMaskStep cv::rvv_hal::core::minMaxIdx + +/* ############ fastAtan ############ */ + +int fast_atan_32(const float* y, const float* x, float* dst, size_t n, bool angle_in_deg); +int fast_atan_64(const double* y, const double* x, double* dst, size_t n, bool angle_in_deg); + +#undef cv_hal_fastAtan32f +#define cv_hal_fastAtan32f cv::rvv_hal::core::fast_atan_32 +#undef cv_hal_fastAtan64f +#define cv_hal_fastAtan64f cv::rvv_hal::core::fast_atan_64 + +/* ############ split ############ */ + +int split8u(const uchar* src, uchar** dst, int len, int cn); + +#undef cv_hal_split8u +#define cv_hal_split8u cv::rvv_hal::core::split8u + +/* ############ sqrt ############ */ + +int sqrt32f(const float* src, float* dst, int _len); +int sqrt64f(const double* src, double* dst, int _len); + +#undef cv_hal_sqrt32f +#define cv_hal_sqrt32f cv::rvv_hal::core::sqrt32f +#undef cv_hal_sqrt64f +#define cv_hal_sqrt64f cv::rvv_hal::core::sqrt64f + +int invSqrt32f(const float* src, float* dst, int _len); +int invSqrt64f(const double* src, double* dst, int _len); + +#undef cv_hal_invSqrt32f +#define cv_hal_invSqrt32f cv::rvv_hal::core::invSqrt32f +#undef cv_hal_invSqrt64f +#define cv_hal_invSqrt64f cv::rvv_hal::core::invSqrt64f + +/* ############ magnitude ############ */ + +int magnitude32f(const float *x, const float *y, float *dst, int len); +int magnitude64f(const double *x, const double *y, double *dst, int len); + +#undef cv_hal_magnitude32f +#define cv_hal_magnitude32f cv::rvv_hal::core::magnitude32f +#undef cv_hal_magnitude64f +#define cv_hal_magnitude64f cv::rvv_hal::core::magnitude64f + +/* ############ cartToPolar ############ */ + +int cartToPolar32f(const float* x, const float* y, float* mag, float* angle, int len, bool angleInDegrees); +int cartToPolar64f(const double* x, const double* y, double* mag, double* angle, int len, bool angleInDegrees); + +#undef cv_hal_cartToPolar32f +#define cv_hal_cartToPolar32f cv::rvv_hal::core::cartToPolar32f +#undef cv_hal_cartToPolar64f +#define cv_hal_cartToPolar64f cv::rvv_hal::core::cartToPolar64f + +/* ############ polarToCart ############ */ + +int polarToCart32f(const float* mag, const float* angle, float* x, float* y, int len, bool angleInDegrees); +int polarToCart64f(const double* mag, const double* angle, double* x, double* y, int len, bool angleInDegrees); + +#undef cv_hal_polarToCart32f +#define cv_hal_polarToCart32f cv::rvv_hal::core::polarToCart32f +#undef cv_hal_polarToCart64f +#define cv_hal_polarToCart64f cv::rvv_hal::core::polarToCart64f + +/* ############ polarToCart ############ */ + +int flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height, + uchar* dst_data, size_t dst_step, int flip_mode); + +#undef cv_hal_flip +#define cv_hal_flip cv::rvv_hal::core::flip + +/* ############ lut ############ */ + +int lut(const uchar* src_data, size_t src_step, size_t src_type, + const uchar* lut_data, size_t lut_channel_size, size_t lut_channels, + uchar* dst_data, size_t dst_step, int width, int height); + +#undef cv_hal_lut +#define cv_hal_lut cv::rvv_hal::core::lut + +/* ############ exp ############ */ + +int exp32f(const float* src, float* dst, int _len); +int exp64f(const double* src, double* dst, int _len); + +#undef cv_hal_exp32f +#define cv_hal_exp32f cv::rvv_hal::core::exp32f +#undef cv_hal_exp64f +#define cv_hal_exp64f cv::rvv_hal::core::exp64f + +/* ############ log ############ */ + +int log32f(const float* src, float* dst, int _len); +int log64f(const double* src, double* dst, int _len); + +#undef cv_hal_log32f +#define cv_hal_log32f cv::rvv_hal::core::log32f +#undef cv_hal_log64f +#define cv_hal_log64f cv::rvv_hal::core::log64f + +/* ############ lu ############ */ + +int LU32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, int* info); +int LU64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, int* info); + +#undef cv_hal_LU32f +#define cv_hal_LU32f cv::rvv_hal::core::LU32f +#undef cv_hal_LU64f +#define cv_hal_LU64f cv::rvv_hal::core::LU64f + +/* ############ cholesky ############ */ + +int Cholesky32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, bool* info); +int Cholesky64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, bool* info); + +#undef cv_hal_Cholesky32f +#define cv_hal_Cholesky32f cv::rvv_hal::core::Cholesky32f +#undef cv_hal_Cholesky64f +#define cv_hal_Cholesky64f cv::rvv_hal::core::Cholesky64f + +/* ############ qr ############ */ + +int QR32f(float* src1, size_t src1_step, int m, int n, int k, float* src2, size_t src2_step, float* dst, int* info); +int QR64f(double* src1, size_t src1_step, int m, int n, int k, double* src2, size_t src2_step, double* dst, int* info); + +#undef cv_hal_QR32f +#define cv_hal_QR32f cv::rvv_hal::core::QR32f +#undef cv_hal_QR64f +#define cv_hal_QR64f cv::rvv_hal::core::QR64f + +/* ############ SVD ############ */ + +int SVD32f(float* src, size_t src_step, float* w, float* u, size_t u_step, float* vt, size_t vt_step, int m, int n, int flags); +int SVD64f(double* src, size_t src_step, double* w, double* u, size_t u_step, double* vt, size_t vt_step, int m, int n, int flags); + +#undef cv_hal_SVD32f +#define cv_hal_SVD32f cv::rvv_hal::core::SVD32f +#undef cv_hal_SVD64f +#define cv_hal_SVD64f cv::rvv_hal::core::SVD64f + +/* ############ copyToMasked ############ */ + +int copyToMasked(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, + int type, const uchar *mask_data, size_t mask_step, int mask_type); + +#undef cv_hal_copyToMasked +#define cv_hal_copyToMasked cv::rvv_hal::core::copyToMasked + +/* ############ div, recip ############ */ + +int div8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, double scale); +int div8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, schar *dst_data, size_t dst_step, int width, int height, double scale); +int div16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, ushort *dst_data, size_t dst_step, int width, int height, double scale); +int div16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, short *dst_data, size_t dst_step, int width, int height, double scale); +int div32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale); +int div32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale); +// int div64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height, double scale); + +#undef cv_hal_div8u +#define cv_hal_div8u cv::rvv_hal::core::div8u +#undef cv_hal_div8s +#define cv_hal_div8s cv::rvv_hal::core::div8s +#undef cv_hal_div16u +#define cv_hal_div16u cv::rvv_hal::core::div16u +#undef cv_hal_div16s +#define cv_hal_div16s cv::rvv_hal::core::div16s +#undef cv_hal_div32s +#define cv_hal_div32s cv::rvv_hal::core::div32s +#undef cv_hal_div32f +#define cv_hal_div32f cv::rvv_hal::core::div32f +// #undef cv_hal_div64f +// #define cv_hal_div64f cv::rvv_hal::core::div64f + +int recip8u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, double scale); +int recip8s(const schar *src_data, size_t src_step, schar *dst_data, size_t dst_step, int width, int height, double scale); +int recip16u(const ushort *src_data, size_t src_step, ushort *dst_data, size_t dst_step, int width, int height, double scale); +int recip16s(const short *src_data, size_t src_step, short *dst_data, size_t dst_step, int width, int height, double scale); +int recip32s(const int *src_data, size_t src_step, int *dst_data, size_t dst_step, int width, int height, double scale); +int recip32f(const float *src_data, size_t src_step, float *dst_data, size_t dst_step, int width, int height, double scale); +// int recip64f(const double *src_data, size_t src_step, double *dst_data, size_t dst_step, int width, int height, double scale); + +#undef cv_hal_recip8u +#define cv_hal_recip8u cv::rvv_hal::core::recip8u +#undef cv_hal_recip8s +#define cv_hal_recip8s cv::rvv_hal::core::recip8s +#undef cv_hal_recip16u +#define cv_hal_recip16u cv::rvv_hal::core::recip16u +#undef cv_hal_recip16s +#define cv_hal_recip16s cv::rvv_hal::core::recip16s +#undef cv_hal_recip32s +#define cv_hal_recip32s cv::rvv_hal::core::recip32s +#undef cv_hal_recip32f +#define cv_hal_recip32f cv::rvv_hal::core::recip32f +// #undef cv_hal_recip64f +// #define cv_hal_recip64f cv::rvv_hal::core::recip64f + +/* ############ dotProduct ############ */ + +int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size_t b_step, + int width, int height, int type, double *dot_val); + +#undef cv_hal_dotProduct +#define cv_hal_dotProduct cv::rvv_hal::core::dotprod + +/* ############ compare ############ */ + +int cmp8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation); +int cmp8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation); +int cmp16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation); +int cmp16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation); +int cmp32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation); +int cmp32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation); +// int cmp64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation); + +#undef cv_hal_cmp8u +#define cv_hal_cmp8u cv::rvv_hal::core::cmp8u +#undef cv_hal_cmp8s +#define cv_hal_cmp8s cv::rvv_hal::core::cmp8s +#undef cv_hal_cmp16u +#define cv_hal_cmp16u cv::rvv_hal::core::cmp16u +#undef cv_hal_cmp16s +#define cv_hal_cmp16s cv::rvv_hal::core::cmp16s +#undef cv_hal_cmp32s +#define cv_hal_cmp32s cv::rvv_hal::core::cmp32s +#undef cv_hal_cmp32f +#define cv_hal_cmp32f cv::rvv_hal::core::cmp32f +// #undef cv_hal_cmp64f +// #define cv_hal_cmp64f cv::rvv_hal::core::cmp64f + +/* ############ transpose2d ############ */ + +int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, + int src_width, int src_height, int element_size); + +#undef cv_hal_transpose2d +#define cv_hal_transpose2d cv::rvv_hal::core::transpose2d + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::core + +#endif // OPENCV_RVV_HAL_CORE_HPP diff --git a/hal/riscv-rvv/include/imgproc.hpp b/hal/riscv-rvv/include/imgproc.hpp new file mode 100644 index 0000000000..66c75786a0 --- /dev/null +++ b/hal/riscv-rvv/include/imgproc.hpp @@ -0,0 +1,249 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_RVV_HAL_IMGPROC_HPP +#define OPENCV_RVV_HAL_IMGPROC_HPP + +struct cvhalFilter2D; + +namespace cv { namespace rvv_hal { namespace imgproc { + +#if CV_HAL_RVV_1P0_ENABLED + +/* ############ imageMoments ############ */ + +int imageMoments(const uchar* src_data, size_t src_step, int src_type, + int width, int height, bool binary, double m[10]); + +#undef cv_hal_imageMoments +#define cv_hal_imageMoments cv::rvv_hal::imgproc::imageMoments + +/* ############ filter ############ */ + +int filterInit(cvhalFilter2D** context, uchar* kernel_data, size_t kernel_step, int kernel_type, int kernel_width, int kernel_height, int /*max_width*/, int /*max_height*/, int src_type, int dst_type, int borderType, double delta, int anchor_x, int anchor_y, bool /*allowSubmatrix*/, bool /*allowInplace*/); +int filter(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y); +int filterFree(cvhalFilter2D* context); + +#undef cv_hal_filterInit +#define cv_hal_filterInit cv::rvv_hal::imgproc::filterInit +#undef cv_hal_filter +#define cv_hal_filter cv::rvv_hal::imgproc::filter +#undef cv_hal_filterFree +#define cv_hal_filterFree cv::rvv_hal::imgproc::filterFree + +/* ############ sepFilter ############ */ + +int sepFilterInit(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type, uchar* kernelx_data, int kernelx_length, uchar* kernely_data, int kernely_length, int anchor_x, int anchor_y, double delta, int borderType); +int sepFilter(cvhalFilter2D *context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y); +int sepFilterFree(cvhalFilter2D* context); + +#undef cv_hal_sepFilterInit +#define cv_hal_sepFilterInit cv::rvv_hal::imgproc::sepFilterInit +#undef cv_hal_sepFilter +#define cv_hal_sepFilter cv::rvv_hal::imgproc::sepFilter +#undef cv_hal_sepFilterFree +#define cv_hal_sepFilterFree cv::rvv_hal::imgproc::sepFilterFree + +/* ############ morph ############ */ + +int morphInit(cvhalFilter2D** context, int operation, int src_type, int dst_type, int /*max_width*/, int /*max_height*/, int kernel_type, uchar* kernel_data, size_t kernel_step, int kernel_width, int kernel_height, int anchor_x, int anchor_y, int borderType, const double borderValue[4], int iterations, bool /*allowSubmatrix*/, bool /*allowInplace*/); +int morph(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_full_width, int src_full_height, int src_roi_x, int src_roi_y, int /*dst_full_width*/, int /*dst_full_height*/, int /*dst_roi_x*/, int /*dst_roi_y*/); +int morphFree(cvhalFilter2D* context); + +#undef cv_hal_morphInit +#undef cv_hal_morph +#undef cv_hal_morphFree +#define cv_hal_morphInit cv::rvv_hal::imgproc::morphInit +#define cv_hal_morph cv::rvv_hal::imgproc::morph +#define cv_hal_morphFree cv::rvv_hal::imgproc::morphFree + +/* ############ gaussianBlur ############ */ + +int gaussianBlurBinomial(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, size_t margin_left, size_t margin_top, size_t margin_right, size_t margin_bottom, size_t ksize, int border_type); + +#undef cv_hal_gaussianBlurBinomial +#define cv_hal_gaussianBlurBinomial cv::rvv_hal::imgproc::gaussianBlurBinomial + +/* ############ medianBlur ############ */ + +int medianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, int ksize); + +#undef cv_hal_medianBlur +#define cv_hal_medianBlur cv::rvv_hal::imgproc::medianBlur + +/* ############ boxFilter ############ */ + +int boxFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, size_t ksize_width, size_t ksize_height, int anchor_x, int anchor_y, bool normalize, int border_type); + +#undef cv_hal_boxFilter +#define cv_hal_boxFilter cv::rvv_hal::imgproc::boxFilter + +/* ############ bilateralFilter ############ */ + +int bilateralFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, + int width, int height, int depth, int cn, int d, double sigma_color, + double sigma_space, int border_type); + +#undef cv_hal_bilateralFilter +#define cv_hal_bilateralFilter cv::rvv_hal::imgproc::bilateralFilter + +/* ############ pyramid ############ */ + +int pyrDown(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type); +int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type); + +#undef cv_hal_pyrdown +#define cv_hal_pyrdown cv::rvv_hal::imgproc::pyrDown +#undef cv_hal_pyrup +#define cv_hal_pyrup cv::rvv_hal::imgproc::pyrUp + +/* ############ cvtColor ############ */ + +int cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue); +int cvtGraytoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn); +int cvtBGRtoGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue); +int cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int greenBits); +int cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int greenBits); +int cvtBGR5x5toGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits); +int cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits); +int cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr); +int cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr); +int cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx, int yIdx); +int cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx); +int cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx); +int cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int yIdx); +int cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, uchar * y_data, size_t y_step, uchar * uv_data, size_t uv_step, int width, int height, int scn, bool swapBlue, int uIdx); +int cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx); +int cvtHSVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV); +int cvtBGRtoHSV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV); +int cvtXYZtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue); +int cvtBGRtoXYZ(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue); +int cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isLab, bool srgb); +int cvtBGRtoLab(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isLab, bool srgb); + +#undef cv_hal_cvtBGRtoBGR +#define cv_hal_cvtBGRtoBGR cv::rvv_hal::imgproc::cvtBGRtoBGR +#undef cv_hal_cvtGraytoBGR +#define cv_hal_cvtGraytoBGR cv::rvv_hal::imgproc::cvtGraytoBGR +#undef cv_hal_cvtBGRtoGray +#define cv_hal_cvtBGRtoGray cv::rvv_hal::imgproc::cvtBGRtoGray +#undef cv_hal_cvtBGR5x5toBGR +#define cv_hal_cvtBGR5x5toBGR cv::rvv_hal::imgproc::cvtBGR5x5toBGR +#undef cv_hal_cvtBGRtoBGR5x5 +#define cv_hal_cvtBGRtoBGR5x5 cv::rvv_hal::imgproc::cvtBGRtoBGR5x5 +#undef cv_hal_cvtBGR5x5toGray +#define cv_hal_cvtBGR5x5toGray cv::rvv_hal::imgproc::cvtBGR5x5toGray +#undef cv_hal_cvtGraytoBGR5x5 +#define cv_hal_cvtGraytoBGR5x5 cv::rvv_hal::imgproc::cvtGraytoBGR5x5 +#undef cv_hal_cvtYUVtoBGR +#define cv_hal_cvtYUVtoBGR cv::rvv_hal::imgproc::cvtYUVtoBGR +#undef cv_hal_cvtBGRtoYUV +#define cv_hal_cvtBGRtoYUV cv::rvv_hal::imgproc::cvtBGRtoYUV +#undef cv_hal_cvtOnePlaneYUVtoBGR +#define cv_hal_cvtOnePlaneYUVtoBGR cv::rvv_hal::imgproc::cvtOnePlaneYUVtoBGR +#undef cv_hal_cvtTwoPlaneYUVtoBGR +#define cv_hal_cvtTwoPlaneYUVtoBGR cv::rvv_hal::imgproc::cvtTwoPlaneYUVtoBGR +#undef cv_hal_cvtThreePlaneYUVtoBGR +#define cv_hal_cvtThreePlaneYUVtoBGR cv::rvv_hal::imgproc::cvtThreePlaneYUVtoBGR +#undef cv_hal_cvtOnePlaneBGRtoYUV +#define cv_hal_cvtOnePlaneBGRtoYUV cv::rvv_hal::imgproc::cvtOnePlaneBGRtoYUV +#undef cv_hal_cvtBGRtoTwoPlaneYUV +#define cv_hal_cvtBGRtoTwoPlaneYUV cv::rvv_hal::imgproc::cvtBGRtoTwoPlaneYUV +#undef cv_hal_cvtBGRtoThreePlaneYUV +#define cv_hal_cvtBGRtoThreePlaneYUV cv::rvv_hal::imgproc::cvtBGRtoThreePlaneYUV +#undef cv_hal_cvtHSVtoBGR +#define cv_hal_cvtHSVtoBGR cv::rvv_hal::imgproc::cvtHSVtoBGR +#undef cv_hal_cvtBGRtoHSV +#define cv_hal_cvtBGRtoHSV cv::rvv_hal::imgproc::cvtBGRtoHSV +#undef cv_hal_cvtXYZtoBGR +#define cv_hal_cvtXYZtoBGR cv::rvv_hal::imgproc::cvtXYZtoBGR +#undef cv_hal_cvtBGRtoXYZ +#define cv_hal_cvtBGRtoXYZ cv::rvv_hal::imgproc::cvtBGRtoXYZ +#undef cv_hal_cvtLabtoBGR +#define cv_hal_cvtLabtoBGR cv::rvv_hal::imgproc::cvtLabtoBGR +#undef cv_hal_cvtBGRtoLab +#define cv_hal_cvtBGRtoLab cv::rvv_hal::imgproc::cvtBGRtoLab + +/* ############ warp ############ */ + +int remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, + uchar *dst_data, size_t dst_step, int dst_width, int dst_height, + float* mapx, size_t mapx_step, float* mapy, size_t mapy_step, + int interpolation, int border_type, const double border_value[4]); +int remap32fc2(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, + uchar *dst_data, size_t dst_step, int dst_width, int dst_height, + float* map, size_t map_step, int interpolation, int border_type, const double border_value[4]); +int remap16s(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, + uchar *dst_data, size_t dst_step, int dst_width, int dst_height, + short* mapx, size_t mapx_step, ushort* mapy, size_t mapy_step, + int interpolation, int border_type, const double border_value[4]); + +#undef cv_hal_remap32f +#define cv_hal_remap32f cv::rvv_hal::imgproc::remap32f +#undef cv_hal_remap32fc2 +#define cv_hal_remap32fc2 cv::rvv_hal::imgproc::remap32fc2 +#undef cv_hal_remap16s +#define cv_hal_remap16s cv::rvv_hal::imgproc::remap16s + +int warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4]); +int warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4]); + +#undef cv_hal_warpAffine +#define cv_hal_warpAffine cv::rvv_hal::imgproc::warpAffine +#undef cv_hal_warpPerspective +#define cv_hal_warpPerspective cv::rvv_hal::imgproc::warpPerspective + +/* ############ threshold ############ */ + +int threshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, double thresh, double maxValue, int thresholdType); +int threshold_otsu(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, double maxValue, int thresholdType, double* thresh); +int adaptiveThreshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, double maxValue, int adaptiveMethod, int thresholdType, int blockSize, double C); + +// disabled since UI is fast enough, only called in threshold_otsu +// #undef cv_hal_threshold +// #define cv_hal_threshold cv::rvv_hal::imgproc::threshold +#undef cv_hal_threshold_otsu +#define cv_hal_threshold_otsu cv::rvv_hal::imgproc::threshold_otsu +#undef cv_hal_adaptiveThreshold +#define cv_hal_adaptiveThreshold cv::rvv_hal::imgproc::adaptiveThreshold + +/* ############ histogram ############ */ + +int equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height); + +#undef cv_hal_equalize_hist +#define cv_hal_equalize_hist cv::rvv_hal::imgproc::equalize_hist + +/* ############ resize ############ */ + +int resize(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y, int interpolation); + +#undef cv_hal_resize +#define cv_hal_resize cv::rvv_hal::imgproc::resize + +/* ############ resize ############ */ + +int integral(int depth, int sdepth, int sqdepth, + const uchar* src_data, size_t src_step, + uchar* sum_data, size_t sum_step, + uchar* sqsum_data, size_t sqsum_step, + uchar* tilted_data, [[maybe_unused]] size_t tilted_step, + int width, int height, int cn); + +#undef cv_hal_integral +#define cv_hal_integral cv::rvv_hal::imgproc::integral + +#endif // CV_HAL_RVV_1P0_ENABLED + +#if CV_HAL_RVV_071_ENABLED + +int cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue); +#undef cv_hal_cvtBGRtoBGR +#define cv_hal_cvtBGRtoBGR cv::rvv_hal::imgproc::cvtBGRtoBGR + +#endif // CV_HAL_RVV_071_ENABLED + +}}} // cv::rvv_hal::imgproc + +#endif // OPENCV_RVV_HAL_IMGPROC_HPP diff --git a/hal/riscv-rvv/hal_rvv_1p0/types.hpp b/hal/riscv-rvv/include/types.hpp similarity index 99% rename from hal/riscv-rvv/hal_rvv_1p0/types.hpp rename to hal/riscv-rvv/include/types.hpp index 6613a018fc..948bbfbd30 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/types.hpp +++ b/hal/riscv-rvv/include/types.hpp @@ -4,13 +4,15 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_TYPES_HPP_INCLUDED -#define OPENCV_HAL_RVV_TYPES_HPP_INCLUDED +#ifndef OPENCV_RVV_HAL_TYPES_HPP +#define OPENCV_RVV_HAL_TYPES_HPP #include #include -namespace cv { namespace cv_hal_rvv { +namespace cv { namespace rvv_hal { + +#if CV_HAL_RVV_1P0_ENABLED enum RVV_LMUL { @@ -869,6 +871,8 @@ HAL_RVV_GROUP(RVV_F64M1, RVV_F64M8, f64, m1, m8) #undef HAL_RVV_GROUP -}} // namespace cv::cv_hal_rvv +#endif // CV_HAL_RVV_1P0_ENABLED -#endif //OPENCV_HAL_RVV_TYPES_HPP_INCLUDED +}} // namespace cv::rvv_hal + +#endif //OPENCV_RVV_HAL_TYPES_HPP diff --git a/hal/riscv-rvv/rvv_hal.hpp b/hal/riscv-rvv/rvv_hal.hpp new file mode 100644 index 0000000000..88989aaeb8 --- /dev/null +++ b/hal/riscv-rvv/rvv_hal.hpp @@ -0,0 +1,31 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_HAL_RVV_HPP_INCLUDED +#define OPENCV_HAL_RVV_HPP_INCLUDED + +#include "opencv2/core/base.hpp" +#include "opencv2/core/utility.hpp" +#include "opencv2/core/hal/interface.h" + +#if defined(__riscv_v) && __riscv_v == 1000000 +#define CV_HAL_RVV_1P0_ENABLED 1 +#else +#define CV_HAL_RVV_1P0_ENABLED 0 +#endif + +#if defined(__riscv_v) && __riscv_v == 7000 && defined(__GNUC__) && __GNUC__ == 10 && __GNUC_MINOR__ == 4 && defined(__THEAD_VERSION__) +#define CV_HAL_RVV_071_ENABLED 1 +#else +#define CV_HAL_RVV_071_ENABLED 0 +#endif + +#if CV_HAL_RVV_1P0_ENABLED || CV_HAL_RVV_071_ENABLED +#include +#endif +#include "include/types.hpp" +#include "include/core.hpp" +#include "include/imgproc.hpp" + +#endif // OPENCV_HAL_RVV_HPP_INCLUDED diff --git a/hal/riscv-rvv/src/core/atan.cpp b/hal/riscv-rvv/src/core/atan.cpp new file mode 100644 index 0000000000..e2b0d5c314 --- /dev/null +++ b/hal/riscv-rvv/src/core/atan.cpp @@ -0,0 +1,64 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level +// directory of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. + +#include "rvv_hal.hpp" +#include "common.hpp" + +namespace cv { namespace rvv_hal { namespace core { + +#if CV_HAL_RVV_1P0_ENABLED + +int fast_atan_32(const float* y, const float* x, float* dst, size_t n, bool angle_in_deg) +{ + auto atan_params = angle_in_deg ? common::atan_params_deg : common::atan_params_rad; + + for (size_t vl = 0; n > 0; n -= vl) + { + vl = __riscv_vsetvl_e32m4(n); + + auto vy = __riscv_vle32_v_f32m4(y, vl); + auto vx = __riscv_vle32_v_f32m4(x, vl); + + auto a = common::rvv_atan(vy, vx, vl, atan_params); + + __riscv_vse32(dst, a, vl); + + x += vl; + y += vl; + dst += vl; + } + + return CV_HAL_ERROR_OK; +} + +int fast_atan_64(const double* y, const double* x, double* dst, size_t n, bool angle_in_deg) +{ + // this also uses float32 version, ref: mathfuncs_core.simd.hpp + + auto atan_params = angle_in_deg ? common::atan_params_deg : common::atan_params_rad; + + for (size_t vl = 0; n > 0; n -= vl) + { + vl = __riscv_vsetvl_e64m8(n); + + auto vy = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(y, vl), vl); + auto vx = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(x, vl), vl); + + auto a = common::rvv_atan(vy, vx, vl, atan_params); + + __riscv_vse64(dst, __riscv_vfwcvt_f(a, vl), vl); + + x += vl; + y += vl; + dst += vl; + } + + return CV_HAL_ERROR_OK; +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/cart_to_polar.hpp b/hal/riscv-rvv/src/core/cart_to_polar.cpp similarity index 53% rename from hal/riscv-rvv/hal_rvv_1p0/cart_to_polar.hpp rename to hal/riscv-rvv/src/core/cart_to_polar.cpp index 676133b668..56ee0fcefc 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/cart_to_polar.hpp +++ b/hal/riscv-rvv/src/core/cart_to_polar.cpp @@ -4,27 +4,20 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED -#define OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED +#include "rvv_hal.hpp" +#include "common.hpp" -#include +namespace cv { namespace rvv_hal { namespace core { -#include "hal_rvv_1p0/atan.hpp" -#include "hal_rvv_1p0/sqrt.hpp" -#include "hal_rvv_1p0/types.hpp" +#if CV_HAL_RVV_1P0_ENABLED -namespace cv { namespace cv_hal_rvv { - -#undef cv_hal_cartToPolar32f -#define cv_hal_cartToPolar32f cv::cv_hal_rvv::cartToPolar -#undef cv_hal_cartToPolar64f -#define cv_hal_cartToPolar64f cv::cv_hal_rvv::cartToPolar +namespace { template inline int cartToPolar(const T* x, const T* y, T* mag, T* angle, int len, bool angleInDegrees) { using CalType = RVV_SameLen; - auto atan_params = angleInDegrees ? detail::atan_params_deg : detail::atan_params_rad; + auto atan_params = angleInDegrees ? common::atan_params_deg : common::atan_params_rad; size_t vl; for (; len > 0; len -= (int)vl, x += vl, y += vl, mag += vl, angle += vl) { @@ -33,16 +26,25 @@ inline int cartToPolar(const T* x, const T* y, T* mag, T* angle, int len, bool a auto vx = CalType::cast(RVV_T::vload(x, vl), vl); auto vy = CalType::cast(RVV_T::vload(y, vl), vl); - auto vmag = detail::sqrt<2>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl); + auto vmag = common::sqrt<2>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl); RVV_T::vstore(mag, RVV_T::cast(vmag, vl), vl); - auto vangle = detail::rvv_atan(vy, vx, vl, atan_params); + auto vangle = common::rvv_atan(vy, vx, vl, atan_params); RVV_T::vstore(angle, RVV_T::cast(vangle, vl), vl); } return CV_HAL_ERROR_OK; } -}} // namespace cv::cv_hal_rvv +} // anonymous -#endif // OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED +int cartToPolar32f(const float* x, const float* y, float* mag, float* angle, int len, bool angleInDegrees) { + return cartToPolar(x, y, mag, angle, len, angleInDegrees); +} +int cartToPolar64f(const double* x, const double* y, double* mag, double* angle, int len, bool angleInDegrees) { + return cartToPolar(x, y, mag, angle, len, angleInDegrees); +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/cholesky.hpp b/hal/riscv-rvv/src/core/cholesky.cpp similarity index 88% rename from hal/riscv-rvv/hal_rvv_1p0/cholesky.hpp rename to hal/riscv-rvv/src/core/cholesky.cpp index b5d9d3e891..995e7eb5be 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/cholesky.hpp +++ b/hal/riscv-rvv/src/core/cholesky.cpp @@ -4,20 +4,15 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_CHOLESKY_HPP_INCLUDED -#define OPENCV_HAL_RVV_CHOLESKY_HPP_INCLUDED - +#include "rvv_hal.hpp" #include #include -#include -#include "hal_rvv_1p0/types.hpp" -namespace cv { namespace cv_hal_rvv { namespace cholesky { +namespace cv { namespace rvv_hal { namespace core { -#undef cv_hal_Cholesky32f -#define cv_hal_Cholesky32f cv::cv_hal_rvv::cholesky::Cholesky -#undef cv_hal_Cholesky64f -#define cv_hal_Cholesky64f cv::cv_hal_rvv::cholesky::Cholesky +#if CV_HAL_RVV_1P0_ENABLED + +namespace { // the algorithm is copied from core/src/matrix_decomp.cpp, // in the function template static int cv::CholImpl @@ -119,6 +114,15 @@ inline int Cholesky(T* src1, size_t src1_step, int m, T* src2, size_t src2_step, return CV_HAL_ERROR_OK; } -}}} +} // anonymous -#endif +int Cholesky32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, bool* info) { + return Cholesky(src1, src1_step, m, src2, src2_step, n, info); +} +int Cholesky64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, bool* info) { + return Cholesky(src1, src1_step, m, src2, src2_step, n, info); +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/src/core/common.hpp b/hal/riscv-rvv/src/core/common.hpp new file mode 100644 index 0000000000..37ef0194d4 --- /dev/null +++ b/hal/riscv-rvv/src/core/common.hpp @@ -0,0 +1,183 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2025, SpaceMIT Inc., all rights reserved. +// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. +// Third party copyrights are property of their respective owners. + +#ifndef OPENCV_HAL_RVV_CORE_COMMON_HPP_INCLUDED +#define OPENCV_HAL_RVV_CORE_COMMON_HPP_INCLUDED + +#include +#include +#include + +namespace cv { namespace rvv_hal { namespace core { namespace common { + +#if CV_HAL_RVV_1P0_ENABLED + +#define CV_HAL_RVV_NOOP(a) (a) + +// ############ abs ############ + +#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(_Tpvs, _Tpvd, shift, suffix) \ + inline _Tpvd __riscv_vabs(const _Tpvs& v, const int vl) { \ + _Tpvs mask = __riscv_vsra(v, shift, vl); \ + _Tpvs v_xor = __riscv_vxor(v, mask, vl); \ + return __riscv_vreinterpret_##suffix( \ + __riscv_vsub(v_xor, mask, vl) \ + ); \ + } + +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m2_t, vuint8m2_t, 7, u8m2) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m8_t, vuint8m8_t, 7, u8m8) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m4_t, vuint16m4_t, 15, u16m4) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m8_t, vuint16m8_t, 15, u16m8) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m4_t, vuint32m4_t, 31, u32m4) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m8_t, vuint32m8_t, 31, u32m8) + +// ############ absdiff ############ + +#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(_Tpvs, _Tpvd, cast, sub, max, min) \ + inline _Tpvd __riscv_vabd(const _Tpvs& v1, const _Tpvs& v2, const int vl) { \ + return cast(__riscv_##sub(__riscv_##max(v1, v2, vl), __riscv_##min(v1, v2, vl), vl)); \ + } + +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m4_t, vuint8m4_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m8_t, vuint8m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m2_t, vuint16m2_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m8_t, vuint16m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu) + +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m4_t, vuint8m4_t, __riscv_vreinterpret_u8m4, vsub, vmax, vmin) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m8_t, vuint8m8_t, __riscv_vreinterpret_u8m8, vsub, vmax, vmin) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m2_t, vuint16m2_t, __riscv_vreinterpret_u16m2, vsub, vmax, vmin) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m8_t, vuint16m8_t, __riscv_vreinterpret_u16m8, vsub, vmax, vmin) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m4_t, vuint32m4_t, __riscv_vreinterpret_u32m4, vsub, vmax, vmin) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m8_t, vuint32m8_t, __riscv_vreinterpret_u32m8, vsub, vmax, vmin) + +// ############ atan ############ + +// ref: mathfuncs_core.simd.hpp +static constexpr float pi = CV_PI; + +struct AtanParams +{ + float p1, p3, p5, p7, angle_90; +}; + +static constexpr AtanParams atan_params_rad { + 0.9997878412794807F, + -0.3258083974640975F, + 0.1555786518463281F, + -0.04432655554792128F, + 90.F * (pi / 180.F)}; +static constexpr AtanParams atan_params_deg { + atan_params_rad.p1 * (180 / pi), + atan_params_rad.p3 * (180 / pi), + atan_params_rad.p5 * (180 / pi), + atan_params_rad.p7 * (180 / pi), + 90.F}; + +template +__attribute__((always_inline)) inline VEC_T + rvv_atan(VEC_T vy, VEC_T vx, size_t vl, const AtanParams& params) +{ + const auto ax = __riscv_vfabs(vx, vl); + const auto ay = __riscv_vfabs(vy, vl); + // Reciprocal Estimate (vfrec7) is not accurate enough to pass the test of cartToPolar. + const auto c = __riscv_vfdiv(__riscv_vfmin(ax, ay, vl), + __riscv_vfadd(__riscv_vfmax(ax, ay, vl), FLT_EPSILON, vl), + vl); + const auto c2 = __riscv_vfmul(c, c, vl); + + // Using vfmadd only results in about a 2% performance improvement, but it occupies 3 additional + // M4 registers. (Performance test on phase32f::VectorLength::1048576: time decreased + // from 5.952ms to 5.805ms on Muse Pi) + // Additionally, when registers are nearly fully utilized (though not yet exhausted), the + // compiler is likely to fail to optimize and may introduce slower memory access (e.g., in + // cv::rvv_hal::fast_atan_64). + // Saving registers can also make this function more reusable in other contexts. + // Therefore, vfmadd is not used here. + auto a = __riscv_vfadd(__riscv_vfmul(c2, params.p7, vl), params.p5, vl); + a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p3, vl); + a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p1, vl); + a = __riscv_vfmul(a, c, vl); + + a = __riscv_vfrsub_mu(__riscv_vmflt(ax, ay, vl), a, a, params.angle_90, vl); + a = __riscv_vfrsub_mu(__riscv_vmflt(vx, 0.F, vl), a, a, params.angle_90 * 2, vl); + a = __riscv_vfrsub_mu(__riscv_vmflt(vy, 0.F, vl), a, a, params.angle_90 * 4, vl); + + return a; +} + +// ############ sqrt ############ + +template +struct Sqrt32f +{ + using T = RVV_T; + static constexpr size_t iter_times = 2; +}; + +template +struct Sqrt64f +{ + using T = RVV_T; + static constexpr size_t iter_times = 3; +}; + +// Newton-Raphson method +// Use 4 LMUL registers +template +inline VEC_T sqrt(VEC_T x, size_t vl) +{ + auto x2 = __riscv_vfmul(x, 0.5, vl); + auto y = __riscv_vfrsqrt7(x, vl); +#ifdef __clang__ +#pragma unroll +#endif + for (size_t i = 0; i < iter_times; i++) + { + auto t = __riscv_vfmul(y, y, vl); + t = __riscv_vfmul(t, x2, vl); + t = __riscv_vfrsub(t, 1.5, vl); + y = __riscv_vfmul(t, y, vl); + } + // just to prevent the compiler from calculating mask before the iteration, which will run out + // of registers and cause memory access. + asm volatile("" ::: "memory"); + auto classified = __riscv_vfclass(x, vl); + // block -0, +0, positive subnormal number, +inf + auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl); + return __riscv_vfmul_mu(mask, x, x, y, vl); +} + +// Newton-Raphson method +// Use 3 LMUL registers and 1 mask register +template +inline VEC_T invSqrt(VEC_T x, size_t vl) +{ + auto classified = __riscv_vfclass(x, vl); + // block -0, +0, positive subnormal number, +inf + auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl); + auto x2 = __riscv_vfmul(x, 0.5, vl); + auto y = __riscv_vfrsqrt7(x, vl); +#ifdef __clang__ +#pragma unroll +#endif + for (size_t i = 0; i < iter_times; i++) + { + auto t = __riscv_vfmul(y, y, vl); + t = __riscv_vfmul(t, x2, vl); + t = __riscv_vfrsub(t, 1.5, vl); + y = __riscv_vfmul_mu(mask, y, t, y, vl); + } + return y; +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}}} // cv::rvv_hal::core::common + +#endif // OPENCV_HAL_RVV_CORE_COMMON_HPP_INCLUDED diff --git a/hal/riscv-rvv/hal_rvv_1p0/compare.hpp b/hal/riscv-rvv/src/core/compare.cpp similarity index 76% rename from hal/riscv-rvv/hal_rvv_1p0/compare.hpp rename to hal/riscv-rvv/src/core/compare.cpp index 6efd92e18a..ccf0151afb 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/compare.hpp +++ b/hal/riscv-rvv/src/core/compare.cpp @@ -5,12 +5,11 @@ // Copyright (C) 2025, SpaceMIT Inc., all rights reserved. // Third party copyrights are property of their respective owners. -#ifndef OPENCV_HAL_RVV_COMPARE_HPP_INCLUDED -#define OPENCV_HAL_RVV_COMPARE_HPP_INCLUDED +#include "rvv_hal.hpp" -#include "types.hpp" +namespace cv { namespace rvv_hal { namespace core { -namespace cv { namespace cv_hal_rvv { namespace compare { +#if CV_HAL_RVV_1P0_ENABLED namespace { @@ -90,23 +89,6 @@ int compare_impl(const _Tps *src1_data, size_t src1_step, const _Tps *src2_data, return CV_HAL_ERROR_OK; } -} // anonymous - -#undef cv_hal_cmp8u -#define cv_hal_cmp8u cv::cv_hal_rvv::compare::compare -#undef cv_hal_cmp8s -#define cv_hal_cmp8s cv::cv_hal_rvv::compare::compare -#undef cv_hal_cmp16u -#define cv_hal_cmp16u cv::cv_hal_rvv::compare::compare -#undef cv_hal_cmp16s -#define cv_hal_cmp16s cv::cv_hal_rvv::compare::compare -#undef cv_hal_cmp32s -#define cv_hal_cmp32s cv::cv_hal_rvv::compare::compare -#undef cv_hal_cmp32f -#define cv_hal_cmp32f cv::cv_hal_rvv::compare::compare -// #undef cv_hal_cmp64f -// #define cv_hal_cmp64f cv::cv_hal_rvv::compare::compare - template inline int compare(const _Tps *src1_data, size_t src1_step, const _Tps *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) { @@ -121,6 +103,27 @@ int compare(const _Tps *src1_data, size_t src1_step, const _Tps *src2_data, size } } -}}} // cv::cv_hal_rvv::compare +} // namespace anonymous -#endif // OPENCV_HAL_RVV_COMPARE_HPP_INCLUDED +int cmp8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) { + return compare(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation); +} +int cmp8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) { + return compare(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation); +} +int cmp16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) { + return compare(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation); +} +int cmp16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) { + return compare(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation); +} +int cmp32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) { + return compare(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation); +} +int cmp32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) { + return compare(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation); +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/convert_scale.hpp b/hal/riscv-rvv/src/core/convert_scale.cpp similarity index 89% rename from hal/riscv-rvv/hal_rvv_1p0/convert_scale.hpp rename to hal/riscv-rvv/src/core/convert_scale.cpp index 2f28f20bfd..8c5f83a677 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/convert_scale.hpp +++ b/hal/riscv-rvv/src/core/convert_scale.cpp @@ -4,15 +4,11 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_CONVERT_SCALE_HPP_INCLUDED -#define OPENCV_HAL_RVV_CONVERT_SCALE_HPP_INCLUDED +#include "rvv_hal.hpp" -#include +namespace cv { namespace rvv_hal { namespace core { -namespace cv { namespace cv_hal_rvv { - -#undef cv_hal_convertScale -#define cv_hal_convertScale cv::cv_hal_rvv::convertScale +#if CV_HAL_RVV_1P0_ENABLED inline int convertScale_8U8U(const uchar* src, size_t src_step, uchar* dst, size_t dst_step, int width, int height, double alpha, double beta) { @@ -89,8 +85,8 @@ inline int convertScale_32F32F(const uchar* src, size_t src_step, uchar* dst, si return CV_HAL_ERROR_OK; } -inline int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t dst_step, int width, int height, - int sdepth, int ddepth, double alpha, double beta) +int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t dst_step, + int width, int height, int sdepth, int ddepth, double alpha, double beta) { if (!dst) return CV_HAL_ERROR_OK; @@ -118,6 +114,6 @@ inline int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t ds return CV_HAL_ERROR_NOT_IMPLEMENTED; } -}} +#endif // CV_HAL_RVV_1P0_ENABLED -#endif +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/copy_mask.hpp b/hal/riscv-rvv/src/core/copy_mask.cpp similarity index 86% rename from hal/riscv-rvv/hal_rvv_1p0/copy_mask.hpp rename to hal/riscv-rvv/src/core/copy_mask.cpp index f13b8bc22e..dd49cfdeab 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/copy_mask.hpp +++ b/hal/riscv-rvv/src/core/copy_mask.cpp @@ -5,21 +5,17 @@ // Copyright (C) 2025, SpaceMIT Inc., all rights reserved. // Third party copyrights are property of their respective owners. -#ifndef OPENCV_HAL_RVV_COPY_MASK_HPP_INCLUDED -#define OPENCV_HAL_RVV_COPY_MASK_HPP_INCLUDED +#include "rvv_hal.hpp" -#include +namespace cv { namespace rvv_hal { namespace core { -namespace cv { namespace cv_hal_rvv { - -#undef cv_hal_copyToMasked -#define cv_hal_copyToMasked cv::cv_hal_rvv::copyToMasked +#if CV_HAL_RVV_1P0_ENABLED namespace { #define CV_HAL_RVV_COPY_MASK_eXc1(X, mask_lmul) \ static int copyToMasked_e##X##c1(const uchar *src_data, size_t src_step, const uchar *mask_data, size_t mask_step, \ - uchar *dst_data, size_t dst_step, int width, int height) { \ + uchar *dst_data, size_t dst_step, int width, int height) { \ for (; height--; mask_data += mask_step, src_data += src_step, dst_data += dst_step) { \ const uint##X##_t *src = (const uint##X##_t*)src_data; \ uint##X##_t *dst = (uint##X##_t*)dst_data; \ @@ -41,7 +37,7 @@ CV_HAL_RVV_COPY_MASK_eXc1(64, 1) #define CV_HAL_RVV_COPY_MASK_eXc3(X, mask_lmul) \ static int copyToMasked_e##X##c3(const uchar *src_data, size_t src_step, const uchar *mask_data, size_t mask_step, \ - uchar *dst_data, size_t dst_step, int width, int height) { \ + uchar *dst_data, size_t dst_step, int width, int height) { \ for (; height--; mask_data += mask_step, src_data += src_step, dst_data += dst_step) { \ const uint##X##_t *src = (const uint##X##_t*)src_data; \ uint##X##_t *dst = (uint##X##_t*)dst_data; \ @@ -62,9 +58,9 @@ CV_HAL_RVV_COPY_MASK_eXc3(32, f2) CV_HAL_RVV_COPY_MASK_eXc3(64, f4) static int copyToMasked_e64c2(const uchar *src_data, size_t src_step, - const uchar *mask_data, size_t mask_step, - uchar *dst_data, size_t dst_step, int width, - int height) { + const uchar *mask_data, size_t mask_step, + uchar *dst_data, size_t dst_step, int width, + int height) { for (; height--; mask_data += mask_step, src_data += src_step, dst_data += dst_step) { const uint64_t *src = (const uint64_t *)src_data; uint64_t *dst = (uint64_t *)dst_data; @@ -80,9 +76,9 @@ static int copyToMasked_e64c2(const uchar *src_data, size_t src_step, } static int copyToMasked_e64c4(const uchar *src_data, size_t src_step, - const uchar *mask_data, size_t mask_step, - uchar *dst_data, size_t dst_step, int width, - int height) { + const uchar *mask_data, size_t mask_step, + uchar *dst_data, size_t dst_step, int width, + int height) { for (; height--; mask_data += mask_step, src_data += src_step, dst_data += dst_step) { const uint64_t *src = (const uint64_t *)src_data; uint64_t *dst = (uint64_t *)dst_data; @@ -100,8 +96,8 @@ static int copyToMasked_e64c4(const uchar *src_data, size_t src_step, } // anonymous using CopyToMaskedFunc = int (*)(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int); -inline int copyToMasked(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, - int type, const uchar *mask_data, size_t mask_step, int mask_type) { +int copyToMasked(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, + int type, const uchar *mask_data, size_t mask_step, int mask_type) { int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); int mdepth = CV_MAT_DEPTH(mask_type), mcn = CV_MAT_CN(mask_type); @@ -189,6 +185,6 @@ inline int copyToMasked(const uchar *src_data, size_t src_step, uchar *dst_data, return CV_HAL_ERROR_OK; } -}} // cv::cv_hal_rvv +#endif // CV_HAL_RVV_1P0_ENABLED -#endif +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/src/core/div.cpp b/hal/riscv-rvv/src/core/div.cpp new file mode 100644 index 0000000000..e12e3775f5 --- /dev/null +++ b/hal/riscv-rvv/src/core/div.cpp @@ -0,0 +1,276 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2025, SpaceMIT Inc., all rights reserved. +// Third party copyrights are property of their respective owners. + +#include "rvv_hal.hpp" +#include + +namespace cv { namespace rvv_hal { namespace core { + +#if CV_HAL_RVV_1P0_ENABLED + +namespace { + +inline size_t setvl(int l) { return __riscv_vsetvl_e8m2(l); } + +inline vuint8m2_t vle(const uint8_t *p, int vl) { return __riscv_vle8_v_u8m2(p, vl); } +inline vint8m2_t vle(const int8_t *p, int vl) { return __riscv_vle8_v_i8m2(p, vl); } +inline vuint16m4_t vle(const uint16_t *p, int vl) { return __riscv_vle16_v_u16m4(p, vl); } +inline vint16m4_t vle(const int16_t *p, int vl) { return __riscv_vle16_v_i16m4(p, vl); } +inline vint32m8_t vle(const int *p, int vl) { return __riscv_vle32_v_i32m8(p, vl); } +inline vfloat32m8_t vle(const float *p, int vl) { return __riscv_vle32_v_f32m8(p, vl); } + +inline void vse(uint8_t *p, const vuint8m2_t &v, int vl) { __riscv_vse8(p, v, vl); } +inline void vse(int8_t *p, const vint8m2_t &v, int vl) { __riscv_vse8(p, v, vl); } +inline void vse(uint16_t *p, const vuint16m4_t &v, int vl) { __riscv_vse16(p, v, vl); } +inline void vse(int16_t *p, const vint16m4_t &v, int vl) { __riscv_vse16(p, v, vl); } +inline void vse(int *p, const vint32m8_t &v, int vl) { __riscv_vse32(p, v, vl); } +inline void vse(float *p, const vfloat32m8_t &v, int vl) { __riscv_vse32(p, v, vl); } + +inline vuint16m4_t ext(const vuint8m2_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); } +inline vint16m4_t ext(const vint8m2_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); } +inline vuint32m8_t ext(const vuint16m4_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); } +inline vint32m8_t ext(const vint16m4_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); } + +inline vuint8m2_t nclip(const vuint16m4_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); } +inline vint8m2_t nclip(const vint16m4_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); } +inline vuint16m4_t nclip(const vuint32m8_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); } +inline vint16m4_t nclip(const vint32m8_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); } + +template inline +VT div_sat(const VT &v1, const VT &v2, const float scale, const int vl) { + return nclip(div_sat(ext(v1, vl), ext(v2, vl), scale, vl), vl); +} +template <> inline +vint32m8_t div_sat(const vint32m8_t &v1, const vint32m8_t &v2, const float scale, const int vl) { + auto f1 = __riscv_vfcvt_f(v1, vl); + auto f2 = __riscv_vfcvt_f(v2, vl); + auto res = __riscv_vfmul(f1, __riscv_vfrdiv(f2, scale, vl), vl); + return __riscv_vfcvt_x(res, vl); +} +template <> inline +vuint32m8_t div_sat(const vuint32m8_t &v1, const vuint32m8_t &v2, const float scale, const int vl) { + auto f1 = __riscv_vfcvt_f(v1, vl); + auto f2 = __riscv_vfcvt_f(v2, vl); + auto res = __riscv_vfmul(f1, __riscv_vfrdiv(f2, scale, vl), vl); + return __riscv_vfcvt_xu(res, vl); +} + +template inline +VT recip_sat(const VT &v, const float scale, const int vl) { + return nclip(recip_sat(ext(v, vl), scale, vl), vl); +} +template <> inline +vint32m8_t recip_sat(const vint32m8_t &v, const float scale, const int vl) { + auto f = __riscv_vfcvt_f(v, vl); + auto res = __riscv_vfrdiv(f, scale, vl); + return __riscv_vfcvt_x(res, vl); +} +template <> inline +vuint32m8_t recip_sat(const vuint32m8_t &v, const float scale, const int vl) { + auto f = __riscv_vfcvt_f(v, vl); + auto res = __riscv_vfrdiv(f, scale, vl); + return __riscv_vfcvt_xu(res, vl); +} + +// Implementation + +template inline +int div(const ST *src1, size_t step1, const ST *src2, size_t step2, + ST *dst, size_t step, int width, int height, float scale) { + float max_fval = static_cast(std::numeric_limits::max()); + if (scale == 0.f || ((scale * max_fval) < 1.f && (scale * max_fval) > -1.f)) { + for (int h = 0; h < height; h++) { + ST *dst_h = reinterpret_cast((uchar*)dst + h * step); + std::memset(dst_h, 0, sizeof(ST) * width); + } + return CV_HAL_ERROR_OK; + } + + for (int h = 0; h < height; h++) { + const ST *src1_h = reinterpret_cast((const uchar*)src1 + h * step1); + const ST *src2_h = reinterpret_cast((const uchar*)src2 + h * step2); + ST *dst_h = reinterpret_cast((uchar*)dst + h * step); + + int vl; + for (int w = 0; w < width; w += vl) { + vl = setvl(width - w); + + auto v1 = vle(src1_h + w, vl); + auto v2 = vle(src2_h + w, vl); + + auto mask = __riscv_vmseq(v2, 0, vl); + vse(dst_h + w, __riscv_vmerge(div_sat(v1, v2, scale, vl), 0, mask, vl), vl); + } + } + + return CV_HAL_ERROR_OK; +} + +template <> +int div(const float *src1, size_t step1, const float *src2, size_t step2, + float *dst, size_t step, int width, int height, float scale) { + if (scale == 0.f) { + for (int h = 0; h < height; h++) { + float *dst_h = reinterpret_cast((uchar*)dst + h * step); + std::memset(dst_h, 0, sizeof(float) * width); + } + return CV_HAL_ERROR_OK; + } + + if (std::fabs(scale - 1.f) < FLT_EPSILON) { + for (int h = 0; h < height; h++) { + const float *src1_h = reinterpret_cast((const uchar*)src1 + h * step1); + const float *src2_h = reinterpret_cast((const uchar*)src2 + h * step2); + float *dst_h = reinterpret_cast((uchar*)dst + h * step); + + int vl; + for (int w = 0; w < width; w += vl) { + vl = setvl(width - w); + + auto v1 = vle(src1_h + w, vl); + auto v2 = vle(src2_h + w, vl); + + vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfrdiv(v2, 1.f, vl), vl), vl); + } + } + } else { + for (int h = 0; h < height; h++) { + const float *src1_h = reinterpret_cast((const uchar*)src1 + h * step1); + const float *src2_h = reinterpret_cast((const uchar*)src2 + h * step2); + float *dst_h = reinterpret_cast((uchar*)dst + h * step); + + int vl; + for (int w = 0; w < width; w += vl) { + vl = setvl(width - w); + + auto v1 = vle(src1_h + w, vl); + auto v2 = vle(src2_h + w, vl); + + vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfrdiv(v2, scale, vl), vl), vl); + } + } + } + + return CV_HAL_ERROR_OK; +} + +template inline +int recip(const ST *src_data, size_t src_step, ST *dst_data, size_t dst_step, + int width, int height, float scale) { + if (scale == 0.f || (scale < 1.f && scale > -1.f)) { + for (int h = 0; h < height; h++) { + ST *dst_h = reinterpret_cast((uchar*)dst_data + h * dst_step); + std::memset(dst_h, 0, sizeof(ST) * width); + } + return CV_HAL_ERROR_OK; + } + + for (int h = 0; h < height; h++) { + const ST *src_h = reinterpret_cast((const uchar*)src_data + h * src_step); + ST *dst_h = reinterpret_cast((uchar*)dst_data + h * dst_step); + + int vl; + for (int w = 0; w < width; w += vl) { + vl = setvl(width - w); + + auto v = vle(src_h + w, vl); + + auto mask = __riscv_vmseq(v, 0, vl); + vse(dst_h + w, __riscv_vmerge(recip_sat(v, scale, vl), 0, mask, vl), vl); + } + } + + return CV_HAL_ERROR_OK; +} + +template <> +int recip(const float *src_data, size_t src_step, float *dst_data, size_t dst_step, + int width, int height, float scale) { + if (scale == 0.f) { + for (int h = 0; h < height; h++) { + float *dst_h = reinterpret_cast((uchar*)dst_data + h * dst_step); + std::memset(dst_h, 0, sizeof(float) * width); + } + return CV_HAL_ERROR_OK; + } + + if (std::fabs(scale - 1.f) < FLT_EPSILON) { + for (int h = 0; h < height; h++) { + const float *src_h = reinterpret_cast((const uchar*)src_data + h * src_step); + float *dst_h = reinterpret_cast((uchar*)dst_data + h * dst_step); + + int vl; + for (int w = 0; w < width; w += vl) { + vl = setvl(width - w); + + auto v = vle(src_h + w, vl); + + vse(dst_h + w, __riscv_vfrdiv(v, 1.f, vl), vl); + } + } + } else { + for (int h = 0; h < height; h++) { + const float *src_h = reinterpret_cast((const uchar*)src_data + h * src_step); + float *dst_h = reinterpret_cast((uchar*)dst_data + h * dst_step); + + int vl; + for (int w = 0; w < width; w += vl) { + vl = setvl(width - w); + + auto v = vle(src_h + w, vl); + + vse(dst_h + w, __riscv_vfrdiv(v, scale, vl), vl); + } + } + } + + return CV_HAL_ERROR_OK; +} + +} // anonymous + +int div8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, double scale) { + return div(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale); +} +int div8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, schar *dst_data, size_t dst_step, int width, int height, double scale) { + return div(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale); +} +int div16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, ushort *dst_data, size_t dst_step, int width, int height, double scale) { + return div(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale); +} +int div16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, short *dst_data, size_t dst_step, int width, int height, double scale) { + return div(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale); +} +int div32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale) { + return div(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale); +} +int div32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale) { + return div(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale); +} + +int recip8u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, double scale) { + return recip(src_data, src_step, dst_data, dst_step, width, height, scale); +} +int recip8s(const schar *src_data, size_t src_step, schar *dst_data, size_t dst_step, int width, int height, double scale) { + return recip(src_data, src_step, dst_data, dst_step, width, height, scale); +} +int recip16u(const ushort *src_data, size_t src_step, ushort *dst_data, size_t dst_step, int width, int height, double scale) { + return recip(src_data, src_step, dst_data, dst_step, width, height, scale); +} +int recip16s(const short *src_data, size_t src_step, short *dst_data, size_t dst_step, int width, int height, double scale) { + return recip(src_data, src_step, dst_data, dst_step, width, height, scale); +} +int recip32s(const int *src_data, size_t src_step, int *dst_data, size_t dst_step, int width, int height, double scale) { + return recip(src_data, src_step, dst_data, dst_step, width, height, scale); +} +int recip32f(const float *src_data, size_t src_step, float *dst_data, size_t dst_step, int width, int height, double scale) { + return recip(src_data, src_step, dst_data, dst_step, width, height, scale); +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/dotprod.hpp b/hal/riscv-rvv/src/core/dotprod.cpp similarity index 87% rename from hal/riscv-rvv/hal_rvv_1p0/dotprod.hpp rename to hal/riscv-rvv/src/core/dotprod.cpp index e16a97cf6a..2630ca198d 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/dotprod.hpp +++ b/hal/riscv-rvv/src/core/dotprod.cpp @@ -5,21 +5,16 @@ // Copyright (C) 2025, SpaceMIT Inc., all rights reserved. // Third party copyrights are property of their respective owners. - -#ifndef OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED -#define OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED - -#include +#include "rvv_hal.hpp" #include -namespace cv { namespace cv_hal_rvv { namespace dotprod { +namespace cv { namespace rvv_hal { namespace core { -#undef cv_hal_dotProduct -#define cv_hal_dotProduct cv::cv_hal_rvv::dotprod::dotprod +#if CV_HAL_RVV_1P0_ENABLED namespace { -double dotProd_8u(const uchar *a, const uchar *b, int len) { +static inline double dotProd_8u(const uchar *a, const uchar *b, int len) { constexpr int block_size0 = (1 << 15); double r = 0; @@ -47,7 +42,7 @@ double dotProd_8u(const uchar *a, const uchar *b, int len) { return r; } -double dotProd_8s(const schar *a, const schar *b, int len) { +static inline double dotProd_8s(const schar *a, const schar *b, int len) { constexpr int block_size0 = (1 << 14); double r = 0; @@ -75,7 +70,7 @@ double dotProd_8s(const schar *a, const schar *b, int len) { return r; } -double dotProd_16u(const ushort *a, const ushort *b, int len) { +static inline double dotProd_16u(const ushort *a, const ushort *b, int len) { constexpr int block_size0 = (1 << 24); double r = 0; @@ -103,7 +98,7 @@ double dotProd_16u(const ushort *a, const ushort *b, int len) { return r; } -double dotProd_16s(const short *a, const short *b, int len) { +static inline double dotProd_16s(const short *a, const short *b, int len) { constexpr int block_size0 = (1 << 24); double r = 0; @@ -131,7 +126,7 @@ double dotProd_16s(const short *a, const short *b, int len) { return r; } -double dotProd_32s(const int *a, const int *b, int len) { +static inline double dotProd_32s(const int *a, const int *b, int len) { double r = 0; vfloat64m8_t s = __riscv_vfmv_v_f_f64m8(0.f, __riscv_vsetvlmax_e64m8()); @@ -149,7 +144,7 @@ double dotProd_32s(const int *a, const int *b, int len) { return r; } -double dotProd_32f(const float *a, const float *b, int len) { +static inline double dotProd_32f(const float *a, const float *b, int len) { constexpr int block_size0 = (1 << 11); double r = 0.f; @@ -180,8 +175,8 @@ double dotProd_32f(const float *a, const float *b, int len) { } // anonymous using DotProdFunc = double (*)(const uchar *a, const uchar *b, int len); -inline int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size_t b_step, - int width, int height, int type, double *dot_val) { +int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size_t b_step, + int width, int height, int type, double *dot_val) { int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); static DotProdFunc dotprod_tab[CV_DEPTH_MAX] = { @@ -228,6 +223,6 @@ inline int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size return CV_HAL_ERROR_OK; } -}}} // cv::cv_hal_rvv::dotprod +#endif // CV_HAL_RVV_1P0_ENABLED -#endif // OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/dxt.hpp b/hal/riscv-rvv/src/core/dxt.cpp similarity index 97% rename from hal/riscv-rvv/hal_rvv_1p0/dxt.hpp rename to hal/riscv-rvv/src/core/dxt.cpp index 25f4879532..fa0c464e88 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/dxt.hpp +++ b/hal/riscv-rvv/src/core/dxt.cpp @@ -4,17 +4,11 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_DXT_HPP_INCLUDED -#define OPENCV_HAL_RVV_DXT_HPP_INCLUDED +#include "rvv_hal.hpp" -#include -#include "hal_rvv_1p0/types.hpp" -#include "opencv2/core/types.hpp" +namespace cv { namespace rvv_hal { namespace core { -namespace cv { namespace cv_hal_rvv { namespace dxt { - -#undef cv_hal_dft -#define cv_hal_dft cv::cv_hal_rvv::dxt::dft +#if CV_HAL_RVV_1P0_ENABLED template struct rvv; @@ -42,7 +36,7 @@ template<> struct rvv : RVV_F64M1 // in the function template static void cv::DFT and cv::DFT_R2, cv::DFT_R3, cv::DFT_R5 template inline int dft(const Complex* src, Complex* dst, int nf, int *factors, T scale, int* itab, - const Complex* wave, int tab_size, int len, bool isInverse, bool noPermute) + const Complex* wave, int tab_size, int len, bool isInverse, bool noPermute) { int n = len; int f_idx, nx; @@ -545,8 +539,8 @@ inline int dft(const Complex* src, Complex* dst, int nf, int *factors, T s return CV_HAL_ERROR_OK; } -inline int dft(const uchar* src, uchar* dst, int depth, int nf, int *factors, double scale, int* itab, void* wave, - int tab_size, int n, bool isInverse, bool noPermute) +int dft(const uchar* src, uchar* dst, int depth, int nf, int *factors, double scale, + int* itab, void* wave, int tab_size, int n, bool isInverse, bool noPermute) { if( n == 0 ) return CV_HAL_ERROR_OK; @@ -563,6 +557,6 @@ inline int dft(const uchar* src, uchar* dst, int depth, int nf, int *factors, do return CV_HAL_ERROR_NOT_IMPLEMENTED; } -}}} +#endif // CV_HAL_RVV_1P0_ENABLED -#endif +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/exp.hpp b/hal/riscv-rvv/src/core/exp.cpp similarity index 95% rename from hal/riscv-rvv/hal_rvv_1p0/exp.hpp rename to hal/riscv-rvv/src/core/exp.cpp index 82690fb321..552fdc0e3f 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/exp.hpp +++ b/hal/riscv-rvv/src/core/exp.cpp @@ -4,17 +4,11 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_EXP_HPP_INCLUDED -#define OPENCV_HAL_RVV_EXP_HPP_INCLUDED +#include "rvv_hal.hpp" -#include +namespace cv { namespace rvv_hal { namespace core { -namespace cv { namespace cv_hal_rvv { - -#undef cv_hal_exp32f -#define cv_hal_exp32f cv::cv_hal_rvv::exp32f -#undef cv_hal_exp64f -#define cv_hal_exp64f cv::cv_hal_rvv::exp64f +#if CV_HAL_RVV_1P0_ENABLED namespace detail { @@ -116,7 +110,7 @@ static constexpr double exp_tab_64f[exp_tab_size] = EXP_TAB_VALUE; } // namespace detail -inline int exp32f(const float* src, float* dst, int _len) +int exp32f(const float* src, float* dst, int _len) { size_t vl = __riscv_vsetvlmax_e32m4(); auto exp_a2 = __riscv_vfmv_v_f_f32m4(detail::exp32f_a2, vl); @@ -158,7 +152,7 @@ inline int exp32f(const float* src, float* dst, int _len) return CV_HAL_ERROR_OK; } -inline int exp64f(const double* src, double* dst, int _len) +int exp64f(const double* src, double* dst, int _len) { size_t vl = __riscv_vsetvlmax_e64m4(); // all vector registers are used up, so not load more constants @@ -203,6 +197,6 @@ inline int exp64f(const double* src, double* dst, int _len) return CV_HAL_ERROR_OK; } -}} // namespace cv::cv_hal_rvv +#endif // CV_HAL_RVV_1P0_ENABLED -#endif //OPENCV_HAL_RVV_EXP_HPP_INCLUDED +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/flip.hpp b/hal/riscv-rvv/src/core/flip.cpp similarity index 96% rename from hal/riscv-rvv/hal_rvv_1p0/flip.hpp rename to hal/riscv-rvv/src/core/flip.cpp index 02abeb6e93..6f4c577c25 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/flip.hpp +++ b/hal/riscv-rvv/src/core/flip.cpp @@ -5,13 +5,7 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. // Copyright (C) 2025, SpaceMIT Inc., all rights reserved. -#ifndef OPENCV_HAL_RVV_FLIP_HPP_INCLUDED -#define OPENCV_HAL_RVV_FLIP_HPP_INCLUDED - - -#include -#include -#include "hal_rvv_1p0/types.hpp" +#include "rvv_hal.hpp" #if defined (__clang__) && __clang_major__ < 18 #define OPENCV_HAL_IMPL_RVV_VCREATE_x3(suffix, width, v0, v1, v2) \ @@ -24,10 +18,9 @@ #define __riscv_vcreate_v_u64m2x3(v0, v1, v2) OPENCV_HAL_IMPL_RVV_VCREATE_x3(u64, 2, v0, v1, v2) #endif -namespace cv { namespace cv_hal_rvv { +namespace cv { namespace rvv_hal { namespace core { -#undef cv_hal_flip -#define cv_hal_flip cv::cv_hal_rvv::flip +#if CV_HAL_RVV_1P0_ENABLED namespace { @@ -73,6 +66,13 @@ CV_HAL_RVV_FLIP_INPLACE_C1(16UC1, ushort, RVV_U16M8) CV_HAL_RVV_FLIP_INPLACE_C1(32UC1, unsigned, RVV_U32M8) CV_HAL_RVV_FLIP_INPLACE_C1(64UC1, uint64_t, RVV_U64M8) +// Suppress warnings of "ignoring attributes applied to VecType after definition", +// VecType is vuint8m2x3_t, vuint16m2x3_t, vuint32m2x3_t or vuint64m2x3_t +#if defined (__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wattributes" +#endif + #define CV_HAL_RVV_FLIP_C3_TYPES(width) \ struct RVV_C3_U##width##M2 : RVV_U##width##M2 { \ static inline vuint##width##m2x3_t vload3(const uint##width##_t *base, size_t vl) { return __riscv_vlseg3e##width##_v_u##width##m2x3(base, vl); } \ @@ -90,6 +90,10 @@ CV_HAL_RVV_FLIP_C3_TYPES(16) CV_HAL_RVV_FLIP_C3_TYPES(32) CV_HAL_RVV_FLIP_C3_TYPES(64) +#if defined (__GNUC__) +#pragma GCC diagnostic pop +#endif + #define CV_HAL_RVV_FLIP_C3(name, _Tps, RVV) \ inline void flip_##name(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int src_width, int src_height, int flip_mode) { \ for (int h = 0; h < src_height; h++) { \ @@ -311,7 +315,7 @@ inline int flip_inplace(int esz, uchar* data, size_t step, int width, int height return CV_HAL_ERROR_OK; } -inline int flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height, +int flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int flip_mode) { int esz = CV_ELEM_SIZE(src_type); @@ -368,6 +372,6 @@ inline int flip(int src_type, const uchar* src_data, size_t src_step, int src_wi return CV_HAL_ERROR_OK; } -}} // namespace cv::cv_hal_rvv +#endif // CV_HAL_RVV_1P0_ENABLED -#endif //OPENCV_HAL_RVV_FLIP_HPP_INCLUDED +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/log.hpp b/hal/riscv-rvv/src/core/log.cpp similarity index 98% rename from hal/riscv-rvv/hal_rvv_1p0/log.hpp rename to hal/riscv-rvv/src/core/log.cpp index 8df0761861..0783e3be54 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/log.hpp +++ b/hal/riscv-rvv/src/core/log.cpp @@ -4,17 +4,11 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_LOG_HPP_INCLUDED -#define OPENCV_HAL_RVV_LOG_HPP_INCLUDED +#include "rvv_hal.hpp" -#include +namespace cv { namespace rvv_hal { namespace core { -namespace cv { namespace cv_hal_rvv { - -#undef cv_hal_log32f -#define cv_hal_log32f cv::cv_hal_rvv::log32f -#undef cv_hal_log64f -#define cv_hal_log64f cv::cv_hal_rvv::log64f +#if CV_HAL_RVV_1P0_ENABLED namespace detail { @@ -306,7 +300,7 @@ static constexpr double log_tab_64f[log_tab_size] = LOG_TAB_VALUE; } // namespace detail -inline int log32f(const float* src, float* dst, int _len) +int log32f(const float* src, float* dst, int _len) { size_t vl = __riscv_vsetvlmax_e32m4(); auto log_a2 = __riscv_vfmv_v_f_f32m4(detail::log32f_a2, vl); @@ -340,7 +334,7 @@ inline int log32f(const float* src, float* dst, int _len) return CV_HAL_ERROR_OK; } -inline int log64f(const double* src, double* dst, int _len) +int log64f(const double* src, double* dst, int _len) { size_t vl = __riscv_vsetvlmax_e64m4(); // all vector registers are used up, so not load more constants @@ -382,6 +376,6 @@ inline int log64f(const double* src, double* dst, int _len) return CV_HAL_ERROR_OK; } -}} // namespace cv::cv_hal_rvv +#endif // CV_HAL_RVV_1P0_ENABLED -#endif //OPENCV_HAL_RVV_LOG_HPP_INCLUDED +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/lu.hpp b/hal/riscv-rvv/src/core/lu.cpp similarity index 91% rename from hal/riscv-rvv/hal_rvv_1p0/lu.hpp rename to hal/riscv-rvv/src/core/lu.cpp index 6de137fe82..d4579caa47 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/lu.hpp +++ b/hal/riscv-rvv/src/core/lu.cpp @@ -4,21 +4,16 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_LU_HPP_INCLUDED -#define OPENCV_HAL_RVV_LU_HPP_INCLUDED - +#include "rvv_hal.hpp" #include #include #include -#include -#include "hal_rvv_1p0/types.hpp" -namespace cv { namespace cv_hal_rvv { namespace lu { +namespace cv { namespace rvv_hal { namespace core { -#undef cv_hal_LU32f -#define cv_hal_LU32f cv::cv_hal_rvv::lu::LU -#undef cv_hal_LU64f -#define cv_hal_LU64f cv::cv_hal_rvv::lu::LU +#if CV_HAL_RVV_1P0_ENABLED + +namespace { // the algorithm is copied from core/src/matrix_decomp.cpp, // in the function template static int cv::LUImpl @@ -167,6 +162,15 @@ inline int LU(T* src1, size_t src1_step, int m, T* src2, size_t src2_step, int n return CV_HAL_ERROR_OK; } -}}} +} // anonymous -#endif +int LU32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, int* info) { + return LU(src1, src1_step, m, src2, src2_step, n, info); +} +int LU64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, int* info) { + return LU(src1, src1_step, m, src2, src2_step, n, info); +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/lut.hpp b/hal/riscv-rvv/src/core/lut.cpp similarity index 93% rename from hal/riscv-rvv/hal_rvv_1p0/lut.hpp rename to hal/riscv-rvv/src/core/lut.cpp index c13a5b2f0a..a90afd2604 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/lut.hpp +++ b/hal/riscv-rvv/src/core/lut.cpp @@ -4,19 +4,11 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_LUT_HPP_INCLUDED -#define OPENCV_HAL_RVV_LUT_HPP_INCLUDED +#include "rvv_hal.hpp" -#include -#include -#include +namespace cv { namespace rvv_hal { namespace core { -#include "hal_rvv_1p0/types.hpp" - -namespace cv { namespace cv_hal_rvv { - -#undef cv_hal_lut -#define cv_hal_lut cv::cv_hal_rvv::lut +#if CV_HAL_RVV_1P0_ENABLED // need vlen >= 256 struct LUTCacheU8 : RVV_U8M8 @@ -135,7 +127,7 @@ private: LUTParallelBody& operator=(const LUTParallelBody&); }; -inline int lut(const uchar* src_data, +int lut(const uchar* src_data, size_t src_step, size_t src_type, const uchar* lut_data, @@ -191,6 +183,6 @@ inline int lut(const uchar* src_data, return CV_HAL_ERROR_NOT_IMPLEMENTED; } -}} // namespace cv::cv_hal_rvv +#endif // CV_HAL_RVV_1P0_ENABLED -#endif //OPENCV_HAL_RVV_LUT_HPP_INCLUDED +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/magnitude.hpp b/hal/riscv-rvv/src/core/magnitude.cpp similarity index 54% rename from hal/riscv-rvv/hal_rvv_1p0/magnitude.hpp rename to hal/riscv-rvv/src/core/magnitude.cpp index eb814c1b77..8630b717da 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/magnitude.hpp +++ b/hal/riscv-rvv/src/core/magnitude.cpp @@ -4,20 +4,14 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_MAGNITUDE_HPP_INCLUDED -#define OPENCV_HAL_RVV_MAGNITUDE_HPP_INCLUDED +#include "rvv_hal.hpp" +#include "common.hpp" -#include +namespace cv { namespace rvv_hal { namespace core { -#include "hal_rvv_1p0/sqrt.hpp" -#include "hal_rvv_1p0/types.hpp" +#if CV_HAL_RVV_1P0_ENABLED -namespace cv { namespace cv_hal_rvv { - -#undef cv_hal_magnitude32f -#define cv_hal_magnitude32f cv::cv_hal_rvv::magnitude> -#undef cv_hal_magnitude64f -#define cv_hal_magnitude64f cv::cv_hal_rvv::magnitude> +namespace { template inline int magnitude(const T* x, const T* y, T* dst, int len) @@ -30,13 +24,22 @@ inline int magnitude(const T* x, const T* y, T* dst, int len) auto vx = SQRT_T::T::vload(x, vl); auto vy = SQRT_T::T::vload(y, vl); - auto vmag = detail::sqrt(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl); + auto vmag = common::sqrt(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl); SQRT_T::T::vstore(dst, vmag, vl); } return CV_HAL_ERROR_OK; } -}} // namespace cv::cv_hal_rvv +} // anonymous -#endif // OPENCV_HAL_RVV_MAGNITUDE_HPP_INCLUDED +int magnitude32f(const float *x, const float *y, float *dst, int len) { + return magnitude>(x, y, dst, len); +} +int magnitude64f(const double *x, const double *y, double *dst, int len) { + return magnitude>(x, y, dst, len); +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/mean.hpp b/hal/riscv-rvv/src/core/mean.cpp similarity index 95% rename from hal/riscv-rvv/hal_rvv_1p0/mean.hpp rename to hal/riscv-rvv/src/core/mean.cpp index e8156371b3..2fc2f98f65 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/mean.hpp +++ b/hal/riscv-rvv/src/core/mean.cpp @@ -4,15 +4,11 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_MEANSTDDEV_HPP_INCLUDED -#define OPENCV_HAL_RVV_MEANSTDDEV_HPP_INCLUDED +#include "rvv_hal.hpp" -#include +namespace cv { namespace rvv_hal { namespace core { -namespace cv { namespace cv_hal_rvv { - -#undef cv_hal_meanStdDev -#define cv_hal_meanStdDev cv::cv_hal_rvv::meanStdDev +#if CV_HAL_RVV_1P0_ENABLED inline int meanStdDev_8UC1(const uchar* src_data, size_t src_step, int width, int height, double* mean_val, double* stddev_val, uchar* mask, size_t mask_step); @@ -21,8 +17,8 @@ inline int meanStdDev_8UC4(const uchar* src_data, size_t src_step, int width, in inline int meanStdDev_32FC1(const uchar* src_data, size_t src_step, int width, int height, double* mean_val, double* stddev_val, uchar* mask, size_t mask_step); -inline int meanStdDev(const uchar* src_data, size_t src_step, int width, int height, - int src_type, double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) { +int meanStdDev(const uchar* src_data, size_t src_step, int width, int height, int src_type, + double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) { switch (src_type) { case CV_8UC1: @@ -226,6 +222,6 @@ inline int meanStdDev_32FC1(const uchar* src_data, size_t src_step, int width, i return CV_HAL_ERROR_OK; } -}} +#endif // CV_HAL_RVV_1P0_ENABLED -#endif +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/merge.hpp b/hal/riscv-rvv/src/core/merge.cpp similarity index 93% rename from hal/riscv-rvv/hal_rvv_1p0/merge.hpp rename to hal/riscv-rvv/src/core/merge.cpp index b1da204b39..9dcc6b67e2 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/merge.hpp +++ b/hal/riscv-rvv/src/core/merge.cpp @@ -4,21 +4,7 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_MERGE_HPP_INCLUDED -#define OPENCV_HAL_RVV_MERGE_HPP_INCLUDED - -#include - -namespace cv { namespace cv_hal_rvv { - -#undef cv_hal_merge8u -#define cv_hal_merge8u cv::cv_hal_rvv::merge8u -#undef cv_hal_merge16u -#define cv_hal_merge16u cv::cv_hal_rvv::merge16u -#undef cv_hal_merge32s -#define cv_hal_merge32s cv::cv_hal_rvv::merge32s -#undef cv_hal_merge64s -#define cv_hal_merge64s cv::cv_hal_rvv::merge64s +#include "rvv_hal.hpp" #if defined __clang__ && __clang_major__ < 18 #define OPENCV_HAL_IMPL_RVV_VCREATE_x2(suffix, width, v0, v1) \ @@ -44,7 +30,11 @@ namespace cv { namespace cv_hal_rvv { #define __riscv_vcreate_v_u16m2x4(v0, v1, v2, v3) OPENCV_HAL_IMPL_RVV_VCREATE_x4(u16, 2, v0, v1, v2, v3) #endif // clang < 18 -inline int merge8u(const uchar** src, uchar* dst, int len, int cn ) { +namespace cv { namespace rvv_hal { namespace core { + +#if CV_HAL_RVV_1P0_ENABLED + +int merge8u(const uchar** src, uchar* dst, int len, int cn ) { int vl = 0; if (cn == 1) { @@ -129,7 +119,7 @@ inline int merge8u(const uchar** src, uchar* dst, int len, int cn ) { return CV_HAL_ERROR_OK; } -inline int merge16u(const ushort** src, ushort* dst, int len, int cn ) { +int merge16u(const ushort** src, ushort* dst, int len, int cn ) { int vl = 0; if (cn == 1) { @@ -217,7 +207,7 @@ inline int merge16u(const ushort** src, ushort* dst, int len, int cn ) { #if defined __GNUC__ && !defined(__clang__) __attribute__((optimize("no-tree-vectorize"))) #endif -inline int merge32s(const int** src, int* dst, int len, int cn ) { +int merge32s(const int** src, int* dst, int len, int cn ) { int k = cn % 4 ? cn % 4 : 4; int i, j; if( k == 1 ) @@ -287,7 +277,7 @@ inline int merge32s(const int** src, int* dst, int len, int cn ) { #if defined __GNUC__ && !defined(__clang__) __attribute__((optimize("no-tree-vectorize"))) #endif -inline int merge64s(const int64** src, int64* dst, int len, int cn ) { +int merge64s(const int64** src, int64* dst, int len, int cn ) { int k = cn % 4 ? cn % 4 : 4; int i, j; if( k == 1 ) @@ -354,6 +344,6 @@ inline int merge64s(const int64** src, int64* dst, int len, int cn ) { return CV_HAL_ERROR_OK; } -}} +#endif // CV_HAL_RVV_1P0_ENABLED -#endif +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/minmax.hpp b/hal/riscv-rvv/src/core/minmax.cpp similarity index 94% rename from hal/riscv-rvv/hal_rvv_1p0/minmax.hpp rename to hal/riscv-rvv/src/core/minmax.cpp index c07a1ff6f7..5fbc3a0f50 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/minmax.hpp +++ b/hal/riscv-rvv/src/core/minmax.cpp @@ -4,19 +4,11 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_MINMAX_HPP_INCLUDED -#define OPENCV_HAL_RVV_MINMAX_HPP_INCLUDED +#include "rvv_hal.hpp" -#include -#include -#include "hal_rvv_1p0/types.hpp" +namespace cv { namespace rvv_hal { namespace core { -namespace cv { namespace cv_hal_rvv { namespace minmax { - -#undef cv_hal_minMaxIdx -#define cv_hal_minMaxIdx cv::cv_hal_rvv::minmax::minMaxIdx -#undef cv_hal_minMaxIdxMaskStep -#define cv_hal_minMaxIdxMaskStep cv::cv_hal_rvv::minmax::minMaxIdx +#if CV_HAL_RVV_1P0_ENABLED template inline int minMaxIdxReadTwice(const uchar* src_data, size_t src_step, int width, int height, double* minVal, double* maxVal, @@ -257,8 +249,8 @@ inline int minMaxIdxReadOnce(const uchar* src_data, size_t src_step, int width, return CV_HAL_ERROR_OK; } -inline int minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth, double* minVal, double* maxVal, - int* minIdx, int* maxIdx, uchar* mask, size_t mask_step = 0) +int minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth, + double* minVal, double* maxVal, int* minIdx, int* maxIdx, uchar* mask, size_t mask_step) { if (!mask_step) mask_step = src_step; @@ -284,6 +276,6 @@ inline int minMaxIdx(const uchar* src_data, size_t src_step, int width, int heig return CV_HAL_ERROR_NOT_IMPLEMENTED; } -}}} +#endif // CV_HAL_RVV_1P0_ENABLED -#endif +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/norm.hpp b/hal/riscv-rvv/src/core/norm.cpp similarity index 96% rename from hal/riscv-rvv/hal_rvv_1p0/norm.hpp rename to hal/riscv-rvv/src/core/norm.cpp index c35c0a3bd5..b2deb3f4fc 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/norm.hpp +++ b/hal/riscv-rvv/src/core/norm.cpp @@ -6,15 +6,12 @@ // Copyright (C) 2025, SpaceMIT Inc., all rights reserved. // Third party copyrights are property of their respective owners. -#ifndef OPENCV_HAL_RVV_NORM_HPP_INCLUDED -#define OPENCV_HAL_RVV_NORM_HPP_INCLUDED - +#include "rvv_hal.hpp" #include "common.hpp" -namespace cv { namespace cv_hal_rvv { namespace norm { +namespace cv { namespace rvv_hal { namespace core { -#undef cv_hal_norm -#define cv_hal_norm cv::cv_hal_rvv::norm::norm +#if CV_HAL_RVV_1P0_ENABLED namespace { @@ -76,7 +73,7 @@ struct NormInf_RVV { for (int i = 0; i < n; i += vl) { vl = __riscv_vsetvl_e8m8(n - i); auto v = __riscv_vle8_v_i8m8(src + i, vl); - s = __riscv_vmaxu_tu(s, s, custom_intrin::__riscv_vabs(v, vl), vl); + s = __riscv_vmaxu_tu(s, s, common::__riscv_vabs(v, vl), vl); } return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax)); } @@ -106,7 +103,7 @@ struct NormInf_RVV { for (int i = 0; i < n; i += vl) { vl = __riscv_vsetvl_e16m8(n - i); auto v = __riscv_vle16_v_i16m8(src + i, vl); - s = __riscv_vmaxu_tu(s, s, custom_intrin::__riscv_vabs(v, vl), vl); + s = __riscv_vmaxu_tu(s, s, common::__riscv_vabs(v, vl), vl); } return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax)); } @@ -121,7 +118,7 @@ struct NormInf_RVV { for (int i = 0; i < n; i += vl) { vl = __riscv_vsetvl_e32m8(n - i); auto v = __riscv_vle32_v_i32m8(src + i, vl); - s = __riscv_vmaxu_tu(s, s, custom_intrin::__riscv_vabs(v, vl), vl); + s = __riscv_vmaxu_tu(s, s, common::__riscv_vabs(v, vl), vl); } return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u32m1(0, __riscv_vsetvlmax_e32m1()), vlmax)); } @@ -180,7 +177,7 @@ struct NormL1_RVV { int vl; for (int i = 0; i < n; i += vl) { vl = __riscv_vsetvl_e8m8(n - i); - auto v = custom_intrin::__riscv_vabs(__riscv_vle8_v_i8m8(src + i, vl), vl); + auto v = common::__riscv_vabs(__riscv_vle8_v_i8m8(src + i, vl), vl); s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); } return __riscv_vmv_x(s); @@ -208,7 +205,7 @@ struct NormL1_RVV { int vl; for (int i = 0; i < n; i += vl) { vl = __riscv_vsetvl_e16m8(n - i); - auto v = custom_intrin::__riscv_vabs(__riscv_vle16_v_i16m8(src + i, vl), vl); + auto v = common::__riscv_vabs(__riscv_vle16_v_i16m8(src + i, vl), vl); s = __riscv_vwredsumu(v, s, vl); } return __riscv_vmv_x(s); @@ -223,7 +220,7 @@ struct NormL1_RVV { int vl; for (int i = 0; i < n; i += vl) { vl = __riscv_vsetvl_e32m4(n - i); - auto v = custom_intrin::__riscv_vabs(__riscv_vle32_v_i32m4(src + i, vl), vl); + auto v = common::__riscv_vabs(__riscv_vle32_v_i32m4(src + i, vl), vl); s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v, vl), vl); } return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); @@ -544,7 +541,7 @@ struct MaskedNormInf_RVV { auto v = __riscv_vlse8_v_i8m8(src + cn * i + cn_index, sizeof(schar) * cn, vl); auto m = __riscv_vle8_v_u8m8(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); - s = __riscv_vmaxu_tumu(b, s, s, custom_intrin::__riscv_vabs(v, vl), vl); + s = __riscv_vmaxu_tumu(b, s, s, common::__riscv_vabs(v, vl), vl); } } return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax)); @@ -560,7 +557,7 @@ struct MaskedNormL1_RVV { int vl; for (int i = 0; i < len; i += vl) { vl = __riscv_vsetvl_e8m8(len - i); - auto v = custom_intrin::__riscv_vabs(__riscv_vlse8_v_i8m8(src + cn * i + cn_index, sizeof(schar) * cn, vl), vl); + auto v = common::__riscv_vabs(__riscv_vlse8_v_i8m8(src + cn * i + cn_index, sizeof(schar) * cn, vl), vl); auto m = __riscv_vle8_v_u8m8(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); @@ -657,7 +654,7 @@ struct MaskedNormInf_RVV { auto v = __riscv_vlse16_v_i16m8(src + cn * i + cn_index, sizeof(short) * cn, vl); auto m = __riscv_vle8_v_u8m4(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); - s = __riscv_vmaxu_tumu(b, s, s, custom_intrin::__riscv_vabs(v, vl), vl); + s = __riscv_vmaxu_tumu(b, s, s, common::__riscv_vabs(v, vl), vl); } } return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax)); @@ -672,7 +669,7 @@ struct MaskedNormL1_RVV { int vl; for (int i = 0; i < len; i += vl) { vl = __riscv_vsetvl_e8m4(len - i); - auto v = custom_intrin::__riscv_vabs(__riscv_vlse16_v_i16m8(src + cn * i + cn_index, sizeof(short) * cn, vl), vl); + auto v = common::__riscv_vabs(__riscv_vlse16_v_i16m8(src + cn * i + cn_index, sizeof(short) * cn, vl), vl); auto m = __riscv_vle8_v_u8m4(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vwredsumu_tum(b, s, v, s, vl); @@ -714,7 +711,7 @@ struct MaskedNormInf_RVV { auto v = __riscv_vlse32_v_i32m8(src + cn * i + cn_index, sizeof(int) * cn, vl); auto m = __riscv_vle8_v_u8m2(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); - s = __riscv_vmaxu_tumu(b, s, s, custom_intrin::__riscv_vabs(v, vl), vl); + s = __riscv_vmaxu_tumu(b, s, s, common::__riscv_vabs(v, vl), vl); } } return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u32m1(0, __riscv_vsetvlmax_e32m1()), vlmax)); @@ -733,7 +730,7 @@ struct MaskedNormL1_RVV { auto v = __riscv_vlse32_v_i32m4(src + cn * i + cn_index, sizeof(int) * cn, vl); auto m = __riscv_vle8_v_u8m1(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); - s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, custom_intrin::__riscv_vabs(v, vl), vl), vl); + s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, common::__riscv_vabs(v, vl), vl), vl); } } return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); @@ -972,8 +969,8 @@ CV_HAL_RVV_DEF_NORM_ALL(64f, double, double, double, double) } using NormFunc = int (*)(const uchar*, const uchar*, uchar*, int, int); -inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, - int height, int type, int norm_type, double* result) { +int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, + int width, int height, int type, int norm_type, double* result) { int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); if (result == nullptr || depth == CV_16F || norm_type > NORM_L2SQR) { @@ -1090,6 +1087,6 @@ inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mas return CV_HAL_ERROR_OK; } -}}} // cv::cv_hal_rvv::norm +#endif // CV_HAL_RVV_1P0_ENABLED -#endif +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/norm_diff.hpp b/hal/riscv-rvv/src/core/norm_diff.cpp similarity index 93% rename from hal/riscv-rvv/hal_rvv_1p0/norm_diff.hpp rename to hal/riscv-rvv/src/core/norm_diff.cpp index 1ffa42f15d..f136be108a 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/norm_diff.hpp +++ b/hal/riscv-rvv/src/core/norm_diff.cpp @@ -6,15 +6,12 @@ // Copyright (C) 2025, SpaceMIT Inc., all rights reserved. // Third party copyrights are property of their respective owners. -#ifndef OPENCV_HAL_RVV_NORM_DIFF_HPP_INCLUDED -#define OPENCV_HAL_RVV_NORM_DIFF_HPP_INCLUDED - +#include "rvv_hal.hpp" #include "common.hpp" -namespace cv { namespace cv_hal_rvv { namespace norm_diff { +namespace cv { namespace rvv_hal { namespace core { -#undef cv_hal_normDiff -#define cv_hal_normDiff cv::cv_hal_rvv::norm_diff::normDiff +#if CV_HAL_RVV_1P0_ENABLED namespace { @@ -62,7 +59,7 @@ struct NormDiffInf_RVV { vl = __riscv_vsetvl_e8m8(n - i); auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl); auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); s = __riscv_vmaxu_tu(s, s, v, vl); } return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax)); @@ -79,7 +76,7 @@ struct NormDiffInf_RVV { vl = __riscv_vsetvl_e8m8(n - i); auto v1 = __riscv_vle8_v_i8m8(src1 + i, vl); auto v2 = __riscv_vle8_v_i8m8(src2 + i, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); s = __riscv_vmaxu_tu(s, s, v, vl); } return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax)); @@ -96,7 +93,7 @@ struct NormDiffInf_RVV { vl = __riscv_vsetvl_e16m8(n - i); auto v1 = __riscv_vle16_v_u16m8(src1 + i, vl); auto v2 = __riscv_vle16_v_u16m8(src2 + i, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); s = __riscv_vmaxu_tu(s, s, v, vl); } return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax)); @@ -113,7 +110,7 @@ struct NormDiffInf_RVV { vl = __riscv_vsetvl_e16m8(n - i); auto v1 = __riscv_vle16_v_i16m8(src1 + i, vl); auto v2 = __riscv_vle16_v_i16m8(src2 + i, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); s = __riscv_vmaxu_tu(s, s, v, vl); } return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax)); @@ -130,8 +127,8 @@ struct NormDiffInf_RVV { vl = __riscv_vsetvl_e32m8(n - i); auto v1 = __riscv_vle32_v_i32m8(src1 + i, vl); auto v2 = __riscv_vle32_v_i32m8(src2 + i, vl); - // auto v = custom_intrin::__riscv_vabd(v1, v2, vl); // 5.x - auto v = custom_intrin::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x + // auto v = common::__riscv_vabd(v1, v2, vl); // 5.x + auto v = common::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x s = __riscv_vmaxu_tu(s, s, v, vl); } return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u32m1(0, __riscv_vsetvlmax_e32m1()), vlmax)); @@ -182,7 +179,7 @@ struct NormDiffL1_RVV { vl = __riscv_vsetvl_e8m8(n - i); auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl); auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); } return __riscv_vmv_x(s); @@ -199,7 +196,7 @@ struct NormDiffL1_RVV { vl = __riscv_vsetvl_e8m8(n - i); auto v1 = __riscv_vle8_v_i8m8(src1 + i, vl); auto v2 = __riscv_vle8_v_i8m8(src2 + i, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); } return __riscv_vmv_x(s); @@ -215,7 +212,7 @@ struct NormDiffL1_RVV { vl = __riscv_vsetvl_e16m8(n - i); auto v1 = __riscv_vle16_v_u16m8(src1 + i, vl); auto v2 = __riscv_vle16_v_u16m8(src2 + i, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); s = __riscv_vwredsumu(v, s, vl); } return __riscv_vmv_x(s); @@ -231,7 +228,7 @@ struct NormDiffL1_RVV { vl = __riscv_vsetvl_e16m8(n - i); auto v1 = __riscv_vle16_v_i16m8(src1 + i, vl); auto v2 = __riscv_vle16_v_i16m8(src2 + i, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); s = __riscv_vwredsumu(v, s, vl); } return __riscv_vmv_x(s); @@ -248,8 +245,8 @@ struct NormDiffL1_RVV { vl = __riscv_vsetvl_e32m4(n - i); auto v1 = __riscv_vle32_v_i32m4(src1 + i, vl); auto v2 = __riscv_vle32_v_i32m4(src2 + i, vl); - // auto v = custom_intrin::__riscv_vabd(v1, v2, vl); // 5.x - auto v = custom_intrin::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x + // auto v = common::__riscv_vabd(v1, v2, vl); // 5.x + auto v = common::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v, vl), vl); } return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); @@ -299,7 +296,7 @@ struct NormDiffL2_RVV { vl = __riscv_vsetvl_e8m4(n - i); auto v1 = __riscv_vle8_v_u8m4(src1 + i, vl); auto v2 = __riscv_vle8_v_u8m4(src2 + i, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); s = __riscv_vwredsumu(__riscv_vwmulu(v, v, vl), s, vl); } return __riscv_vmv_x(s); @@ -315,7 +312,7 @@ struct NormDiffL2_RVV { vl = __riscv_vsetvl_e8m4(n - i); auto v1 = __riscv_vle8_v_i8m4(src1 + i, vl); auto v2 = __riscv_vle8_v_i8m4(src2 + i, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); s = __riscv_vwredsumu(__riscv_vwmulu(v, v, vl), s, vl); } return __riscv_vmv_x(s); @@ -332,7 +329,7 @@ struct NormDiffL2_RVV { vl = __riscv_vsetvl_e16m2(n - i); auto v1 = __riscv_vle16_v_u16m2(src1 + i, vl); auto v2 = __riscv_vle16_v_u16m2(src2 + i, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto v_mul = __riscv_vwmulu(v, v, vl); s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v_mul, vl), vl); } @@ -350,7 +347,7 @@ struct NormDiffL2_RVV { vl = __riscv_vsetvl_e16m2(n - i); auto v1 = __riscv_vle16_v_i16m2(src1 + i, vl); auto v2 = __riscv_vle16_v_i16m2(src2 + i, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto v_mul = __riscv_vwmulu(v, v, vl); s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v_mul, vl), vl); } @@ -368,7 +365,7 @@ struct NormDiffL2_RVV { vl = __riscv_vsetvl_e32m4(n - i); auto v1 = __riscv_vle32_v_i32m4(src1 + i, vl); auto v2 = __riscv_vle32_v_i32m4(src2 + i, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto v_mul = __riscv_vwmulu(v, v, vl); s = __riscv_vfadd_tu(s, s, __riscv_vfcvt_f(v_mul, vl), vl); } @@ -471,7 +468,7 @@ struct MaskedNormDiffInf_RVV { vl = __riscv_vsetvl_e8m8(len - i); auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl); auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto m = __riscv_vle8_v_u8m8(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vmaxu_tumu(b, s, s, v, vl); @@ -482,7 +479,7 @@ struct MaskedNormDiffInf_RVV { vl = __riscv_vsetvl_e8m2(len - i); auto v1 = __riscv_vle8_v_u8m8(src1 + i * 4, vl * 4); auto v2 = __riscv_vle8_v_u8m8(src2 + i * 4, vl * 4); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl * 4); + auto v = common::__riscv_vabd(v1, v2, vl * 4); auto m = __riscv_vle8_v_u8m2(mask + i, vl); auto b = __riscv_vmsne(__riscv_vreinterpret_u8m8(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4); s = __riscv_vmaxu_tumu(b, s, s, v, vl * 4); @@ -494,7 +491,7 @@ struct MaskedNormDiffInf_RVV { vl = __riscv_vsetvl_e8m8(len - i); auto v1 = __riscv_vlse8_v_u8m8(src1 + cn * i + cn_index, sizeof(uchar) * cn, vl); auto v2 = __riscv_vlse8_v_u8m8(src2 + cn * i + cn_index, sizeof(uchar) * cn, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto m = __riscv_vle8_v_u8m8(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vmaxu_tumu(b, s, s, v, vl); @@ -516,7 +513,7 @@ struct MaskedNormDiffInf_RVV { vl = __riscv_vsetvl_e8m8(len - i); auto v1 = __riscv_vlse8_v_i8m8(src1 + cn * i + cn_index, sizeof(schar) * cn, vl); auto v2 = __riscv_vlse8_v_i8m8(src2 + cn * i + cn_index, sizeof(schar) * cn, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto m = __riscv_vle8_v_u8m8(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vmaxu_tumu(b, s, s, v, vl); @@ -537,7 +534,7 @@ struct MaskedNormDiffInf_RVV { vl = __riscv_vsetvl_e16m8(len - i); auto v1 = __riscv_vlse16_v_u16m8(src1 + cn * i + cn_index, sizeof(ushort) * cn, vl); auto v2 = __riscv_vlse16_v_u16m8(src2 + cn * i + cn_index, sizeof(ushort) * cn, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto m = __riscv_vle8_v_u8m4(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vmaxu_tumu(b, s, s, v, vl); @@ -558,7 +555,7 @@ struct MaskedNormDiffInf_RVV { vl = __riscv_vsetvl_e16m8(len - i); auto v1 = __riscv_vlse16_v_i16m8(src1 + cn * i + cn_index, sizeof(short) * cn, vl); auto v2 = __riscv_vlse16_v_i16m8(src2 + cn * i + cn_index, sizeof(short) * cn, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto m = __riscv_vle8_v_u8m4(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vmaxu_tumu(b, s, s, v, vl); @@ -579,8 +576,8 @@ struct MaskedNormDiffInf_RVV { vl = __riscv_vsetvl_e32m8(len - i); auto v1 = __riscv_vlse32_v_i32m8(src1 + cn * i + cn_index, sizeof(int) * cn, vl); auto v2 = __riscv_vlse32_v_i32m8(src2 + cn * i + cn_index, sizeof(int) * cn, vl); - // auto v = custom_intrin::__riscv_vabd(v1, v2, vl); // 5.x - auto v = custom_intrin::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x + // auto v = common::__riscv_vabd(v1, v2, vl); // 5.x + auto v = common::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x auto m = __riscv_vle8_v_u8m2(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vmaxu_tumu(b, s, s, v, vl); @@ -656,7 +653,7 @@ struct MaskedNormDiffL1_RVV { vl = __riscv_vsetvl_e8m8(len - i); auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl); auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto m = __riscv_vle8_v_u8m8(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); @@ -667,7 +664,7 @@ struct MaskedNormDiffL1_RVV { vl = __riscv_vsetvl_e8m2(len - i); auto v1 = __riscv_vle8_v_u8m8(src1 + i * 4, vl * 4); auto v2 = __riscv_vle8_v_u8m8(src2 + i * 4, vl * 4); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl * 4); + auto v = common::__riscv_vabd(v1, v2, vl * 4); auto m = __riscv_vle8_v_u8m2(mask + i, vl); auto b = __riscv_vmsne(__riscv_vreinterpret_u8m8(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4); s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl * 4), s, __riscv_vsetvlmax_e16m1()); @@ -679,7 +676,7 @@ struct MaskedNormDiffL1_RVV { vl = __riscv_vsetvl_e8m8(len - i); auto v1 = __riscv_vlse8_v_u8m8(src1 + cn * i + cn_index, sizeof(uchar) * cn, vl); auto v2 = __riscv_vlse8_v_u8m8(src2 + cn * i + cn_index, sizeof(uchar) * cn, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto m = __riscv_vle8_v_u8m8(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); @@ -701,7 +698,7 @@ struct MaskedNormDiffL1_RVV { vl = __riscv_vsetvl_e8m8(len - i); auto v1 = __riscv_vlse8_v_i8m8(src1 + cn * i + cn_index, sizeof(schar) * cn, vl); auto v2 = __riscv_vlse8_v_i8m8(src2 + cn * i + cn_index, sizeof(schar) * cn, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto m = __riscv_vle8_v_u8m8(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); @@ -721,7 +718,7 @@ struct MaskedNormDiffL1_RVV { vl = __riscv_vsetvl_e8m4(len - i); auto v1 = __riscv_vlse16_v_u16m8(src1 + cn * i + cn_index, sizeof(ushort) * cn, vl); auto v2 = __riscv_vlse16_v_u16m8(src2 + cn * i + cn_index, sizeof(ushort) * cn, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto m = __riscv_vle8_v_u8m4(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vwredsumu_tum(b, s, v, s, vl); @@ -741,7 +738,7 @@ struct MaskedNormDiffL1_RVV { vl = __riscv_vsetvl_e8m4(len - i); auto v1 = __riscv_vlse16_v_i16m8(src1 + cn * i + cn_index, sizeof(short) * cn, vl); auto v2 = __riscv_vlse16_v_i16m8(src2 + cn * i + cn_index, sizeof(short) * cn, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto m = __riscv_vle8_v_u8m4(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vwredsumu_tum(b, s, v, s, vl); @@ -762,8 +759,8 @@ struct MaskedNormDiffL1_RVV { vl = __riscv_vsetvl_e32m4(len - i); auto v1 = __riscv_vlse32_v_i32m4(src1 + cn * i + cn_index, sizeof(int) * cn, vl); auto v2 = __riscv_vlse32_v_i32m4(src2 + cn * i + cn_index, sizeof(int) * cn, vl); - // auto v = custom_intrin::__riscv_vabd(v1, v2, vl); // 5.x - auto v = custom_intrin::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x + // auto v = common::__riscv_vabd(v1, v2, vl); // 5.x + auto v = common::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x auto m = __riscv_vle8_v_u8m1(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, v, vl), vl); @@ -838,7 +835,7 @@ struct MaskedNormDiffL2_RVV { vl = __riscv_vsetvl_e8m4(len - i); auto v1 = __riscv_vle8_v_u8m4(src1 + i, vl); auto v2 = __riscv_vle8_v_u8m4(src2 + i, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto m = __riscv_vle8_v_u8m4(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl); @@ -849,7 +846,7 @@ struct MaskedNormDiffL2_RVV { vl = __riscv_vsetvl_e8m1(len - i); auto v1 = __riscv_vle8_v_u8m4(src1 + i * 4, vl * 4); auto v2 = __riscv_vle8_v_u8m4(src2 + i * 4, vl * 4); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl * 4); + auto v = common::__riscv_vabd(v1, v2, vl * 4); auto m = __riscv_vle8_v_u8m1(mask + i, vl); auto b = __riscv_vmsne(__riscv_vreinterpret_u8m4(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4); s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl * 4), s, vl * 4); @@ -861,7 +858,7 @@ struct MaskedNormDiffL2_RVV { vl = __riscv_vsetvl_e8m4(len - i); auto v1 = __riscv_vlse8_v_u8m4(src1 + cn * i + cn_index, sizeof(uchar) * cn, vl); auto v2 = __riscv_vlse8_v_u8m4(src2 + cn * i + cn_index, sizeof(uchar) * cn, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto m = __riscv_vle8_v_u8m4(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl); @@ -882,7 +879,7 @@ struct MaskedNormDiffL2_RVV { vl = __riscv_vsetvl_e8m4(len - i); auto v1 = __riscv_vlse8_v_i8m4(src1 + cn * i + cn_index, sizeof(schar) * cn, vl); auto v2 = __riscv_vlse8_v_i8m4(src2 + cn * i + cn_index, sizeof(schar) * cn, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto m = __riscv_vle8_v_u8m4(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl); @@ -903,7 +900,7 @@ struct MaskedNormDiffL2_RVV { vl = __riscv_vsetvl_e16m2(len - i); auto v1 = __riscv_vlse16_v_u16m2(src1 + cn * i + cn_index, sizeof(ushort) * cn, vl); auto v2 = __riscv_vlse16_v_u16m2(src2 + cn * i + cn_index, sizeof(ushort) * cn, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto m = __riscv_vle8_v_u8m1(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); auto v_mul = __riscv_vwmulu(b, v, v, vl); @@ -925,7 +922,7 @@ struct MaskedNormDiffL2_RVV { vl = __riscv_vsetvl_e16m2(len - i); auto v1 = __riscv_vlse16_v_i16m2(src1 + cn * i + cn_index, sizeof(short) * cn, vl); auto v2 = __riscv_vlse16_v_i16m2(src2 + cn * i + cn_index, sizeof(short) * cn, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto m = __riscv_vle8_v_u8m1(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); auto v_mul = __riscv_vwmulu(b, v, v, vl); @@ -947,7 +944,7 @@ struct MaskedNormDiffL2_RVV { vl = __riscv_vsetvl_e16m2(len - i); auto v1 = __riscv_vlse32_v_i32m4(src1 + cn * i + cn_index, sizeof(int) * cn, vl); auto v2 = __riscv_vlse32_v_i32m4(src2 + cn * i + cn_index, sizeof(int) * cn, vl); - auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v = common::__riscv_vabd(v1, v2, vl); auto m = __riscv_vle8_v_u8m1(mask + i, vl); auto b = __riscv_vmsne(m, 0, vl); auto v_mul = __riscv_vwmulu(b, v, v, vl); @@ -1081,9 +1078,8 @@ CV_HAL_RVV_DEF_NORM_DIFF_ALL(64f, double, double, double, double) } using NormDiffFunc = int (*)(const uchar*, const uchar*, const uchar*, uchar*, int, int); -inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, - size_t mask_step, int width, int height, int type, int norm_type, double* result) -{ +int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, + int width, int height, int type, int norm_type, double* result) { int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); bool relative = norm_type & NORM_RELATIVE; @@ -1207,7 +1203,7 @@ inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size if(relative) { double result_; - int ret = cv::cv_hal_rvv::norm::norm(src2, src2_step, mask, mask_step, width, height, type, norm_type, &result_); + int ret = cv::rvv_hal::core::norm(src2, src2_step, mask, mask_step, width, height, type, norm_type, &result_); if(ret == CV_HAL_ERROR_OK) { *result /= result_ + DBL_EPSILON; @@ -1217,6 +1213,6 @@ inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size return CV_HAL_ERROR_OK; } -}}} +#endif // CV_HAL_RVV_1P0_ENABLED -#endif +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/norm_hamming.hpp b/hal/riscv-rvv/src/core/norm_hamming.cpp similarity index 89% rename from hal/riscv-rvv/hal_rvv_1p0/norm_hamming.hpp rename to hal/riscv-rvv/src/core/norm_hamming.cpp index 9c19f62b7e..7a0951f3bc 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/norm_hamming.hpp +++ b/hal/riscv-rvv/src/core/norm_hamming.cpp @@ -4,18 +4,11 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_NORM_HAMMING_HPP_INCLUDED -#define OPENCV_HAL_RVV_NORM_HAMMING_HPP_INCLUDED +#include "rvv_hal.hpp" -#include -#include +namespace cv { namespace rvv_hal { namespace core { -namespace cv { namespace cv_hal_rvv { - -#undef cv_hal_normHamming8u -#define cv_hal_normHamming8u cv::cv_hal_rvv::normHamming8u -#undef cv_hal_normHammingDiff8u -#define cv_hal_normHammingDiff8u cv::cv_hal_rvv::normHammingDiff8u +#if CV_HAL_RVV_1P0_ENABLED template inline void normHammingCnt_m8(vuint8m8_t v, vbool1_t mask, size_t len_bool, size_t& result) @@ -153,7 +146,7 @@ inline void normHammingDiff8uLoop(const uchar* a, const uchar* b, size_t n, size } } -inline int normHamming8u(const uchar* a, int n, int cellSize, int* result) +int normHamming8u(const uchar* a, int n, int cellSize, int* result) { size_t _result = 0; @@ -168,7 +161,7 @@ inline int normHamming8u(const uchar* a, int n, int cellSize, int* result) return CV_HAL_ERROR_OK; } -inline int normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result) +int normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result) { size_t _result = 0; @@ -183,6 +176,6 @@ inline int normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize return CV_HAL_ERROR_OK; } -}} // namespace cv::cv_hal_rvv +#endif // CV_HAL_RVV_1P0_ENABLED -#endif //OPENCV_HAL_RVV_NORM_HAMMING_HPP_INCLUDED +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/sincos.hpp b/hal/riscv-rvv/src/core/polar_to_cart.cpp similarity index 61% rename from hal/riscv-rvv/hal_rvv_1p0/sincos.hpp rename to hal/riscv-rvv/src/core/polar_to_cart.cpp index 776d58f42c..bb5824ca49 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/sincos.hpp +++ b/hal/riscv-rvv/src/core/polar_to_cart.cpp @@ -1,16 +1,16 @@ // This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level -// directory of this distribution and at http://opencv.org/license.html. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_SINCOS_HPP_INCLUDED -#define OPENCV_HAL_RVV_SINCOS_HPP_INCLUDED +#include "rvv_hal.hpp" -#include -#include "hal_rvv_1p0/types.hpp" +namespace cv { namespace rvv_hal { namespace core { -namespace cv { namespace cv_hal_rvv { namespace detail { +#if CV_HAL_RVV_1P0_ENABLED + +namespace { static constexpr size_t sincos_mask = 0x3; @@ -67,6 +67,44 @@ static inline void cosval = __riscv_vfneg_mu(__riscv_vmor(idx1, idx2, vl), cosval, cosval, vl); } -}}} // namespace cv::cv_hal_rvv::detail +template +inline int polarToCart(const Elem* mag, const Elem* angle, Elem* x, Elem* y, int len, bool angleInDegrees) +{ + using T = RVV_F32M4; + const auto sincos_scale = angleInDegrees ? sincos_deg_scale : sincos_rad_scale; -#endif // OPENCV_HAL_RVV_SINCOS_HPP_INCLUDED + size_t vl; + auto cos_p2 = T::vmv(sincos_cos_p2, T::setvlmax()); + auto cos_p0 = T::vmv(sincos_cos_p0, T::setvlmax()); + for (; len > 0; len -= (int)vl, angle += vl, x += vl, y += vl) + { + vl = RVV_T::setvl(len); + auto vangle = T::cast(RVV_T::vload(angle, vl), vl); + T::VecType vsin, vcos; + SinCos32f(vangle, vsin, vcos, sincos_scale, cos_p2, cos_p0, vl); + if (mag) + { + auto vmag = T::cast(RVV_T::vload(mag, vl), vl); + vsin = __riscv_vfmul(vsin, vmag, vl); + vcos = __riscv_vfmul(vcos, vmag, vl); + mag += vl; + } + RVV_T::vstore(x, RVV_T::cast(vcos, vl), vl); + RVV_T::vstore(y, RVV_T::cast(vsin, vl), vl); + } + + return CV_HAL_ERROR_OK; +} + +} // anonymous + +int polarToCart32f(const float* mag, const float* angle, float* x, float* y, int len, bool angleInDegrees) { + return polarToCart(mag, angle, x, y, len, angleInDegrees); +} +int polarToCart64f(const double* mag, const double* angle, double* x, double* y, int len, bool angleInDegrees) { + return polarToCart(mag, angle, x, y, len, angleInDegrees); +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/qr.hpp b/hal/riscv-rvv/src/core/qr.cpp similarity index 91% rename from hal/riscv-rvv/hal_rvv_1p0/qr.hpp rename to hal/riscv-rvv/src/core/qr.cpp index a7085e062b..1bb471a5aa 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/qr.hpp +++ b/hal/riscv-rvv/src/core/qr.cpp @@ -4,22 +4,17 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_QR_HPP_INCLUDED -#define OPENCV_HAL_RVV_QR_HPP_INCLUDED - +#include "rvv_hal.hpp" #include #include #include #include -#include -#include "hal_rvv_1p0/types.hpp" -namespace cv { namespace cv_hal_rvv { namespace qr { +namespace cv { namespace rvv_hal { namespace core { -#undef cv_hal_QR32f -#define cv_hal_QR32f cv::cv_hal_rvv::qr::QR -#undef cv_hal_QR64f -#define cv_hal_QR64f cv::cv_hal_rvv::qr::QR +#if CV_HAL_RVV_1P0_ENABLED + +namespace { // the algorithm is copied from core/src/matrix_decomp.cpp, // in the function template static int cv::QRImpl @@ -171,6 +166,15 @@ inline int QR(T* src1, size_t src1_step, int m, int n, int k, T* src2, size_t sr return CV_HAL_ERROR_OK; } -}}} +} // anonymous -#endif +int QR32f(float* src1, size_t src1_step, int m, int n, int k, float* src2, size_t src2_step, float* dst, int* info) { + return QR(src1, src1_step, m, n, k, src2, src2_step, dst, info); +} +int QR64f(double* src1, size_t src1_step, int m, int n, int k, double* src2, size_t src2_step, double* dst, int* info) { + return QR(src1, src1_step, m, n, k, src2, src2_step, dst, info); +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/split.hpp b/hal/riscv-rvv/src/core/split.cpp similarity index 91% rename from hal/riscv-rvv/hal_rvv_1p0/split.hpp rename to hal/riscv-rvv/src/core/split.cpp index 9646fd9f67..1a843c939e 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/split.hpp +++ b/hal/riscv-rvv/src/core/split.cpp @@ -1,17 +1,14 @@ // This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. -#ifndef OPENCV_HAL_RVV_SPLIT_HPP_INCLUDED -#define OPENCV_HAL_RVV_SPLIT_HPP_INCLUDED -#include +#include "rvv_hal.hpp" -namespace cv { namespace cv_hal_rvv { +namespace cv { namespace rvv_hal { namespace core { -#undef cv_hal_split8u -#define cv_hal_split8u cv::cv_hal_rvv::split8u +#if CV_HAL_RVV_1P0_ENABLED -inline int split8u(const uchar* src, uchar** dst, int len, int cn) +int split8u(const uchar* src, uchar** dst, int len, int cn) { int vl = 0; if (cn == 1) @@ -89,5 +86,6 @@ inline int split8u(const uchar* src, uchar** dst, int len, int cn) return CV_HAL_ERROR_OK; } -}} -#endif +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/src/core/sqrt.cpp b/hal/riscv-rvv/src/core/sqrt.cpp new file mode 100644 index 0000000000..7186f1bcca --- /dev/null +++ b/hal/riscv-rvv/src/core/sqrt.cpp @@ -0,0 +1,74 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level +// directory of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. + +#include "rvv_hal.hpp" +#include "common.hpp" + +namespace cv { namespace rvv_hal { namespace core { + +#if CV_HAL_RVV_1P0_ENABLED + +namespace { + +template +inline int sqrt(const Elem* src, Elem* dst, int _len) +{ + size_t vl; + for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl) + { + vl = SQRT_T::T::setvl(len); + auto x = SQRT_T::T::vload(src, vl); + SQRT_T::T::vstore(dst, common::sqrt(x, vl), vl); + } + + return CV_HAL_ERROR_OK; +} + +template +inline int invSqrt(const Elem* src, Elem* dst, int _len) +{ + size_t vl; + for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl) + { + vl = SQRT_T::T::setvl(len); + auto x = SQRT_T::T::vload(src, vl); + SQRT_T::T::vstore(dst, common::invSqrt(x, vl), vl); + } + + return CV_HAL_ERROR_OK; +} + +} // anonymous + +int sqrt32f(const float* src, float* dst, int len) { + return sqrt>(src, dst, len); +} +int sqrt64f(const double* src, double* dst, int len) { + return sqrt>(src, dst, len); +} + +int invSqrt32f(const float* src, float* dst, int len) { +#ifdef __clang__ +// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access. +// So a smaller LMUL is used here. + return invSqrt>(src, dst, len); +#else + return invSqrt>(src, dst, len); +#endif +} +int invSqrt64f(const double* src, double* dst, int len) { +#ifdef __clang__ +// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access. +// So a smaller LMUL is used here. + return invSqrt>(src, dst, len); +#else + return invSqrt>(src, dst, len); +#endif +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/svd.hpp b/hal/riscv-rvv/src/core/svd.cpp similarity index 93% rename from hal/riscv-rvv/hal_rvv_1p0/svd.hpp rename to hal/riscv-rvv/src/core/svd.cpp index 2ecad0671e..8454b60a85 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/svd.hpp +++ b/hal/riscv-rvv/src/core/svd.cpp @@ -4,22 +4,17 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_SVD_HPP_INCLUDED -#define OPENCV_HAL_RVV_SVD_HPP_INCLUDED - +#include "rvv_hal.hpp" #include #include #include #include -#include -#include "hal_rvv_1p0/types.hpp" -namespace cv { namespace cv_hal_rvv { namespace svd { +namespace cv { namespace rvv_hal { namespace core { -#undef cv_hal_SVD32f -#define cv_hal_SVD32f cv::cv_hal_rvv::svd::SVD -#undef cv_hal_SVD64f -#define cv_hal_SVD64f cv::cv_hal_rvv::svd::SVD +#if CV_HAL_RVV_1P0_ENABLED + +namespace { // the algorithm is copied from core/src/lapack.cpp, // in the function template static void cv::JacobiSVDImpl_ @@ -268,6 +263,15 @@ inline int SVD(T* src, size_t src_step, T* w, T*, size_t, T* vt, size_t vt_step, return CV_HAL_ERROR_OK; } -}}} +} // anonymous -#endif +int SVD32f(float* src, size_t src_step, float* w, float* u, size_t u_step, float* vt, size_t vt_step, int m, int n, int flags) { + return SVD(src, src_step, w, u, u_step, vt, vt_step, m, n, flags); +} +int SVD64f(double* src, size_t src_step, double* w, double* u, size_t u_step, double* vt, size_t vt_step, int m, int n, int flags) { + return SVD(src, src_step, w, u, u_step, vt, vt_step, m, n, flags); +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/hal_rvv_1p0/transpose.hpp b/hal/riscv-rvv/src/core/transpose.cpp similarity index 71% rename from hal/riscv-rvv/hal_rvv_1p0/transpose.hpp rename to hal/riscv-rvv/src/core/transpose.cpp index 10bf9b4d3e..9881c3db90 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/transpose.hpp +++ b/hal/riscv-rvv/src/core/transpose.cpp @@ -5,12 +5,7 @@ // Copyright (C) 2025, SpaceMIT Inc., all rights reserved. // Third party copyrights are property of their respective owners. -#ifndef OPENCV_HAL_RVV_TRANSPOSE_HPP_INCLUDED -#define OPENCV_HAL_RVV_TRANSPOSE_HPP_INCLUDED - -#include - -namespace cv { namespace cv_hal_rvv { namespace transpose { +#include "rvv_hal.hpp" #if defined (__clang__) && __clang_major__ < 18 #define OPENCV_HAL_IMPL_RVV_VCREATE_x4(suffix, width, v0, v1, v2, v3) \ @@ -35,18 +30,22 @@ namespace cv { namespace cv_hal_rvv { namespace transpose { #define __riscv_vcreate_v_i64m1x8(v0, v1, v2, v3, v4, v5, v6, v7) OPENCV_HAL_IMPL_RVV_VCREATE_x8(i64, 1, v0, v1, v2, v3, v4, v5, v6, v7) #endif +namespace cv { namespace rvv_hal { namespace core { + +#if CV_HAL_RVV_1P0_ENABLED + static void transpose2d_8u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) { - auto transpose_8u_8xVl = [](const uchar *src, size_t src_step, uchar *dst, size_t dst_step, const int vl) { + auto transpose_8u_8xVl = [](const uchar *src, size_t sstep, uchar *dst, size_t dstep, const int vl) { auto v0 = __riscv_vle8_v_u8m1(src, vl); - auto v1 = __riscv_vle8_v_u8m1(src + src_step, vl); - auto v2 = __riscv_vle8_v_u8m1(src + 2 * src_step, vl); - auto v3 = __riscv_vle8_v_u8m1(src + 3 * src_step, vl); - auto v4 = __riscv_vle8_v_u8m1(src + 4 * src_step, vl); - auto v5 = __riscv_vle8_v_u8m1(src + 5 * src_step, vl); - auto v6 = __riscv_vle8_v_u8m1(src + 6 * src_step, vl); - auto v7 = __riscv_vle8_v_u8m1(src + 7 * src_step, vl); + auto v1 = __riscv_vle8_v_u8m1(src + sstep, vl); + auto v2 = __riscv_vle8_v_u8m1(src + 2 * sstep, vl); + auto v3 = __riscv_vle8_v_u8m1(src + 3 * sstep, vl); + auto v4 = __riscv_vle8_v_u8m1(src + 4 * sstep, vl); + auto v5 = __riscv_vle8_v_u8m1(src + 5 * sstep, vl); + auto v6 = __riscv_vle8_v_u8m1(src + 6 * sstep, vl); + auto v7 = __riscv_vle8_v_u8m1(src + 7 * sstep, vl); vuint8m1x8_t v = __riscv_vcreate_v_u8m1x8(v0, v1, v2, v3, v4, v5, v6, v7); - __riscv_vssseg8e8(dst, dst_step, v, vl); + __riscv_vssseg8e8(dst, dstep, v, vl); }; int h = 0, w = 0; @@ -72,17 +71,17 @@ static void transpose2d_8u(const uchar *src_data, size_t src_step, uchar *dst_da } static void transpose2d_16u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) { - auto transpose_16u_8xVl = [](const ushort *src, size_t src_step, ushort *dst, size_t dst_step, const int vl) { + auto transpose_16u_8xVl = [](const ushort *src, size_t sstep, ushort *dst, size_t dstep, const int vl) { auto v0 = __riscv_vle16_v_u16m1(src, vl); - auto v1 = __riscv_vle16_v_u16m1(src + src_step, vl); - auto v2 = __riscv_vle16_v_u16m1(src + 2 * src_step, vl); - auto v3 = __riscv_vle16_v_u16m1(src + 3 * src_step, vl); - auto v4 = __riscv_vle16_v_u16m1(src + 4 * src_step, vl); - auto v5 = __riscv_vle16_v_u16m1(src + 5 * src_step, vl); - auto v6 = __riscv_vle16_v_u16m1(src + 6 * src_step, vl); - auto v7 = __riscv_vle16_v_u16m1(src + 7 * src_step, vl); + auto v1 = __riscv_vle16_v_u16m1(src + sstep, vl); + auto v2 = __riscv_vle16_v_u16m1(src + 2 * sstep, vl); + auto v3 = __riscv_vle16_v_u16m1(src + 3 * sstep, vl); + auto v4 = __riscv_vle16_v_u16m1(src + 4 * sstep, vl); + auto v5 = __riscv_vle16_v_u16m1(src + 5 * sstep, vl); + auto v6 = __riscv_vle16_v_u16m1(src + 6 * sstep, vl); + auto v7 = __riscv_vle16_v_u16m1(src + 7 * sstep, vl); vuint16m1x8_t v = __riscv_vcreate_v_u16m1x8(v0, v1, v2, v3, v4, v5, v6, v7); - __riscv_vssseg8e16(dst, dst_step, v, vl); + __riscv_vssseg8e16(dst, dstep, v, vl); }; size_t src_step_base = src_step / sizeof(ushort); @@ -111,13 +110,13 @@ static void transpose2d_16u(const uchar *src_data, size_t src_step, uchar *dst_d } static void transpose2d_32s(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) { - auto transpose_32s_4xVl = [](const int *src, size_t src_step, int *dst, size_t dst_step, const int vl) { + auto transpose_32s_4xVl = [](const int *src, size_t sstep, int *dst, size_t dstep, const int vl) { auto v0 = __riscv_vle32_v_i32m1(src, vl); - auto v1 = __riscv_vle32_v_i32m1(src + src_step, vl); - auto v2 = __riscv_vle32_v_i32m1(src + 2 * src_step, vl); - auto v3 = __riscv_vle32_v_i32m1(src + 3 * src_step, vl); + auto v1 = __riscv_vle32_v_i32m1(src + sstep, vl); + auto v2 = __riscv_vle32_v_i32m1(src + 2 * sstep, vl); + auto v3 = __riscv_vle32_v_i32m1(src + 3 * sstep, vl); vint32m1x4_t v = __riscv_vcreate_v_i32m1x4(v0, v1, v2, v3); - __riscv_vssseg4e32(dst, dst_step, v, vl); + __riscv_vssseg4e32(dst, dstep, v, vl); }; size_t src_step_base = src_step / sizeof(int); @@ -146,17 +145,17 @@ static void transpose2d_32s(const uchar *src_data, size_t src_step, uchar *dst_d } static void transpose2d_32sC2(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) { - auto transpose_64s_8xVl = [](const int64_t *src, size_t src_step, int64_t *dst, size_t dst_step, const int vl) { + auto transpose_64s_8xVl = [](const int64_t *src, size_t sstep, int64_t *dst, size_t dstep, const int vl) { auto v0 = __riscv_vle64_v_i64m1(src, vl); - auto v1 = __riscv_vle64_v_i64m1(src + src_step, vl); - auto v2 = __riscv_vle64_v_i64m1(src + 2 * src_step, vl); - auto v3 = __riscv_vle64_v_i64m1(src + 3 * src_step, vl); - auto v4 = __riscv_vle64_v_i64m1(src + 4 * src_step, vl); - auto v5 = __riscv_vle64_v_i64m1(src + 5 * src_step, vl); - auto v6 = __riscv_vle64_v_i64m1(src + 6 * src_step, vl); - auto v7 = __riscv_vle64_v_i64m1(src + 7 * src_step, vl); + auto v1 = __riscv_vle64_v_i64m1(src + sstep, vl); + auto v2 = __riscv_vle64_v_i64m1(src + 2 * sstep, vl); + auto v3 = __riscv_vle64_v_i64m1(src + 3 * sstep, vl); + auto v4 = __riscv_vle64_v_i64m1(src + 4 * sstep, vl); + auto v5 = __riscv_vle64_v_i64m1(src + 5 * sstep, vl); + auto v6 = __riscv_vle64_v_i64m1(src + 6 * sstep, vl); + auto v7 = __riscv_vle64_v_i64m1(src + 7 * sstep, vl); vint64m1x8_t v = __riscv_vcreate_v_i64m1x8(v0, v1, v2, v3, v4, v5, v6, v7); - __riscv_vssseg8e64(dst, dst_step, v, vl); + __riscv_vssseg8e64(dst, dstep, v, vl); }; size_t src_step_base = src_step / sizeof(int64_t); @@ -184,12 +183,9 @@ static void transpose2d_32sC2(const uchar *src_data, size_t src_step, uchar *dst } } -#undef cv_hal_transpose2d -#define cv_hal_transpose2d cv::cv_hal_rvv::transpose::transpose2d - using Transpose2dFunc = void (*)(const uchar*, size_t, uchar*, size_t, int, int); -inline int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, - int src_width, int src_height, int element_size) { +int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, + int src_width, int src_height, int element_size) { if (src_data == dst_data) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } @@ -215,6 +211,6 @@ inline int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data, return CV_HAL_ERROR_OK; } -}}} // cv::cv_hal_rvv::transpose +#endif // CV_HAL_RVV_1P0_ENABLED -#endif // OPENCV_HAL_RVV_TRANSPOSE_HPP_INCLUDED +}}} // cv::rvv_hal::core diff --git a/hal/riscv-rvv/src/imgproc/bilateral_filter.cpp b/hal/riscv-rvv/src/imgproc/bilateral_filter.cpp new file mode 100644 index 0000000000..0756f2e6c0 --- /dev/null +++ b/hal/riscv-rvv/src/imgproc/bilateral_filter.cpp @@ -0,0 +1,361 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. + +#include "rvv_hal.hpp" +#include "common.hpp" + +namespace cv { namespace rvv_hal { namespace imgproc { + +#if CV_HAL_RVV_1P0_ENABLED + +namespace { + +// the algorithm is copied from imgproc/src/bilateral_filter.simd.cpp +// in the functor BilateralFilter_8u_Invoker +static inline int bilateralFilter8UC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* color_weight) +{ + constexpr int align = 31; + std::vector _sum(width + align), _wsum(width + align); + float* sum = reinterpret_cast(((size_t)_sum.data() + align) & ~align); + float* wsum = reinterpret_cast(((size_t)_wsum.data() + align) & ~align); + + for (int i = start; i < end; i++) + { + const uchar* sptr = src_data + (i+radius) * src_step + radius; + memset(sum, 0, sizeof(float) * width); + memset(wsum, 0, sizeof(float) * width); + for(int k = 0; k < maxk; k++) + { + const uchar* ksptr = sptr + space_ofs[k]; + int vl; + for (int j = 0; j < width; j += vl) + { + vl = __riscv_vsetvl_e8m2(width - j); + auto src = __riscv_vle8_v_u8m2(sptr + j, vl); + auto ksrc = __riscv_vle8_v_u8m2(ksptr + j, vl); + auto diff = __riscv_vsub(__riscv_vmaxu(src, ksrc, vl), __riscv_vminu(src, ksrc, vl), vl); + auto w = __riscv_vloxei16_v_f32m8(color_weight, __riscv_vmul(__riscv_vzext_vf2(diff, vl), sizeof(float), vl), vl); + w = __riscv_vfmul(w, space_weight[k], vl); + + __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl); + __riscv_vse32(sum + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc, vl), vl), __riscv_vle32_v_f32m8(sum + j, vl), vl), vl); + } + } + + int vl; + for (int j = 0; j < width; j += vl) + { + vl = __riscv_vsetvl_e8m2(width - j); + auto dst = __riscv_vfncvt_xu(__riscv_vfdiv(__riscv_vle32_v_f32m8(sum + j, vl), __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl); + __riscv_vse8(dst_data + i * dst_step + j, __riscv_vncvt_x(dst, vl), vl); + } + } + + return CV_HAL_ERROR_OK; +} + +static inline int bilateralFilter8UC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* color_weight) +{ + constexpr int align = 31; + std::vector _sum_b(width + align), _sum_g(width + align), _sum_r(width + align), _wsum(width + align); + float* sum_b = reinterpret_cast(((size_t)_sum_b.data() + align) & ~align); + float* sum_g = reinterpret_cast(((size_t)_sum_g.data() + align) & ~align); + float* sum_r = reinterpret_cast(((size_t)_sum_r.data() + align) & ~align); + float* wsum = reinterpret_cast(((size_t)_wsum.data() + align) & ~align); + + for (int i = start; i < end; i++) + { + const uchar* sptr = src_data + (i+radius) * src_step + radius*3; + memset(sum_b, 0, sizeof(float) * width); + memset(sum_g, 0, sizeof(float) * width); + memset(sum_r, 0, sizeof(float) * width); + memset(wsum, 0, sizeof(float) * width); + for(int k = 0; k < maxk; k++) + { + const uchar* ksptr = sptr + space_ofs[k]; + int vl; + for (int j = 0; j < width; j += vl) + { + vl = __riscv_vsetvl_e8m2(width - j); + auto src = __riscv_vlseg3e8_v_u8m2x3(sptr + j * 3, vl); + auto src0 = __riscv_vget_v_u8m2x3_u8m2(src, 0); + auto src1 = __riscv_vget_v_u8m2x3_u8m2(src, 1); + auto src2 = __riscv_vget_v_u8m2x3_u8m2(src, 2); + src = __riscv_vlseg3e8_v_u8m2x3(ksptr + j * 3, vl); + auto ksrc0 = __riscv_vget_v_u8m2x3_u8m2(src, 0); + auto ksrc1 = __riscv_vget_v_u8m2x3_u8m2(src, 1); + auto ksrc2 = __riscv_vget_v_u8m2x3_u8m2(src, 2); + + auto diff0 = __riscv_vsub(__riscv_vmaxu(src0, ksrc0, vl), __riscv_vminu(src0, ksrc0, vl), vl); + auto diff1 = __riscv_vsub(__riscv_vmaxu(src1, ksrc1, vl), __riscv_vminu(src1, ksrc1, vl), vl); + auto diff2 = __riscv_vsub(__riscv_vmaxu(src2, ksrc2, vl), __riscv_vminu(src2, ksrc2, vl), vl); + auto w = __riscv_vloxei16_v_f32m8(color_weight, __riscv_vmul(__riscv_vadd(__riscv_vadd(__riscv_vzext_vf2(diff0, vl), __riscv_vzext_vf2(diff1, vl), vl), __riscv_vzext_vf2(diff2, vl), vl), sizeof(float), vl), vl); + w = __riscv_vfmul(w, space_weight[k], vl); + + __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl); + __riscv_vse32(sum_b + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc0, vl), vl), __riscv_vle32_v_f32m8(sum_b + j, vl), vl), vl); + __riscv_vse32(sum_g + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc1, vl), vl), __riscv_vle32_v_f32m8(sum_g + j, vl), vl), vl); + __riscv_vse32(sum_r + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc2, vl), vl), __riscv_vle32_v_f32m8(sum_r + j, vl), vl), vl); + } + } + + int vl; + for (int j = 0; j < width; j += vl) + { + vl = __riscv_vsetvl_e8m2(width - j); + auto w = __riscv_vfrdiv(__riscv_vle32_v_f32m8(wsum + j, vl), 1.0f, vl); + vuint8m2x3_t dst{}; + dst = __riscv_vset_v_u8m2_u8m2x3(dst, 0,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_b + j, vl), w, vl), vl), vl)); + dst = __riscv_vset_v_u8m2_u8m2x3(dst, 1,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_g + j, vl), w, vl), vl), vl)); + dst = __riscv_vset_v_u8m2_u8m2x3(dst, 2,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_r + j, vl), w, vl), vl), vl)); + __riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl); + } + } + + return CV_HAL_ERROR_OK; +} + +// the algorithm is copied from imgproc/src/bilateral_filter.simd.cpp +// in the functor BilateralFilter_32f_Invoker +static inline int bilateralFilter32FC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* expLUT, float scale_index) +{ + constexpr int align = 31; + std::vector _sum(width + align), _wsum(width + align); + float* sum = reinterpret_cast(((size_t)_sum.data() + align) & ~align); + float* wsum = reinterpret_cast(((size_t)_wsum.data() + align) & ~align); + + for (int i = start; i < end; i++) + { + const float* sptr = reinterpret_cast(src_data + (i+radius) * src_step) + radius; + memset(sum, 0, sizeof(float) * width); + memset(wsum, 0, sizeof(float) * width); + for(int k = 0; k < maxk; k++) + { + const float* ksptr = sptr + space_ofs[k]; + int vl; + for (int j = 0; j < width; j += vl) + { + vl = __riscv_vsetvl_e32m4(width - j); + auto src = __riscv_vle32_v_f32m4(sptr + j, vl); + auto ksrc = __riscv_vle32_v_f32m4(ksptr + j, vl); + auto diff = __riscv_vfmul(__riscv_vfabs(__riscv_vfsub(src, ksrc, vl), vl), scale_index, vl); + auto idx = __riscv_vfcvt_rtz_x(diff, vl); + auto alpha = __riscv_vfsub(diff, __riscv_vfcvt_f(idx, vl), vl); + + auto exp = __riscv_vloxseg2ei32_v_f32m4x2(expLUT, __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vmul(idx, sizeof(float), vl)), vl); + auto w = __riscv_vfmadd(alpha, __riscv_vfsub(__riscv_vget_v_f32m4x2_f32m4(exp, 1), __riscv_vget_v_f32m4x2_f32m4(exp, 0), vl), __riscv_vget_v_f32m4x2_f32m4(exp, 0), vl); + w = __riscv_vfmul(w, space_weight[k], vl); + + __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m4(wsum + j, vl), vl), vl); + __riscv_vse32(sum + j, __riscv_vfmadd(w, ksrc, __riscv_vle32_v_f32m4(sum + j, vl), vl), vl); + } + } + + int vl; + for (int j = 0; j < width; j += vl) + { + vl = __riscv_vsetvl_e32m4(width - j); + auto src = __riscv_vle32_v_f32m4(sptr + j, vl); + auto dst = __riscv_vfdiv(__riscv_vfadd(__riscv_vle32_v_f32m4(sum + j, vl), src, vl), __riscv_vfadd(__riscv_vle32_v_f32m4(wsum + j, vl), 1, vl), vl); + __riscv_vse32(reinterpret_cast(dst_data + i * dst_step) + j, dst, vl); + } + } + + return CV_HAL_ERROR_OK; +} + +static inline int bilateralFilter32FC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* expLUT, float scale_index) +{ + constexpr int align = 31; + std::vector _sum_b(width + align), _sum_g(width + align), _sum_r(width + align), _wsum(width + align); + float* sum_b = reinterpret_cast(((size_t)_sum_b.data() + align) & ~align); + float* sum_g = reinterpret_cast(((size_t)_sum_g.data() + align) & ~align); + float* sum_r = reinterpret_cast(((size_t)_sum_r.data() + align) & ~align); + float* wsum = reinterpret_cast(((size_t)_wsum.data() + align) & ~align); + + for (int i = start; i < end; i++) + { + const float* sptr = reinterpret_cast(src_data + (i+radius) * src_step) + radius*3; + memset(sum_b, 0, sizeof(float) * width); + memset(sum_g, 0, sizeof(float) * width); + memset(sum_r, 0, sizeof(float) * width); + memset(wsum, 0, sizeof(float) * width); + for(int k = 0; k < maxk; k++) + { + const float* ksptr = sptr + space_ofs[k]; + int vl; + for (int j = 0; j < width; j += vl) + { + vl = __riscv_vsetvl_e32m2(width - j); + auto src = __riscv_vlseg3e32_v_f32m2x3(sptr + j * 3, vl); + auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0); + auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1); + auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2); + src = __riscv_vlseg3e32_v_f32m2x3(ksptr + j * 3, vl); + auto ksrc0 = __riscv_vget_v_f32m2x3_f32m2(src, 0); + auto ksrc1 = __riscv_vget_v_f32m2x3_f32m2(src, 1); + auto ksrc2 = __riscv_vget_v_f32m2x3_f32m2(src, 2); + + auto diff = __riscv_vfmul(__riscv_vfadd(__riscv_vfadd(__riscv_vfabs(__riscv_vfsub(src0, ksrc0, vl), vl), __riscv_vfabs(__riscv_vfsub(src1, ksrc1, vl), vl), vl), __riscv_vfabs(__riscv_vfsub(src2, ksrc2, vl), vl), vl), scale_index, vl); + auto idx = __riscv_vfcvt_rtz_x(diff, vl); + auto alpha = __riscv_vfsub(diff, __riscv_vfcvt_f(idx, vl), vl); + + auto exp = __riscv_vloxseg2ei32_v_f32m2x2(expLUT, __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmul(idx, sizeof(float), vl)), vl); + auto w = __riscv_vfmadd(alpha, __riscv_vfsub(__riscv_vget_v_f32m2x2_f32m2(exp, 1), __riscv_vget_v_f32m2x2_f32m2(exp, 0), vl), __riscv_vget_v_f32m2x2_f32m2(exp, 0), vl); + w = __riscv_vfmul(w, space_weight[k], vl); + + __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m2(wsum + j, vl), vl), vl); + __riscv_vse32(sum_b + j, __riscv_vfmadd(w, ksrc0, __riscv_vle32_v_f32m2(sum_b + j, vl), vl), vl); + __riscv_vse32(sum_g + j, __riscv_vfmadd(w, ksrc1, __riscv_vle32_v_f32m2(sum_g + j, vl), vl), vl); + __riscv_vse32(sum_r + j, __riscv_vfmadd(w, ksrc2, __riscv_vle32_v_f32m2(sum_r + j, vl), vl), vl); + } + } + + int vl; + for (int j = 0; j < width; j += vl) + { + vl = __riscv_vsetvl_e32m2(width - j); + auto w = __riscv_vfrdiv(__riscv_vfadd(__riscv_vle32_v_f32m2(wsum + j, vl), 1, vl), 1, vl); + auto src = __riscv_vlseg3e32_v_f32m2x3(sptr + j * 3, vl); + auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0); + auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1); + auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2); + + vfloat32m2x3_t dst{}; + dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_b + j, vl), src0, vl), vl)); + dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_g + j, vl), src1, vl), vl)); + dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_r + j, vl), src2, vl), vl)); + __riscv_vsseg3e32(reinterpret_cast(dst_data + i * dst_step) + j * 3, dst, vl); + } + } + + return CV_HAL_ERROR_OK; +} + +} // anonymous + +// the algorithm is copied from imgproc/src/bilateral_filter.dispatch.cpp +// in the function static void bilateralFilter_8u and bilateralFilter_32f +int bilateralFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, + int width, int height, int depth, int cn, int d, double sigma_color, double sigma_space, int border_type) +{ + const int type = CV_MAKETYPE(depth, cn); + if (type != CV_8UC1 && type != CV_8UC3 && type != CV_32FC1 && type != CV_32FC3) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (type == CV_32FC1 && width * height > 1 << 20) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (src_data == dst_data || border_type & BORDER_ISOLATED) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + sigma_color = sigma_color <= 0 ? 1 : sigma_color; + sigma_space = sigma_space <= 0 ? 1 : sigma_space; + double gauss_color_coeff = -0.5/(sigma_color*sigma_color); + double gauss_space_coeff = -0.5/(sigma_space*sigma_space); + int radius = d <= 0 ? std::round(sigma_space*1.5) : d/2; + radius = std::max(radius, 1); + d = radius*2 + 1; + + const int size = depth == CV_32F ? cn * sizeof(float) : cn; + const int temp_step = (width + radius * 2) * size; + std::vector _temp((width + radius * 2) * (height + radius * 2) * size, 0); + uchar* temp = _temp.data(); + std::vector width_interpolate(radius * 2); + for (int j = 0; j < radius; j++) + { + width_interpolate[j] = common::borderInterpolate(j - radius, width, border_type); + width_interpolate[j + radius] = common::borderInterpolate(width + j, width, border_type); + } + for (int i = 0; i < height + radius * 2; i++) + { + int x = common::borderInterpolate(i - radius, height, border_type); + if (x != -1) + { + for (int j = 0; j < radius; j++) + { + int y = width_interpolate[j]; + if (y != -1) + memcpy(temp + i * temp_step + j * size, src_data + x * src_step + y * size, size); + y = width_interpolate[j + radius]; + if (y != -1) + memcpy(temp + i * temp_step + (width + j + radius) * size, src_data + x * src_step + y * size, size); + } + memcpy(temp + i * temp_step + radius * size, src_data + x * src_step, width * size); + } + } + + std::vector _space_weight(d*d); + std::vector _space_ofs(d*d); + float* space_weight = _space_weight.data(); + int* space_ofs = _space_ofs.data(); + int maxk = 0; + for (int i = -radius; i <= radius; i++) + { + for (int j = -radius; j <= radius; j++) + { + double r = std::sqrt((double)i*i + (double)j*j); + if (r <= radius && (depth == CV_8U || i != 0 || j != 0)) + { + space_weight[maxk] = static_cast(r*r*gauss_space_coeff); + space_ofs[maxk++] = (i * (temp_step / size) + j) * cn; + } + } + } + cv::rvv_hal::core::exp32f(space_weight, space_weight, maxk); + + if (depth == CV_8U) + { + std::vector _color_weight(cn*256); + float* color_weight = _color_weight.data(); + for (int i = 0; i < 256*cn; i++) + color_weight[i] = static_cast(i*i*gauss_color_coeff); + cv::rvv_hal::core::exp32f(color_weight, color_weight, 256*cn); + + switch (cn) + { + case 1: + return common::invoke(height, {bilateralFilter8UC1}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, color_weight); + case 3: + return common::invoke(height, {bilateralFilter8UC3}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, color_weight); + } + } + else + { + double minValSrc = -1, maxValSrc = 1; + cv::rvv_hal::core::minMaxIdx(src_data, src_step, width * cn, height, CV_32F, &minValSrc, &maxValSrc, nullptr, nullptr, nullptr); + if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON) + { + for (int i = 0; i < width; i++) + memcpy(dst_data + i * dst_step, src_data + i * src_step, width * size); + return CV_HAL_ERROR_OK; + } + + const int kExpNumBinsPerChannel = 1 << 12; + const int kExpNumBins = kExpNumBinsPerChannel * cn; + const float scale_index = kExpNumBins / static_cast((maxValSrc - minValSrc) * cn); + std::vector _expLUT(kExpNumBins+2, 0); + float* expLUT = _expLUT.data(); + for (int i = 0; i < kExpNumBins+2; i++) + { + double val = i / scale_index; + expLUT[i] = static_cast(val * val * gauss_color_coeff); + } + cv::rvv_hal::core::exp32f(expLUT, expLUT, kExpNumBins+2); + + switch (cn) + { + case 1: + return common::invoke(height, {bilateralFilter32FC1}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, expLUT, scale_index); + case 3: + return common::invoke(height, {bilateralFilter32FC3}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, expLUT, scale_index); + } + } + + return CV_HAL_ERROR_NOT_IMPLEMENTED; +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::imgproc diff --git a/hal/riscv-rvv/src/imgproc/box_filter.cpp b/hal/riscv-rvv/src/imgproc/box_filter.cpp new file mode 100644 index 0000000000..8a91ef57bb --- /dev/null +++ b/hal/riscv-rvv/src/imgproc/box_filter.cpp @@ -0,0 +1,392 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. + +#include "rvv_hal.hpp" +#include "common.hpp" + +namespace cv { namespace rvv_hal { namespace imgproc { + +#if CV_HAL_RVV_1P0_ENABLED + +namespace { + +template struct rvv; +template<> struct rvv +{ + static inline vuint16m8_t vcvt0(vuint8m4_t a, size_t b) { return __riscv_vzext_vf2(a, b); } + static inline vuint8m4_t vcvt1(vuint16m8_t a, size_t b) { return __riscv_vnclipu(a, 0, __RISCV_VXRM_RNU, b); } + static inline vuint16m8_t vdiv(vuint16m8_t a, ushort b, size_t c) { return __riscv_vdivu(__riscv_vadd(a, b / 2, c), b, c); } +}; +template<> struct rvv +{ + static inline vint32m8_t vcvt0(vint16m4_t a, size_t b) { return __riscv_vsext_vf2(a, b); } + static inline vint16m4_t vcvt1(vint32m8_t a, size_t b) { return __riscv_vnclip(a, 0, __RISCV_VXRM_RNU, b); } + static inline vint32m8_t vdiv(vint32m8_t a, int b, size_t c) { return __riscv_vdiv(__riscv_vadd(a, b / 2, c), b, c); } +}; +template<> struct rvv +{ + static inline vint32m8_t vcvt0(vint32m8_t a, size_t) { return a; } + static inline vint32m8_t vcvt1(vint32m8_t a, size_t) { return a; } + static inline vint32m8_t vdiv(vint32m8_t a, int b, size_t c) { return __riscv_vdiv(__riscv_vadd(a, b / 2, c), b, c); } +}; +template<> struct rvv +{ + static inline vfloat32m8_t vcvt0(vfloat32m8_t a, size_t) { return a; } + static inline vfloat32m8_t vcvt1(vfloat32m8_t a, size_t) { return a; } + static inline vfloat32m8_t vdiv(vfloat32m8_t a, float b, size_t c) { return __riscv_vfdiv(a, b, c); } +}; + +// the algorithm is same as cv_hal_sepFilter +template +static inline int boxFilterC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int anchor_x, int anchor_y, bool normalize, int border_type) +{ + using T = typename helperT::ElemType; + using WT = typename helperWT::ElemType; + + constexpr int noval = std::numeric_limits::max(); + auto accessX = [&](int x) { + int pi = common::borderInterpolate(offset_y + x - anchor_y, full_height, border_type); + return pi < 0 ? noval : pi - offset_y; + }; + auto accessY = [&](int y) { + int pj = common::borderInterpolate(offset_x + y - anchor_x, full_width, border_type); + return pj < 0 ? noval : pj - offset_x; + }; + auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; }; + + std::vector res(width * ksize); + auto process = [&](int x, int y) { + WT sum = 0; + for (int i = 0; i < ksize; i++) + { + int p = accessY(y + i); + if (p != noval) + { + sum += reinterpret_cast(src_data + x * src_step)[p]; + } + } + res[p2idx(x, y)] = sum; + }; + + const int left = anchor_x, right = width - (ksize - 1 - anchor_x); + for (int i = start - anchor_y; i < end + (ksize - 1 - anchor_y); i++) + { + if (i + offset_y >= 0 && i + offset_y < full_height) + { + if (left >= right) + { + for (int j = 0; j < width; j++) + process(i, j); + } + else + { + for (int j = 0; j < left; j++) + process(i, j); + for (int j = right; j < width; j++) + process(i, j); + + int vl; + for (int j = left; j < right; j += vl) + { + vl = helperT::setvl(right - j); + const T* extra = reinterpret_cast(src_data + i * src_step) + j - anchor_x; + auto src = rvv::vcvt0(helperT::vload(extra, vl), vl); + + extra += vl; + auto sum = src; + src = helperWT::vslide1down(src, extra[0], vl); + sum = helperWT::vadd(sum, src, vl); + src = helperWT::vslide1down(src, extra[1], vl); + sum = helperWT::vadd(sum, src, vl); + if (ksize == 5) + { + src = helperWT::vslide1down(src, extra[2], vl); + sum = helperWT::vadd(sum, src, vl); + src = helperWT::vslide1down(src, extra[3], vl); + sum = helperWT::vadd(sum, src, vl); + } + helperWT::vstore(res.data() + p2idx(i, j), sum, vl); + } + } + } + + int cur = i - (ksize - 1 - anchor_y); + if (cur >= start) + { + const WT* row0 = accessX(cur ) == noval ? nullptr : res.data() + p2idx(accessX(cur ), 0); + const WT* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0); + const WT* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0); + const WT* row3 = nullptr, *row4 = nullptr; + if (ksize == 5) + { + row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0); + row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0); + } + + int vl; + for (int j = 0; j < width; j += vl) + { + vl = helperWT::setvl(width - j); + auto sum = row0 ? helperWT::vload(row0 + j, vl) : helperWT::vmv(0, vl); + if (row1) sum = helperWT::vadd(sum, helperWT::vload(row1 + j, vl), vl); + if (row2) sum = helperWT::vadd(sum, helperWT::vload(row2 + j, vl), vl); + if (row3) sum = helperWT::vadd(sum, helperWT::vload(row3 + j, vl), vl); + if (row4) sum = helperWT::vadd(sum, helperWT::vload(row4 + j, vl), vl); + if (normalize) sum = rvv::vdiv(sum, ksize * ksize, vl); + + if (cast) + { + helperT::vstore(reinterpret_cast(dst_data + cur * dst_step) + j, rvv::vcvt1(sum, vl), vl); + } + else + { + helperWT::vstore(reinterpret_cast(dst_data + cur * dst_step) + j, sum, vl); + } + } + } + } + + return CV_HAL_ERROR_OK; +} + +template +static inline int boxFilterC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int anchor_x, int anchor_y, bool normalize, int border_type) +{ + constexpr int noval = std::numeric_limits::max(); + auto accessX = [&](int x) { + int pi = common::borderInterpolate(offset_y + x - anchor_y, full_height, border_type); + return pi < 0 ? noval : pi - offset_y; + }; + auto accessY = [&](int y) { + int pj = common::borderInterpolate(offset_x + y - anchor_x, full_width, border_type); + return pj < 0 ? noval : pj - offset_x; + }; + auto p2idx = [&](int x, int y){ return ((x + ksize) % ksize * width + y) * 3; }; + + std::vector res(width * ksize * 3); + auto process = [&](int x, int y) { + float sum0, sum1, sum2; + sum0 = sum1 = sum2 = 0; + for (int i = 0; i < ksize; i++) + { + int p = accessY(y + i); + if (p != noval) + { + sum0 += reinterpret_cast(src_data + x * src_step)[p * 3 ]; + sum1 += reinterpret_cast(src_data + x * src_step)[p * 3 + 1]; + sum2 += reinterpret_cast(src_data + x * src_step)[p * 3 + 2]; + } + } + res[p2idx(x, y) ] = sum0; + res[p2idx(x, y) + 1] = sum1; + res[p2idx(x, y) + 2] = sum2; + }; + + const int left = anchor_x, right = width - (ksize - 1 - anchor_x); + for (int i = start - anchor_y; i < end + (ksize - 1 - anchor_y); i++) + { + if (i + offset_y >= 0 && i + offset_y < full_height) + { + if (left >= right) + { + for (int j = 0; j < width; j++) + process(i, j); + } + else + { + for (int j = 0; j < left; j++) + process(i, j); + for (int j = right; j < width; j++) + process(i, j); + + int vl; + for (int j = left; j < right; j += vl) + { + vl = __riscv_vsetvl_e32m2(right - j); + const float* extra = reinterpret_cast(src_data + i * src_step) + (j - anchor_x) * 3; + auto src = __riscv_vlseg3e32_v_f32m2x3(extra, vl); + auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0); + auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1); + auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2); + + extra += vl * 3; + auto sum0 = src0, sum1 = src1, sum2 = src2; + src0 = __riscv_vfslide1down(src0, extra[0], vl); + src1 = __riscv_vfslide1down(src1, extra[1], vl); + src2 = __riscv_vfslide1down(src2, extra[2], vl); + sum0 = __riscv_vfadd(sum0, src0, vl); + sum1 = __riscv_vfadd(sum1, src1, vl); + sum2 = __riscv_vfadd(sum2, src2, vl); + src0 = __riscv_vfslide1down(src0, extra[3], vl); + src1 = __riscv_vfslide1down(src1, extra[4], vl); + src2 = __riscv_vfslide1down(src2, extra[5], vl); + sum0 = __riscv_vfadd(sum0, src0, vl); + sum1 = __riscv_vfadd(sum1, src1, vl); + sum2 = __riscv_vfadd(sum2, src2, vl); + if (ksize == 5) + { + src0 = __riscv_vfslide1down(src0, extra[6], vl); + src1 = __riscv_vfslide1down(src1, extra[7], vl); + src2 = __riscv_vfslide1down(src2, extra[8], vl); + sum0 = __riscv_vfadd(sum0, src0, vl); + sum1 = __riscv_vfadd(sum1, src1, vl); + sum2 = __riscv_vfadd(sum2, src2, vl); + src0 = __riscv_vfslide1down(src0, extra[ 9], vl); + src1 = __riscv_vfslide1down(src1, extra[10], vl); + src2 = __riscv_vfslide1down(src2, extra[11], vl); + sum0 = __riscv_vfadd(sum0, src0, vl); + sum1 = __riscv_vfadd(sum1, src1, vl); + sum2 = __riscv_vfadd(sum2, src2, vl); + } + + vfloat32m2x3_t dst{}; + dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, sum0); + dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, sum1); + dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, sum2); + __riscv_vsseg3e32(res.data() + p2idx(i, j), dst, vl); + } + } + } + + int cur = i - (ksize - 1 - anchor_y); + if (cur >= start) + { + const float* row0 = accessX(cur ) == noval ? nullptr : res.data() + p2idx(accessX(cur ), 0); + const float* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0); + const float* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0); + const float* row3 = nullptr, *row4 = nullptr; + if (ksize == 5) + { + row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0); + row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0); + } + + int vl; + for (int j = 0; j < width; j += vl) + { + vl = __riscv_vsetvl_e32m2(width - j); + vfloat32m2_t sum0, sum1, sum2; + sum0 = sum1 = sum2 = __riscv_vfmv_v_f_f32m2(0, vl); + auto loadres = [&](const float* row) { + if (!row) return; + auto src = __riscv_vlseg3e32_v_f32m2x3(row + j * 3, vl); + sum0 = __riscv_vfadd(sum0, __riscv_vget_v_f32m2x3_f32m2(src, 0), vl); + sum1 = __riscv_vfadd(sum1, __riscv_vget_v_f32m2x3_f32m2(src, 1), vl); + sum2 = __riscv_vfadd(sum2, __riscv_vget_v_f32m2x3_f32m2(src, 2), vl); + }; + loadres(row0); + loadres(row1); + loadres(row2); + loadres(row3); + loadres(row4); + if (normalize) + { + sum0 = __riscv_vfdiv(sum0, ksize * ksize, vl); + sum1 = __riscv_vfdiv(sum1, ksize * ksize, vl); + sum2 = __riscv_vfdiv(sum2, ksize * ksize, vl); + } + + vfloat32m2x3_t dst{}; + dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, sum0); + dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, sum1); + dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, sum2); + __riscv_vsseg3e32(reinterpret_cast(dst_data + cur * dst_step) + j * 3, dst, vl); + } + } + } + + return CV_HAL_ERROR_OK; +} + +} // anonymous + +int boxFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, size_t ksize_width, size_t ksize_height, int anchor_x, int anchor_y, bool normalize, int border_type) +{ + const int src_type = CV_MAKETYPE(src_depth, cn), dst_type = CV_MAKETYPE(dst_depth, cn); + if (ksize_width != ksize_height || (ksize_width != 3 && ksize_width != 5)) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (border_type & BORDER_ISOLATED || border_type == BORDER_WRAP) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + uchar* _dst_data = dst_data; + size_t _dst_step = dst_step; + const size_t size = CV_ELEM_SIZE(dst_type); + std::vector dst; + if (src_data == _dst_data) + { + dst = std::vector(width * height * size); + dst_data = dst.data(); + dst_step = width * size; + } + + int res = CV_HAL_ERROR_NOT_IMPLEMENTED; + anchor_x = anchor_x < 0 ? ksize_width / 2 : anchor_x; + anchor_y = anchor_y < 0 ? ksize_height / 2 : anchor_y; + if (src_type != dst_type) + { + if (src_type == CV_8UC1 && dst_type == CV_16UC1) + { + if (ksize_width == 3) + { + res = common::invoke(height, {boxFilterC1<3, RVV_U8M4, RVV_U16M8, false>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); + } + if (ksize_width == 5) + { + res = common::invoke(height, {boxFilterC1<5, RVV_U8M4, RVV_U16M8, false>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); + } + } + } + else + { + switch (ksize_width*100 + src_type) + { + case 300 + CV_8UC1: + res = common::invoke(height, {boxFilterC1<3, RVV_U8M4, RVV_U16M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); + break; + case 500 + CV_8UC1: + res = common::invoke(height, {boxFilterC1<5, RVV_U8M4, RVV_U16M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); + break; + case 300 + CV_16SC1: + res = common::invoke(height, {boxFilterC1<3, RVV_I16M4, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); + break; + case 500 + CV_16SC1: + res = common::invoke(height, {boxFilterC1<5, RVV_I16M4, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); + break; + case 300 + CV_32SC1: + res = common::invoke(height, {boxFilterC1<3, RVV_I32M8, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); + break; + case 500 + CV_32SC1: + res = common::invoke(height, {boxFilterC1<5, RVV_I32M8, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); + break; + case 300 + CV_32FC1: + res = common::invoke(height, {boxFilterC1<3, RVV_F32M8, RVV_F32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); + break; + case 500 + CV_32FC1: + res = common::invoke(height, {boxFilterC1<5, RVV_F32M8, RVV_F32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); + break; + case 300 + CV_32FC3: + res = common::invoke(height, {boxFilterC3<3>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); + break; + case 500 + CV_32FC3: + res = common::invoke(height, {boxFilterC3<5>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type); + break; + } + } + if (res == CV_HAL_ERROR_NOT_IMPLEMENTED) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + if (src_data == _dst_data) + { + for (int i = 0; i < height; i++) + memcpy(_dst_data + i * _dst_step, dst.data() + i * dst_step, dst_step); + } + + return res; +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::imgproc diff --git a/hal/riscv-rvv/hal_rvv_1p0/color.hpp b/hal/riscv-rvv/src/imgproc/color.cpp similarity index 90% rename from hal/riscv-rvv/hal_rvv_1p0/color.hpp rename to hal/riscv-rvv/src/imgproc/color.cpp index c715c6ad38..1b7ee0a4d3 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/color.hpp +++ b/hal/riscv-rvv/src/imgproc/color.cpp @@ -4,12 +4,12 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_COLOR_HPP_INCLUDED -#define OPENCV_HAL_RVV_COLOR_HPP_INCLUDED +#include "rvv_hal.hpp" +#include -#include +namespace cv { namespace rvv_hal { namespace imgproc { -namespace cv { namespace cv_hal_rvv { +#if CV_HAL_RVV_1P0_ENABLED namespace color { class ColorInvoker : public ParallelLoopBody @@ -41,11 +41,9 @@ namespace color { { return val - std::remainder(val, 1.0); } -} // cv::cv_hal_rvv::color +} // cv::rvv_hal::color namespace BGRtoBGR { -#undef cv_hal_cvtBGRtoBGR -#define cv_hal_cvtBGRtoBGR cv::cv_hal_rvv::BGRtoBGR::cvtBGRtoBGR template struct rvv; template<> struct rvv @@ -206,27 +204,26 @@ static inline int cvtBGRtoBGR(int start, int end, const T * src, size_t src_step return CV_HAL_ERROR_OK; } -inline int cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue) +} // BGRtoBGR + +int cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue) { if ((scn != 3 && scn != 4) || (dcn != 3 && dcn != 4)) return CV_HAL_ERROR_NOT_IMPLEMENTED; switch (depth) { case CV_8U: - return cvtBGRtoBGR(0, height, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, dcn, swapBlue); + return BGRtoBGR::cvtBGRtoBGR(0, height, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, dcn, swapBlue); case CV_16U: - return cvtBGRtoBGR(0, height, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, dcn, swapBlue); + return BGRtoBGR::cvtBGRtoBGR(0, height, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, dcn, swapBlue); case CV_32F: - return cvtBGRtoBGR(0, height, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, dcn, swapBlue); + return BGRtoBGR::cvtBGRtoBGR(0, height, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, dcn, swapBlue); } return CV_HAL_ERROR_NOT_IMPLEMENTED; } -} // cv::cv_hal_rvv::BGRtoBGR namespace GraytoBGR { -#undef cv_hal_cvtGraytoBGR -#define cv_hal_cvtGraytoBGR cv::cv_hal_rvv::GraytoBGR::cvtGraytoBGR template struct rvv; template<> struct rvv @@ -337,27 +334,26 @@ static inline int cvtGraytoBGR(int start, int end, const T * src, size_t src_ste return CV_HAL_ERROR_OK; } -inline int cvtGraytoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn) +} // GraytoBGR + +int cvtGraytoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn) { if (dcn != 3 && dcn != 4) return CV_HAL_ERROR_NOT_IMPLEMENTED; switch (depth) { case CV_8U: - return cvtGraytoBGR(0, height, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn); + return GraytoBGR::cvtGraytoBGR(0, height, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn); case CV_16U: - return cvtGraytoBGR(0, height, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn); + return GraytoBGR::cvtGraytoBGR(0, height, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn); case CV_32F: - return cvtGraytoBGR(0, height, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn); + return GraytoBGR::cvtGraytoBGR(0, height, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn); } return CV_HAL_ERROR_NOT_IMPLEMENTED; } -} // cv::cv_hal_rvv::GraytoBGR namespace BGRtoGray { -#undef cv_hal_cvtBGRtoGray -#define cv_hal_cvtBGRtoGray cv::cv_hal_rvv::BGRtoGray::cvtBGRtoGray template struct rvv; template<> struct rvv @@ -462,27 +458,26 @@ static inline int cvtBGRtoGray(int start, int end, const T * src, size_t src_ste return CV_HAL_ERROR_OK; } -inline int cvtBGRtoGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue) +} // BGRtoGray + +int cvtBGRtoGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue) { if (scn != 3 && scn != 4) return CV_HAL_ERROR_NOT_IMPLEMENTED; switch (depth) { case CV_8U: - return color::invoke(width, height, {cvtBGRtoGray}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue); + return color::invoke(width, height, {BGRtoGray::cvtBGRtoGray}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue); case CV_16U: - return color::invoke(width, height, {cvtBGRtoGray}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue); + return color::invoke(width, height, {BGRtoGray::cvtBGRtoGray}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue); case CV_32F: - return color::invoke(width, height, {cvtBGRtoGray}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue); + return color::invoke(width, height, {BGRtoGray::cvtBGRtoGray}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue); } return CV_HAL_ERROR_NOT_IMPLEMENTED; } -} // cv::cv_hal_rvv::BGRtoGray namespace BGR5x5toBGR { -#undef cv_hal_cvtBGR5x5toBGR -#define cv_hal_cvtBGR5x5toBGR cv::cv_hal_rvv::BGR5x5toBGR::cvtBGR5x5toBGR // the algorithm is copied from imgproc/src/color_rgb.simd.cpp, // in the functor struct RGB5x52RGB @@ -540,18 +535,17 @@ static inline int cvtBGR5x5toBGR_u(int start, int end, const ushort * src, size_ return CV_HAL_ERROR_OK; } -inline int cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int greenBits) +} // BGR5x5toBGR + +int cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int greenBits) { if ((dcn != 3 && dcn != 4) || (greenBits != 5 && greenBits != 6)) return CV_HAL_ERROR_NOT_IMPLEMENTED; - return color::invoke(width, height, {cvtBGR5x5toBGR_u}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue, greenBits); + return color::invoke(width, height, {BGR5x5toBGR::cvtBGR5x5toBGR_u}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue, greenBits); } -} // cv::cv_hal_rvv::BGR5x5toBGR namespace BGRtoBGR5x5 { -#undef cv_hal_cvtBGRtoBGR5x5 -#define cv_hal_cvtBGRtoBGR5x5 cv::cv_hal_rvv::BGRtoBGR5x5::cvtBGRtoBGR5x5 // the algorithm is copied from imgproc/src/color_rgb.simd.cpp, // in the functor struct RGB2RGB5x5 @@ -604,18 +598,17 @@ static inline int cvtBGRtoBGR5x5_u(int start, int end, const uchar * src, size_t return CV_HAL_ERROR_OK; } -inline int cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int greenBits) +} // BGRtoBGR5x5 + +int cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int greenBits) { if ((scn != 3 && scn != 4) || (greenBits != 5 && greenBits != 6)) return CV_HAL_ERROR_NOT_IMPLEMENTED; - return color::invoke(width, height, {cvtBGRtoBGR5x5_u}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue, greenBits); + return color::invoke(width, height, {BGRtoBGR5x5::cvtBGRtoBGR5x5_u}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue, greenBits); } -} // cv::cv_hal_rvv::BGRtoBGR5x5 namespace BGR5x5toGray { -#undef cv_hal_cvtBGR5x5toGray -#define cv_hal_cvtBGR5x5toGray cv::cv_hal_rvv::BGR5x5toGray::cvtBGR5x5toGray // the algorithm is copied from imgproc/src/color_rgb.simd.cpp, // in the functor struct RGB5x52Gray @@ -654,18 +647,17 @@ static inline int cvtBGR5x5toGray_u(int start, int end, const ushort * src, size return CV_HAL_ERROR_OK; } -inline int cvtBGR5x5toGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits) +} // BGR5x5toGray + +int cvtBGR5x5toGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits) { if (greenBits != 5 && greenBits != 6) return CV_HAL_ERROR_NOT_IMPLEMENTED; - return color::invoke(width, height, {cvtBGR5x5toGray_u}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, greenBits); + return color::invoke(width, height, {BGR5x5toGray::cvtBGR5x5toGray_u}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, greenBits); } -} // cv::cv_hal_rvv::BGR5x5toGray namespace GraytoBGR5x5 { -#undef cv_hal_cvtGraytoBGR5x5 -#define cv_hal_cvtGraytoBGR5x5 cv::cv_hal_rvv::GraytoBGR5x5::cvtGraytoBGR5x5 // the algorithm is copied from imgproc/src/color_rgb.simd.cpp, // in the functor struct Gray2RGB5x5 @@ -697,18 +689,17 @@ static inline int cvtGraytoBGR5x5_u(int start, int end, const uchar * src, size_ return CV_HAL_ERROR_OK; } -inline int cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits) +} // GraytoBGR5x5 + +int cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits) { if (greenBits != 5 && greenBits != 6) return CV_HAL_ERROR_NOT_IMPLEMENTED; - return color::invoke(width, height, {cvtGraytoBGR5x5_u}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, greenBits); + return color::invoke(width, height, {GraytoBGR5x5::cvtGraytoBGR5x5_u}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, greenBits); } -} // cv::cv_hal_rvv::GraytoBGR5x5 namespace YUVtoBGR { -#undef cv_hal_cvtYUVtoBGR -#define cv_hal_cvtYUVtoBGR cv::cv_hal_rvv::YUVtoBGR::cvtYUVtoBGR template struct rvv; template<> struct rvv @@ -857,27 +848,26 @@ static inline int cvtYUVtoBGR(int start, int end, const T * src, size_t src_step return CV_HAL_ERROR_OK; } -inline int cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr) +} // YUVtoBGR + +int cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr) { if (dcn != 3 && dcn != 4) return CV_HAL_ERROR_NOT_IMPLEMENTED; switch (depth) { case CV_8U: - return color::invoke(width, height, {cvtYUVtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue, isCbCr); + return color::invoke(width, height, {YUVtoBGR::cvtYUVtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue, isCbCr); case CV_16U: - return color::invoke(width, height, {cvtYUVtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue, isCbCr); + return color::invoke(width, height, {YUVtoBGR::cvtYUVtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue, isCbCr); case CV_32F: - return color::invoke(width, height, {cvtYUVtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue, isCbCr); + return color::invoke(width, height, {YUVtoBGR::cvtYUVtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue, isCbCr); } return CV_HAL_ERROR_NOT_IMPLEMENTED; } -} // cv::cv_hal_rvv::YUVtoBGR namespace BGRtoYUV { -#undef cv_hal_cvtBGRtoYUV -#define cv_hal_cvtBGRtoYUV cv::cv_hal_rvv::BGRtoYUV::cvtBGRtoYUV template struct rvv; template<> struct rvv @@ -1027,31 +1017,26 @@ static inline int cvtBGRtoYUV(int start, int end, const T * src, size_t src_step return CV_HAL_ERROR_OK; } -inline int cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr) +} // BGRtoYUV + +int cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr) { if (scn != 3 && scn != 4) return CV_HAL_ERROR_NOT_IMPLEMENTED; switch (depth) { case CV_8U: - return color::invoke(width, height, {cvtBGRtoYUV}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue, isCbCr); + return color::invoke(width, height, {BGRtoYUV::cvtBGRtoYUV}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue, isCbCr); case CV_16U: - return color::invoke(width, height, {cvtBGRtoYUV}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue, isCbCr); + return color::invoke(width, height, {BGRtoYUV::cvtBGRtoYUV}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue, isCbCr); case CV_32F: - return color::invoke(width, height, {cvtBGRtoYUV}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue, isCbCr); + return color::invoke(width, height, {BGRtoYUV::cvtBGRtoYUV}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue, isCbCr); } return CV_HAL_ERROR_NOT_IMPLEMENTED; } -} // cv::cv_hal_rvv::BGRtoYUV namespace PlaneYUVtoBGR { -#undef cv_hal_cvtOnePlaneYUVtoBGR -#define cv_hal_cvtOnePlaneYUVtoBGR cv::cv_hal_rvv::PlaneYUVtoBGR::cvtOnePlaneYUVtoBGR -#undef cv_hal_cvtTwoPlaneYUVtoBGR -#define cv_hal_cvtTwoPlaneYUVtoBGR cv::cv_hal_rvv::PlaneYUVtoBGR::cvtTwoPlaneYUVtoBGR -#undef cv_hal_cvtThreePlaneYUVtoBGR -#define cv_hal_cvtThreePlaneYUVtoBGR cv::cv_hal_rvv::PlaneYUVtoBGR::cvtThreePlaneYUVtoBGR static const int ITUR_BT_601_SHIFT = 20; static const int ITUR_BT_601_CY = 1220542; @@ -1241,22 +1226,24 @@ static inline int cvtMultiPlaneYUVtoBGR(int start, int end, uchar * dst_data, si return CV_HAL_ERROR_OK; } -inline int cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx, int yIdx) +} // PlaneYUVtoBGR + +int cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx, int yIdx) { if (dcn != 3 && dcn != 4) return CV_HAL_ERROR_NOT_IMPLEMENTED; - return color::invoke(dst_width, dst_height, {cvtSinglePlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, dcn, swapBlue, uIdx, yIdx); + return color::invoke(dst_width, dst_height, {PlaneYUVtoBGR::cvtSinglePlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, dcn, swapBlue, uIdx, yIdx); } -inline int cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) +int cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) { if (dcn != 3 && dcn != 4) return CV_HAL_ERROR_NOT_IMPLEMENTED; const uchar* uv = src_data + src_step * static_cast(dst_height); - return color::invoke(dst_width, dst_height / 2, {cvtMultiPlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, uv, uv, 0, 0, dcn, swapBlue, uIdx); + return color::invoke(dst_width, dst_height / 2, {PlaneYUVtoBGR::cvtMultiPlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, uv, uv, 0, 0, dcn, swapBlue, uIdx); } -inline int cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) +int cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) { if (dcn != 3 && dcn != 4) return CV_HAL_ERROR_NOT_IMPLEMENTED; @@ -1267,17 +1254,10 @@ inline int cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar int vstepIdx = dst_height % 4 == 2 ? 1 : 0; if (uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); } - return color::invoke(dst_width, dst_height / 2, {cvtMultiPlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, u, v, ustepIdx, vstepIdx, dcn, swapBlue, -1); + return color::invoke(dst_width, dst_height / 2, {PlaneYUVtoBGR::cvtMultiPlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, u, v, ustepIdx, vstepIdx, dcn, swapBlue, -1); } -} // cv::cv_hal_rvv::PlaneYUVtoBGR namespace PlaneBGRtoYUV { -#undef cv_hal_cvtOnePlaneBGRtoYUV -#define cv_hal_cvtOnePlaneBGRtoYUV cv::cv_hal_rvv::PlaneBGRtoYUV::cvtOnePlaneBGRtoYUV -#undef cv_hal_cvtBGRtoTwoPlaneYUV -#define cv_hal_cvtBGRtoTwoPlaneYUV cv::cv_hal_rvv::PlaneBGRtoYUV::cvtBGRtoTwoPlaneYUV -#undef cv_hal_cvtBGRtoThreePlaneYUV -#define cv_hal_cvtBGRtoThreePlaneYUV cv::cv_hal_rvv::PlaneBGRtoYUV::cvtBGRtoThreePlaneYUV static const int ITUR_BT_601_SHIFT = 20; static const int ITUR_BT_601_CBY = 102760; // 0.114035 * (236-16)/256 * (1 << ITUR_BT_601_SHIFT) @@ -1512,35 +1492,34 @@ static inline int cvtBGRtoMultiPlaneYUV(int start, int end, uchar * yData, uchar return CV_HAL_ERROR_OK; } -inline int cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int yIdx) +} // PlaneBGRtoYUV + +int cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int yIdx) { if (scn != 3 && scn != 4) return CV_HAL_ERROR_NOT_IMPLEMENTED; - return color::invoke(width, height, {cvtBGRtoSinglePlaneYUV}, dst_data, dst_step, width, src_step, src_data, scn, swapBlue, uIdx, yIdx); + return color::invoke(width, height, {PlaneBGRtoYUV::cvtBGRtoSinglePlaneYUV}, dst_data, dst_step, width, src_step, src_data, scn, swapBlue, uIdx, yIdx); } -inline int cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, +int cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, uchar * y_data, size_t y_step, uchar * uv_data, size_t uv_step, int width, int height, int scn, bool swapBlue, int uIdx) { if (y_step != uv_step || (scn != 3 && scn != 4)) return CV_HAL_ERROR_NOT_IMPLEMENTED; - return color::invoke(width, height / 2, {cvtBGRtoMultiPlaneYUV}, y_data, uv_data, y_step, width, height, src_step, src_data, scn, swapBlue, uIdx == 2); + return color::invoke(width, height / 2, {PlaneBGRtoYUV::cvtBGRtoMultiPlaneYUV}, y_data, uv_data, y_step, width, height, src_step, src_data, scn, swapBlue, uIdx == 2); } -inline int cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx) +int cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx) { if (scn != 3 && scn != 4) return CV_HAL_ERROR_NOT_IMPLEMENTED; uchar* uv_data = dst_data + dst_step * static_cast(height); - return color::invoke(width, height / 2, {cvtBGRtoMultiPlaneYUV}, dst_data, uv_data, dst_step, width, height, src_step, src_data, scn, swapBlue, uIdx == 2 ? 3 : 2); + return color::invoke(width, height / 2, {PlaneBGRtoYUV::cvtBGRtoMultiPlaneYUV}, dst_data, uv_data, dst_step, width, height, src_step, src_data, scn, swapBlue, uIdx == 2 ? 3 : 2); } -} // cv::cv_hal_rvv::PlaneBGRtoYUV namespace HSVtoBGR { -#undef cv_hal_cvtHSVtoBGR -#define cv_hal_cvtHSVtoBGR cv::cv_hal_rvv::HSVtoBGR::cvtHSVtoBGR template static inline int cvtHSVtoBGR(int start, int end, const T * src, size_t src_step, T * dst, size_t dst_step, int width, int dcn, bool swapBlue, bool isFullRange, bool isHSV); @@ -1710,25 +1689,24 @@ inline int cvtHSVtoBGR(int start, int end, const float * src, size_t src_ return CV_HAL_ERROR_OK; } -inline int cvtHSVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV) +} // HSVtoBGR + +int cvtHSVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV) { if (dcn != 3 && dcn != 4) return CV_HAL_ERROR_NOT_IMPLEMENTED; switch (depth) { case CV_8U: - return color::invoke(width, height, {cvtHSVtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue, isFullRange, isHSV); + return color::invoke(width, height, {HSVtoBGR::cvtHSVtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue, isFullRange, isHSV); case CV_32F: - return color::invoke(width, height, {cvtHSVtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue, isFullRange, isHSV); + return color::invoke(width, height, {HSVtoBGR::cvtHSVtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue, isFullRange, isHSV); } return CV_HAL_ERROR_NOT_IMPLEMENTED; } -} // cv::cv_hal_rvv::HSVtoBGR namespace BGRtoHSV { -#undef cv_hal_cvtBGRtoHSV -#define cv_hal_cvtBGRtoHSV cv::cv_hal_rvv::BGRtoHSV::cvtBGRtoHSV template static inline int cvtBGRtoHSV(int start, int end, const T * src, size_t src_step, T * dst, size_t dst_step, int width, int scn, bool swapBlue, bool isFullRange, bool isHSV); @@ -1870,25 +1848,24 @@ inline int cvtBGRtoHSV(int start, int end, const float * src, size_t src_ return CV_HAL_ERROR_OK; } -inline int cvtBGRtoHSV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV) +} // BGRtoHSV + +int cvtBGRtoHSV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV) { if (scn != 3 && scn != 4) return CV_HAL_ERROR_NOT_IMPLEMENTED; switch (depth) { case CV_8U: - return color::invoke(width, height, {cvtBGRtoHSV}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue, isFullRange, isHSV); + return color::invoke(width, height, {BGRtoHSV::cvtBGRtoHSV}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue, isFullRange, isHSV); case CV_32F: - return color::invoke(width, height, {cvtBGRtoHSV}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue, isFullRange, isHSV); + return color::invoke(width, height, {BGRtoHSV::cvtBGRtoHSV}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue, isFullRange, isHSV); } return CV_HAL_ERROR_NOT_IMPLEMENTED; } -} // cv::cv_hal_rvv::BGRtoHSV namespace XYZtoBGR { -#undef cv_hal_cvtXYZtoBGR -#define cv_hal_cvtXYZtoBGR cv::cv_hal_rvv::XYZtoBGR::cvtXYZtoBGR template struct rvv; template<> struct rvv @@ -2042,27 +2019,26 @@ static inline int cvtXYZtoBGR(int start, int end, const T * src, size_t src_step return CV_HAL_ERROR_OK; } -inline int cvtXYZtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue) +} // XYZtoBGR + +int cvtXYZtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue) { if (dcn != 3 && dcn != 4) return CV_HAL_ERROR_NOT_IMPLEMENTED; switch (depth) { case CV_8U: - return color::invoke(width, height, {cvtXYZtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue); + return color::invoke(width, height, {XYZtoBGR::cvtXYZtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue); case CV_16U: - return color::invoke(width, height, {cvtXYZtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue); + return color::invoke(width, height, {XYZtoBGR::cvtXYZtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue); case CV_32F: - return color::invoke(width, height, {cvtXYZtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue); + return color::invoke(width, height, {XYZtoBGR::cvtXYZtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue); } return CV_HAL_ERROR_NOT_IMPLEMENTED; } -} // cv::cv_hal_rvv::XYZtoBGR namespace BGRtoXYZ { -#undef cv_hal_cvtBGRtoXYZ -#define cv_hal_cvtBGRtoXYZ cv::cv_hal_rvv::BGRtoXYZ::cvtBGRtoXYZ template struct rvv; template<> struct rvv @@ -2209,23 +2185,24 @@ static inline int cvtBGRtoXYZ(int start, int end, const T * src, size_t src_step return CV_HAL_ERROR_OK; } -inline int cvtBGRtoXYZ(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue) +} // BGRtoXYZ + +int cvtBGRtoXYZ(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue) { if (scn != 3 && scn != 4) return CV_HAL_ERROR_NOT_IMPLEMENTED; switch (depth) { case CV_8U: - return color::invoke(width, height, {cvtBGRtoXYZ}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue); + return color::invoke(width, height, {BGRtoXYZ::cvtBGRtoXYZ}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue); case CV_16U: - return color::invoke(width, height, {cvtBGRtoXYZ}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue); + return color::invoke(width, height, {BGRtoXYZ::cvtBGRtoXYZ}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue); case CV_32F: - return color::invoke(width, height, {cvtBGRtoXYZ}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue); + return color::invoke(width, height, {BGRtoXYZ::cvtBGRtoXYZ}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue); } return CV_HAL_ERROR_NOT_IMPLEMENTED; } -} // cv::cv_hal_rvv::BGRtoXYZ namespace LabTable { @@ -2495,11 +2472,9 @@ namespace LabTable return __riscv_vfmadd(__riscv_vfmadd(__riscv_vfmadd(__riscv_vget_v_f32m2x4_f32m2(val, 3), x, __riscv_vget_v_f32m2x4_f32m2(val, 2), vl), x, __riscv_vget_v_f32m2x4_f32m2(val, 1), vl), x, __riscv_vget_v_f32m2x4_f32m2(val, 0), vl); } }; -} // cv::cv_hal_rvv::LabTable +} // cv::rvv_hal::imgproc::LabTable namespace LabtoBGR { -#undef cv_hal_cvtLabtoBGR -#define cv_hal_cvtLabtoBGR cv::cv_hal_rvv::LabtoBGR::cvtLabtoBGR template static inline int cvtLabtoBGR(int start, int end, const T * src, size_t src_step, T * dst, size_t dst_step, int width, int dcn, bool swapBlue, bool isLab, bool srgb); @@ -2713,25 +2688,24 @@ inline int cvtLabtoBGR(int start, int end, const float * src, size_t src_ return CV_HAL_ERROR_OK; } -inline int cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isLab, bool srgb) +} // LabtoBGR + +int cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isLab, bool srgb) { if (dcn != 3 && dcn != 4) return CV_HAL_ERROR_NOT_IMPLEMENTED; switch (depth) { case CV_8U: - return color::invoke(width, height, {cvtLabtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue, isLab, srgb); + return color::invoke(width, height, {LabtoBGR::cvtLabtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue, isLab, srgb); case CV_32F: - return color::invoke(width, height, {cvtLabtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue, isLab, srgb); + return color::invoke(width, height, {LabtoBGR::cvtLabtoBGR}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, dcn, swapBlue, isLab, srgb); } return CV_HAL_ERROR_NOT_IMPLEMENTED; } -} // cv::cv_hal_rvv::LabtoBGR namespace BGRtoLab { -#undef cv_hal_cvtBGRtoLab -#define cv_hal_cvtBGRtoLab cv::cv_hal_rvv::BGRtoLab::cvtBGRtoLab struct rvv_base { @@ -3060,31 +3034,126 @@ static inline int cvtBGRtoLab_f(int start, int end, const float * src, size_t sr return CV_HAL_ERROR_OK; } -inline int cvtBGRtoLab(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isLab, bool srgb) +} // BGRtoLab + +int cvtBGRtoLab(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isLab, bool srgb) { if (scn != 3 && scn != 4) return CV_HAL_ERROR_NOT_IMPLEMENTED; - auto cvtBGRtoLab_b = cvtBGRtoLab_u; + auto cvtBGRtoLab_b = BGRtoLab::cvtBGRtoLab_u; if (!isLab && !srgb) - cvtBGRtoLab_b = cvtBGRtoLab_u; + cvtBGRtoLab_b = BGRtoLab::cvtBGRtoLab_u; else if (!isLab && srgb) - cvtBGRtoLab_b = cvtBGRtoLab_u; + cvtBGRtoLab_b = BGRtoLab::cvtBGRtoLab_u; else if (isLab && !srgb) - cvtBGRtoLab_b = cvtBGRtoLab_u; + cvtBGRtoLab_b = BGRtoLab::cvtBGRtoLab_u; switch (depth) { case CV_8U: return color::invoke(width, height, {cvtBGRtoLab_b}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue); case CV_32F: - return color::invoke(width, height, {cvtBGRtoLab_f}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue, isLab, srgb); + return color::invoke(width, height, {BGRtoLab::cvtBGRtoLab_f}, reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, scn, swapBlue, isLab, srgb); } return CV_HAL_ERROR_NOT_IMPLEMENTED; } -} // cv::cv_hal_rvv::BGRtoLab -}} +#endif // CV_HAL_RVV_1P0_ENABLED -#endif +#if CV_HAL_RVV_071_ENABLED + +static const unsigned char index_array_32 [32] + { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15, 18, 17, 16, 19, 22, 21, 20, 23, 26, 25, 24, 27, 30, 29, 28, 31 }; + +static const unsigned char index_array_24 [24] + { 2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17, 16, 15, 20, 19, 18, 23, 22, 21 }; + +static void vBGRtoBGR(const unsigned char* src, unsigned char * dst, const unsigned char * index, int n, int scn, int dcn, int vsize_pixels, const int vsize) +{ + vuint8m2_t vec_index = vle8_v_u8m2(index, vsize); + + int i = 0; + + for ( ; i <= n-vsize; i += vsize_pixels, src += vsize, dst += vsize) + { + vuint8m2_t vec_src = vle8_v_u8m2(src, vsize); + vuint8m2_t vec_dst = vrgather_vv_u8m2(vec_src, vec_index, vsize); + vse8_v_u8m2(dst, vec_dst, vsize); + } + + for ( ; i < n; i++, src += scn, dst += dcn ) + { + unsigned char t0 = src[0], t1 = src[1], t2 = src[2]; + dst[2] = t0; + dst[1] = t1; + dst[0] = t2; + if(dcn == 4) + { + unsigned char d = src[3]; + dst[3] = d; + } + } +} + +static void sBGRtoBGR(const unsigned char* src, unsigned char * dst, int n, int scn, int dcn, int bi) +{ + for (int i = 0; i < n; i++, src += scn, dst += dcn) + { + unsigned char t0 = src[0], t1 = src[1], t2 = src[2]; + dst[bi ] = t0; + dst[1] = t1; + dst[bi^2] = t2; + if(dcn == 4) + { + unsigned char d = scn == 4 ? src[3] : std::numeric_limits::max(); + dst[3] = d; + } + } +} + +int cvtBGRtoBGR(const unsigned char * src_data, size_t src_step, unsigned char * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue) +{ + if (depth != CV_8U) + { + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + const int blueIdx = swapBlue ? 2 : 0; + if (scn == dcn) + { + if (!swapBlue) + { + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + const int vsize_pixels = 8; + + if (scn == 4) + { + for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step) + { + vBGRtoBGR(src_data, dst_data, index_array_32, width, scn, dcn, vsize_pixels, 32); + } + } + else + { + for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step) + { + vBGRtoBGR(src_data, dst_data, index_array_24, width, scn, dcn, vsize_pixels, 24); + } + } + } + else + { + for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step) + sBGRtoBGR(src_data, dst_data, width, scn, dcn, blueIdx); + } + + return CV_HAL_ERROR_OK; +} + +#endif // CV_HAL_RVV_071_ENABLED + +}}} // cv::rvv_hal::imgproc diff --git a/hal/riscv-rvv/src/imgproc/common.hpp b/hal/riscv-rvv/src/imgproc/common.hpp new file mode 100644 index 0000000000..819b43421c --- /dev/null +++ b/hal/riscv-rvv/src/imgproc/common.hpp @@ -0,0 +1,76 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2025, SpaceMIT Inc., all rights reserved. +// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. +// Third party copyrights are property of their respective owners. + +#ifndef OPENCV_HAL_RVV_IMGPROC_COMMON_HPP_INCLUDED +#define OPENCV_HAL_RVV_IMGPROC_COMMON_HPP_INCLUDED + +#include "opencv2/core/hal/interface.h" +#include "opencv2/imgproc/hal/interface.h" + +namespace cv { namespace rvv_hal { namespace imgproc { namespace common { + +inline int borderInterpolate( int p, int len, int borderType ) +{ + if ((unsigned)p < (unsigned)len) + ; + else if (borderType == CV_HAL_BORDER_REPLICATE) + p = p < 0 ? 0 : len - 1; + else if (borderType == CV_HAL_BORDER_REFLECT || borderType == CV_HAL_BORDER_REFLECT_101) + { + int delta = borderType == CV_HAL_BORDER_REFLECT_101; + if (len == 1) + return 0; + do + { + if (p < 0) + p = -p - 1 + delta; + else + p = len - 1 - (p - len) - delta; + } + while( (unsigned)p >= (unsigned)len ); + } + else if (borderType == CV_HAL_BORDER_WRAP) + { + if (p < 0) + p -= ((p-len+1)/len)*len; + if (p >= len) + p %= len; + } + else if (borderType == CV_HAL_BORDER_CONSTANT) + p = -1; + return p; +} + +class FilterInvoker : public ParallelLoopBody +{ +public: + template + FilterInvoker(std::function _func, Args&&... args) + { + func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward(args)...); + } + + virtual void operator()(const Range& range) const override + { + func(range.start, range.end); + } + +private: + std::function func; +}; + +template +inline int invoke(int height, std::function func, Args&&... args) +{ + cv::parallel_for_(Range(1, height), FilterInvoker(func, std::forward(args)...), cv::getNumThreads()); + return func(0, 1, std::forward(args)...); +} + +}}}} // cv::rvv_hal::imgproc::common + +#endif // OPENCV_HAL_RVV_IMGPROC_COMMON_HPP_INCLUDED diff --git a/hal/riscv-rvv/src/imgproc/filter.cpp b/hal/riscv-rvv/src/imgproc/filter.cpp new file mode 100644 index 0000000000..f23b56e01d --- /dev/null +++ b/hal/riscv-rvv/src/imgproc/filter.cpp @@ -0,0 +1,264 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. + +#include "rvv_hal.hpp" +#include "common.hpp" + +namespace cv { namespace rvv_hal { namespace imgproc { + +#if CV_HAL_RVV_1P0_ENABLED + +namespace { + +struct Filter2D +{ + const uchar* kernel_data; + size_t kernel_step; + int kernel_type; + int kernel_width; + int kernel_height; + int src_type; + int dst_type; + int borderType; + double delta; + int anchor_x; + int anchor_y; +}; + +static void process3(int anchor, int left, int right, float delta, const float* kernel, const uchar* row0, const uchar* row1, const uchar* row2, uchar* dst) +{ + int vl; + for (int i = left; i < right; i += vl) + { + vl = __riscv_vsetvl_e8m1(right - i); + auto s0 = __riscv_vfmv_v_f_f32m4(delta, vl); + auto s1 = __riscv_vfmv_v_f_f32m4(delta, vl); + auto s2 = __riscv_vfmv_v_f_f32m4(delta, vl); + auto s3 = __riscv_vfmv_v_f_f32m4(delta, vl); + + auto addshift = [&](vfloat32m4_t a, vfloat32m4_t b, float k0, float k1, float k2, float r1, float r2) { + a = __riscv_vfmacc(a, k0, b, vl); + b = __riscv_vfslide1down(b, r1, vl); + a = __riscv_vfmacc(a, k1, b, vl); + b = __riscv_vfslide1down(b, r2, vl); + return __riscv_vfmacc(a, k2, b, vl); + }; + auto loadsrc = [&](const uchar* row, float k0, float k1, float k2) { + if (!row) return; + + const uchar* extra = row + (i - anchor) * 4; + auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl); + auto v0 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl), vl); + auto v1 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl), vl); + auto v2 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl), vl); + auto v3 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl), vl); + + extra += vl * 4; + s0 = addshift(s0, v0, k0, k1, k2, extra[0], extra[4]); + s1 = addshift(s1, v1, k0, k1, k2, extra[1], extra[5]); + s2 = addshift(s2, v2, k0, k1, k2, extra[2], extra[6]); + s3 = addshift(s3, v3, k0, k1, k2, extra[3], extra[7]); + }; + + loadsrc(row0, kernel[0], kernel[1], kernel[2]); + loadsrc(row1, kernel[3], kernel[4], kernel[5]); + loadsrc(row2, kernel[6], kernel[7], kernel[8]); + vuint8m1x4_t val{}; + val = __riscv_vset_v_u8m1_u8m1x4(val, 0, __riscv_vnclipu(__riscv_vfncvt_xu(s0, vl), 0, __RISCV_VXRM_RNU, vl)); + val = __riscv_vset_v_u8m1_u8m1x4(val, 1, __riscv_vnclipu(__riscv_vfncvt_xu(s1, vl), 0, __RISCV_VXRM_RNU, vl)); + val = __riscv_vset_v_u8m1_u8m1x4(val, 2, __riscv_vnclipu(__riscv_vfncvt_xu(s2, vl), 0, __RISCV_VXRM_RNU, vl)); + val = __riscv_vset_v_u8m1_u8m1x4(val, 3, __riscv_vnclipu(__riscv_vfncvt_xu(s3, vl), 0, __RISCV_VXRM_RNU, vl)); + __riscv_vsseg4e8(dst + i * 4, val, vl); + } +} + +static void process5(int anchor, int left, int right, float delta, const float* kernel, const uchar* row0, const uchar* row1, const uchar* row2, const uchar* row3, const uchar* row4, uchar* dst) +{ + int vl; + for (int i = left; i < right; i += vl) + { + vl = __riscv_vsetvl_e8m1(right - i); + auto s0 = __riscv_vfmv_v_f_f32m4(delta, vl); + auto s1 = __riscv_vfmv_v_f_f32m4(delta, vl); + auto s2 = __riscv_vfmv_v_f_f32m4(delta, vl); + auto s3 = __riscv_vfmv_v_f_f32m4(delta, vl); + + auto addshift = [&](vfloat32m4_t a, vfloat32m4_t b, float k0, float k1, float k2, float k3, float k4, float r1, float r2, float r3, float r4) { + a = __riscv_vfmacc(a, k0, b, vl); + b = __riscv_vfslide1down(b, r1, vl); + a = __riscv_vfmacc(a, k1, b, vl); + b = __riscv_vfslide1down(b, r2, vl); + a = __riscv_vfmacc(a, k2, b, vl); + b = __riscv_vfslide1down(b, r3, vl); + a = __riscv_vfmacc(a, k3, b, vl); + b = __riscv_vfslide1down(b, r4, vl); + return __riscv_vfmacc(a, k4, b, vl); + }; + auto loadsrc = [&](const uchar* row, float k0, float k1, float k2, float k3, float k4) { + if (!row) return; + + const uchar* extra = row + (i - anchor) * 4; + auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl); + auto v0 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl), vl); + auto v1 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl), vl); + auto v2 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl), vl); + auto v3 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl), vl); + + extra += vl * 4; + s0 = addshift(s0, v0, k0, k1, k2, k3, k4, extra[0], extra[4], extra[ 8], extra[12]); + s1 = addshift(s1, v1, k0, k1, k2, k3, k4, extra[1], extra[5], extra[ 9], extra[13]); + s2 = addshift(s2, v2, k0, k1, k2, k3, k4, extra[2], extra[6], extra[10], extra[14]); + s3 = addshift(s3, v3, k0, k1, k2, k3, k4, extra[3], extra[7], extra[11], extra[15]); + }; + + loadsrc(row0, kernel[ 0], kernel[ 1], kernel[ 2], kernel[ 3], kernel[ 4]); + loadsrc(row1, kernel[ 5], kernel[ 6], kernel[ 7], kernel[ 8], kernel[ 9]); + loadsrc(row2, kernel[10], kernel[11], kernel[12], kernel[13], kernel[14]); + loadsrc(row3, kernel[15], kernel[16], kernel[17], kernel[18], kernel[19]); + loadsrc(row4, kernel[20], kernel[21], kernel[22], kernel[23], kernel[24]); + vuint8m1x4_t val{}; + val = __riscv_vset_v_u8m1_u8m1x4(val, 0, __riscv_vnclipu(__riscv_vfncvt_xu(s0, vl), 0, __RISCV_VXRM_RNU, vl)); + val = __riscv_vset_v_u8m1_u8m1x4(val, 1, __riscv_vnclipu(__riscv_vfncvt_xu(s1, vl), 0, __RISCV_VXRM_RNU, vl)); + val = __riscv_vset_v_u8m1_u8m1x4(val, 2, __riscv_vnclipu(__riscv_vfncvt_xu(s2, vl), 0, __RISCV_VXRM_RNU, vl)); + val = __riscv_vset_v_u8m1_u8m1x4(val, 3, __riscv_vnclipu(__riscv_vfncvt_xu(s3, vl), 0, __RISCV_VXRM_RNU, vl)); + __riscv_vsseg4e8(dst + i * 4, val, vl); + } +} + +// the algorithm is copied from 3rdparty/carotene/src/convolution.cpp, +// in the function void CAROTENE_NS::convolution +template +static inline int filter(int start, int end, Filter2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, int width, int height, int full_width, int full_height, int offset_x, int offset_y) +{ + float kernel[ksize * ksize]; + for (int i = 0; i < ksize * ksize; i++) + { + kernel[i] = reinterpret_cast(data->kernel_data + (i / ksize) * data->kernel_step)[i % ksize]; + } + + constexpr int noval = std::numeric_limits::max(); + auto access = [&](int x, int y) { + int pi, pj; + if (data->borderType & BORDER_ISOLATED) + { + pi = common::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED); + pj = common::borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED); + pi = pi < 0 ? noval : pi; + pj = pj < 0 ? noval : pj; + } + else + { + pi = common::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType); + pj = common::borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType); + pi = pi < 0 ? noval : pi - offset_y; + pj = pj < 0 ? noval : pj - offset_x; + } + return std::make_pair(pi, pj); + }; + + auto process = [&](int x, int y) { + float sum0, sum1, sum2, sum3; + sum0 = sum1 = sum2 = sum3 = data->delta; + for (int i = 0; i < ksize * ksize; i++) + { + auto p = access(x + i / ksize, y + i % ksize); + if (p.first != noval && p.second != noval) + { + sum0 += kernel[i] * src_data[p.first * src_step + p.second * 4 ]; + sum1 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 1]; + sum2 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 2]; + sum3 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 3]; + } + } + dst_data[(x * width + y) * 4 ] = std::max(0, std::min((int)std::round(sum0), (int)std::numeric_limits::max())); + dst_data[(x * width + y) * 4 + 1] = std::max(0, std::min((int)std::round(sum1), (int)std::numeric_limits::max())); + dst_data[(x * width + y) * 4 + 2] = std::max(0, std::min((int)std::round(sum2), (int)std::numeric_limits::max())); + dst_data[(x * width + y) * 4 + 3] = std::max(0, std::min((int)std::round(sum3), (int)std::numeric_limits::max())); + }; + + const int left = data->anchor_x, right = width - (ksize - 1 - data->anchor_x); + for (int i = start; i < end; i++) + { + if (left >= right) + { + for (int j = 0; j < width; j++) + process(i, j); + } + else + { + for (int j = 0; j < left; j++) + process(i, j); + for (int j = right; j < width; j++) + process(i, j); + + const uchar* row0 = access(i , 0).first == noval ? nullptr : src_data + access(i , 0).first * src_step; + const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step; + const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step; + if (ksize == 3) + { + process3(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, dst_data + i * width * 4); + } + else + { + const uchar* row3 = access(i + 3, 0).first == noval ? nullptr : src_data + access(i + 3, 0).first * src_step; + const uchar* row4 = access(i + 4, 0).first == noval ? nullptr : src_data + access(i + 4, 0).first * src_step; + process5(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, row3, row4, dst_data + i * width * 4); + } + } + } + + return CV_HAL_ERROR_OK; +} + +} // anonymous + +int filterInit(cvhalFilter2D** context, uchar* kernel_data, size_t kernel_step, int kernel_type, int kernel_width, int kernel_height, int /*max_width*/, int /*max_height*/, int src_type, int dst_type, int borderType, double delta, int anchor_x, int anchor_y, bool /*allowSubmatrix*/, bool /*allowInplace*/) +{ + if (kernel_type != CV_32FC1 || src_type != CV_8UC4 || dst_type != CV_8UC4) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (kernel_width != kernel_height) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (kernel_width != 3 && kernel_width != 5) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + anchor_x = anchor_x < 0 ? kernel_width / 2 : anchor_x; + anchor_y = anchor_y < 0 ? kernel_height / 2 : anchor_y; + *context = reinterpret_cast(new Filter2D{kernel_data, kernel_step, kernel_type, kernel_width, kernel_height, src_type, dst_type, borderType, delta, anchor_x, anchor_y}); + return CV_HAL_ERROR_OK; +} + +int filter(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y) +{ + Filter2D* data = reinterpret_cast(context); + std::vector dst(width * height * 4); + + int res = CV_HAL_ERROR_NOT_IMPLEMENTED; + switch (data->kernel_width) + { + case 3: + res = common::invoke(height, {filter<3>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y); + break; + case 5: + res = common::invoke(height, {filter<5>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y); + break; + } + + for (int i = 0; i < height; i++) + memcpy(dst_data + i * dst_step, dst.data() + i * width * 4, width * 4); + return res; +} + +int filterFree(cvhalFilter2D* context) +{ + delete reinterpret_cast(context); + return CV_HAL_ERROR_OK; +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::imgproc diff --git a/hal/riscv-rvv/src/imgproc/gaussian_blur.cpp b/hal/riscv-rvv/src/imgproc/gaussian_blur.cpp new file mode 100644 index 0000000000..495efa4ee7 --- /dev/null +++ b/hal/riscv-rvv/src/imgproc/gaussian_blur.cpp @@ -0,0 +1,389 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. + +#include "rvv_hal.hpp" +#include "common.hpp" + +namespace cv { namespace rvv_hal { namespace imgproc { + +#if CV_HAL_RVV_1P0_ENABLED + +namespace { + +// the algorithm is same as cv_hal_sepFilter +template +static inline int gaussianBlurC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int border_type) +{ + using T = typename helperT::ElemType; + using WT = typename helperWT::ElemType; + + constexpr int noval = std::numeric_limits::max(); + auto accessX = [&](int x) { + int pi = common::borderInterpolate(offset_y + x - ksize / 2, full_height, border_type); // [TODO] fix dependencies + return pi < 0 ? noval : pi - offset_y; + }; + auto accessY = [&](int y) { + int pj = common::borderInterpolate(offset_x + y - ksize / 2, full_width, border_type); + return pj < 0 ? noval : pj - offset_x; + }; + auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; }; + + constexpr uint kernel[2][5] = {{1, 2, 1}, {1, 4, 6, 4, 1}}; + std::vector res(width * ksize); + auto process = [&](int x, int y) { + WT sum = 0; + for (int i = 0; i < ksize; i++) + { + int p = accessY(y + i); + if (p != noval) + { + sum += kernel[ksize == 5][i] * static_cast(reinterpret_cast(src_data + x * src_step)[p]); + } + } + res[p2idx(x, y)] = sum; + }; + + const int left = ksize / 2, right = width - ksize / 2; + for (int i = start - ksize / 2; i < end + ksize / 2; i++) + { + if (i + offset_y >= 0 && i + offset_y < full_height) + { + if (left >= right) + { + for (int j = 0; j < width; j++) + process(i, j); + } + else + { + for (int j = 0; j < left; j++) + process(i, j); + for (int j = right; j < width; j++) + process(i, j); + + int vl; + for (int j = left; j < right; j += vl) + { + vl = helperT::setvl(right - j); + const T* extra = reinterpret_cast(src_data + i * src_step) + j - ksize / 2; + auto src = __riscv_vzext_vf2(helperT::vload(extra, vl), vl); + + extra += vl; + auto sum = src; + if (ksize == 3) + { + src = __riscv_vslide1down(src, extra[0], vl); + sum = __riscv_vadd(sum, __riscv_vsll(src, 1, vl), vl); + src = __riscv_vslide1down(src, extra[1], vl); + sum = __riscv_vadd(sum, src, vl); + } + else + { + src = __riscv_vslide1down(src, extra[0], vl); + sum = __riscv_vadd(sum, __riscv_vsll(src, 2, vl), vl); + src = __riscv_vslide1down(src, extra[1], vl); + sum = __riscv_vadd(sum, __riscv_vadd(__riscv_vsll(src, 1, vl), __riscv_vsll(src, 2, vl), vl), vl); + src = __riscv_vslide1down(src, extra[2], vl); + sum = __riscv_vadd(sum, __riscv_vsll(src, 2, vl), vl); + src = __riscv_vslide1down(src, extra[3], vl); + sum = __riscv_vadd(sum, src, vl); + } + helperWT::vstore(res.data() + p2idx(i, j), sum, vl); + } + } + } + + int cur = i - ksize / 2; + if (cur >= start) + { + const WT* row0 = accessX(cur ) == noval ? nullptr : res.data() + p2idx(accessX(cur ), 0); + const WT* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0); + const WT* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0); + const WT* row3 = nullptr, *row4 = nullptr; + if (ksize == 5) + { + row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0); + row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0); + } + + int vl; + for (int j = 0; j < width; j += vl) + { + vl = helperWT::setvl(width - j); + auto v0 = row0 ? helperWT::vload(row0 + j, vl) : helperWT::vmv(0, vl); + auto v1 = row1 ? helperWT::vload(row1 + j, vl) : helperWT::vmv(0, vl); + auto v2 = row2 ? helperWT::vload(row2 + j, vl) : helperWT::vmv(0, vl); + typename helperWT::VecType sum; + if (ksize == 3) + { + sum = __riscv_vadd(__riscv_vadd(v0, v2, vl), __riscv_vsll(v1, 1, vl), vl); + } + else + { + sum = __riscv_vadd(v0, __riscv_vadd(__riscv_vsll(v2, 1, vl), __riscv_vsll(v2, 2, vl), vl), vl); + auto v3 = row3 ? helperWT::vload(row3 + j, vl) : helperWT::vmv(0, vl); + sum = __riscv_vadd(sum, __riscv_vsll(__riscv_vadd(v1, v3, vl), 2, vl), vl); + auto v4 = row4 ? helperWT::vload(row4 + j, vl) : helperWT::vmv(0, vl); + sum = __riscv_vadd(sum, v4, vl); + } + helperT::vstore(reinterpret_cast(dst_data + cur * dst_step) + j, __riscv_vnclipu(sum, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl), vl); + } + } + } + + return CV_HAL_ERROR_OK; +} + +template +static inline int gaussianBlurC4(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int border_type) +{ + constexpr int noval = std::numeric_limits::max(); + auto accessX = [&](int x) { + int pi = common::borderInterpolate(offset_y + x - ksize / 2, full_height, border_type); + return pi < 0 ? noval : pi - offset_y; + }; + auto accessY = [&](int y) { + int pj = common::borderInterpolate(offset_x + y - ksize / 2, full_width, border_type); + return pj < 0 ? noval : pj - offset_x; + }; + auto p2idx = [&](int x, int y){ return ((x + ksize) % ksize * width + y) * 4; }; + + constexpr uint kernel[2][5] = {{1, 2, 1}, {1, 4, 6, 4, 1}}; + std::vector res(width * ksize * 4); + auto process = [&](int x, int y) { + ushort sum0, sum1, sum2, sum3; + sum0 = sum1 = sum2 = sum3 = 0; + for (int i = 0; i < ksize; i++) + { + int p = accessY(y + i); + if (p != noval) + { + sum0 += kernel[ksize == 5][i] * static_cast((src_data + x * src_step)[p * 4 ]); + sum1 += kernel[ksize == 5][i] * static_cast((src_data + x * src_step)[p * 4 + 1]); + sum2 += kernel[ksize == 5][i] * static_cast((src_data + x * src_step)[p * 4 + 2]); + sum3 += kernel[ksize == 5][i] * static_cast((src_data + x * src_step)[p * 4 + 3]); + } + } + res[p2idx(x, y) ] = sum0; + res[p2idx(x, y) + 1] = sum1; + res[p2idx(x, y) + 2] = sum2; + res[p2idx(x, y) + 3] = sum3; + }; + + const int left = ksize / 2, right = width - ksize / 2; + for (int i = start - ksize / 2; i < end + ksize / 2; i++) + { + if (i + offset_y >= 0 && i + offset_y < full_height) + { + if (left >= right) + { + for (int j = 0; j < width; j++) + process(i, j); + } + else + { + for (int j = 0; j < left; j++) + process(i, j); + for (int j = right; j < width; j++) + process(i, j); + + int vl; + for (int j = left; j < right; j += vl) + { + vl = __riscv_vsetvl_e8m1(right - j); + const uchar* extra = src_data + i * src_step + (j - ksize / 2) * 4; + auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl); + auto src0 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl); + auto src1 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl); + auto src2 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl); + auto src3 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl); + + extra += vl * 4; + auto sum0 = src0, sum1 = src1, sum2 = src2, sum3 = src3; + if (ksize == 3) + { + src0 = __riscv_vslide1down(src0, extra[0], vl); + src1 = __riscv_vslide1down(src1, extra[1], vl); + src2 = __riscv_vslide1down(src2, extra[2], vl); + src3 = __riscv_vslide1down(src3, extra[3], vl); + sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 1, vl), vl); + sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 1, vl), vl); + sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 1, vl), vl); + sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 1, vl), vl); + src0 = __riscv_vslide1down(src0, extra[4], vl); + src1 = __riscv_vslide1down(src1, extra[5], vl); + src2 = __riscv_vslide1down(src2, extra[6], vl); + src3 = __riscv_vslide1down(src3, extra[7], vl); + sum0 = __riscv_vadd(sum0, src0, vl); + sum1 = __riscv_vadd(sum1, src1, vl); + sum2 = __riscv_vadd(sum2, src2, vl); + sum3 = __riscv_vadd(sum3, src3, vl); + } + else + { + src0 = __riscv_vslide1down(src0, extra[0], vl); + src1 = __riscv_vslide1down(src1, extra[1], vl); + src2 = __riscv_vslide1down(src2, extra[2], vl); + src3 = __riscv_vslide1down(src3, extra[3], vl); + sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl); + sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl); + sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl); + sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl); + src0 = __riscv_vslide1down(src0, extra[4], vl); + src1 = __riscv_vslide1down(src1, extra[5], vl); + src2 = __riscv_vslide1down(src2, extra[6], vl); + src3 = __riscv_vslide1down(src3, extra[7], vl); + sum0 = __riscv_vadd(sum0, __riscv_vadd(__riscv_vsll(src0, 1, vl), __riscv_vsll(src0, 2, vl), vl), vl); + sum1 = __riscv_vadd(sum1, __riscv_vadd(__riscv_vsll(src1, 1, vl), __riscv_vsll(src1, 2, vl), vl), vl); + sum2 = __riscv_vadd(sum2, __riscv_vadd(__riscv_vsll(src2, 1, vl), __riscv_vsll(src2, 2, vl), vl), vl); + sum3 = __riscv_vadd(sum3, __riscv_vadd(__riscv_vsll(src3, 1, vl), __riscv_vsll(src3, 2, vl), vl), vl); + src0 = __riscv_vslide1down(src0, extra[ 8], vl); + src1 = __riscv_vslide1down(src1, extra[ 9], vl); + src2 = __riscv_vslide1down(src2, extra[10], vl); + src3 = __riscv_vslide1down(src3, extra[11], vl); + sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl); + sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl); + sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl); + sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl); + src0 = __riscv_vslide1down(src0, extra[12], vl); + src1 = __riscv_vslide1down(src1, extra[13], vl); + src2 = __riscv_vslide1down(src2, extra[14], vl); + src3 = __riscv_vslide1down(src3, extra[15], vl); + sum0 = __riscv_vadd(sum0, src0, vl); + sum1 = __riscv_vadd(sum1, src1, vl); + sum2 = __riscv_vadd(sum2, src2, vl); + sum3 = __riscv_vadd(sum3, src3, vl); + } + + vuint16m2x4_t dst{}; + dst = __riscv_vset_v_u16m2_u16m2x4(dst, 0, sum0); + dst = __riscv_vset_v_u16m2_u16m2x4(dst, 1, sum1); + dst = __riscv_vset_v_u16m2_u16m2x4(dst, 2, sum2); + dst = __riscv_vset_v_u16m2_u16m2x4(dst, 3, sum3); + __riscv_vsseg4e16(res.data() + p2idx(i, j), dst, vl); + } + } + } + + int cur = i - ksize / 2; + if (cur >= start) + { + const ushort* row0 = accessX(cur ) == noval ? nullptr : res.data() + p2idx(accessX(cur ), 0); + const ushort* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0); + const ushort* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0); + const ushort* row3 = nullptr, *row4 = nullptr; + if (ksize == 5) + { + row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0); + row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0); + } + + int vl; + for (int j = 0; j < width; j += vl) + { + vl = __riscv_vsetvl_e16m2(width - j); + vuint16m2_t sum0, sum1, sum2, sum3, src0{}, src1{}, src2{}, src3{}; + sum0 = sum1 = sum2 = sum3 = __riscv_vmv_v_x_u16m2(0, vl); + + auto loadres = [&](const ushort* row) { + auto src = __riscv_vlseg4e16_v_u16m2x4(row + j * 4, vl); + src0 = __riscv_vget_v_u16m2x4_u16m2(src, 0); + src1 = __riscv_vget_v_u16m2x4_u16m2(src, 1); + src2 = __riscv_vget_v_u16m2x4_u16m2(src, 2); + src3 = __riscv_vget_v_u16m2x4_u16m2(src, 3); + }; + if (row0) + { + loadres(row0); + sum0 = src0; + sum1 = src1; + sum2 = src2; + sum3 = src3; + } + if (row1) + { + loadres(row1); + sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, ksize == 5 ? 2 : 1, vl), vl); + sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, ksize == 5 ? 2 : 1, vl), vl); + sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, ksize == 5 ? 2 : 1, vl), vl); + sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, ksize == 5 ? 2 : 1, vl), vl); + } + if (row2) + { + loadres(row2); + if (ksize == 5) + { + src0 = __riscv_vadd(__riscv_vsll(src0, 1, vl), __riscv_vsll(src0, 2, vl), vl); + src1 = __riscv_vadd(__riscv_vsll(src1, 1, vl), __riscv_vsll(src1, 2, vl), vl); + src2 = __riscv_vadd(__riscv_vsll(src2, 1, vl), __riscv_vsll(src2, 2, vl), vl); + src3 = __riscv_vadd(__riscv_vsll(src3, 1, vl), __riscv_vsll(src3, 2, vl), vl); + } + sum0 = __riscv_vadd(sum0, src0, vl); + sum1 = __riscv_vadd(sum1, src1, vl); + sum2 = __riscv_vadd(sum2, src2, vl); + sum3 = __riscv_vadd(sum3, src3, vl); + } + if (row3) + { + loadres(row3); + sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl); + sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl); + sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl); + sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl); + } + if (row4) + { + loadres(row4); + sum0 = __riscv_vadd(sum0, src0, vl); + sum1 = __riscv_vadd(sum1, src1, vl); + sum2 = __riscv_vadd(sum2, src2, vl); + sum3 = __riscv_vadd(sum3, src3, vl); + } + + vuint8m1x4_t dst{}; + dst = __riscv_vset_v_u8m1_u8m1x4(dst, 0, __riscv_vnclipu(sum0, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl)); + dst = __riscv_vset_v_u8m1_u8m1x4(dst, 1, __riscv_vnclipu(sum1, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl)); + dst = __riscv_vset_v_u8m1_u8m1x4(dst, 2, __riscv_vnclipu(sum2, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl)); + dst = __riscv_vset_v_u8m1_u8m1x4(dst, 3, __riscv_vnclipu(sum3, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl)); + __riscv_vsseg4e8(dst_data + cur * dst_step + j * 4, dst, vl); + } + } + } + + return CV_HAL_ERROR_OK; +} + +} // anonymous + +int gaussianBlurBinomial(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, size_t margin_left, size_t margin_top, size_t margin_right, size_t margin_bottom, size_t ksize, int border_type) +{ + const int type = CV_MAKETYPE(depth, cn); + if ((type != CV_8UC1 && type != CV_8UC4 && type != CV_16UC1) || src_data == dst_data) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if ((ksize != 3 && ksize != 5) || border_type & BORDER_ISOLATED || border_type == BORDER_WRAP) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + switch (ksize*100 + type) + { + case 300 + CV_8UC1: + return common::invoke(height, {gaussianBlurC1<3, RVV_U8M4, RVV_U16M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type); + case 500 + CV_8UC1: + return common::invoke(height, {gaussianBlurC1<5, RVV_U8M4, RVV_U16M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type); + case 300 + CV_16UC1: + return common::invoke(height, {gaussianBlurC1<3, RVV_U16M4, RVV_U32M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type); + case 500 + CV_16UC1: + return common::invoke(height, {gaussianBlurC1<5, RVV_U16M4, RVV_U32M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type); + case 300 + CV_8UC4: + return common::invoke(height, {gaussianBlurC4<3>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type); + case 500 + CV_8UC4: + return common::invoke(height, {gaussianBlurC4<5>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type); + } + + return CV_HAL_ERROR_NOT_IMPLEMENTED; +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::imgproc diff --git a/hal/riscv-rvv/hal_rvv_1p0/histogram.hpp b/hal/riscv-rvv/src/imgproc/histogram.cpp similarity index 86% rename from hal/riscv-rvv/hal_rvv_1p0/histogram.hpp rename to hal/riscv-rvv/src/imgproc/histogram.cpp index 48f6123b0d..fd6adc3be3 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/histogram.hpp +++ b/hal/riscv-rvv/src/imgproc/histogram.cpp @@ -4,16 +4,13 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_HISTOGRAM_HPP_INCLUDED -#define OPENCV_HAL_RVV_HISTOGRAM_HPP_INCLUDED +#include "rvv_hal.hpp" -#include +namespace cv { namespace rvv_hal { namespace imgproc { -namespace cv { namespace cv_hal_rvv { +#if CV_HAL_RVV_1P0_ENABLED -namespace equalize_hist { -#undef cv_hal_equalize_hist -#define cv_hal_equalize_hist cv::cv_hal_rvv::equalize_hist::equalize_hist +namespace { class HistogramInvoker : public ParallelLoopBody { @@ -77,9 +74,11 @@ static inline void lut_invoke(int start, int end, const uchar* src_data, size_t } } +} // equalize_hist + // the algorithm is copied from imgproc/src/histogram.cpp, // in the function void cv::equalizeHist -inline int equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height) +int equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height) { int hist[HIST_SZ] = {0}; uchar lut[HIST_SZ]; @@ -101,8 +100,7 @@ inline int equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data return CV_HAL_ERROR_OK; } -} // cv::cv_hal_rvv::equalize_hist -}} +#endif // CV_HAL_RVV_1P0_ENABLED -#endif +}}} // cv::rvv_hal::imgproc diff --git a/hal/riscv-rvv/hal_rvv_1p0/integral.hpp b/hal/riscv-rvv/src/imgproc/integral.cpp similarity index 92% rename from hal/riscv-rvv/hal_rvv_1p0/integral.hpp rename to hal/riscv-rvv/src/imgproc/integral.cpp index a3ea0b5557..e0c7f44995 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/integral.hpp +++ b/hal/riscv-rvv/src/imgproc/integral.cpp @@ -4,16 +4,13 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_INTEGRAL_HPP_INCLUDED -#define OPENCV_HAL_RVV_INTEGRAL_HPP_INCLUDED +#include "rvv_hal.hpp" -#include -#include "types.hpp" +namespace cv { namespace rvv_hal { namespace imgproc { -namespace cv { namespace cv_hal_rvv { +#if CV_HAL_RVV_1P0_ENABLED -#undef cv_hal_integral -#define cv_hal_integral cv::cv_hal_rvv::integral +namespace { template inline typename vec_t::VecType repeat_last_n(typename vec_t::VecType vs, int n, size_t vl) { @@ -87,6 +84,8 @@ inline int integral(const uchar* src_data, size_t src_step, uchar* sum_data, siz return result; } +} // anonymous + /** @brief Calculate integral image @param depth Depth of source image @@ -119,12 +118,12 @@ inline int integral(const uchar* src_data, size_t src_step, uchar* sum_data, siz CV_32F | CV_64F | CV_64F CV_64F | CV_64F | CV_64F */ -inline int integral(int depth, int sdepth, int sqdepth, - const uchar* src_data, size_t src_step, - uchar* sum_data, size_t sum_step, - uchar* sqsum_data, size_t sqsum_step, - uchar* tilted_data, [[maybe_unused]] size_t tilted_step, - int width, int height, int cn) { +int integral(int depth, int sdepth, int sqdepth, + const uchar* src_data, size_t src_step, + uchar* sum_data, size_t sum_step, + uchar* sqsum_data, size_t sqsum_step, + uchar* tilted_data, [[maybe_unused]] size_t tilted_step, + int width, int height, int cn) { // tilted sum and cn == 3 cases are not supported if (tilted_data || cn == 3) { return CV_HAL_ERROR_NOT_IMPLEMENTED; @@ -168,6 +167,6 @@ inline int integral(int depth, int sdepth, int sqdepth, return result; } -}} +#endif // CV_HAL_RVV_1P0_ENABLED -#endif +}}} // cv::rvv_hal::imgproc diff --git a/hal/riscv-rvv/src/imgproc/median_blur.cpp b/hal/riscv-rvv/src/imgproc/median_blur.cpp new file mode 100644 index 0000000000..d86b2d92e3 --- /dev/null +++ b/hal/riscv-rvv/src/imgproc/median_blur.cpp @@ -0,0 +1,575 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. + +#include "rvv_hal.hpp" +#include "common.hpp" + +namespace cv { namespace rvv_hal { namespace imgproc { + +#if CV_HAL_RVV_1P0_ENABLED + +namespace { + +// the algorithm is copied from imgproc/src/median_blur.simd.cpp +// in the function template static void medianBlur_SortNet +template +static inline int medianBlurC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height) +{ + using T = typename helper::ElemType; + using VT = typename helper::VecType; + + for (int i = start; i < end; i++) + { + const T* row0 = reinterpret_cast(src_data + std::min(std::max(i - ksize / 2, 0), height - 1) * src_step); + const T* row1 = reinterpret_cast(src_data + std::min(std::max(i + 1 - ksize / 2, 0), height - 1) * src_step); + const T* row2 = reinterpret_cast(src_data + std::min(std::max(i + 2 - ksize / 2, 0), height - 1) * src_step); + const T* row3 = reinterpret_cast(src_data + std::min(std::max(i + 3 - ksize / 2, 0), height - 1) * src_step); + const T* row4 = reinterpret_cast(src_data + std::min(std::max(i + 4 - ksize / 2, 0), height - 1) * src_step); + int vl; + auto vop = [&vl](VT& a, VT& b) { + auto t = a; + a = helper::vmin(a, b, vl); + b = helper::vmax(t, b, vl); + }; + + for (int j = 0; j < width; j += vl) + { + vl = helper::setvl(width - j); + if (ksize == 3) + { + VT p0, p1, p2; + VT p3, p4, p5; + VT p6, p7, p8; + if (j != 0) + { + p0 = helper::vload(row0 + j - 1, vl); + p3 = helper::vload(row1 + j - 1, vl); + p6 = helper::vload(row2 + j - 1, vl); + } + else + { + p0 = helper::vslide1up(helper::vload(row0, vl), row0[0], vl); + p3 = helper::vslide1up(helper::vload(row1, vl), row1[0], vl); + p6 = helper::vslide1up(helper::vload(row2, vl), row2[0], vl); + } + p1 = helper::vslide1down(p0, row0[j + vl - 1], vl); + p4 = helper::vslide1down(p3, row1[j + vl - 1], vl); + p7 = helper::vslide1down(p6, row2[j + vl - 1], vl); + p2 = helper::vslide1down(p1, row0[std::min(width - 1, j + vl)], vl); + p5 = helper::vslide1down(p4, row1[std::min(width - 1, j + vl)], vl); + p8 = helper::vslide1down(p7, row2[std::min(width - 1, j + vl)], vl); + + vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1); + vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5); + vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7); + vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7); + vop(p4, p2); vop(p6, p4); vop(p4, p2); + helper::vstore(reinterpret_cast(dst_data + i * dst_step) + j, p4, vl); + } + else + { + VT p0, p1, p2, p3, p4; + VT p5, p6, p7, p8, p9; + VT p10, p11, p12, p13, p14; + VT p15, p16, p17, p18, p19; + VT p20, p21, p22, p23, p24; + if (j >= 2) + { + p0 = helper::vload(row0 + j - 2, vl); + p5 = helper::vload(row1 + j - 2, vl); + p10 = helper::vload(row2 + j - 2, vl); + p15 = helper::vload(row3 + j - 2, vl); + p20 = helper::vload(row4 + j - 2, vl); + } + else + { + p0 = helper::vslide1up(helper::vload(row0, vl), row0[0], vl); + p5 = helper::vslide1up(helper::vload(row1, vl), row1[0], vl); + p10 = helper::vslide1up(helper::vload(row2, vl), row2[0], vl); + p15 = helper::vslide1up(helper::vload(row3, vl), row3[0], vl); + p20 = helper::vslide1up(helper::vload(row4, vl), row4[0], vl); + if (j == 0) + { + p0 = helper::vslide1up(p0, row0[0], vl); + p5 = helper::vslide1up(p5, row1[0], vl); + p10 = helper::vslide1up(p10, row2[0], vl); + p15 = helper::vslide1up(p15, row3[0], vl); + p20 = helper::vslide1up(p20, row4[0], vl); + } + } + p1 = helper::vslide1down(p0, row0[j + vl - 2], vl); + p6 = helper::vslide1down(p5, row1[j + vl - 2], vl); + p11 = helper::vslide1down(p10, row2[j + vl - 2], vl); + p16 = helper::vslide1down(p15, row3[j + vl - 2], vl); + p21 = helper::vslide1down(p20, row4[j + vl - 2], vl); + p2 = helper::vslide1down(p1, row0[j + vl - 1], vl); + p7 = helper::vslide1down(p6, row1[j + vl - 1], vl); + p12 = helper::vslide1down(p11, row2[j + vl - 1], vl); + p17 = helper::vslide1down(p16, row3[j + vl - 1], vl); + p22 = helper::vslide1down(p21, row4[j + vl - 1], vl); + p3 = helper::vslide1down(p2, row0[std::min(width - 1, j + vl)], vl); + p8 = helper::vslide1down(p7, row1[std::min(width - 1, j + vl)], vl); + p13 = helper::vslide1down(p12, row2[std::min(width - 1, j + vl)], vl); + p18 = helper::vslide1down(p17, row3[std::min(width - 1, j + vl)], vl); + p23 = helper::vslide1down(p22, row4[std::min(width - 1, j + vl)], vl); + p4 = helper::vslide1down(p3, row0[std::min(width - 1, j + vl + 1)], vl); + p9 = helper::vslide1down(p8, row1[std::min(width - 1, j + vl + 1)], vl); + p14 = helper::vslide1down(p13, row2[std::min(width - 1, j + vl + 1)], vl); + p19 = helper::vslide1down(p18, row3[std::min(width - 1, j + vl + 1)], vl); + p24 = helper::vslide1down(p23, row4[std::min(width - 1, j + vl + 1)], vl); + + vop(p1, p2); vop(p0, p1); vop(p1, p2); vop(p4, p5); vop(p3, p4); + vop(p4, p5); vop(p0, p3); vop(p2, p5); vop(p2, p3); vop(p1, p4); + vop(p1, p2); vop(p3, p4); vop(p7, p8); vop(p6, p7); vop(p7, p8); + vop(p10, p11); vop(p9, p10); vop(p10, p11); vop(p6, p9); vop(p8, p11); + vop(p8, p9); vop(p7, p10); vop(p7, p8); vop(p9, p10); vop(p0, p6); + vop(p4, p10); vop(p4, p6); vop(p2, p8); vop(p2, p4); vop(p6, p8); + vop(p1, p7); vop(p5, p11); vop(p5, p7); vop(p3, p9); vop(p3, p5); + vop(p7, p9); vop(p1, p2); vop(p3, p4); vop(p5, p6); vop(p7, p8); + vop(p9, p10); vop(p13, p14); vop(p12, p13); vop(p13, p14); vop(p16, p17); + vop(p15, p16); vop(p16, p17); vop(p12, p15); vop(p14, p17); vop(p14, p15); + vop(p13, p16); vop(p13, p14); vop(p15, p16); vop(p19, p20); vop(p18, p19); + vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p21, p23); vop(p22, p24); + vop(p22, p23); vop(p18, p21); vop(p20, p23); vop(p20, p21); vop(p19, p22); + vop(p22, p24); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p12, p18); + vop(p16, p22); vop(p16, p18); vop(p14, p20); vop(p20, p24); vop(p14, p16); + vop(p18, p20); vop(p22, p24); vop(p13, p19); vop(p17, p23); vop(p17, p19); + vop(p15, p21); vop(p15, p17); vop(p19, p21); vop(p13, p14); vop(p15, p16); + vop(p17, p18); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p0, p12); + vop(p8, p20); vop(p8, p12); vop(p4, p16); vop(p16, p24); vop(p12, p16); + vop(p2, p14); vop(p10, p22); vop(p10, p14); vop(p6, p18); vop(p6, p10); + vop(p10, p12); vop(p1, p13); vop(p9, p21); vop(p9, p13); vop(p5, p17); + vop(p13, p17); vop(p3, p15); vop(p11, p23); vop(p11, p15); vop(p7, p19); + vop(p7, p11); vop(p11, p13); vop(p11, p12); + helper::vstore(reinterpret_cast(dst_data + i * dst_step) + j, p12, vl); + } + } + } + + return CV_HAL_ERROR_OK; +} + +template +static inline int medianBlurC4(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height) +{ + for (int i = start; i < end; i++) + { + const uchar* row0 = src_data + std::min(std::max(i - ksize / 2, 0), height - 1) * src_step; + const uchar* row1 = src_data + std::min(std::max(i + 1 - ksize / 2, 0), height - 1) * src_step; + const uchar* row2 = src_data + std::min(std::max(i + 2 - ksize / 2, 0), height - 1) * src_step; + const uchar* row3 = src_data + std::min(std::max(i + 3 - ksize / 2, 0), height - 1) * src_step; + const uchar* row4 = src_data + std::min(std::max(i + 4 - ksize / 2, 0), height - 1) * src_step; + int vl; + for (int j = 0; j < width; j += vl) + { + if (ksize == 3) + { + vl = __riscv_vsetvl_e8m1(width - j); + vuint8m1_t p00, p01, p02; + vuint8m1_t p03, p04, p05; + vuint8m1_t p06, p07, p08; + vuint8m1_t p10, p11, p12; + vuint8m1_t p13, p14, p15; + vuint8m1_t p16, p17, p18; + vuint8m1_t p20, p21, p22; + vuint8m1_t p23, p24, p25; + vuint8m1_t p26, p27, p28; + vuint8m1_t p30, p31, p32; + vuint8m1_t p33, p34, p35; + vuint8m1_t p36, p37, p38; + auto loadsrc = [&vl](const uchar* row, vuint8m1_t& p0, vuint8m1_t& p1, vuint8m1_t& p2, vuint8m1_t& p3) { + auto src = __riscv_vlseg4e8_v_u8m1x4(row, vl); + p0 = __riscv_vget_v_u8m1x4_u8m1(src, 0); + p1 = __riscv_vget_v_u8m1x4_u8m1(src, 1); + p2 = __riscv_vget_v_u8m1x4_u8m1(src, 2); + p3 = __riscv_vget_v_u8m1x4_u8m1(src, 3); + }; + if (j != 0) + { + loadsrc(row0 + (j - 1) * 4, p00, p10, p20, p30); + loadsrc(row1 + (j - 1) * 4, p03, p13, p23, p33); + loadsrc(row2 + (j - 1) * 4, p06, p16, p26, p36); + } + else + { + loadsrc(row0, p00, p10, p20, p30); + loadsrc(row1, p03, p13, p23, p33); + loadsrc(row2, p06, p16, p26, p36); + p00 = __riscv_vslide1up(p00, row0[0], vl); + p10 = __riscv_vslide1up(p10, row0[1], vl); + p20 = __riscv_vslide1up(p20, row0[2], vl); + p30 = __riscv_vslide1up(p30, row0[3], vl); + p03 = __riscv_vslide1up(p03, row1[0], vl); + p13 = __riscv_vslide1up(p13, row1[1], vl); + p23 = __riscv_vslide1up(p23, row1[2], vl); + p33 = __riscv_vslide1up(p33, row1[3], vl); + p06 = __riscv_vslide1up(p06, row2[0], vl); + p16 = __riscv_vslide1up(p16, row2[1], vl); + p26 = __riscv_vslide1up(p26, row2[2], vl); + p36 = __riscv_vslide1up(p36, row2[3], vl); + } + p01 = __riscv_vslide1down(p00, row0[(j + vl - 1) * 4 ], vl); + p11 = __riscv_vslide1down(p10, row0[(j + vl - 1) * 4 + 1], vl); + p21 = __riscv_vslide1down(p20, row0[(j + vl - 1) * 4 + 2], vl); + p31 = __riscv_vslide1down(p30, row0[(j + vl - 1) * 4 + 3], vl); + p04 = __riscv_vslide1down(p03, row1[(j + vl - 1) * 4 ], vl); + p14 = __riscv_vslide1down(p13, row1[(j + vl - 1) * 4 + 1], vl); + p24 = __riscv_vslide1down(p23, row1[(j + vl - 1) * 4 + 2], vl); + p34 = __riscv_vslide1down(p33, row1[(j + vl - 1) * 4 + 3], vl); + p07 = __riscv_vslide1down(p06, row2[(j + vl - 1) * 4 ], vl); + p17 = __riscv_vslide1down(p16, row2[(j + vl - 1) * 4 + 1], vl); + p27 = __riscv_vslide1down(p26, row2[(j + vl - 1) * 4 + 2], vl); + p37 = __riscv_vslide1down(p36, row2[(j + vl - 1) * 4 + 3], vl); + p02 = __riscv_vslide1down(p01, row0[std::min(width - 1, j + vl) * 4 ], vl); + p12 = __riscv_vslide1down(p11, row0[std::min(width - 1, j + vl) * 4 + 1], vl); + p22 = __riscv_vslide1down(p21, row0[std::min(width - 1, j + vl) * 4 + 2], vl); + p32 = __riscv_vslide1down(p31, row0[std::min(width - 1, j + vl) * 4 + 3], vl); + p05 = __riscv_vslide1down(p04, row1[std::min(width - 1, j + vl) * 4 ], vl); + p15 = __riscv_vslide1down(p14, row1[std::min(width - 1, j + vl) * 4 + 1], vl); + p25 = __riscv_vslide1down(p24, row1[std::min(width - 1, j + vl) * 4 + 2], vl); + p35 = __riscv_vslide1down(p34, row1[std::min(width - 1, j + vl) * 4 + 3], vl); + p08 = __riscv_vslide1down(p07, row2[std::min(width - 1, j + vl) * 4 ], vl); + p18 = __riscv_vslide1down(p17, row2[std::min(width - 1, j + vl) * 4 + 1], vl); + p28 = __riscv_vslide1down(p27, row2[std::min(width - 1, j + vl) * 4 + 2], vl); + p38 = __riscv_vslide1down(p37, row2[std::min(width - 1, j + vl) * 4 + 3], vl); + + auto vop = [&vl](vuint8m1_t& a, vuint8m1_t& b) { + auto t = a; + a = __riscv_vminu(a, b, vl); + b = __riscv_vmaxu(t, b, vl); + }; + vuint8m1x4_t dst{}; + vop(p01, p02); vop(p04, p05); vop(p07, p08); vop(p00, p01); + vop(p03, p04); vop(p06, p07); vop(p01, p02); vop(p04, p05); + vop(p07, p08); vop(p00, p03); vop(p05, p08); vop(p04, p07); + vop(p03, p06); vop(p01, p04); vop(p02, p05); vop(p04, p07); + vop(p04, p02); vop(p06, p04); vop(p04, p02); + dst = __riscv_vset_v_u8m1_u8m1x4(dst, 0, p04); + vop(p11, p12); vop(p14, p15); vop(p17, p18); vop(p10, p11); + vop(p13, p14); vop(p16, p17); vop(p11, p12); vop(p14, p15); + vop(p17, p18); vop(p10, p13); vop(p15, p18); vop(p14, p17); + vop(p13, p16); vop(p11, p14); vop(p12, p15); vop(p14, p17); + vop(p14, p12); vop(p16, p14); vop(p14, p12); + dst = __riscv_vset_v_u8m1_u8m1x4(dst, 1, p14); + vop(p21, p22); vop(p24, p25); vop(p27, p28); vop(p20, p21); + vop(p23, p24); vop(p26, p27); vop(p21, p22); vop(p24, p25); + vop(p27, p28); vop(p20, p23); vop(p25, p28); vop(p24, p27); + vop(p23, p26); vop(p21, p24); vop(p22, p25); vop(p24, p27); + vop(p24, p22); vop(p26, p24); vop(p24, p22); + dst = __riscv_vset_v_u8m1_u8m1x4(dst, 2, p24); + vop(p31, p32); vop(p34, p35); vop(p37, p38); vop(p30, p31); + vop(p33, p34); vop(p36, p37); vop(p31, p32); vop(p34, p35); + vop(p37, p38); vop(p30, p33); vop(p35, p38); vop(p34, p37); + vop(p33, p36); vop(p31, p34); vop(p32, p35); vop(p34, p37); + vop(p34, p32); vop(p36, p34); vop(p34, p32); + dst = __riscv_vset_v_u8m1_u8m1x4(dst, 3, p34); + __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl); + } + else + { + vl = __riscv_vsetvl_e8m2(width - j); + vuint8m2_t p00, p01, p02, p03, p04; + vuint8m2_t p05, p06, p07, p08, p09; + vuint8m2_t p010, p011, p012, p013, p014; + vuint8m2_t p015, p016, p017, p018, p019; + vuint8m2_t p020, p021, p022, p023, p024; + vuint8m2_t p10, p11, p12, p13, p14; + vuint8m2_t p15, p16, p17, p18, p19; + vuint8m2_t p110, p111, p112, p113, p114; + vuint8m2_t p115, p116, p117, p118, p119; + vuint8m2_t p120, p121, p122, p123, p124; + vuint8m2_t p20, p21, p22, p23, p24; + vuint8m2_t p25, p26, p27, p28, p29; + vuint8m2_t p210, p211, p212, p213, p214; + vuint8m2_t p215, p216, p217, p218, p219; + vuint8m2_t p220, p221, p222, p223, p224; + vuint8m2_t p30, p31, p32, p33, p34; + vuint8m2_t p35, p36, p37, p38, p39; + vuint8m2_t p310, p311, p312, p313, p314; + vuint8m2_t p315, p316, p317, p318, p319; + vuint8m2_t p320, p321, p322, p323, p324; + auto loadsrc = [&vl](const uchar* row, vuint8m2_t& p0, vuint8m2_t& p1, vuint8m2_t& p2, vuint8m2_t& p3) { + auto src = __riscv_vlseg4e8_v_u8m2x4(row, vl); + p0 = __riscv_vget_v_u8m2x4_u8m2(src, 0); + p1 = __riscv_vget_v_u8m2x4_u8m2(src, 1); + p2 = __riscv_vget_v_u8m2x4_u8m2(src, 2); + p3 = __riscv_vget_v_u8m2x4_u8m2(src, 3); + }; + if (j >= 2) + { + loadsrc(row0 + (j - 2) * 4, p00, p10, p20, p30); + loadsrc(row1 + (j - 2) * 4, p05, p15, p25, p35); + loadsrc(row2 + (j - 2) * 4, p010, p110, p210, p310); + loadsrc(row3 + (j - 2) * 4, p015, p115, p215, p315); + loadsrc(row4 + (j - 2) * 4, p020, p120, p220, p320); + } + else + { + loadsrc(row0, p00, p10, p20, p30); + loadsrc(row1, p05, p15, p25, p35); + loadsrc(row2, p010, p110, p210, p310); + loadsrc(row3, p015, p115, p215, p315); + loadsrc(row4, p020, p120, p220, p320); + auto slideup = [&] { + p00 = __riscv_vslide1up(p00, row0[0], vl); + p10 = __riscv_vslide1up(p10, row0[1], vl); + p20 = __riscv_vslide1up(p20, row0[2], vl); + p30 = __riscv_vslide1up(p30, row0[3], vl); + p05 = __riscv_vslide1up(p05, row1[0], vl); + p15 = __riscv_vslide1up(p15, row1[1], vl); + p25 = __riscv_vslide1up(p25, row1[2], vl); + p35 = __riscv_vslide1up(p35, row1[3], vl); + p010 = __riscv_vslide1up(p010, row2[0], vl); + p110 = __riscv_vslide1up(p110, row2[1], vl); + p210 = __riscv_vslide1up(p210, row2[2], vl); + p310 = __riscv_vslide1up(p310, row2[3], vl); + p015 = __riscv_vslide1up(p015, row3[0], vl); + p115 = __riscv_vslide1up(p115, row3[1], vl); + p215 = __riscv_vslide1up(p215, row3[2], vl); + p315 = __riscv_vslide1up(p315, row3[3], vl); + p020 = __riscv_vslide1up(p020, row4[0], vl); + p120 = __riscv_vslide1up(p120, row4[1], vl); + p220 = __riscv_vslide1up(p220, row4[2], vl); + p320 = __riscv_vslide1up(p320, row4[3], vl); + }; + slideup(); + if (j == 0) + { + slideup(); + } + } + p01 = __riscv_vslide1down(p00, row0[(j + vl - 2) * 4 ], vl); + p11 = __riscv_vslide1down(p10, row0[(j + vl - 2) * 4 + 1], vl); + p21 = __riscv_vslide1down(p20, row0[(j + vl - 2) * 4 + 2], vl); + p31 = __riscv_vslide1down(p30, row0[(j + vl - 2) * 4 + 3], vl); + p06 = __riscv_vslide1down(p05, row1[(j + vl - 2) * 4 ], vl); + p16 = __riscv_vslide1down(p15, row1[(j + vl - 2) * 4 + 1], vl); + p26 = __riscv_vslide1down(p25, row1[(j + vl - 2) * 4 + 2], vl); + p36 = __riscv_vslide1down(p35, row1[(j + vl - 2) * 4 + 3], vl); + p011 = __riscv_vslide1down(p010, row2[(j + vl - 2) * 4 ], vl); + p111 = __riscv_vslide1down(p110, row2[(j + vl - 2) * 4 + 1], vl); + p211 = __riscv_vslide1down(p210, row2[(j + vl - 2) * 4 + 2], vl); + p311 = __riscv_vslide1down(p310, row2[(j + vl - 2) * 4 + 3], vl); + p016 = __riscv_vslide1down(p015, row3[(j + vl - 2) * 4 ], vl); + p116 = __riscv_vslide1down(p115, row3[(j + vl - 2) * 4 + 1], vl); + p216 = __riscv_vslide1down(p215, row3[(j + vl - 2) * 4 + 2], vl); + p316 = __riscv_vslide1down(p315, row3[(j + vl - 2) * 4 + 3], vl); + p021 = __riscv_vslide1down(p020, row4[(j + vl - 2) * 4 ], vl); + p121 = __riscv_vslide1down(p120, row4[(j + vl - 2) * 4 + 1], vl); + p221 = __riscv_vslide1down(p220, row4[(j + vl - 2) * 4 + 2], vl); + p321 = __riscv_vslide1down(p320, row4[(j + vl - 2) * 4 + 3], vl); + p02 = __riscv_vslide1down(p01, row0[(j + vl - 1) * 4 ], vl); + p12 = __riscv_vslide1down(p11, row0[(j + vl - 1) * 4 + 1], vl); + p22 = __riscv_vslide1down(p21, row0[(j + vl - 1) * 4 + 2], vl); + p32 = __riscv_vslide1down(p31, row0[(j + vl - 1) * 4 + 3], vl); + p07 = __riscv_vslide1down(p06, row1[(j + vl - 1) * 4 ], vl); + p17 = __riscv_vslide1down(p16, row1[(j + vl - 1) * 4 + 1], vl); + p27 = __riscv_vslide1down(p26, row1[(j + vl - 1) * 4 + 2], vl); + p37 = __riscv_vslide1down(p36, row1[(j + vl - 1) * 4 + 3], vl); + p012 = __riscv_vslide1down(p011, row2[(j + vl - 1) * 4 ], vl); + p112 = __riscv_vslide1down(p111, row2[(j + vl - 1) * 4 + 1], vl); + p212 = __riscv_vslide1down(p211, row2[(j + vl - 1) * 4 + 2], vl); + p312 = __riscv_vslide1down(p311, row2[(j + vl - 1) * 4 + 3], vl); + p017 = __riscv_vslide1down(p016, row3[(j + vl - 1) * 4 ], vl); + p117 = __riscv_vslide1down(p116, row3[(j + vl - 1) * 4 + 1], vl); + p217 = __riscv_vslide1down(p216, row3[(j + vl - 1) * 4 + 2], vl); + p317 = __riscv_vslide1down(p316, row3[(j + vl - 1) * 4 + 3], vl); + p022 = __riscv_vslide1down(p021, row4[(j + vl - 1) * 4 ], vl); + p122 = __riscv_vslide1down(p121, row4[(j + vl - 1) * 4 + 1], vl); + p222 = __riscv_vslide1down(p221, row4[(j + vl - 1) * 4 + 2], vl); + p322 = __riscv_vslide1down(p321, row4[(j + vl - 1) * 4 + 3], vl); + p03 = __riscv_vslide1down(p02, row0[std::min(width - 1, j + vl) * 4 ], vl); + p13 = __riscv_vslide1down(p12, row0[std::min(width - 1, j + vl) * 4 + 1], vl); + p23 = __riscv_vslide1down(p22, row0[std::min(width - 1, j + vl) * 4 + 2], vl); + p33 = __riscv_vslide1down(p32, row0[std::min(width - 1, j + vl) * 4 + 3], vl); + p08 = __riscv_vslide1down(p07, row1[std::min(width - 1, j + vl) * 4 ], vl); + p18 = __riscv_vslide1down(p17, row1[std::min(width - 1, j + vl) * 4 + 1], vl); + p28 = __riscv_vslide1down(p27, row1[std::min(width - 1, j + vl) * 4 + 2], vl); + p38 = __riscv_vslide1down(p37, row1[std::min(width - 1, j + vl) * 4 + 3], vl); + p013 = __riscv_vslide1down(p012, row2[std::min(width - 1, j + vl) * 4 ], vl); + p113 = __riscv_vslide1down(p112, row2[std::min(width - 1, j + vl) * 4 + 1], vl); + p213 = __riscv_vslide1down(p212, row2[std::min(width - 1, j + vl) * 4 + 2], vl); + p313 = __riscv_vslide1down(p312, row2[std::min(width - 1, j + vl) * 4 + 3], vl); + p018 = __riscv_vslide1down(p017, row3[std::min(width - 1, j + vl) * 4 ], vl); + p118 = __riscv_vslide1down(p117, row3[std::min(width - 1, j + vl) * 4 + 1], vl); + p218 = __riscv_vslide1down(p217, row3[std::min(width - 1, j + vl) * 4 + 2], vl); + p318 = __riscv_vslide1down(p317, row3[std::min(width - 1, j + vl) * 4 + 3], vl); + p023 = __riscv_vslide1down(p022, row4[std::min(width - 1, j + vl) * 4 ], vl); + p123 = __riscv_vslide1down(p122, row4[std::min(width - 1, j + vl) * 4 + 1], vl); + p223 = __riscv_vslide1down(p222, row4[std::min(width - 1, j + vl) * 4 + 2], vl); + p323 = __riscv_vslide1down(p322, row4[std::min(width - 1, j + vl) * 4 + 3], vl); + p04 = __riscv_vslide1down(p03, row0[std::min(width - 1, j + vl + 1) * 4 ], vl); + p14 = __riscv_vslide1down(p13, row0[std::min(width - 1, j + vl + 1) * 4 + 1], vl); + p24 = __riscv_vslide1down(p23, row0[std::min(width - 1, j + vl + 1) * 4 + 2], vl); + p34 = __riscv_vslide1down(p33, row0[std::min(width - 1, j + vl + 1) * 4 + 3], vl); + p09 = __riscv_vslide1down(p08, row1[std::min(width - 1, j + vl + 1) * 4 ], vl); + p19 = __riscv_vslide1down(p18, row1[std::min(width - 1, j + vl + 1) * 4 + 1], vl); + p29 = __riscv_vslide1down(p28, row1[std::min(width - 1, j + vl + 1) * 4 + 2], vl); + p39 = __riscv_vslide1down(p38, row1[std::min(width - 1, j + vl + 1) * 4 + 3], vl); + p014 = __riscv_vslide1down(p013, row2[std::min(width - 1, j + vl + 1) * 4 ], vl); + p114 = __riscv_vslide1down(p113, row2[std::min(width - 1, j + vl + 1) * 4 + 1], vl); + p214 = __riscv_vslide1down(p213, row2[std::min(width - 1, j + vl + 1) * 4 + 2], vl); + p314 = __riscv_vslide1down(p313, row2[std::min(width - 1, j + vl + 1) * 4 + 3], vl); + p019 = __riscv_vslide1down(p018, row3[std::min(width - 1, j + vl + 1) * 4 ], vl); + p119 = __riscv_vslide1down(p118, row3[std::min(width - 1, j + vl + 1) * 4 + 1], vl); + p219 = __riscv_vslide1down(p218, row3[std::min(width - 1, j + vl + 1) * 4 + 2], vl); + p319 = __riscv_vslide1down(p318, row3[std::min(width - 1, j + vl + 1) * 4 + 3], vl); + p024 = __riscv_vslide1down(p023, row4[std::min(width - 1, j + vl + 1) * 4 ], vl); + p124 = __riscv_vslide1down(p123, row4[std::min(width - 1, j + vl + 1) * 4 + 1], vl); + p224 = __riscv_vslide1down(p223, row4[std::min(width - 1, j + vl + 1) * 4 + 2], vl); + p324 = __riscv_vslide1down(p323, row4[std::min(width - 1, j + vl + 1) * 4 + 3], vl); + + auto vop = [&vl](vuint8m2_t& a, vuint8m2_t& b) { + auto t = a; + a = __riscv_vminu(a, b, vl); + b = __riscv_vmaxu(t, b, vl); + }; + vuint8m2x4_t dst{}; + vop(p01, p02); vop(p00, p01); vop(p01, p02); vop(p04, p05); vop(p03, p04); + vop(p04, p05); vop(p00, p03); vop(p02, p05); vop(p02, p03); vop(p01, p04); + vop(p01, p02); vop(p03, p04); vop(p07, p08); vop(p06, p07); vop(p07, p08); + vop(p010, p011); vop(p09, p010); vop(p010, p011); vop(p06, p09); vop(p08, p011); + vop(p08, p09); vop(p07, p010); vop(p07, p08); vop(p09, p010); vop(p00, p06); + vop(p04, p010); vop(p04, p06); vop(p02, p08); vop(p02, p04); vop(p06, p08); + vop(p01, p07); vop(p05, p011); vop(p05, p07); vop(p03, p09); vop(p03, p05); + vop(p07, p09); vop(p01, p02); vop(p03, p04); vop(p05, p06); vop(p07, p08); + vop(p09, p010); vop(p013, p014); vop(p012, p013); vop(p013, p014); vop(p016, p017); + vop(p015, p016); vop(p016, p017); vop(p012, p015); vop(p014, p017); vop(p014, p015); + vop(p013, p016); vop(p013, p014); vop(p015, p016); vop(p019, p020); vop(p018, p019); + vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p021, p023); vop(p022, p024); + vop(p022, p023); vop(p018, p021); vop(p020, p023); vop(p020, p021); vop(p019, p022); + vop(p022, p024); vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p012, p018); + vop(p016, p022); vop(p016, p018); vop(p014, p020); vop(p020, p024); vop(p014, p016); + vop(p018, p020); vop(p022, p024); vop(p013, p019); vop(p017, p023); vop(p017, p019); + vop(p015, p021); vop(p015, p017); vop(p019, p021); vop(p013, p014); vop(p015, p016); + vop(p017, p018); vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p00, p012); + vop(p08, p020); vop(p08, p012); vop(p04, p016); vop(p016, p024); vop(p012, p016); + vop(p02, p014); vop(p010, p022); vop(p010, p014); vop(p06, p018); vop(p06, p010); + vop(p010, p012); vop(p01, p013); vop(p09, p021); vop(p09, p013); vop(p05, p017); + vop(p013, p017); vop(p03, p015); vop(p011, p023); vop(p011, p015); vop(p07, p019); + vop(p07, p011); vop(p011, p013); vop(p011, p012); + dst = __riscv_vset_v_u8m2_u8m2x4(dst, 0, p012); + vop(p11, p12); vop(p10, p11); vop(p11, p12); vop(p14, p15); vop(p13, p14); + vop(p14, p15); vop(p10, p13); vop(p12, p15); vop(p12, p13); vop(p11, p14); + vop(p11, p12); vop(p13, p14); vop(p17, p18); vop(p16, p17); vop(p17, p18); + vop(p110, p111); vop(p19, p110); vop(p110, p111); vop(p16, p19); vop(p18, p111); + vop(p18, p19); vop(p17, p110); vop(p17, p18); vop(p19, p110); vop(p10, p16); + vop(p14, p110); vop(p14, p16); vop(p12, p18); vop(p12, p14); vop(p16, p18); + vop(p11, p17); vop(p15, p111); vop(p15, p17); vop(p13, p19); vop(p13, p15); + vop(p17, p19); vop(p11, p12); vop(p13, p14); vop(p15, p16); vop(p17, p18); + vop(p19, p110); vop(p113, p114); vop(p112, p113); vop(p113, p114); vop(p116, p117); + vop(p115, p116); vop(p116, p117); vop(p112, p115); vop(p114, p117); vop(p114, p115); + vop(p113, p116); vop(p113, p114); vop(p115, p116); vop(p119, p120); vop(p118, p119); + vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p121, p123); vop(p122, p124); + vop(p122, p123); vop(p118, p121); vop(p120, p123); vop(p120, p121); vop(p119, p122); + vop(p122, p124); vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p112, p118); + vop(p116, p122); vop(p116, p118); vop(p114, p120); vop(p120, p124); vop(p114, p116); + vop(p118, p120); vop(p122, p124); vop(p113, p119); vop(p117, p123); vop(p117, p119); + vop(p115, p121); vop(p115, p117); vop(p119, p121); vop(p113, p114); vop(p115, p116); + vop(p117, p118); vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p10, p112); + vop(p18, p120); vop(p18, p112); vop(p14, p116); vop(p116, p124); vop(p112, p116); + vop(p12, p114); vop(p110, p122); vop(p110, p114); vop(p16, p118); vop(p16, p110); + vop(p110, p112); vop(p11, p113); vop(p19, p121); vop(p19, p113); vop(p15, p117); + vop(p113, p117); vop(p13, p115); vop(p111, p123); vop(p111, p115); vop(p17, p119); + vop(p17, p111); vop(p111, p113); vop(p111, p112); + dst = __riscv_vset_v_u8m2_u8m2x4(dst, 1, p112); + vop(p21, p22); vop(p20, p21); vop(p21, p22); vop(p24, p25); vop(p23, p24); + vop(p24, p25); vop(p20, p23); vop(p22, p25); vop(p22, p23); vop(p21, p24); + vop(p21, p22); vop(p23, p24); vop(p27, p28); vop(p26, p27); vop(p27, p28); + vop(p210, p211); vop(p29, p210); vop(p210, p211); vop(p26, p29); vop(p28, p211); + vop(p28, p29); vop(p27, p210); vop(p27, p28); vop(p29, p210); vop(p20, p26); + vop(p24, p210); vop(p24, p26); vop(p22, p28); vop(p22, p24); vop(p26, p28); + vop(p21, p27); vop(p25, p211); vop(p25, p27); vop(p23, p29); vop(p23, p25); + vop(p27, p29); vop(p21, p22); vop(p23, p24); vop(p25, p26); vop(p27, p28); + vop(p29, p210); vop(p213, p214); vop(p212, p213); vop(p213, p214); vop(p216, p217); + vop(p215, p216); vop(p216, p217); vop(p212, p215); vop(p214, p217); vop(p214, p215); + vop(p213, p216); vop(p213, p214); vop(p215, p216); vop(p219, p220); vop(p218, p219); + vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p221, p223); vop(p222, p224); + vop(p222, p223); vop(p218, p221); vop(p220, p223); vop(p220, p221); vop(p219, p222); + vop(p222, p224); vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p212, p218); + vop(p216, p222); vop(p216, p218); vop(p214, p220); vop(p220, p224); vop(p214, p216); + vop(p218, p220); vop(p222, p224); vop(p213, p219); vop(p217, p223); vop(p217, p219); + vop(p215, p221); vop(p215, p217); vop(p219, p221); vop(p213, p214); vop(p215, p216); + vop(p217, p218); vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p20, p212); + vop(p28, p220); vop(p28, p212); vop(p24, p216); vop(p216, p224); vop(p212, p216); + vop(p22, p214); vop(p210, p222); vop(p210, p214); vop(p26, p218); vop(p26, p210); + vop(p210, p212); vop(p21, p213); vop(p29, p221); vop(p29, p213); vop(p25, p217); + vop(p213, p217); vop(p23, p215); vop(p211, p223); vop(p211, p215); vop(p27, p219); + vop(p27, p211); vop(p211, p213); vop(p211, p212); + dst = __riscv_vset_v_u8m2_u8m2x4(dst, 2, p212); + vop(p31, p32); vop(p30, p31); vop(p31, p32); vop(p34, p35); vop(p33, p34); + vop(p34, p35); vop(p30, p33); vop(p32, p35); vop(p32, p33); vop(p31, p34); + vop(p31, p32); vop(p33, p34); vop(p37, p38); vop(p36, p37); vop(p37, p38); + vop(p310, p311); vop(p39, p310); vop(p310, p311); vop(p36, p39); vop(p38, p311); + vop(p38, p39); vop(p37, p310); vop(p37, p38); vop(p39, p310); vop(p30, p36); + vop(p34, p310); vop(p34, p36); vop(p32, p38); vop(p32, p34); vop(p36, p38); + vop(p31, p37); vop(p35, p311); vop(p35, p37); vop(p33, p39); vop(p33, p35); + vop(p37, p39); vop(p31, p32); vop(p33, p34); vop(p35, p36); vop(p37, p38); + vop(p39, p310); vop(p313, p314); vop(p312, p313); vop(p313, p314); vop(p316, p317); + vop(p315, p316); vop(p316, p317); vop(p312, p315); vop(p314, p317); vop(p314, p315); + vop(p313, p316); vop(p313, p314); vop(p315, p316); vop(p319, p320); vop(p318, p319); + vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p321, p323); vop(p322, p324); + vop(p322, p323); vop(p318, p321); vop(p320, p323); vop(p320, p321); vop(p319, p322); + vop(p322, p324); vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p312, p318); + vop(p316, p322); vop(p316, p318); vop(p314, p320); vop(p320, p324); vop(p314, p316); + vop(p318, p320); vop(p322, p324); vop(p313, p319); vop(p317, p323); vop(p317, p319); + vop(p315, p321); vop(p315, p317); vop(p319, p321); vop(p313, p314); vop(p315, p316); + vop(p317, p318); vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p30, p312); + vop(p38, p320); vop(p38, p312); vop(p34, p316); vop(p316, p324); vop(p312, p316); + vop(p32, p314); vop(p310, p322); vop(p310, p314); vop(p36, p318); vop(p36, p310); + vop(p310, p312); vop(p31, p313); vop(p39, p321); vop(p39, p313); vop(p35, p317); + vop(p313, p317); vop(p33, p315); vop(p311, p323); vop(p311, p315); vop(p37, p319); + vop(p37, p311); vop(p311, p313); vop(p311, p312); + dst = __riscv_vset_v_u8m2_u8m2x4(dst, 3, p312); + __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl); + } + } + } + + return CV_HAL_ERROR_OK; +} + +} // anonymous + +int medianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, int ksize) +{ + const int type = CV_MAKETYPE(depth, cn); + if (type != CV_8UC1 && type != CV_8UC4 && type != CV_16UC1 && type != CV_16SC1 && type != CV_32FC1) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if ((ksize != 3 && ksize != 5) || src_data == dst_data) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + switch (ksize*100 + type) + { + case 300 + CV_8UC1: + return common::invoke(height, {medianBlurC1<3, RVV_U8M4>}, src_data, src_step, dst_data, dst_step, width, height); + case 300 + CV_16UC1: + return common::invoke(height, {medianBlurC1<3, RVV_U16M4>}, src_data, src_step, dst_data, dst_step, width, height); + case 300 + CV_16SC1: + return common::invoke(height, {medianBlurC1<3, RVV_I16M4>}, src_data, src_step, dst_data, dst_step, width, height); + case 300 + CV_32FC1: + return common::invoke(height, {medianBlurC1<3, RVV_F32M4>}, src_data, src_step, dst_data, dst_step, width, height); + case 500 + CV_8UC1: + return common::invoke(height, {medianBlurC1<5, RVV_U8M1>}, src_data, src_step, dst_data, dst_step, width, height); + case 500 + CV_16UC1: + return common::invoke(height, {medianBlurC1<5, RVV_U16M1>}, src_data, src_step, dst_data, dst_step, width, height); + case 500 + CV_16SC1: + return common::invoke(height, {medianBlurC1<5, RVV_I16M1>}, src_data, src_step, dst_data, dst_step, width, height); + case 500 + CV_32FC1: + return common::invoke(height, {medianBlurC1<5, RVV_F32M1>}, src_data, src_step, dst_data, dst_step, width, height); + + case 300 + CV_8UC4: + return common::invoke(height, {medianBlurC4<3>}, src_data, src_step, dst_data, dst_step, width, height); + case 500 + CV_8UC4: + return common::invoke(height, {medianBlurC4<5>}, src_data, src_step, dst_data, dst_step, width, height); + } + + return CV_HAL_ERROR_NOT_IMPLEMENTED; +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::imgproc diff --git a/hal/riscv-rvv/hal_rvv_1p0/moments.hpp b/hal/riscv-rvv/src/imgproc/moments.cpp similarity index 94% rename from hal/riscv-rvv/hal_rvv_1p0/moments.hpp rename to hal/riscv-rvv/src/imgproc/moments.cpp index f0db8b3a17..c29f1edfd0 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/moments.hpp +++ b/hal/riscv-rvv/src/imgproc/moments.cpp @@ -4,16 +4,13 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_MOMENTS_HPP_INCLUDED -#define OPENCV_HAL_RVV_MOMENTS_HPP_INCLUDED +#include "rvv_hal.hpp" -#include +namespace cv { namespace rvv_hal { namespace imgproc { -namespace cv { namespace cv_hal_rvv { +#if CV_HAL_RVV_1P0_ENABLED -namespace imageMoments { -#undef cv_hal_imageMoments -#define cv_hal_imageMoments cv::cv_hal_rvv::imageMoments::imageMoments +namespace { class MomentsInvoker : public ParallelLoopBody { @@ -152,9 +149,11 @@ static inline int imageMoments(int start, int end, const uchar* src_data, size_t return CV_HAL_ERROR_OK; } +} // anonymous + // the algorithm is copied from imgproc/src/moments.cpp, // in the function cv::Moments cv::moments -inline int imageMoments(const uchar* src_data, size_t src_step, int src_type, int width, int height, bool binary, double m[10]) +int imageMoments(const uchar* src_data, size_t src_step, int src_type, int width, int height, bool binary, double m[10]) { if (src_type != CV_16UC1 && src_type != CV_16SC1 && src_type != CV_32FC1 && src_type != CV_64FC1) return CV_HAL_ERROR_NOT_IMPLEMENTED; @@ -184,8 +183,7 @@ inline int imageMoments(const uchar* src_data, size_t src_step, int src_type, in return CV_HAL_ERROR_NOT_IMPLEMENTED; } -} // cv::cv_hal_rvv::imageMoments -}} +#endif // CV_HAL_RVV_1P0_ENABLED -#endif +}}} // cv::rvv_hal::imgproc diff --git a/hal/riscv-rvv/src/imgproc/morph.cpp b/hal/riscv-rvv/src/imgproc/morph.cpp new file mode 100644 index 0000000000..e5d79b598b --- /dev/null +++ b/hal/riscv-rvv/src/imgproc/morph.cpp @@ -0,0 +1,331 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. + +#include "rvv_hal.hpp" +#include "common.hpp" + +namespace cv { namespace rvv_hal { namespace imgproc { + +#if CV_HAL_RVV_1P0_ENABLED + +namespace { + +struct Morph2D +{ + int operation; + int src_type; + int dst_type; + int kernel_type; + uchar* kernel_data; + size_t kernel_step; + int kernel_width; + int kernel_height; + int anchor_x; + int anchor_y; + int borderType; + const uchar* borderValue; +}; + +template struct rvv; +template<> struct rvv +{ + static inline uchar init() { return std::numeric_limits::max(); } + static inline uchar mop(uchar a, uchar b) { return a < b ? a : b; } + static inline vuint8m4_t vop(vuint8m4_t a, vuint8m4_t b, size_t c) { return __riscv_vminu(a, b, c); } + static inline vuint8m4_t vop(vuint8m4_t a, uchar b, size_t c) { return __riscv_vminu(a, b, c); } +}; +template<> struct rvv +{ + static inline uchar init() { return std::numeric_limits::min(); } + static inline uchar mop(uchar a, uchar b) { return a > b ? a : b; } + static inline vuint8m4_t vop(vuint8m4_t a, vuint8m4_t b, size_t c) { return __riscv_vmaxu(a, b, c); } + static inline vuint8m4_t vop(vuint8m4_t a, uchar b, size_t c) { return __riscv_vmaxu(a, b, c); } +}; + +// the algorithm is copied from 3rdparty/carotene/src/morph.cpp, +// in the function template void morph3x3 +template +static inline int morph(int start, int end, Morph2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, int width, int height, int full_width, int full_height, int offset_x, int offset_y) +{ + bool kernel[9]; + for (int i = 0; i < 9; i++) + { + kernel[i] = data->kernel_data[(i / 3) * data->kernel_step + i % 3] != 0; + } + + constexpr int noval = std::numeric_limits::max(); + auto access = [&](int x, int y) { + int pi, pj; + if (data->borderType & BORDER_ISOLATED) + { + pi = common::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED); + pj = common::borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED); + pi = pi < 0 ? noval : pi; + pj = pj < 0 ? noval : pj; + } + else + { + pi = common::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType); + pj = common::borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType); + pi = pi < 0 ? noval : pi - offset_y; + pj = pj < 0 ? noval : pj - offset_x; + } + return std::make_pair(pi, pj); + }; + + auto process = [&](int x, int y) { + if (data->src_type == CV_8UC1) + { + uchar val = rvv::init(); + for (int i = 0; i < 9; i++) + { + if (kernel[i]) + { + auto p = access(x + i / 3, y + i % 3); + if (p.first != noval && p.second != noval) + { + val = rvv::mop(val, src_data[p.first * src_step + p.second]); + } + else + { + val = rvv::mop(val, data->borderValue[0]); + } + } + } + dst_data[x * width + y] = val; + } + else + { + uchar val0, val1, val2, val3; + val0 = val1 = val2 = val3 = rvv::init(); + for (int i = 0; i < 9; i++) + { + if (kernel[i]) + { + auto p = access(x + i / 3, y + i % 3); + if (p.first != noval && p.second != noval) + { + val0 = rvv::mop(val0, src_data[p.first * src_step + p.second * 4 ]); + val1 = rvv::mop(val1, src_data[p.first * src_step + p.second * 4 + 1]); + val2 = rvv::mop(val2, src_data[p.first * src_step + p.second * 4 + 2]); + val3 = rvv::mop(val3, src_data[p.first * src_step + p.second * 4 + 3]); + } + else + { + val0 = rvv::mop(val0, data->borderValue[0]); + val1 = rvv::mop(val1, data->borderValue[1]); + val2 = rvv::mop(val2, data->borderValue[2]); + val3 = rvv::mop(val3, data->borderValue[3]); + } + } + } + dst_data[(x * width + y) * 4 ] = val0; + dst_data[(x * width + y) * 4 + 1] = val1; + dst_data[(x * width + y) * 4 + 2] = val2; + dst_data[(x * width + y) * 4 + 3] = val3; + } + }; + + const int left = data->anchor_x, right = width - (2 - data->anchor_x); + for (int i = start; i < end; i++) + { + if (left >= right) + { + for (int j = 0; j < width; j++) + process(i, j); + } + else + { + for (int j = 0; j < left; j++) + process(i, j); + for (int j = right; j < width; j++) + process(i, j); + + const uchar* row0 = access(i , 0).first == noval ? nullptr : src_data + access(i , 0).first * src_step; + const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step; + const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step; + if (data->src_type == CV_8UC1) + { + int vl; + for (int j = left; j < right; j += vl) + { + vl = __riscv_vsetvl_e8m4(right - j); + auto m0 = __riscv_vmv_v_x_u8m4(rvv::init(), vl); + auto loadsrc = [&](const uchar* row, bool k0, bool k1, bool k2) { + if (!row) + { + m0 = rvv::vop(m0, data->borderValue[0], vl); + return; + } + + const uchar* extra = row + j - data->anchor_x; + auto v0 = __riscv_vle8_v_u8m4(extra, vl); + + if (k0) m0 = rvv::vop(m0, v0, vl); + v0 = __riscv_vslide1down(v0, extra[vl], vl); + if (k1) m0 = rvv::vop(m0, v0, vl); + if (!k2) return; + v0 = __riscv_vslide1down(v0, extra[vl + 1], vl); + m0 = rvv::vop(m0, v0, vl); + }; + + loadsrc(row0, kernel[0], kernel[1], kernel[2]); + loadsrc(row1, kernel[3], kernel[4], kernel[5]); + loadsrc(row2, kernel[6], kernel[7], kernel[8]); + __riscv_vse8(dst_data + i * width + j, m0, vl); + } + } + else + { + int vl, vl0, vl1; + for (int j = left; j < right; j += vl) + { + vl = __riscv_vsetvl_e8m4(right - j); + vl0 = std::min(vl, (int)__riscv_vlenb() * 2); + vl1 = vl - vl0; + auto m0 = __riscv_vmv_v_x_u8m4(rvv::init(), vl); + auto m1 = __riscv_vmv_v_x_u8m4(rvv::init(), vl); + auto m2 = __riscv_vmv_v_x_u8m4(rvv::init(), vl); + auto m3 = __riscv_vmv_v_x_u8m4(rvv::init(), vl); + + auto opshift = [&](vuint8m4_t a, vuint8m4_t b, bool k0, bool k1, bool k2, uchar r1, uchar r2) { + if (k0) a = rvv::vop(a, b, vl); + b = __riscv_vslide1down(b, r1, vl); + if (k1) a = rvv::vop(a, b, vl); + if (!k2) return a; + b = __riscv_vslide1down(b, r2, vl); + return rvv::vop(a, b, vl); + }; + auto loadsrc = [&](const uchar* row, bool k0, bool k1, bool k2) { + if (!row) + { + m0 = rvv::vop(m0, data->borderValue[0], vl); + m1 = rvv::vop(m1, data->borderValue[1], vl); + m2 = rvv::vop(m2, data->borderValue[2], vl); + m3 = rvv::vop(m3, data->borderValue[3], vl); + return; + } + + vuint8m4_t v0{}, v1{}, v2{}, v3{}; + const uchar* extra = row + (j - data->anchor_x) * 4; + auto src = __riscv_vlseg4e8_v_u8m2x4(extra, vl0); + v0 = __riscv_vset_v_u8m2_u8m4(v0, 0, __riscv_vget_v_u8m2x4_u8m2(src, 0)); + v1 = __riscv_vset_v_u8m2_u8m4(v1, 0, __riscv_vget_v_u8m2x4_u8m2(src, 1)); + v2 = __riscv_vset_v_u8m2_u8m4(v2, 0, __riscv_vget_v_u8m2x4_u8m2(src, 2)); + v3 = __riscv_vset_v_u8m2_u8m4(v3, 0, __riscv_vget_v_u8m2x4_u8m2(src, 3)); + src = __riscv_vlseg4e8_v_u8m2x4(extra + vl0 * 4, vl1); + v0 = __riscv_vset_v_u8m2_u8m4(v0, 1, __riscv_vget_v_u8m2x4_u8m2(src, 0)); + v1 = __riscv_vset_v_u8m2_u8m4(v1, 1, __riscv_vget_v_u8m2x4_u8m2(src, 1)); + v2 = __riscv_vset_v_u8m2_u8m4(v2, 1, __riscv_vget_v_u8m2x4_u8m2(src, 2)); + v3 = __riscv_vset_v_u8m2_u8m4(v3, 1, __riscv_vget_v_u8m2x4_u8m2(src, 3)); + + extra += vl * 4; + m0 = opshift(m0, v0, k0, k1, k2, extra[0], extra[4]); + m1 = opshift(m1, v1, k0, k1, k2, extra[1], extra[5]); + m2 = opshift(m2, v2, k0, k1, k2, extra[2], extra[6]); + m3 = opshift(m3, v3, k0, k1, k2, extra[3], extra[7]); + }; + + loadsrc(row0, kernel[0], kernel[1], kernel[2]); + loadsrc(row1, kernel[3], kernel[4], kernel[5]); + loadsrc(row2, kernel[6], kernel[7], kernel[8]); + vuint8m2x4_t val{}; + val = __riscv_vset_v_u8m2_u8m2x4(val, 0, __riscv_vget_v_u8m4_u8m2(m0, 0)); + val = __riscv_vset_v_u8m2_u8m2x4(val, 1, __riscv_vget_v_u8m4_u8m2(m1, 0)); + val = __riscv_vset_v_u8m2_u8m2x4(val, 2, __riscv_vget_v_u8m4_u8m2(m2, 0)); + val = __riscv_vset_v_u8m2_u8m2x4(val, 3, __riscv_vget_v_u8m4_u8m2(m3, 0)); + __riscv_vsseg4e8(dst_data + (i * width + j) * 4, val, vl0); + val = __riscv_vset_v_u8m2_u8m2x4(val, 0, __riscv_vget_v_u8m4_u8m2(m0, 1)); + val = __riscv_vset_v_u8m2_u8m2x4(val, 1, __riscv_vget_v_u8m4_u8m2(m1, 1)); + val = __riscv_vset_v_u8m2_u8m2x4(val, 2, __riscv_vget_v_u8m4_u8m2(m2, 1)); + val = __riscv_vset_v_u8m2_u8m2x4(val, 3, __riscv_vget_v_u8m4_u8m2(m3, 1)); + __riscv_vsseg4e8(dst_data + (i * width + j + vl0) * 4, val, vl1); + } + } + } + } + + return CV_HAL_ERROR_OK; +} + +} // anonymous + +int morphInit(cvhalFilter2D** context, int operation, int src_type, int dst_type, int /*max_width*/, int /*max_height*/, int kernel_type, uchar* kernel_data, size_t kernel_step, int kernel_width, int kernel_height, int anchor_x, int anchor_y, int borderType, const double borderValue[4], int iterations, bool /*allowSubmatrix*/, bool /*allowInplace*/) +{ + if (kernel_type != CV_8UC1 || src_type != dst_type) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (src_type != CV_8UC1 && src_type != CV_8UC4) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (kernel_width != kernel_height || kernel_width != 3) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (iterations != 1) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (operation != CV_HAL_MORPH_ERODE && operation != CV_HAL_MORPH_DILATE) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + uchar* borderV; + if (src_type == CV_8UC1) + { + borderV = new uchar{static_cast(borderValue[0])}; + if (operation == CV_HAL_MORPH_DILATE && borderValue[0] == DBL_MAX) + borderV[0] = 0; + } + else + { + borderV = new uchar[4]{static_cast(borderValue[0]), static_cast(borderValue[1]), static_cast(borderValue[2]), static_cast(borderValue[3])}; + if (operation == CV_HAL_MORPH_DILATE) + { + if (borderValue[0] == DBL_MAX) + borderV[0] = 0; + if (borderValue[1] == DBL_MAX) + borderV[1] = 0; + if (borderValue[2] == DBL_MAX) + borderV[2] = 0; + if (borderValue[3] == DBL_MAX) + borderV[3] = 0; + } + } + + anchor_x = anchor_x < 0 ? kernel_width / 2 : anchor_x; + anchor_y = anchor_y < 0 ? kernel_height / 2 : anchor_y; + *context = reinterpret_cast(new Morph2D{operation, src_type, dst_type, kernel_type, kernel_data, kernel_step, kernel_width, kernel_height, anchor_x, anchor_y, borderType, borderV}); + return CV_HAL_ERROR_OK; +} + +int morph(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_full_width, int src_full_height, int src_roi_x, int src_roi_y, int /*dst_full_width*/, int /*dst_full_height*/, int /*dst_roi_x*/, int /*dst_roi_y*/) +{ + Morph2D* data = reinterpret_cast(context); + int cn = data->src_type == CV_8UC1 ? 1 : 4; + std::vector dst(width * height * cn); + + int res = CV_HAL_ERROR_NOT_IMPLEMENTED; + switch (data->operation) + { + case CV_HAL_MORPH_ERODE: + res = common::invoke(height, {morph}, data, src_data, src_step, dst.data(), width, height, src_full_width, src_full_height, src_roi_x, src_roi_y); + break; + case CV_HAL_MORPH_DILATE: + res = common::invoke(height, {morph}, data, src_data, src_step, dst.data(), width, height, src_full_width, src_full_height, src_roi_x, src_roi_y); + break; + } + + for (int i = 0; i < height; i++) + memcpy(dst_data + i * dst_step, dst.data() + i * width * cn, width * cn); + return res; +} + +int morphFree(cvhalFilter2D* context) +{ + delete reinterpret_cast(context)->borderValue; + delete reinterpret_cast(context); + return CV_HAL_ERROR_OK; +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::imgproc diff --git a/hal/riscv-rvv/hal_rvv_1p0/pyramids.hpp b/hal/riscv-rvv/src/imgproc/pyramids.cpp similarity index 97% rename from hal/riscv-rvv/hal_rvv_1p0/pyramids.hpp rename to hal/riscv-rvv/src/imgproc/pyramids.cpp index a349d341c5..66bf4c1b4d 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/pyramids.hpp +++ b/hal/riscv-rvv/src/imgproc/pyramids.cpp @@ -4,18 +4,13 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_PYRAMIDS_HPP_INCLUDED -#define OPENCV_HAL_RVV_PYRAMIDS_HPP_INCLUDED +#include "rvv_hal.hpp" -#include -#include "hal_rvv_1p0/types.hpp" +namespace cv { namespace rvv_hal { namespace imgproc { -namespace cv { namespace cv_hal_rvv { namespace pyramids { +#if CV_HAL_RVV_1P0_ENABLED -#undef cv_hal_pyrdown -#define cv_hal_pyrdown cv::cv_hal_rvv::pyramids::pyrDown -#undef cv_hal_pyrup -#define cv_hal_pyrup cv::cv_hal_rvv::pyramids::pyrUp +namespace { template struct rvv; @@ -562,7 +557,9 @@ inline int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_ return CV_HAL_ERROR_OK; } -inline int pyrDown(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type) +} // anonymous + +int pyrDown(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type) { if (border_type == BORDER_CONSTANT || (depth == CV_32F && cn == 1)) return CV_HAL_ERROR_NOT_IMPLEMENTED; @@ -580,7 +577,7 @@ inline int pyrDown(const uchar* src_data, size_t src_step, int src_width, int sr return CV_HAL_ERROR_NOT_IMPLEMENTED; } -inline int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type) +int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type) { if (border_type != BORDER_DEFAULT) return CV_HAL_ERROR_NOT_IMPLEMENTED; @@ -598,6 +595,6 @@ inline int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_ return CV_HAL_ERROR_NOT_IMPLEMENTED; } -}}} +#endif // CV_HAL_RVV_1P0_ENABLED -#endif +}}} // cv::rvv_hal::imgproc diff --git a/hal/riscv-rvv/hal_rvv_1p0/resize.hpp b/hal/riscv-rvv/src/imgproc/resize.cpp similarity index 99% rename from hal/riscv-rvv/hal_rvv_1p0/resize.hpp rename to hal/riscv-rvv/src/imgproc/resize.cpp index d18db5f058..1ce5e16bb3 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/resize.hpp +++ b/hal/riscv-rvv/src/imgproc/resize.cpp @@ -4,17 +4,15 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_RESIZE_HPP_INCLUDED -#define OPENCV_HAL_RVV_RESIZE_HPP_INCLUDED - -#include +#include "rvv_hal.hpp" +#include "common.hpp" #include -namespace cv { namespace cv_hal_rvv { +namespace cv { namespace rvv_hal { namespace imgproc { -namespace resize { -#undef cv_hal_resize -#define cv_hal_resize cv::cv_hal_rvv::resize::resize +#if CV_HAL_RVV_1P0_ENABLED + +namespace { class ResizeInvoker : public ParallelLoopBody { @@ -986,7 +984,9 @@ static inline int resizeArea(int src_type, const uchar *src_data, size_t src_ste return CV_HAL_ERROR_NOT_IMPLEMENTED; } -inline int resize(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y, int interpolation) +} // anonymous + +int resize(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y, int interpolation) { inv_scale_x = 1 / inv_scale_x; inv_scale_y = 1 / inv_scale_y; @@ -999,8 +999,7 @@ inline int resize(int src_type, const uchar *src_data, size_t src_step, int src_ return CV_HAL_ERROR_NOT_IMPLEMENTED; } -} // cv::cv_hal_rvv::resize -}} +#endif // CV_HAL_RVV_1P0_ENABLED -#endif +}}} // cv::rvv_hal::imgproc diff --git a/hal/riscv-rvv/src/imgproc/sep_filter.cpp b/hal/riscv-rvv/src/imgproc/sep_filter.cpp new file mode 100644 index 0000000000..54267683e5 --- /dev/null +++ b/hal/riscv-rvv/src/imgproc/sep_filter.cpp @@ -0,0 +1,259 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. + +#include "rvv_hal.hpp" +#include "common.hpp" + +namespace cv { namespace rvv_hal { namespace imgproc { + +#if CV_HAL_RVV_1P0_ENABLED + +namespace { + +struct sepFilter2D +{ + int src_type; + int dst_type; + int kernel_type; + const uchar* kernelx_data; + int kernelx_length; + const uchar* kernely_data; + int kernely_length; + int anchor_x; + int anchor_y; + double delta; + int borderType; +}; + +// the algorithm is copied from 3rdparty/carotene/src/separable_filter.hpp, +// in the functor RowFilter3x3S16Generic and ColFilter3x3S16Generic +template +static inline int sepFilter(int start, int end, sepFilter2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y) +{ + constexpr int noval = std::numeric_limits::max(); + auto accessX = [&](int x) { + int pi; + if (data->borderType & BORDER_ISOLATED) + { + pi = common::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED); + pi = pi < 0 ? noval : pi; + } + else + { + pi = common::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType); + pi = pi < 0 ? noval : pi - offset_y; + } + return pi; + }; + auto accessY = [&](int y) { + int pj; + if (data->borderType & BORDER_ISOLATED) + { + pj = common::borderInterpolate(y - data->anchor_x, width, data->borderType & ~BORDER_ISOLATED); + pj = pj < 0 ? noval : pj; + } + else + { + pj = common::borderInterpolate(offset_x + y - data->anchor_x, full_width, data->borderType); + pj = pj < 0 ? noval : pj - offset_x; + } + return pj; + }; + auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; }; + + const float* kx = reinterpret_cast(data->kernelx_data); + const float* ky = reinterpret_cast(data->kernely_data); + std::vector res(width * ksize); + auto process = [&](int x, int y) { + float sum = 0; + for (int i = 0; i < ksize; i++) + { + int p = accessY(y + i); + if (p != noval) + { + sum += kx[i] * reinterpret_cast(src_data + x * src_step)[p]; + } + } + res[p2idx(x, y)] = sum; + }; + + const int left = data->anchor_x, right = width - (ksize - 1 - data->anchor_x); + for (int i = start - data->anchor_y; i < end + (ksize - 1 - data->anchor_y); i++) + { + if (i + offset_y >= 0 && i + offset_y < full_height) + { + if (left >= right) + { + for (int j = 0; j < width; j++) + process(i, j); + } + else + { + for (int j = 0; j < left; j++) + process(i, j); + for (int j = right; j < width; j++) + process(i, j); + + int vl; + for (int j = left; j < right; j += vl) + { + vl = __riscv_vsetvl_e8m2(right - j); + const T* extra = reinterpret_cast(src_data + i * src_step) + j - data->anchor_x; + vfloat32m8_t src; + if (std::is_same::value) + { + src = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vle8_v_u8m2(reinterpret_cast(extra), vl), vl), vl); + } + else if (std::is_same::value) + { + src = __riscv_vfwcvt_f(__riscv_vle16_v_i16m4(reinterpret_cast(extra), vl), vl); + } + else + { + src = __riscv_vle32_v_f32m8(reinterpret_cast(extra), vl); + } + + extra += vl; + auto sum = __riscv_vfmul(src, kx[0], vl); + src = __riscv_vfslide1down(src, extra[0], vl); + sum = __riscv_vfmacc(sum, kx[1], src, vl); + src = __riscv_vfslide1down(src, extra[1], vl); + sum = __riscv_vfmacc(sum, kx[2], src, vl); + if (ksize == 5) + { + src = __riscv_vfslide1down(src, extra[2], vl); + sum = __riscv_vfmacc(sum, kx[3], src, vl); + src = __riscv_vfslide1down(src, extra[3], vl); + sum = __riscv_vfmacc(sum, kx[4], src, vl); + } + __riscv_vse32(res.data() + p2idx(i, j), sum, vl); + } + } + } + + int cur = i - (ksize - 1 - data->anchor_y); + if (cur >= start) + { + const float* row0 = accessX(cur ) == noval ? nullptr : res.data() + p2idx(accessX(cur ), 0); + const float* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0); + const float* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0); + const float* row3 = nullptr, *row4 = nullptr; + if (ksize == 5) + { + row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0); + row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0); + } + + int vl; + for (int j = 0; j < width; j += vl) + { + vl = __riscv_vsetvl_e32m4(width - j); + auto v0 = row0 ? __riscv_vle32_v_f32m4(row0 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl); + auto v1 = row1 ? __riscv_vle32_v_f32m4(row1 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl); + auto v2 = row2 ? __riscv_vle32_v_f32m4(row2 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl); + auto sum = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmv_v_f_f32m4(data->delta, vl), ky[0], v0, vl), ky[1], v1, vl), ky[2], v2, vl); + + if (ksize == 5) + { + auto v3 = row3 ? __riscv_vle32_v_f32m4(row3 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl); + auto v4 = row4 ? __riscv_vle32_v_f32m4(row4 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl); + sum = __riscv_vfmacc(__riscv_vfmacc(sum, ky[3], v3, vl), ky[4], v4, vl); + } + + if (data->dst_type == CV_16SC1) + { + __riscv_vse16(reinterpret_cast(dst_data + cur * dst_step) + j, __riscv_vfncvt_x(sum, vl), vl); + } + else + { + __riscv_vse32(reinterpret_cast(dst_data + cur * dst_step) + j, sum, vl); + } + } + } + } + + return CV_HAL_ERROR_OK; +} + +} // anonymous + +int sepFilterInit(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type, uchar* kernelx_data, int kernelx_length, uchar* kernely_data, int kernely_length, int anchor_x, int anchor_y, double delta, int borderType) +{ + if (kernel_type != CV_32FC1) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (src_type != CV_8UC1 && src_type != CV_16SC1 && src_type != CV_32FC1) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (dst_type != CV_16SC1 && dst_type != CV_32FC1) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if ((kernelx_length != 3 && kernelx_length != 5) || kernelx_length != kernely_length) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + anchor_x = anchor_x < 0 ? kernelx_length / 2 : anchor_x; + anchor_y = anchor_y < 0 ? kernely_length / 2 : anchor_y; + *context = reinterpret_cast(new sepFilter2D{src_type, dst_type, kernel_type, kernelx_data, kernelx_length, kernely_data, kernely_length, anchor_x, anchor_y, delta, borderType & ~BORDER_ISOLATED}); + return CV_HAL_ERROR_OK; +} + +int sepFilter(cvhalFilter2D *context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y) +{ + sepFilter2D* data = reinterpret_cast(context); + + uchar* _dst_data = dst_data; + size_t _dst_step = dst_step; + const size_t size = CV_ELEM_SIZE(data->dst_type); + std::vector dst; + if (src_data == _dst_data) + { + dst = std::vector(width * height * size); + dst_data = dst.data(); + dst_step = width * size; + } + + int res = CV_HAL_ERROR_NOT_IMPLEMENTED; + switch (data->kernelx_length*100 + data->src_type) + { + case 300 + CV_8UC1: + res = common::invoke(height, {sepFilter<3, uchar>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); + break; + case 500 + CV_8UC1: + res = common::invoke(height, {sepFilter<5, uchar>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); + break; + case 300 + CV_16SC1: + res = common::invoke(height, {sepFilter<3, short>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); + break; + case 500 + CV_16SC1: + res = common::invoke(height, {sepFilter<5, short>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); + break; + case 300 + CV_32FC1: + res = common::invoke(height, {sepFilter<3, float>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); + break; + case 500 + CV_32FC1: + res = common::invoke(height, {sepFilter<5, float>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); + break; + } + if (res == CV_HAL_ERROR_NOT_IMPLEMENTED) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + if (src_data == _dst_data) + { + for (int i = 0; i < height; i++) + memcpy(_dst_data + i * _dst_step, dst.data() + i * dst_step, dst_step); + } + + return res; +} + +int sepFilterFree(cvhalFilter2D* context) +{ + delete reinterpret_cast(context); + return CV_HAL_ERROR_OK; +} + +#endif // CV_HAL_RVV_1P0_ENABLED + +}}} // cv::rvv_hal::imgproc diff --git a/hal/riscv-rvv/hal_rvv_1p0/thresh.hpp b/hal/riscv-rvv/src/imgproc/threshold.cpp similarity index 86% rename from hal/riscv-rvv/hal_rvv_1p0/thresh.hpp rename to hal/riscv-rvv/src/imgproc/threshold.cpp index 738e3d5012..8d76b5626d 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/thresh.hpp +++ b/hal/riscv-rvv/src/imgproc/threshold.cpp @@ -4,18 +4,15 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_THRESH_HPP_INCLUDED -#define OPENCV_HAL_RVV_THRESH_HPP_INCLUDED - -#include +#include "rvv_hal.hpp" +#include "common.hpp" #include -namespace cv { namespace cv_hal_rvv { +namespace cv { namespace rvv_hal { namespace imgproc { -namespace threshold { -// disabled since UI is fast enough, only called in threshold_otsu -// #undef cv_hal_threshold -// #define cv_hal_threshold cv::cv_hal_rvv::threshold::threshold +#if CV_HAL_RVV_1P0_ENABLED + +namespace { class ThresholdInvoker : public ParallelLoopBody { @@ -182,16 +179,6 @@ static inline int threshold_range(int start, int end, const uchar* src_data, siz return CV_HAL_ERROR_NOT_IMPLEMENTED; } -inline int threshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, double thresh, double maxValue, int thresholdType) -{ - return threshold_range(0, height, src_data, src_step, dst_data, dst_step, width, depth, cn, thresh, maxValue, thresholdType); -} -} // cv::cv_hal_rvv::threshold - -namespace threshold_otsu { -#undef cv_hal_threshold_otsu -#define cv_hal_threshold_otsu cv::cv_hal_rvv::threshold_otsu::threshold_otsu - static inline int otsu(int start, int end, const uchar* src_data, size_t src_step, int width, std::atomic* cnt, int N, int* h) { const int c = cnt->fetch_add(1) % cv::getNumThreads(); @@ -205,69 +192,6 @@ static inline int otsu(int start, int end, const uchar* src_data, size_t src_ste return CV_HAL_ERROR_OK; } -// the algorithm is copied from imgproc/src/thresh.cpp, -// in the function template static double getThreshVal_Otsu -inline int threshold_otsu(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, double maxValue, int thresholdType, double* thresh) -{ - if (depth != CV_8UC1 || width * height < (1 << 15)) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - - const int N = std::numeric_limits::max() + 1; - const int nums = cv::getNumThreads(); - std::vector _h(N * nums, 0); - int* h = _h.data(); - - std::atomic cnt(0); - cv::parallel_for_(Range(0, height), threshold::ThresholdInvoker({otsu}, src_data, src_step, width, &cnt, N, h), nums); - for (int i = N; i < nums * N; i++) - { - h[i % N] += h[i]; - } - - double mu = 0, scale = 1. / (width*height); - for (int i = 0; i < N; i++) - { - mu += i*(double)h[i]; - } - - mu *= scale; - double mu1 = 0, q1 = 0; - double max_sigma = 0, max_val = 0; - - for (int i = 0; i < N; i++) - { - double p_i, q2, mu2, sigma; - - p_i = h[i]*scale; - mu1 *= q1; - q1 += p_i; - q2 = 1. - q1; - - if (std::min(q1,q2) < FLT_EPSILON || std::max(q1,q2) > 1. - FLT_EPSILON) - continue; - - mu1 = (mu1 + i*p_i)/q1; - mu2 = (mu - q1*mu1)/q2; - sigma = q1*q2*(mu1 - mu2)*(mu1 - mu2); - if (sigma > max_sigma) - { - max_sigma = sigma; - max_val = i; - } - } - - *thresh = max_val; - if (dst_data == nullptr) - return CV_HAL_ERROR_OK; - - return threshold::invoke(width, height, {threshold::threshold_range}, src_data, src_step, dst_data, dst_step, width, depth, 1, max_val, maxValue, thresholdType); -} -} // cv::cv_hal_rvv::threshold_otsu - -namespace adaptiveThreshold { -#undef cv_hal_adaptiveThreshold -#define cv_hal_adaptiveThreshold cv::cv_hal_rvv::adaptiveThreshold::adaptiveThreshold - // the algorithm is copied from imgproc/src/thresh.cpp, // in the function void cv::adaptiveThreshold template @@ -444,7 +368,72 @@ static inline int adaptiveThreshold(int start, int end, const uchar* src_data, s return CV_HAL_ERROR_OK; } -inline int adaptiveThreshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, double maxValue, int adaptiveMethod, int thresholdType, int blockSize, double C) +} // anonymous + +int threshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, double thresh, double maxValue, int thresholdType) +{ + return threshold_range(0, height, src_data, src_step, dst_data, dst_step, width, depth, cn, thresh, maxValue, thresholdType); +} + +// the algorithm is copied from imgproc/src/thresh.cpp, +// in the function template static double getThreshVal_Otsu +int threshold_otsu(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, double maxValue, int thresholdType, double* thresh) +{ + if (depth != CV_8UC1 || width * height < (1 << 15)) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + const int N = std::numeric_limits::max() + 1; + const int nums = cv::getNumThreads(); + std::vector _h(N * nums, 0); + int* h = _h.data(); + + std::atomic cnt(0); + cv::parallel_for_(Range(0, height), ThresholdInvoker({otsu}, src_data, src_step, width, &cnt, N, h), nums); + for (int i = N; i < nums * N; i++) + { + h[i % N] += h[i]; + } + + double mu = 0, scale = 1. / (width*height); + for (int i = 0; i < N; i++) + { + mu += i*(double)h[i]; + } + + mu *= scale; + double mu1 = 0, q1 = 0; + double max_sigma = 0, max_val = 0; + + for (int i = 0; i < N; i++) + { + double p_i, q2, mu2, sigma; + + p_i = h[i]*scale; + mu1 *= q1; + q1 += p_i; + q2 = 1. - q1; + + if (std::min(q1,q2) < FLT_EPSILON || std::max(q1,q2) > 1. - FLT_EPSILON) + continue; + + mu1 = (mu1 + i*p_i)/q1; + mu2 = (mu - q1*mu1)/q2; + sigma = q1*q2*(mu1 - mu2)*(mu1 - mu2); + if (sigma > max_sigma) + { + max_sigma = sigma; + max_val = i; + } + } + + *thresh = max_val; + if (dst_data == nullptr) + return CV_HAL_ERROR_OK; + + return invoke(width, height, {threshold_range}, src_data, src_step, dst_data, dst_step, width, depth, 1, max_val, maxValue, thresholdType); +} + +int adaptiveThreshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, double maxValue, int adaptiveMethod, int thresholdType, int blockSize, double C) { if (thresholdType != CV_HAL_THRESH_BINARY && thresholdType != CV_HAL_THRESH_BINARY_INV) return CV_HAL_ERROR_NOT_IMPLEMENTED; @@ -456,27 +445,26 @@ inline int adaptiveThreshold(const uchar* src_data, size_t src_step, uchar* dst_ switch (blockSize*100 + adaptiveMethod*10 + thresholdType) { case 300 + CV_HAL_ADAPTIVE_THRESH_MEAN_C*10 + CV_HAL_THRESH_BINARY: - return threshold::invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C); + return invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C); case 300 + CV_HAL_ADAPTIVE_THRESH_MEAN_C*10 + CV_HAL_THRESH_BINARY_INV: - return threshold::invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C); + return invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C); case 500 + CV_HAL_ADAPTIVE_THRESH_MEAN_C*10 + CV_HAL_THRESH_BINARY: - return threshold::invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C); + return invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C); case 500 + CV_HAL_ADAPTIVE_THRESH_MEAN_C*10 + CV_HAL_THRESH_BINARY_INV: - return threshold::invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C); + return invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C); case 300 + CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C*10 + CV_HAL_THRESH_BINARY: - return threshold::invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C); + return invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C); case 300 + CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C*10 + CV_HAL_THRESH_BINARY_INV: - return threshold::invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C); + return invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C); case 500 + CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C*10 + CV_HAL_THRESH_BINARY: - return threshold::invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C); + return invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C); case 500 + CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C*10 + CV_HAL_THRESH_BINARY_INV: - return threshold::invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C); + return invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C); } return CV_HAL_ERROR_NOT_IMPLEMENTED; } -} // cv::cv_hal_rvv::adaptiveThreshold -}} +#endif // CV_HAL_RVV_1P0_ENABLED -#endif +}}} /// cv::rvv_hal::imgproc diff --git a/hal/riscv-rvv/hal_rvv_1p0/warp.hpp b/hal/riscv-rvv/src/imgproc/warp.cpp similarity index 95% rename from hal/riscv-rvv/hal_rvv_1p0/warp.hpp rename to hal/riscv-rvv/src/imgproc/warp.cpp index f207c7cb95..745f27c9ca 100644 --- a/hal/riscv-rvv/hal_rvv_1p0/warp.hpp +++ b/hal/riscv-rvv/src/imgproc/warp.cpp @@ -4,20 +4,14 @@ // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. -#ifndef OPENCV_HAL_RVV_WARP_HPP_INCLUDED -#define OPENCV_HAL_RVV_WARP_HPP_INCLUDED +#include "rvv_hal.hpp" +#include "common.hpp" -#include +namespace cv { namespace rvv_hal { namespace imgproc { -namespace cv { namespace cv_hal_rvv { +#if CV_HAL_RVV_1P0_ENABLED -namespace remap { -#undef cv_hal_remap32f -#define cv_hal_remap32f cv::cv_hal_rvv::remap::remap32f -#undef cv_hal_remap32fc2 -#define cv_hal_remap32fc2 cv::cv_hal_rvv::remap::remap32fc2 -#undef cv_hal_remap16s -#define cv_hal_remap16s cv::cv_hal_rvv::remap::remap16s +namespace { class RemapInvoker : public ParallelLoopBody { @@ -862,30 +856,6 @@ inline int remap32f(int src_type, const uchar *src_data, size_t src_step, int sr return CV_HAL_ERROR_NOT_IMPLEMENTED; } -inline int remap32fc2(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, - uchar *dst_data, size_t dst_step, int dst_width, int dst_height, - float* map, size_t map_step, int interpolation, int border_type, const double border_value[4]) -{ - return remap32f(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, map, map_step, nullptr, 0, interpolation, border_type, border_value); -} - -inline int remap16s(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, - uchar *dst_data, size_t dst_step, int dst_width, int dst_height, - short* mapx, size_t mapx_step, ushort* mapy, size_t mapy_step, - int interpolation, int border_type, const double border_value[4]) -{ - if (CV_MAKETYPE(src_type, 1) != src_type) - return CV_HAL_ERROR_NOT_IMPLEMENTED; - return remap32f(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, reinterpret_cast(mapx), mapx_step, reinterpret_cast(mapy), mapy_step, interpolation, border_type, border_value); -} -} // cv::cv_hal_rvv::remap - -namespace warp { -#undef cv_hal_warpAffine -#define cv_hal_warpAffine cv::cv_hal_rvv::warp::warpAffine -#undef cv_hal_warpPerspective -#define cv_hal_warpPerspective cv::cv_hal_rvv::warp::warpPerspective - template static inline int warpC1(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, const double* M, int interpolation, int borderType, const double* borderValue) { @@ -1154,9 +1124,36 @@ static inline int warpC4(int start, int end, const uchar *src_data, size_t src_s return CV_HAL_ERROR_OK; } +} // anonymous + +int remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, + uchar *dst_data, size_t dst_step, int dst_width, int dst_height, + float* mapx, size_t mapx_step, float* mapy, size_t mapy_step, + int interpolation, int border_type, const double border_value[4]) +{ + return remap32f(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value); +} + +int remap32fc2(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, + uchar *dst_data, size_t dst_step, int dst_width, int dst_height, + float* map, size_t map_step, int interpolation, int border_type, const double border_value[4]) +{ + return remap32f(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, map, map_step, nullptr, 0, interpolation, border_type, border_value); +} + +int remap16s(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, + uchar *dst_data, size_t dst_step, int dst_width, int dst_height, + short* mapx, size_t mapx_step, ushort* mapy, size_t mapy_step, + int interpolation, int border_type, const double border_value[4]) +{ + if (CV_MAKETYPE(src_type, 1) != src_type) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + return remap32f(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, reinterpret_cast(mapx), mapx_step, reinterpret_cast(mapy), mapy_step, interpolation, border_type, border_value); +} + // the algorithm is copied from 3rdparty/carotene/src/warp_affine.cpp, // in the function void CAROTENE_NS::warpAffineNearestNeighbor and void CAROTENE_NS::warpAffineLinear -inline int warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4]) +int warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4]) { if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4) return CV_HAL_ERROR_NOT_IMPLEMENTED; @@ -1168,11 +1165,11 @@ inline int warpAffine(int src_type, const uchar *src_data, size_t src_step, int switch (src_type) { case CV_8UC1: - return remap::invoke(dst_width, dst_height, {warpC1}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); + return invoke(dst_width, dst_height, {warpC1}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); case CV_8UC3: - return remap::invoke(dst_width, dst_height, {warpC3}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); + return invoke(dst_width, dst_height, {warpC3}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); case CV_8UC4: - return remap::invoke(dst_width, dst_height, {warpC4}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); + return invoke(dst_width, dst_height, {warpC4}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); } return CV_HAL_ERROR_NOT_IMPLEMENTED; @@ -1180,7 +1177,7 @@ inline int warpAffine(int src_type, const uchar *src_data, size_t src_step, int // the algorithm is copied from 3rdparty/carotene/src/warp_perspective.cpp, // in the function void CAROTENE_NS::warpPerspectiveNearestNeighbor and void CAROTENE_NS::warpPerspectiveLinear -inline int warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4]) +int warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4]) { if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4) return CV_HAL_ERROR_NOT_IMPLEMENTED; @@ -1192,17 +1189,16 @@ inline int warpPerspective(int src_type, const uchar *src_data, size_t src_step, switch (src_type) { case CV_8UC1: - return remap::invoke(dst_width, dst_height, {warpC1}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); + return invoke(dst_width, dst_height, {warpC1}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); case CV_8UC3: - return remap::invoke(dst_width, dst_height, {warpC3}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); + return invoke(dst_width, dst_height, {warpC3}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); case CV_8UC4: - return remap::invoke(dst_width, dst_height, {warpC4}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); + return invoke(dst_width, dst_height, {warpC4}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); } return CV_HAL_ERROR_NOT_IMPLEMENTED; } -} // cv::cv_hal_rvv::warp -}} +#endif // CV_HAL_RVV_1P0_ENABLED -#endif +}}} // cv::rvv_hal::imgproc diff --git a/hal/riscv-rvv/version/hal_rvv_071.hpp b/hal/riscv-rvv/version/hal_rvv_071.hpp deleted file mode 100644 index db235d6139..0000000000 --- a/hal/riscv-rvv/version/hal_rvv_071.hpp +++ /dev/null @@ -1,109 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html. - -#ifndef OPENCV_HAL_RVV_071_HPP_INCLUDED -#define OPENCV_HAL_RVV_071_HPP_INCLUDED - -#include - -#include - -namespace cv { namespace cv_hal_rvv { - -#undef cv_hal_cvtBGRtoBGR -#define cv_hal_cvtBGRtoBGR cv::cv_hal_rvv::cvtBGRtoBGR - -static const unsigned char index_array_32 [32] - { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15, 18, 17, 16, 19, 22, 21, 20, 23, 26, 25, 24, 27, 30, 29, 28, 31 }; - -static const unsigned char index_array_24 [24] - { 2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17, 16, 15, 20, 19, 18, 23, 22, 21 }; - -static void vBGRtoBGR(const unsigned char* src, unsigned char * dst, const unsigned char * index, int n, int scn, int dcn, int vsize_pixels, const int vsize) -{ - vuint8m2_t vec_index = vle8_v_u8m2(index, vsize); - - int i = 0; - - for ( ; i <= n-vsize; i += vsize_pixels, src += vsize, dst += vsize) - { - vuint8m2_t vec_src = vle8_v_u8m2(src, vsize); - vuint8m2_t vec_dst = vrgather_vv_u8m2(vec_src, vec_index, vsize); - vse8_v_u8m2(dst, vec_dst, vsize); - } - - for ( ; i < n; i++, src += scn, dst += dcn ) - { - unsigned char t0 = src[0], t1 = src[1], t2 = src[2]; - dst[2] = t0; - dst[1] = t1; - dst[0] = t2; - if(dcn == 4) - { - unsigned char d = src[3]; - dst[3] = d; - } - } -} - -static void sBGRtoBGR(const unsigned char* src, unsigned char * dst, int n, int scn, int dcn, int bi) -{ - for (int i = 0; i < n; i++, src += scn, dst += dcn) - { - unsigned char t0 = src[0], t1 = src[1], t2 = src[2]; - dst[bi ] = t0; - dst[1] = t1; - dst[bi^2] = t2; - if(dcn == 4) - { - unsigned char d = scn == 4 ? src[3] : std::numeric_limits::max(); - dst[3] = d; - } - } -} - -static int cvtBGRtoBGR(const unsigned char * src_data, size_t src_step, unsigned char * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue) -{ - if (depth != CV_8U) - { - return CV_HAL_ERROR_NOT_IMPLEMENTED; - } - - const int blueIdx = swapBlue ? 2 : 0; - if (scn == dcn) - { - if (!swapBlue) - { - return CV_HAL_ERROR_NOT_IMPLEMENTED; - } - - const int vsize_pixels = 8; - - if (scn == 4) - { - for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step) - { - vBGRtoBGR(src_data, dst_data, index_array_32, width, scn, dcn, vsize_pixels, 32); - } - } - else - { - for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step) - { - vBGRtoBGR(src_data, dst_data, index_array_24, width, scn, dcn, vsize_pixels, 24); - } - } - } - else - { - for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step) - sBGRtoBGR(src_data, dst_data, width, scn, dcn, blueIdx); - } - - return CV_HAL_ERROR_OK; -} - -}} - -#endif