diff --git a/CMakeLists.txt b/CMakeLists.txt
index 93a7ad2799..150a018f8a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -996,9 +996,9 @@ if(WITH_NDSRVP)
 endif()
 
 if(WITH_HAL_RVV)
-  ocv_debug_message(STATUS "Enable HAL RVV acceleration")
-  if(NOT ";${OpenCV_HAL};" MATCHES ";halrvv;")
-    set(OpenCV_HAL "halrvv;${OpenCV_HAL}")
+  ocv_debug_message(STATUS "Enable RVV HAL acceleration")
+  if(NOT ";${OpenCV_HAL};" MATCHES ";rvvhal;")
+    set(OpenCV_HAL "rvvhal;${OpenCV_HAL}")
   endif()
 endif()
 
@@ -1031,13 +1031,13 @@ foreach(hal ${OpenCV_HAL})
     else()
       message(STATUS "NDSRVP: Andes GNU Toolchain DSP extension is not enabled, disabling ndsrvp...")
     endif()
-  elseif(hal STREQUAL "halrvv")
+  elseif(hal STREQUAL "rvvhal")
     if(";${CPU_BASELINE_FINAL};" MATCHES ";RVV;")
       add_subdirectory(hal/riscv-rvv)
       ocv_hal_register(RVV_HAL_LIBRARIES RVV_HAL_HEADERS RVV_HAL_INCLUDE_DIRS)
-      list(APPEND OpenCV_USED_HAL "HAL RVV (ver ${RVV_HAL_VERSION})")
+      list(APPEND OpenCV_USED_HAL "RVV HAL (ver ${RVV_HAL_VERSION})")
     else()
-      message(STATUS "HAL RVV: RVV is not available, disabling halrvv...")
+      message(STATUS "RVV HAL: RVV is not available, disabling RVV HAL...")
     endif()
   elseif(hal STREQUAL "ipp")
     add_subdirectory(hal/ipp)
diff --git a/hal/riscv-rvv/CMakeLists.txt b/hal/riscv-rvv/CMakeLists.txt
index 8c19800053..a0c9e628b3 100644
--- a/hal/riscv-rvv/CMakeLists.txt
+++ b/hal/riscv-rvv/CMakeLists.txt
@@ -1,9 +1,26 @@
 cmake_minimum_required(VERSION ${MIN_VER_CMAKE} FATAL_ERROR)
 
-set(HAL_LIB_NAME "")
+set(RVV_HAL_INCLUDE_DIR include)
+set(RVV_HAL_SOURCE_DIR src)
+
+file(GLOB rvv_hal_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${RVV_HAL_INCLUDE_DIR}/*.hpp")
+file(GLOB rvv_hal_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${RVV_HAL_SOURCE_DIR}/**/*.cpp")
+
+set(HAL_LIB_NAME "rvv_hal")
+add_library(${HAL_LIB_NAME} STATIC)
+target_sources(${HAL_LIB_NAME} PRIVATE ${rvv_hal_headers} ${rvv_hal_sources})
+
+set_target_properties(${HAL_LIB_NAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
+if(NOT BUILD_SHARED_LIBS)
+  ocv_install_target(${HAL_LIB_NAME} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+endif()
+target_include_directories(${HAL_LIB_NAME} PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${CMAKE_SOURCE_DIR}/modules/core/include
+  ${CMAKE_SOURCE_DIR}/modules/imgproc/include) #   ${CMAKE_SOURCE_DIR}/modules/features2d/include
 
 set(RVV_HAL_FOUND TRUE CACHE INTERNAL "")
 set(RVV_HAL_VERSION "0.0.1" CACHE INTERNAL "")
 set(RVV_HAL_LIBRARIES ${HAL_LIB_NAME} CACHE INTERNAL "")
-set(RVV_HAL_HEADERS "hal_rvv.hpp" CACHE INTERNAL "")
-set(RVV_HAL_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_SOURCE_DIR}/modules/imgproc/include" CACHE INTERNAL "")
+set(RVV_HAL_HEADERS "rvv_hal.hpp" CACHE INTERNAL "")
+set(RVV_HAL_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}" CACHE INTERNAL "")
diff --git a/hal/riscv-rvv/hal_rvv.hpp b/hal/riscv-rvv/hal_rvv.hpp
deleted file mode 100644
index 8fe78bd8b9..0000000000
--- a/hal/riscv-rvv/hal_rvv.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#ifndef OPENCV_HAL_RVV_HPP_INCLUDED
-#define OPENCV_HAL_RVV_HPP_INCLUDED
-
-#include "opencv2/core/base.hpp"
-#include "opencv2/core/hal/interface.h"
-#include "opencv2/imgproc/hal/interface.h"
-
-#ifndef CV_HAL_RVV_071_ENABLED
-#  if defined(__GNUC__) && __GNUC__ == 10 && __GNUC_MINOR__ == 4 && defined(__THEAD_VERSION__) && defined(__riscv_v) && __riscv_v == 7000
-#    define CV_HAL_RVV_071_ENABLED 1
-#  else
-#    define CV_HAL_RVV_071_ENABLED 0
-#  endif
-#endif
-
-#if CV_HAL_RVV_071_ENABLED
-#include "version/hal_rvv_071.hpp"
-#endif
-
-#if defined(__riscv_v) && __riscv_v == 1000000
-#include "hal_rvv_1p0/types.hpp"
-#include "hal_rvv_1p0/merge.hpp" // core
-#include "hal_rvv_1p0/mean.hpp" // core
-#include "hal_rvv_1p0/dxt.hpp" // core
-#include "hal_rvv_1p0/norm.hpp" // core
-#include "hal_rvv_1p0/norm_diff.hpp" // core
-#include "hal_rvv_1p0/norm_hamming.hpp" // core
-#include "hal_rvv_1p0/convert_scale.hpp" // core
-#include "hal_rvv_1p0/minmax.hpp" // core
-#include "hal_rvv_1p0/atan.hpp" // core
-#include "hal_rvv_1p0/split.hpp" // core
-#include "hal_rvv_1p0/magnitude.hpp" // core
-#include "hal_rvv_1p0/cart_to_polar.hpp" // core
-#include "hal_rvv_1p0/polar_to_cart.hpp" // core
-#include "hal_rvv_1p0/flip.hpp" // core
-#include "hal_rvv_1p0/lut.hpp" // core
-#include "hal_rvv_1p0/exp.hpp" // core
-#include "hal_rvv_1p0/log.hpp" // core
-#include "hal_rvv_1p0/lu.hpp" // core
-#include "hal_rvv_1p0/cholesky.hpp" // core
-#include "hal_rvv_1p0/qr.hpp" // core
-#include "hal_rvv_1p0/svd.hpp" // core
-#include "hal_rvv_1p0/sqrt.hpp" // core
-#include "hal_rvv_1p0/copy_mask.hpp" // core
-#include "hal_rvv_1p0/div.hpp" // core
-#include "hal_rvv_1p0/dotprod.hpp" // core
-#include "hal_rvv_1p0/compare.hpp" // core
-#include "hal_rvv_1p0/transpose.hpp" // core
-
-#include "hal_rvv_1p0/moments.hpp" // imgproc
-#include "hal_rvv_1p0/filter.hpp" // imgproc
-#include "hal_rvv_1p0/pyramids.hpp" // imgproc
-#include "hal_rvv_1p0/color.hpp" // imgproc
-#include "hal_rvv_1p0/warp.hpp" // imgproc
-#include "hal_rvv_1p0/thresh.hpp" // imgproc
-#include "hal_rvv_1p0/histogram.hpp" // imgproc
-#include "hal_rvv_1p0/resize.hpp" // imgproc
-#include "hal_rvv_1p0/integral.hpp" // imgproc
-#endif
-
-#endif
diff --git a/hal/riscv-rvv/hal_rvv_1p0/atan.hpp b/hal/riscv-rvv/hal_rvv_1p0/atan.hpp
deleted file mode 100644
index b864fea2c1..0000000000
--- a/hal/riscv-rvv/hal_rvv_1p0/atan.hpp
+++ /dev/null
@@ -1,128 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level
-// directory of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
-
-#ifndef OPENCV_HAL_RVV_ATAN_HPP_INCLUDED
-#define OPENCV_HAL_RVV_ATAN_HPP_INCLUDED
-
-#undef cv_hal_fastAtan32f
-#define cv_hal_fastAtan32f cv::cv_hal_rvv::fast_atan_32
-
-#undef cv_hal_fastAtan64f
-#define cv_hal_fastAtan64f cv::cv_hal_rvv::fast_atan_64
-
-#include <riscv_vector.h>
-
-#include <cfloat>
-
-namespace cv { namespace cv_hal_rvv {
-
-namespace detail {
-// ref: mathfuncs_core.simd.hpp
-static constexpr float pi = CV_PI;
-
-struct AtanParams
-{
-    float p1, p3, p5, p7, angle_90;
-};
-
-static constexpr AtanParams atan_params_rad {
-    0.9997878412794807F,
-    -0.3258083974640975F,
-    0.1555786518463281F,
-    -0.04432655554792128F,
-    90.F * (pi / 180.F)};
-static constexpr AtanParams atan_params_deg {
-    atan_params_rad.p1 * (180 / pi),
-    atan_params_rad.p3 * (180 / pi),
-    atan_params_rad.p5 * (180 / pi),
-    atan_params_rad.p7 * (180 / pi),
-    90.F};
-
-template <typename VEC_T>
-__attribute__((always_inline)) inline VEC_T
-    rvv_atan(VEC_T vy, VEC_T vx, size_t vl, const AtanParams& params)
-{
-    const auto ax = __riscv_vfabs(vx, vl);
-    const auto ay = __riscv_vfabs(vy, vl);
-    // Reciprocal Estimate (vfrec7) is not accurate enough to pass the test of cartToPolar.
-    const auto c = __riscv_vfdiv(__riscv_vfmin(ax, ay, vl),
-                                 __riscv_vfadd(__riscv_vfmax(ax, ay, vl), FLT_EPSILON, vl),
-                                 vl);
-    const auto c2 = __riscv_vfmul(c, c, vl);
-
-    // Using vfmadd only results in about a 2% performance improvement, but it occupies 3 additional
-    // M4 registers. (Performance test on phase32f::VectorLength::1048576: time decreased
-    // from 5.952ms to 5.805ms on Muse Pi)
-    // Additionally, when registers are nearly fully utilized (though not yet exhausted), the
-    // compiler is likely to fail to optimize and may introduce slower memory access (e.g., in
-    // cv::cv_hal_rvv::fast_atan_64).
-    // Saving registers can also make this function more reusable in other contexts.
-    // Therefore, vfmadd is not used here.
-    auto a = __riscv_vfadd(__riscv_vfmul(c2, params.p7, vl), params.p5, vl);
-    a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p3, vl);
-    a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p1, vl);
-    a = __riscv_vfmul(a, c, vl);
-
-    a = __riscv_vfrsub_mu(__riscv_vmflt(ax, ay, vl), a, a, params.angle_90, vl);
-    a = __riscv_vfrsub_mu(__riscv_vmflt(vx, 0.F, vl), a, a, params.angle_90 * 2, vl);
-    a = __riscv_vfrsub_mu(__riscv_vmflt(vy, 0.F, vl), a, a, params.angle_90 * 4, vl);
-
-    return a;
-}
-
-}  // namespace detail
-
-inline int fast_atan_32(const float* y, const float* x, float* dst, size_t n, bool angle_in_deg)
-{
-    auto atan_params = angle_in_deg ? detail::atan_params_deg : detail::atan_params_rad;
-
-    for (size_t vl = 0; n > 0; n -= vl)
-    {
-        vl = __riscv_vsetvl_e32m4(n);
-
-        auto vy = __riscv_vle32_v_f32m4(y, vl);
-        auto vx = __riscv_vle32_v_f32m4(x, vl);
-
-        auto a = detail::rvv_atan(vy, vx, vl, atan_params);
-
-        __riscv_vse32(dst, a, vl);
-
-        x += vl;
-        y += vl;
-        dst += vl;
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int fast_atan_64(const double* y, const double* x, double* dst, size_t n, bool angle_in_deg)
-{
-    // this also uses float32 version, ref: mathfuncs_core.simd.hpp
-
-    auto atan_params = angle_in_deg ? detail::atan_params_deg : detail::atan_params_rad;
-
-    for (size_t vl = 0; n > 0; n -= vl)
-    {
-        vl = __riscv_vsetvl_e64m8(n);
-
-        auto vy = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(y, vl), vl);
-        auto vx = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(x, vl), vl);
-
-        auto a = detail::rvv_atan(vy, vx, vl, atan_params);
-
-        __riscv_vse64(dst, __riscv_vfwcvt_f(a, vl), vl);
-
-        x += vl;
-        y += vl;
-        dst += vl;
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-}} // namespace cv::cv_hal_rvv
-
-#endif //OPENCV_HAL_RVV_ATAN_HPP_INCLUDED
diff --git a/hal/riscv-rvv/hal_rvv_1p0/common.hpp b/hal/riscv-rvv/hal_rvv_1p0/common.hpp
deleted file mode 100644
index 9fc01d2897..0000000000
--- a/hal/riscv-rvv/hal_rvv_1p0/common.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-//
-// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-
-#ifndef OPENCV_HAL_RVV_COMMON_HPP_INCLUDED
-#define OPENCV_HAL_RVV_COMMON_HPP_INCLUDED
-
-#include <riscv_vector.h>
-
-namespace cv { namespace cv_hal_rvv { namespace custom_intrin {
-
-#define CV_HAL_RVV_NOOP(a) (a)
-
-#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(_Tpvs, _Tpvd, shift, suffix) \
-    inline _Tpvd __riscv_vabs(const _Tpvs& v, const int vl) { \
-        _Tpvs mask = __riscv_vsra(v, shift, vl); \
-        _Tpvs v_xor = __riscv_vxor(v, mask, vl); \
-        return __riscv_vreinterpret_##suffix( \
-            __riscv_vsub(v_xor, mask, vl) \
-        ); \
-    }
-
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m2_t,  vuint8m2_t,  7,  u8m2)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m8_t,  vuint8m8_t,  7,  u8m8)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m4_t, vuint16m4_t, 15, u16m4)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m8_t, vuint16m8_t, 15, u16m8)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m4_t, vuint32m4_t, 31, u32m4)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m8_t, vuint32m8_t, 31, u32m8)
-
-#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(_Tpvs, _Tpvd, cast, sub, max, min) \
-    inline _Tpvd __riscv_vabd(const _Tpvs& v1, const _Tpvs& v2, const int vl) { \
-        return cast(__riscv_##sub(__riscv_##max(v1, v2, vl), __riscv_##min(v1, v2, vl), vl)); \
-    }
-
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m4_t, vuint8m4_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m8_t, vuint8m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m2_t, vuint16m2_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m8_t, vuint16m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
-
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m4_t, vuint8m4_t, __riscv_vreinterpret_u8m4, vsub, vmax, vmin)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m8_t, vuint8m8_t, __riscv_vreinterpret_u8m8, vsub, vmax, vmin)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m2_t, vuint16m2_t, __riscv_vreinterpret_u16m2, vsub, vmax, vmin)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m8_t, vuint16m8_t, __riscv_vreinterpret_u16m8, vsub, vmax, vmin)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m4_t, vuint32m4_t, __riscv_vreinterpret_u32m4, vsub, vmax, vmin)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m8_t, vuint32m8_t, __riscv_vreinterpret_u32m8, vsub, vmax, vmin)
-
-}}} // cv::cv_hal_rvv::custom_intrin
-
-#endif
diff --git a/hal/riscv-rvv/hal_rvv_1p0/div.hpp b/hal/riscv-rvv/hal_rvv_1p0/div.hpp
deleted file mode 100644
index ccbeb6403d..0000000000
--- a/hal/riscv-rvv/hal_rvv_1p0/div.hpp
+++ /dev/null
@@ -1,268 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-//
-// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-
-#ifndef OPENCV_HAL_RVV_DIV_HPP_INCLUDED
-#define OPENCV_HAL_RVV_DIV_HPP_INCLUDED
-
-#include <riscv_vector.h>
-#include <limits>
-
-namespace cv { namespace cv_hal_rvv { namespace div {
-
-namespace {
-
-    inline size_t setvl(int l) { return __riscv_vsetvl_e8m2(l); }
-
-    inline   vuint8m2_t vle(const uint8_t  *p, int vl) { return __riscv_vle8_v_u8m2(p, vl); }
-    inline    vint8m2_t vle(const int8_t   *p, int vl) { return __riscv_vle8_v_i8m2(p, vl); }
-    inline  vuint16m4_t vle(const uint16_t *p, int vl) { return __riscv_vle16_v_u16m4(p, vl); }
-    inline   vint16m4_t vle(const int16_t  *p, int vl) { return __riscv_vle16_v_i16m4(p, vl); }
-    inline   vint32m8_t vle(const int      *p, int vl) { return __riscv_vle32_v_i32m8(p, vl); }
-    inline vfloat32m8_t vle(const float    *p, int vl) { return __riscv_vle32_v_f32m8(p, vl); }
-
-    inline void vse(uint8_t  *p, const   vuint8m2_t &v, int vl) { __riscv_vse8(p, v, vl); }
-    inline void vse(int8_t   *p, const    vint8m2_t &v, int vl) { __riscv_vse8(p, v, vl); }
-    inline void vse(uint16_t *p, const  vuint16m4_t &v, int vl) { __riscv_vse16(p, v, vl); }
-    inline void vse(int16_t  *p, const   vint16m4_t &v, int vl) { __riscv_vse16(p, v, vl); }
-    inline void vse(int      *p, const   vint32m8_t &v, int vl) { __riscv_vse32(p, v, vl); }
-    inline void vse(float    *p, const vfloat32m8_t &v, int vl) { __riscv_vse32(p, v, vl); }
-
-    inline vuint16m4_t ext(const  vuint8m2_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); }
-    inline  vint16m4_t ext(const   vint8m2_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); }
-    inline vuint32m8_t ext(const vuint16m4_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); }
-    inline  vint32m8_t ext(const  vint16m4_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); }
-
-    inline  vuint8m2_t nclip(const vuint16m4_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); }
-    inline   vint8m2_t nclip(const  vint16m4_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); }
-    inline vuint16m4_t nclip(const vuint32m8_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); }
-    inline  vint16m4_t nclip(const  vint32m8_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); }
-
-    template <typename VT> inline
-    VT div_sat(const VT &v1, const VT &v2, const float scale, const int vl) {
-        return nclip(div_sat(ext(v1, vl), ext(v2, vl), scale, vl), vl);
-    }
-    template <> inline
-    vint32m8_t div_sat(const vint32m8_t &v1, const vint32m8_t &v2, const float scale, const int vl) {
-        auto f1 = __riscv_vfcvt_f(v1, vl);
-        auto f2 = __riscv_vfcvt_f(v2, vl);
-        auto res = __riscv_vfmul(f1, __riscv_vfrdiv(f2, scale, vl), vl);
-        return __riscv_vfcvt_x(res, vl);
-    }
-    template <> inline
-    vuint32m8_t div_sat(const vuint32m8_t &v1, const vuint32m8_t &v2, const float scale, const int vl) {
-        auto f1 = __riscv_vfcvt_f(v1, vl);
-        auto f2 = __riscv_vfcvt_f(v2, vl);
-        auto res = __riscv_vfmul(f1, __riscv_vfrdiv(f2, scale, vl), vl);
-        return __riscv_vfcvt_xu(res, vl);
-    }
-
-    template <typename VT> inline
-    VT recip_sat(const VT &v, const float scale, const int vl) {
-        return nclip(recip_sat(ext(v, vl), scale, vl), vl);
-    }
-    template <> inline
-    vint32m8_t recip_sat(const vint32m8_t &v, const float scale, const int vl) {
-        auto f = __riscv_vfcvt_f(v, vl);
-        auto res = __riscv_vfrdiv(f, scale, vl);
-        return __riscv_vfcvt_x(res, vl);
-    }
-    template <> inline
-    vuint32m8_t recip_sat(const vuint32m8_t &v, const float scale, const int vl) {
-        auto f = __riscv_vfcvt_f(v, vl);
-        auto res = __riscv_vfrdiv(f, scale, vl);
-        return __riscv_vfcvt_xu(res, vl);
-    }
-
-} // anonymous
-
-#undef cv_hal_div8u
-#define cv_hal_div8u cv::cv_hal_rvv::div::div<uint8_t>
-#undef cv_hal_div8s
-#define cv_hal_div8s cv::cv_hal_rvv::div::div<int8_t>
-#undef cv_hal_div16u
-#define cv_hal_div16u cv::cv_hal_rvv::div::div<uint16_t>
-#undef cv_hal_div16s
-#define cv_hal_div16s cv::cv_hal_rvv::div::div<int16_t>
-#undef cv_hal_div32s
-#define cv_hal_div32s cv::cv_hal_rvv::div::div<int>
-#undef cv_hal_div32f
-#define cv_hal_div32f cv::cv_hal_rvv::div::div<float>
-// #undef cv_hal_div64f
-// #define cv_hal_div64f cv::cv_hal_rvv::div::div<double>
-
-template <typename ST> inline
-int div(const ST *src1, size_t step1, const ST *src2, size_t step2,
-         ST *dst, size_t step, int width, int height, float scale) {
-    if (scale == 0.f ||
-        (scale * static_cast<float>(std::numeric_limits<ST>::max())) <  1.f &&
-        (scale * static_cast<float>(std::numeric_limits<ST>::max())) > -1.f) {
-        for (int h = 0; h < height; h++) {
-            ST *dst_h = reinterpret_cast<ST*>((uchar*)dst + h * step);
-            std::memset(dst_h, 0, sizeof(ST) * width);
-        }
-        return CV_HAL_ERROR_OK;
-    }
-
-    for (int h = 0; h < height; h++) {
-        const ST *src1_h = reinterpret_cast<const ST*>((const uchar*)src1 + h * step1);
-        const ST *src2_h = reinterpret_cast<const ST*>((const uchar*)src2 + h * step2);
-        ST *dst_h = reinterpret_cast<ST*>((uchar*)dst + h * step);
-
-        int vl;
-        for (int w = 0; w < width; w += vl) {
-            vl = setvl(width - w);
-
-            auto v1 = vle(src1_h + w, vl);
-            auto v2 = vle(src2_h + w, vl);
-
-            auto mask = __riscv_vmseq(v2, 0, vl);
-            vse(dst_h + w, __riscv_vmerge(div_sat(v1, v2, scale, vl), 0, mask, vl), vl);
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-template <> inline
-int div(const float *src1, size_t step1, const float *src2, size_t step2,
-        float *dst, size_t step, int width, int height, float scale) {
-    if (scale == 0.f) {
-        for (int h = 0; h < height; h++) {
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
-            std::memset(dst_h, 0, sizeof(float) * width);
-        }
-        return CV_HAL_ERROR_OK;
-    }
-
-    if (std::fabs(scale - 1.f) < FLT_EPSILON) {
-        for (int h = 0; h < height; h++) {
-            const float *src1_h = reinterpret_cast<const float*>((const uchar*)src1 + h * step1);
-            const float *src2_h = reinterpret_cast<const float*>((const uchar*)src2 + h * step2);
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
-
-            int vl;
-            for (int w = 0; w < width; w += vl) {
-                vl = setvl(width - w);
-
-                auto v1 = vle(src1_h + w, vl);
-                auto v2 = vle(src2_h + w, vl);
-
-                vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfrdiv(v2, 1.f, vl), vl), vl);
-            }
-        }
-    } else {
-        for (int h = 0; h < height; h++) {
-            const float *src1_h = reinterpret_cast<const float*>((const uchar*)src1 + h * step1);
-            const float *src2_h = reinterpret_cast<const float*>((const uchar*)src2 + h * step2);
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
-
-            int vl;
-            for (int w = 0; w < width; w += vl) {
-                vl = setvl(width - w);
-
-                auto v1 = vle(src1_h + w, vl);
-                auto v2 = vle(src2_h + w, vl);
-
-                vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfrdiv(v2, scale, vl), vl), vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-#undef cv_hal_recip8u
-#define cv_hal_recip8u cv::cv_hal_rvv::div::recip<uint8_t>
-#undef cv_hal_recip8s
-#define cv_hal_recip8s cv::cv_hal_rvv::div::recip<int8_t>
-#undef cv_hal_recip16u
-#define cv_hal_recip16u cv::cv_hal_rvv::div::recip<uint16_t>
-#undef cv_hal_recip16s
-#define cv_hal_recip16s cv::cv_hal_rvv::div::recip<int16_t>
-#undef cv_hal_recip32s
-#define cv_hal_recip32s cv::cv_hal_rvv::div::recip<int>
-#undef cv_hal_recip32f
-#define cv_hal_recip32f cv::cv_hal_rvv::div::recip<float>
-// #undef cv_hal_recip64f
-// #define cv_hal_recip64f cv::cv_hal_rvv::div::recip<double>
-
-template <typename ST> inline
-int recip(const ST *src_data, size_t src_step, ST *dst_data, size_t dst_step,
-          int width, int height, float scale) {
-    if (scale == 0.f || scale < 1.f && scale > -1.f) {
-        for (int h = 0; h < height; h++) {
-            ST *dst_h = reinterpret_cast<ST*>((uchar*)dst_data + h * dst_step);
-            std::memset(dst_h, 0, sizeof(ST) * width);
-        }
-        return CV_HAL_ERROR_OK;
-    }
-
-    for (int h = 0; h < height; h++) {
-        const ST *src_h = reinterpret_cast<const ST*>((const uchar*)src_data + h * src_step);
-        ST *dst_h = reinterpret_cast<ST*>((uchar*)dst_data + h * dst_step);
-
-        int vl;
-        for (int w = 0; w < width; w += vl) {
-            vl = setvl(width - w);
-
-            auto v = vle(src_h + w, vl);
-
-            auto mask = __riscv_vmseq(v, 0, vl);
-            vse(dst_h + w, __riscv_vmerge(recip_sat(v, scale, vl), 0, mask, vl), vl);
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-template <> inline
-int recip(const float *src_data, size_t src_step, float *dst_data, size_t dst_step,
-          int width, int height, float scale) {
-    if (scale == 0.f) {
-        for (int h = 0; h < height; h++) {
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
-            std::memset(dst_h, 0, sizeof(float) * width);
-        }
-        return CV_HAL_ERROR_OK;
-    }
-
-    if (std::fabs(scale - 1.f) < FLT_EPSILON) {
-        for (int h = 0; h < height; h++) {
-            const float *src_h = reinterpret_cast<const float*>((const uchar*)src_data + h * src_step);
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
-
-            int vl;
-            for (int w = 0; w < width; w += vl) {
-                vl = setvl(width - w);
-
-                auto v = vle(src_h + w, vl);
-
-                vse(dst_h + w, __riscv_vfrdiv(v, 1.f, vl), vl);
-            }
-        }
-    } else {
-        for (int h = 0; h < height; h++) {
-            const float *src_h = reinterpret_cast<const float*>((const uchar*)src_data + h * src_step);
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
-
-            int vl;
-            for (int w = 0; w < width; w += vl) {
-                vl = setvl(width - w);
-
-                auto v = vle(src_h + w, vl);
-
-                vse(dst_h + w, __riscv_vfrdiv(v, scale, vl), vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-}}} // cv::cv_hal_rvv::div
-
-#endif // OPENCV_HAL_RVV_DIV_HPP_INCLUDED
diff --git a/hal/riscv-rvv/hal_rvv_1p0/filter.hpp b/hal/riscv-rvv/hal_rvv_1p0/filter.hpp
deleted file mode 100644
index 85949137e3..0000000000
--- a/hal/riscv-rvv/hal_rvv_1p0/filter.hpp
+++ /dev/null
@@ -1,2553 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
-
-#ifndef OPENCV_HAL_RVV_FILTER_HPP_INCLUDED
-#define OPENCV_HAL_RVV_FILTER_HPP_INCLUDED
-
-#include <riscv_vector.h>
-
-struct cvhalFilter2D;
-
-namespace cv { namespace cv_hal_rvv {
-
-namespace filter {
-#undef cv_hal_filterInit
-#undef cv_hal_filter
-#undef cv_hal_filterFree
-#define cv_hal_filterInit cv::cv_hal_rvv::filter::filterInit
-#define cv_hal_filter cv::cv_hal_rvv::filter::filter
-#define cv_hal_filterFree cv::cv_hal_rvv::filter::filterFree
-
-class FilterInvoker : public ParallelLoopBody
-{
-public:
-    template<typename... Args>
-    FilterInvoker(std::function<int(int, int, Args...)> _func, Args&&... args)
-    {
-        func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward<Args>(args)...);
-    }
-
-    virtual void operator()(const Range& range) const override
-    {
-        func(range.start, range.end);
-    }
-
-private:
-    std::function<int(int, int)> func;
-};
-
-template<typename... Args>
-static inline int invoke(int height, std::function<int(int, int, Args...)> func, Args&&... args)
-{
-    cv::parallel_for_(Range(1, height), FilterInvoker(func, std::forward<Args>(args)...), cv::getNumThreads());
-    return func(0, 1, std::forward<Args>(args)...);
-}
-
-static inline int borderInterpolate( int p, int len, int borderType )
-{
-    if ((unsigned)p < (unsigned)len)
-        ;
-    else if (borderType == BORDER_REPLICATE)
-        p = p < 0 ? 0 : len - 1;
-    else if (borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101)
-    {
-        int delta = borderType == BORDER_REFLECT_101;
-        if (len == 1)
-            return 0;
-        do
-        {
-            if (p < 0)
-                p = -p - 1 + delta;
-            else
-                p = len - 1 - (p - len) - delta;
-        }
-        while( (unsigned)p >= (unsigned)len );
-    }
-    else if (borderType == BORDER_WRAP)
-    {
-        if (p < 0)
-            p -= ((p-len+1)/len)*len;
-        if (p >= len)
-            p %= len;
-    }
-    else if (borderType == BORDER_CONSTANT)
-        p = -1;
-    return p;
-}
-
-struct Filter2D
-{
-    const uchar* kernel_data;
-    size_t kernel_step;
-    int kernel_type;
-    int kernel_width;
-    int kernel_height;
-    int src_type;
-    int dst_type;
-    int borderType;
-    double delta;
-    int anchor_x;
-    int anchor_y;
-};
-
-inline int filterInit(cvhalFilter2D** context, uchar* kernel_data, size_t kernel_step, int kernel_type, int kernel_width, int kernel_height, int /*max_width*/, int /*max_height*/, int src_type, int dst_type, int borderType, double delta, int anchor_x, int anchor_y, bool /*allowSubmatrix*/, bool /*allowInplace*/)
-{
-    if (kernel_type != CV_32FC1 || src_type != CV_8UC4 || dst_type != CV_8UC4)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (kernel_width != kernel_height)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (kernel_width != 3 && kernel_width != 5)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    anchor_x = anchor_x < 0 ? kernel_width  / 2 : anchor_x;
-    anchor_y = anchor_y < 0 ? kernel_height / 2 : anchor_y;
-    *context = reinterpret_cast<cvhalFilter2D*>(new Filter2D{kernel_data, kernel_step, kernel_type, kernel_width, kernel_height, src_type, dst_type, borderType, delta, anchor_x, anchor_y});
-    return CV_HAL_ERROR_OK;
-}
-
-static void process3(int anchor, int left, int right, float delta, const float* kernel, const uchar* row0, const uchar* row1, const uchar* row2, uchar* dst)
-{
-    int vl;
-    for (int i = left; i < right; i += vl)
-    {
-        vl = __riscv_vsetvl_e8m1(right - i);
-        auto s0 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s1 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s2 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s3 = __riscv_vfmv_v_f_f32m4(delta, vl);
-
-        auto addshift = [&](vfloat32m4_t a, vfloat32m4_t b, float k0, float k1, float k2, float r1, float r2) {
-            a = __riscv_vfmacc(a, k0, b, vl);
-            b = __riscv_vfslide1down(b, r1, vl);
-            a = __riscv_vfmacc(a, k1, b, vl);
-            b = __riscv_vfslide1down(b, r2, vl);
-            return __riscv_vfmacc(a, k2, b, vl);
-        };
-        auto loadsrc = [&](const uchar* row, float k0, float k1, float k2) {
-            if (!row) return;
-
-            const uchar* extra = row + (i - anchor) * 4;
-            auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl);
-            auto v0 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl), vl);
-            auto v1 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl), vl);
-            auto v2 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl), vl);
-            auto v3 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl), vl);
-
-            extra += vl * 4;
-            s0 = addshift(s0, v0, k0, k1, k2, extra[0], extra[4]);
-            s1 = addshift(s1, v1, k0, k1, k2, extra[1], extra[5]);
-            s2 = addshift(s2, v2, k0, k1, k2, extra[2], extra[6]);
-            s3 = addshift(s3, v3, k0, k1, k2, extra[3], extra[7]);
-        };
-
-        loadsrc(row0, kernel[0], kernel[1], kernel[2]);
-        loadsrc(row1, kernel[3], kernel[4], kernel[5]);
-        loadsrc(row2, kernel[6], kernel[7], kernel[8]);
-        vuint8m1x4_t val{};
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 0, __riscv_vnclipu(__riscv_vfncvt_xu(s0, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 1, __riscv_vnclipu(__riscv_vfncvt_xu(s1, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 2, __riscv_vnclipu(__riscv_vfncvt_xu(s2, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 3, __riscv_vnclipu(__riscv_vfncvt_xu(s3, vl), 0, __RISCV_VXRM_RNU, vl));
-        __riscv_vsseg4e8(dst + i * 4, val, vl);
-    }
-}
-
-static void process5(int anchor, int left, int right, float delta, const float* kernel, const uchar* row0, const uchar* row1, const uchar* row2, const uchar* row3, const uchar* row4, uchar* dst)
-{
-    int vl;
-    for (int i = left; i < right; i += vl)
-    {
-        vl = __riscv_vsetvl_e8m1(right - i);
-        auto s0 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s1 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s2 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s3 = __riscv_vfmv_v_f_f32m4(delta, vl);
-
-        auto addshift = [&](vfloat32m4_t a, vfloat32m4_t b, float k0, float k1, float k2, float k3, float k4, float r1, float r2, float r3, float r4) {
-            a = __riscv_vfmacc(a, k0, b, vl);
-            b = __riscv_vfslide1down(b, r1, vl);
-            a = __riscv_vfmacc(a, k1, b, vl);
-            b = __riscv_vfslide1down(b, r2, vl);
-            a = __riscv_vfmacc(a, k2, b, vl);
-            b = __riscv_vfslide1down(b, r3, vl);
-            a = __riscv_vfmacc(a, k3, b, vl);
-            b = __riscv_vfslide1down(b, r4, vl);
-            return __riscv_vfmacc(a, k4, b, vl);
-        };
-        auto loadsrc = [&](const uchar* row, float k0, float k1, float k2, float k3, float k4) {
-            if (!row) return;
-
-            const uchar* extra = row + (i - anchor) * 4;
-            auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl);
-            auto v0 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl), vl);
-            auto v1 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl), vl);
-            auto v2 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl), vl);
-            auto v3 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl), vl);
-
-            extra += vl * 4;
-            s0 = addshift(s0, v0, k0, k1, k2, k3, k4, extra[0], extra[4], extra[ 8], extra[12]);
-            s1 = addshift(s1, v1, k0, k1, k2, k3, k4, extra[1], extra[5], extra[ 9], extra[13]);
-            s2 = addshift(s2, v2, k0, k1, k2, k3, k4, extra[2], extra[6], extra[10], extra[14]);
-            s3 = addshift(s3, v3, k0, k1, k2, k3, k4, extra[3], extra[7], extra[11], extra[15]);
-        };
-
-        loadsrc(row0, kernel[ 0], kernel[ 1], kernel[ 2], kernel[ 3], kernel[ 4]);
-        loadsrc(row1, kernel[ 5], kernel[ 6], kernel[ 7], kernel[ 8], kernel[ 9]);
-        loadsrc(row2, kernel[10], kernel[11], kernel[12], kernel[13], kernel[14]);
-        loadsrc(row3, kernel[15], kernel[16], kernel[17], kernel[18], kernel[19]);
-        loadsrc(row4, kernel[20], kernel[21], kernel[22], kernel[23], kernel[24]);
-        vuint8m1x4_t val{};
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 0, __riscv_vnclipu(__riscv_vfncvt_xu(s0, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 1, __riscv_vnclipu(__riscv_vfncvt_xu(s1, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 2, __riscv_vnclipu(__riscv_vfncvt_xu(s2, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 3, __riscv_vnclipu(__riscv_vfncvt_xu(s3, vl), 0, __RISCV_VXRM_RNU, vl));
-        __riscv_vsseg4e8(dst + i * 4, val, vl);
-    }
-}
-
-// the algorithm is copied from 3rdparty/carotene/src/convolution.cpp,
-// in the function void CAROTENE_NS::convolution
-template<int ksize>
-static inline int filter(int start, int end, Filter2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
-{
-    float kernel[ksize * ksize];
-    for (int i = 0; i < ksize * ksize; i++)
-    {
-        kernel[i] = reinterpret_cast<const float*>(data->kernel_data + (i / ksize) * data->kernel_step)[i % ksize];
-    }
-
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto access = [&](int x, int y) {
-        int pi, pj;
-        if (data->borderType & BORDER_ISOLATED)
-        {
-            pi = borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
-            pj = borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED);
-            pi = pi < 0 ? noval : pi;
-            pj = pj < 0 ? noval : pj;
-        }
-        else
-        {
-            pi = borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
-            pj = borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType);
-            pi = pi < 0 ? noval : pi - offset_y;
-            pj = pj < 0 ? noval : pj - offset_x;
-        }
-        return std::make_pair(pi, pj);
-    };
-
-    auto process = [&](int x, int y) {
-        float sum0, sum1, sum2, sum3;
-        sum0 = sum1 = sum2 = sum3 = data->delta;
-        for (int i = 0; i < ksize * ksize; i++)
-        {
-            auto p = access(x + i / ksize, y + i % ksize);
-            if (p.first != noval && p.second != noval)
-            {
-                sum0 += kernel[i] * src_data[p.first * src_step + p.second * 4    ];
-                sum1 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 1];
-                sum2 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 2];
-                sum3 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 3];
-            }
-        }
-        dst_data[(x * width + y) * 4    ] = std::max(0, std::min((int)std::round(sum0), (int)std::numeric_limits<uchar>::max()));
-        dst_data[(x * width + y) * 4 + 1] = std::max(0, std::min((int)std::round(sum1), (int)std::numeric_limits<uchar>::max()));
-        dst_data[(x * width + y) * 4 + 2] = std::max(0, std::min((int)std::round(sum2), (int)std::numeric_limits<uchar>::max()));
-        dst_data[(x * width + y) * 4 + 3] = std::max(0, std::min((int)std::round(sum3), (int)std::numeric_limits<uchar>::max()));
-    };
-
-    const int left = data->anchor_x, right = width - (ksize - 1 - data->anchor_x);
-    for (int i = start; i < end; i++)
-    {
-        if (left >= right)
-        {
-            for (int j = 0; j < width; j++)
-                process(i, j);
-        }
-        else
-        {
-            for (int j = 0; j < left; j++)
-                process(i, j);
-            for (int j = right; j < width; j++)
-                process(i, j);
-
-            const uchar* row0 = access(i    , 0).first == noval ? nullptr : src_data + access(i    , 0).first * src_step;
-            const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step;
-            const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step;
-            if (ksize == 3)
-            {
-                process3(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, dst_data + i * width * 4);
-            }
-            else
-            {
-                const uchar* row3 = access(i + 3, 0).first == noval ? nullptr : src_data + access(i + 3, 0).first * src_step;
-                const uchar* row4 = access(i + 4, 0).first == noval ? nullptr : src_data + access(i + 4, 0).first * src_step;
-                process5(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, row3, row4, dst_data + i * width * 4);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int filter(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
-{
-    Filter2D* data = reinterpret_cast<Filter2D*>(context);
-    std::vector<uchar> dst(width * height * 4);
-
-    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
-    switch (data->kernel_width)
-    {
-    case 3:
-        res = invoke(height, {filter<3>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    case 5:
-        res = invoke(height, {filter<5>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    }
-
-    for (int i = 0; i < height; i++)
-        memcpy(dst_data + i * dst_step, dst.data() + i * width * 4, width * 4);
-    return res;
-}
-
-inline int filterFree(cvhalFilter2D* context)
-{
-    delete reinterpret_cast<Filter2D*>(context);
-    return CV_HAL_ERROR_OK;
-}
-} // cv::cv_hal_rvv::filter
-
-namespace sepFilter {
-#undef cv_hal_sepFilterInit
-#undef cv_hal_sepFilter
-#undef cv_hal_sepFilterFree
-#define cv_hal_sepFilterInit cv::cv_hal_rvv::sepFilter::sepFilterInit
-#define cv_hal_sepFilter cv::cv_hal_rvv::sepFilter::sepFilter
-#define cv_hal_sepFilterFree cv::cv_hal_rvv::sepFilter::sepFilterFree
-
-struct sepFilter2D
-{
-    int src_type;
-    int dst_type;
-    int kernel_type;
-    const uchar* kernelx_data;
-    int kernelx_length;
-    const uchar* kernely_data;
-    int kernely_length;
-    int anchor_x;
-    int anchor_y;
-    double delta;
-    int borderType;
-};
-
-inline int sepFilterInit(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type, uchar* kernelx_data, int kernelx_length, uchar* kernely_data, int kernely_length, int anchor_x, int anchor_y, double delta, int borderType)
-{
-    if (kernel_type != CV_32FC1)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (src_type != CV_8UC1 && src_type != CV_16SC1 && src_type != CV_32FC1)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (dst_type != CV_16SC1 && dst_type != CV_32FC1)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if ((kernelx_length != 3 && kernelx_length != 5) || kernelx_length != kernely_length)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    anchor_x = anchor_x < 0 ? kernelx_length / 2 : anchor_x;
-    anchor_y = anchor_y < 0 ? kernely_length / 2 : anchor_y;
-    *context = reinterpret_cast<cvhalFilter2D*>(new sepFilter2D{src_type, dst_type, kernel_type, kernelx_data, kernelx_length, kernely_data, kernely_length, anchor_x, anchor_y, delta, borderType & ~BORDER_ISOLATED});
-    return CV_HAL_ERROR_OK;
-}
-
-// the algorithm is copied from 3rdparty/carotene/src/separable_filter.hpp,
-// in the functor RowFilter3x3S16Generic and ColFilter3x3S16Generic
-template<int ksize, typename T>
-static inline int sepFilter(int start, int end, sepFilter2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
-{
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto accessX = [&](int x) {
-        int pi;
-        if (data->borderType & BORDER_ISOLATED)
-        {
-            pi = filter::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
-            pi = pi < 0 ? noval : pi;
-        }
-        else
-        {
-            pi = filter::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
-            pi = pi < 0 ? noval : pi - offset_y;
-        }
-        return pi;
-    };
-    auto accessY = [&](int y) {
-        int pj;
-        if (data->borderType & BORDER_ISOLATED)
-        {
-            pj = filter::borderInterpolate(y - data->anchor_x, width, data->borderType & ~BORDER_ISOLATED);
-            pj = pj < 0 ? noval : pj;
-        }
-        else
-        {
-            pj = filter::borderInterpolate(offset_x + y - data->anchor_x, full_width, data->borderType);
-            pj = pj < 0 ? noval : pj - offset_x;
-        }
-        return pj;
-    };
-    auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; };
-
-    const float* kx = reinterpret_cast<const float*>(data->kernelx_data);
-    const float* ky = reinterpret_cast<const float*>(data->kernely_data);
-    std::vector<float> res(width * ksize);
-    auto process = [&](int x, int y) {
-        float sum = 0;
-        for (int i = 0; i < ksize; i++)
-        {
-            int p = accessY(y + i);
-            if (p != noval)
-            {
-                sum += kx[i] * reinterpret_cast<const T*>(src_data + x * src_step)[p];
-            }
-        }
-        res[p2idx(x, y)] = sum;
-    };
-
-    const int left = data->anchor_x, right = width - (ksize - 1 - data->anchor_x);
-    for (int i = start - data->anchor_y; i < end + (ksize - 1 - data->anchor_y); i++)
-    {
-        if (i + offset_y >= 0 && i + offset_y < full_height)
-        {
-            if (left >= right)
-            {
-                for (int j = 0; j < width; j++)
-                    process(i, j);
-            }
-            else
-            {
-                for (int j = 0; j < left; j++)
-                    process(i, j);
-                for (int j = right; j < width; j++)
-                    process(i, j);
-
-                int vl;
-                for (int j = left; j < right; j += vl)
-                {
-                    vl = __riscv_vsetvl_e8m2(right - j);
-                    const T* extra = reinterpret_cast<const T*>(src_data + i * src_step) + j - data->anchor_x;
-                    vfloat32m8_t src;
-                    if (std::is_same<T, uchar>::value)
-                    {
-                        src = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vle8_v_u8m2(reinterpret_cast<const uchar*>(extra), vl), vl), vl);
-                    }
-                    else if (std::is_same<T, short>::value)
-                    {
-                        src = __riscv_vfwcvt_f(__riscv_vle16_v_i16m4(reinterpret_cast<const short*>(extra), vl), vl);
-                    }
-                    else
-                    {
-                        src = __riscv_vle32_v_f32m8(reinterpret_cast<const float*>(extra), vl);
-                    }
-
-                    extra += vl;
-                    auto sum = __riscv_vfmul(src, kx[0], vl);
-                    src = __riscv_vfslide1down(src, extra[0], vl);
-                    sum = __riscv_vfmacc(sum, kx[1], src, vl);
-                    src = __riscv_vfslide1down(src, extra[1], vl);
-                    sum = __riscv_vfmacc(sum, kx[2], src, vl);
-                    if (ksize == 5)
-                    {
-                        src = __riscv_vfslide1down(src, extra[2], vl);
-                        sum = __riscv_vfmacc(sum, kx[3], src, vl);
-                        src = __riscv_vfslide1down(src, extra[3], vl);
-                        sum = __riscv_vfmacc(sum, kx[4], src, vl);
-                    }
-                    __riscv_vse32(res.data() + p2idx(i, j), sum, vl);
-                }
-            }
-        }
-
-        int cur = i - (ksize - 1 - data->anchor_y);
-        if (cur >= start)
-        {
-            const float* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
-            const float* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
-            const float* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
-            const float* row3 = nullptr, *row4 = nullptr;
-            if (ksize == 5)
-            {
-                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
-                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
-            }
-
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = __riscv_vsetvl_e32m4(width - j);
-                auto v0 = row0 ? __riscv_vle32_v_f32m4(row0 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
-                auto v1 = row1 ? __riscv_vle32_v_f32m4(row1 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
-                auto v2 = row2 ? __riscv_vle32_v_f32m4(row2 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
-                auto sum = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmv_v_f_f32m4(data->delta, vl), ky[0], v0, vl), ky[1], v1, vl), ky[2], v2, vl);
-
-                if (ksize == 5)
-                {
-                    auto v3 = row3 ? __riscv_vle32_v_f32m4(row3 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
-                    auto v4 = row4 ? __riscv_vle32_v_f32m4(row4 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
-                    sum = __riscv_vfmacc(__riscv_vfmacc(sum, ky[3], v3, vl), ky[4], v4, vl);
-                }
-
-                if (data->dst_type == CV_16SC1)
-                {
-                    __riscv_vse16(reinterpret_cast<short*>(dst_data + cur * dst_step) + j, __riscv_vfncvt_x(sum, vl), vl);
-                }
-                else
-                {
-                    __riscv_vse32(reinterpret_cast<float*>(dst_data + cur * dst_step) + j, sum, vl);
-                }
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int sepFilter(cvhalFilter2D *context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
-{
-    sepFilter2D* data = reinterpret_cast<sepFilter2D*>(context);
-
-    uchar* _dst_data = dst_data;
-    size_t _dst_step = dst_step;
-    const size_t size = CV_ELEM_SIZE(data->dst_type);
-    std::vector<uchar> dst;
-    if (src_data == _dst_data)
-    {
-        dst = std::vector<uchar>(width * height * size);
-        dst_data = dst.data();
-        dst_step = width * size;
-    }
-
-    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
-    switch (data->kernelx_length*100 + data->src_type)
-    {
-    case 300 + CV_8UC1:
-        res = filter::invoke(height, {sepFilter<3, uchar>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    case 500 + CV_8UC1:
-        res = filter::invoke(height, {sepFilter<5, uchar>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    case 300 + CV_16SC1:
-        res = filter::invoke(height, {sepFilter<3, short>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    case 500 + CV_16SC1:
-        res = filter::invoke(height, {sepFilter<5, short>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    case 300 + CV_32FC1:
-        res = filter::invoke(height, {sepFilter<3, float>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    case 500 + CV_32FC1:
-        res = filter::invoke(height, {sepFilter<5, float>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    }
-    if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    if (src_data == _dst_data)
-    {
-        for (int i = 0; i < height; i++)
-            memcpy(_dst_data + i * _dst_step, dst.data() + i * dst_step, dst_step);
-    }
-
-    return res;
-}
-
-inline int sepFilterFree(cvhalFilter2D* context)
-{
-    delete reinterpret_cast<sepFilter2D*>(context);
-    return CV_HAL_ERROR_OK;
-}
-} // cv::cv_hal_rvv::sepFilter
-
-namespace morph {
-#undef cv_hal_morphInit
-#undef cv_hal_morph
-#undef cv_hal_morphFree
-#define cv_hal_morphInit cv::cv_hal_rvv::morph::morphInit
-#define cv_hal_morph cv::cv_hal_rvv::morph::morph
-#define cv_hal_morphFree cv::cv_hal_rvv::morph::morphFree
-
-struct Morph2D
-{
-    int operation;
-    int src_type;
-    int dst_type;
-    int kernel_type;
-    uchar* kernel_data;
-    size_t kernel_step;
-    int kernel_width;
-    int kernel_height;
-    int anchor_x;
-    int anchor_y;
-    int borderType;
-    const uchar* borderValue;
-};
-
-inline int morphInit(cvhalFilter2D** context, int operation, int src_type, int dst_type, int /*max_width*/, int /*max_height*/, int kernel_type, uchar* kernel_data, size_t kernel_step, int kernel_width, int kernel_height, int anchor_x, int anchor_y, int borderType, const double borderValue[4], int iterations, bool /*allowSubmatrix*/, bool /*allowInplace*/)
-{
-    if (kernel_type != CV_8UC1 || src_type != dst_type)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (src_type != CV_8UC1 && src_type != CV_8UC4)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (kernel_width != kernel_height || kernel_width != 3)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (iterations != 1)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (operation != CV_HAL_MORPH_ERODE && operation != CV_HAL_MORPH_DILATE)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    uchar* borderV;
-    if (src_type == CV_8UC1)
-    {
-        borderV = new uchar{static_cast<uchar>(borderValue[0])};
-        if (operation == CV_HAL_MORPH_DILATE && borderValue[0] == DBL_MAX)
-            borderV[0] = 0;
-    }
-    else
-    {
-        borderV = new uchar[4]{static_cast<uchar>(borderValue[0]), static_cast<uchar>(borderValue[1]), static_cast<uchar>(borderValue[2]), static_cast<uchar>(borderValue[3])};
-        if (operation == CV_HAL_MORPH_DILATE)
-        {
-            if (borderValue[0] == DBL_MAX)
-                borderV[0] = 0;
-            if (borderValue[1] == DBL_MAX)
-                borderV[1] = 0;
-            if (borderValue[2] == DBL_MAX)
-                borderV[2] = 0;
-            if (borderValue[3] == DBL_MAX)
-                borderV[3] = 0;
-        }
-    }
-
-    anchor_x = anchor_x < 0 ? kernel_width  / 2 : anchor_x;
-    anchor_y = anchor_y < 0 ? kernel_height / 2 : anchor_y;
-    *context = reinterpret_cast<cvhalFilter2D*>(new Morph2D{operation, src_type, dst_type, kernel_type, kernel_data, kernel_step, kernel_width, kernel_height, anchor_x, anchor_y, borderType, borderV});
-    return CV_HAL_ERROR_OK;
-}
-
-template<int op> struct rvv;
-template<> struct rvv<CV_HAL_MORPH_ERODE>
-{
-    static inline uchar init() { return std::numeric_limits<uchar>::max(); }
-    static inline uchar mop(uchar a, uchar b) { return a < b ? a : b; }
-    static inline vuint8m4_t vop(vuint8m4_t a, vuint8m4_t b, size_t c) { return __riscv_vminu(a, b, c); }
-    static inline vuint8m4_t vop(vuint8m4_t a, uchar b, size_t c) { return __riscv_vminu(a, b, c); }
-};
-template<> struct rvv<CV_HAL_MORPH_DILATE>
-{
-    static inline uchar init() { return std::numeric_limits<uchar>::min(); }
-    static inline uchar mop(uchar a, uchar b) { return a > b ? a : b; }
-    static inline vuint8m4_t vop(vuint8m4_t a, vuint8m4_t b, size_t c) { return __riscv_vmaxu(a, b, c); }
-    static inline vuint8m4_t vop(vuint8m4_t a, uchar b, size_t c) { return __riscv_vmaxu(a, b, c); }
-};
-
-// the algorithm is copied from 3rdparty/carotene/src/morph.cpp,
-// in the function template void morph3x3
-template<int op>
-static inline int morph(int start, int end, Morph2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
-{
-    bool kernel[9];
-    for (int i = 0; i < 9; i++)
-    {
-        kernel[i] = data->kernel_data[(i / 3) * data->kernel_step + i % 3] != 0;
-    }
-
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto access = [&](int x, int y) {
-        int pi, pj;
-        if (data->borderType & BORDER_ISOLATED)
-        {
-            pi = filter::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
-            pj = filter::borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED);
-            pi = pi < 0 ? noval : pi;
-            pj = pj < 0 ? noval : pj;
-        }
-        else
-        {
-            pi = filter::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
-            pj = filter::borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType);
-            pi = pi < 0 ? noval : pi - offset_y;
-            pj = pj < 0 ? noval : pj - offset_x;
-        }
-        return std::make_pair(pi, pj);
-    };
-
-    auto process = [&](int x, int y) {
-        if (data->src_type == CV_8UC1)
-        {
-            uchar val = rvv<op>::init();
-            for (int i = 0; i < 9; i++)
-            {
-                if (kernel[i])
-                {
-                    auto p = access(x + i / 3, y + i % 3);
-                    if (p.first != noval && p.second != noval)
-                    {
-                        val = rvv<op>::mop(val, src_data[p.first * src_step + p.second]);
-                    }
-                    else
-                    {
-                        val = rvv<op>::mop(val, data->borderValue[0]);
-                    }
-                }
-            }
-            dst_data[x * width + y] = val;
-        }
-        else
-        {
-            uchar val0, val1, val2, val3;
-            val0 = val1 = val2 = val3 = rvv<op>::init();
-            for (int i = 0; i < 9; i++)
-            {
-                if (kernel[i])
-                {
-                    auto p = access(x + i / 3, y + i % 3);
-                    if (p.first != noval && p.second != noval)
-                    {
-                        val0 = rvv<op>::mop(val0, src_data[p.first * src_step + p.second * 4    ]);
-                        val1 = rvv<op>::mop(val1, src_data[p.first * src_step + p.second * 4 + 1]);
-                        val2 = rvv<op>::mop(val2, src_data[p.first * src_step + p.second * 4 + 2]);
-                        val3 = rvv<op>::mop(val3, src_data[p.first * src_step + p.second * 4 + 3]);
-                    }
-                    else
-                    {
-                        val0 = rvv<op>::mop(val0, data->borderValue[0]);
-                        val1 = rvv<op>::mop(val1, data->borderValue[1]);
-                        val2 = rvv<op>::mop(val2, data->borderValue[2]);
-                        val3 = rvv<op>::mop(val3, data->borderValue[3]);
-                    }
-                }
-            }
-            dst_data[(x * width + y) * 4    ] = val0;
-            dst_data[(x * width + y) * 4 + 1] = val1;
-            dst_data[(x * width + y) * 4 + 2] = val2;
-            dst_data[(x * width + y) * 4 + 3] = val3;
-        }
-    };
-
-    const int left = data->anchor_x, right = width - (2 - data->anchor_x);
-    for (int i = start; i < end; i++)
-    {
-        if (left >= right)
-        {
-            for (int j = 0; j < width; j++)
-                process(i, j);
-        }
-        else
-        {
-            for (int j = 0; j < left; j++)
-                process(i, j);
-            for (int j = right; j < width; j++)
-                process(i, j);
-
-            const uchar* row0 = access(i    , 0).first == noval ? nullptr : src_data + access(i    , 0).first * src_step;
-            const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step;
-            const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step;
-            if (data->src_type == CV_8UC1)
-            {
-                int vl;
-                for (int j = left; j < right; j += vl)
-                {
-                    vl = __riscv_vsetvl_e8m4(right - j);
-                    auto m0 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
-                    auto loadsrc = [&](const uchar* row, bool k0, bool k1, bool k2) {
-                        if (!row)
-                        {
-                            m0 = rvv<op>::vop(m0, data->borderValue[0], vl);
-                            return;
-                        }
-
-                        const uchar* extra = row + j - data->anchor_x;
-                        auto v0 = __riscv_vle8_v_u8m4(extra, vl);
-
-                        if (k0) m0 = rvv<op>::vop(m0, v0, vl);
-                        v0 = __riscv_vslide1down(v0, extra[vl], vl);
-                        if (k1) m0 = rvv<op>::vop(m0, v0, vl);
-                        if (!k2) return;
-                        v0 = __riscv_vslide1down(v0, extra[vl + 1], vl);
-                        m0 = rvv<op>::vop(m0, v0, vl);
-                    };
-
-                    loadsrc(row0, kernel[0], kernel[1], kernel[2]);
-                    loadsrc(row1, kernel[3], kernel[4], kernel[5]);
-                    loadsrc(row2, kernel[6], kernel[7], kernel[8]);
-                    __riscv_vse8(dst_data + i * width + j, m0, vl);
-                }
-            }
-            else
-            {
-                int vl, vl0, vl1;
-                for (int j = left; j < right; j += vl)
-                {
-                    vl = __riscv_vsetvl_e8m4(right - j);
-                    vl0 = std::min(vl, (int)__riscv_vlenb() * 2);
-                    vl1 = vl - vl0;
-                    auto m0 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
-                    auto m1 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
-                    auto m2 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
-                    auto m3 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
-
-                    auto opshift = [&](vuint8m4_t a, vuint8m4_t b, bool k0, bool k1, bool k2, uchar r1, uchar r2) {
-                        if (k0) a = rvv<op>::vop(a, b, vl);
-                        b = __riscv_vslide1down(b, r1, vl);
-                        if (k1) a = rvv<op>::vop(a, b, vl);
-                        if (!k2) return a;
-                        b = __riscv_vslide1down(b, r2, vl);
-                        return rvv<op>::vop(a, b, vl);
-                    };
-                    auto loadsrc = [&](const uchar* row, bool k0, bool k1, bool k2) {
-                        if (!row)
-                        {
-                            m0 = rvv<op>::vop(m0, data->borderValue[0], vl);
-                            m1 = rvv<op>::vop(m1, data->borderValue[1], vl);
-                            m2 = rvv<op>::vop(m2, data->borderValue[2], vl);
-                            m3 = rvv<op>::vop(m3, data->borderValue[3], vl);
-                            return;
-                        }
-
-                        vuint8m4_t v0{}, v1{}, v2{}, v3{};
-                        const uchar* extra = row + (j - data->anchor_x) * 4;
-                        auto src = __riscv_vlseg4e8_v_u8m2x4(extra, vl0);
-                        v0 = __riscv_vset_v_u8m2_u8m4(v0, 0, __riscv_vget_v_u8m2x4_u8m2(src, 0));
-                        v1 = __riscv_vset_v_u8m2_u8m4(v1, 0, __riscv_vget_v_u8m2x4_u8m2(src, 1));
-                        v2 = __riscv_vset_v_u8m2_u8m4(v2, 0, __riscv_vget_v_u8m2x4_u8m2(src, 2));
-                        v3 = __riscv_vset_v_u8m2_u8m4(v3, 0, __riscv_vget_v_u8m2x4_u8m2(src, 3));
-                        src = __riscv_vlseg4e8_v_u8m2x4(extra + vl0 * 4, vl1);
-                        v0 = __riscv_vset_v_u8m2_u8m4(v0, 1, __riscv_vget_v_u8m2x4_u8m2(src, 0));
-                        v1 = __riscv_vset_v_u8m2_u8m4(v1, 1, __riscv_vget_v_u8m2x4_u8m2(src, 1));
-                        v2 = __riscv_vset_v_u8m2_u8m4(v2, 1, __riscv_vget_v_u8m2x4_u8m2(src, 2));
-                        v3 = __riscv_vset_v_u8m2_u8m4(v3, 1, __riscv_vget_v_u8m2x4_u8m2(src, 3));
-
-                        extra += vl * 4;
-                        m0 = opshift(m0, v0, k0, k1, k2, extra[0], extra[4]);
-                        m1 = opshift(m1, v1, k0, k1, k2, extra[1], extra[5]);
-                        m2 = opshift(m2, v2, k0, k1, k2, extra[2], extra[6]);
-                        m3 = opshift(m3, v3, k0, k1, k2, extra[3], extra[7]);
-                    };
-
-                    loadsrc(row0, kernel[0], kernel[1], kernel[2]);
-                    loadsrc(row1, kernel[3], kernel[4], kernel[5]);
-                    loadsrc(row2, kernel[6], kernel[7], kernel[8]);
-                    vuint8m2x4_t val{};
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 0, __riscv_vget_v_u8m4_u8m2(m0, 0));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 1, __riscv_vget_v_u8m4_u8m2(m1, 0));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 2, __riscv_vget_v_u8m4_u8m2(m2, 0));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 3, __riscv_vget_v_u8m4_u8m2(m3, 0));
-                    __riscv_vsseg4e8(dst_data + (i * width + j) * 4, val, vl0);
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 0, __riscv_vget_v_u8m4_u8m2(m0, 1));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 1, __riscv_vget_v_u8m4_u8m2(m1, 1));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 2, __riscv_vget_v_u8m4_u8m2(m2, 1));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 3, __riscv_vget_v_u8m4_u8m2(m3, 1));
-                    __riscv_vsseg4e8(dst_data + (i * width + j + vl0) * 4, val, vl1);
-                }
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int morph(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_full_width, int src_full_height, int src_roi_x, int src_roi_y, int /*dst_full_width*/, int /*dst_full_height*/, int /*dst_roi_x*/, int /*dst_roi_y*/)
-{
-    Morph2D* data = reinterpret_cast<Morph2D*>(context);
-    int cn = data->src_type == CV_8UC1 ? 1 : 4;
-    std::vector<uchar> dst(width * height * cn);
-
-    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
-    switch (data->operation)
-    {
-    case CV_HAL_MORPH_ERODE:
-        res = filter::invoke(height, {morph<CV_HAL_MORPH_ERODE>}, data, src_data, src_step, dst.data(), width, height, src_full_width, src_full_height, src_roi_x, src_roi_y);
-        break;
-    case CV_HAL_MORPH_DILATE:
-        res = filter::invoke(height, {morph<CV_HAL_MORPH_DILATE>}, data, src_data, src_step, dst.data(), width, height, src_full_width, src_full_height, src_roi_x, src_roi_y);
-        break;
-    }
-
-    for (int i = 0; i < height; i++)
-        memcpy(dst_data + i * dst_step, dst.data() + i * width * cn, width * cn);
-    return res;
-}
-
-inline int morphFree(cvhalFilter2D* context)
-{
-    delete reinterpret_cast<Morph2D*>(context)->borderValue;
-    delete reinterpret_cast<Morph2D*>(context);
-    return CV_HAL_ERROR_OK;
-}
-} // cv::cv_hal_rvv::morph
-
-namespace gaussianBlurBinomial {
-#undef cv_hal_gaussianBlurBinomial
-#define cv_hal_gaussianBlurBinomial cv::cv_hal_rvv::gaussianBlurBinomial::gaussianBlurBinomial
-
-// the algorithm is same as cv_hal_sepFilter
-template<int ksize, typename helperT, typename helperWT>
-static inline int gaussianBlurC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int border_type)
-{
-    using T = typename helperT::ElemType;
-    using WT = typename helperWT::ElemType;
-
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto accessX = [&](int x) {
-        int pi = filter::borderInterpolate(offset_y + x - ksize / 2, full_height, border_type);
-        return pi < 0 ? noval : pi - offset_y;
-    };
-    auto accessY = [&](int y) {
-        int pj = filter::borderInterpolate(offset_x + y - ksize / 2, full_width, border_type);
-        return pj < 0 ? noval : pj - offset_x;
-    };
-    auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; };
-
-    constexpr uint kernel[2][5] = {{1, 2, 1}, {1, 4, 6, 4, 1}};
-    std::vector<WT> res(width * ksize);
-    auto process = [&](int x, int y) {
-        WT sum = 0;
-        for (int i = 0; i < ksize; i++)
-        {
-            int p = accessY(y + i);
-            if (p != noval)
-            {
-                sum += kernel[ksize == 5][i] * static_cast<WT>(reinterpret_cast<const T*>(src_data + x * src_step)[p]);
-            }
-        }
-        res[p2idx(x, y)] = sum;
-    };
-
-    const int left = ksize / 2, right = width - ksize / 2;
-    for (int i = start - ksize / 2; i < end + ksize / 2; i++)
-    {
-        if (i + offset_y >= 0 && i + offset_y < full_height)
-        {
-            if (left >= right)
-            {
-                for (int j = 0; j < width; j++)
-                    process(i, j);
-            }
-            else
-            {
-                for (int j = 0; j < left; j++)
-                    process(i, j);
-                for (int j = right; j < width; j++)
-                    process(i, j);
-
-                int vl;
-                for (int j = left; j < right; j += vl)
-                {
-                    vl = helperT::setvl(right - j);
-                    const T* extra = reinterpret_cast<const T*>(src_data + i * src_step) + j - ksize / 2;
-                    auto src = __riscv_vzext_vf2(helperT::vload(extra, vl), vl);
-
-                    extra += vl;
-                    auto sum = src;
-                    if (ksize == 3)
-                    {
-                        src = __riscv_vslide1down(src, extra[0], vl);
-                        sum = __riscv_vadd(sum, __riscv_vsll(src, 1, vl), vl);
-                        src = __riscv_vslide1down(src, extra[1], vl);
-                        sum = __riscv_vadd(sum, src, vl);
-                    }
-                    else
-                    {
-                        src = __riscv_vslide1down(src, extra[0], vl);
-                        sum = __riscv_vadd(sum, __riscv_vsll(src, 2, vl), vl);
-                        src = __riscv_vslide1down(src, extra[1], vl);
-                        sum = __riscv_vadd(sum, __riscv_vadd(__riscv_vsll(src, 1, vl), __riscv_vsll(src, 2, vl), vl), vl);
-                        src = __riscv_vslide1down(src, extra[2], vl);
-                        sum = __riscv_vadd(sum, __riscv_vsll(src, 2, vl), vl);
-                        src = __riscv_vslide1down(src, extra[3], vl);
-                        sum = __riscv_vadd(sum, src, vl);
-                    }
-                    helperWT::vstore(res.data() + p2idx(i, j), sum, vl);
-                }
-            }
-        }
-
-        int cur = i - ksize / 2;
-        if (cur >= start)
-        {
-            const WT* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
-            const WT* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
-            const WT* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
-            const WT* row3 = nullptr, *row4 = nullptr;
-            if (ksize == 5)
-            {
-                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
-                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
-            }
-
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = helperWT::setvl(width - j);
-                auto v0 = row0 ? helperWT::vload(row0 + j, vl) : helperWT::vmv(0, vl);
-                auto v1 = row1 ? helperWT::vload(row1 + j, vl) : helperWT::vmv(0, vl);
-                auto v2 = row2 ? helperWT::vload(row2 + j, vl) : helperWT::vmv(0, vl);
-                typename helperWT::VecType sum;
-                if (ksize == 3)
-                {
-                    sum = __riscv_vadd(__riscv_vadd(v0, v2, vl), __riscv_vsll(v1, 1, vl), vl);
-                }
-                else
-                {
-                    sum = __riscv_vadd(v0, __riscv_vadd(__riscv_vsll(v2, 1, vl), __riscv_vsll(v2, 2, vl), vl), vl);
-                    auto v3 = row3 ? helperWT::vload(row3 + j, vl) : helperWT::vmv(0, vl);
-                    sum = __riscv_vadd(sum, __riscv_vsll(__riscv_vadd(v1, v3, vl), 2, vl), vl);
-                    auto v4 = row4 ? helperWT::vload(row4 + j, vl) : helperWT::vmv(0, vl);
-                    sum = __riscv_vadd(sum, v4, vl);
-                }
-                helperT::vstore(reinterpret_cast<T*>(dst_data + cur * dst_step) + j, __riscv_vnclipu(sum, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl), vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-template<int ksize>
-static inline int gaussianBlurC4(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int border_type)
-{
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto accessX = [&](int x) {
-        int pi = filter::borderInterpolate(offset_y + x - ksize / 2, full_height, border_type);
-        return pi < 0 ? noval : pi - offset_y;
-    };
-    auto accessY = [&](int y) {
-        int pj = filter::borderInterpolate(offset_x + y - ksize / 2, full_width, border_type);
-        return pj < 0 ? noval : pj - offset_x;
-    };
-    auto p2idx = [&](int x, int y){ return ((x + ksize) % ksize * width + y) * 4; };
-
-    constexpr uint kernel[2][5] = {{1, 2, 1}, {1, 4, 6, 4, 1}};
-    std::vector<ushort> res(width * ksize * 4);
-    auto process = [&](int x, int y) {
-        ushort sum0, sum1, sum2, sum3;
-        sum0 = sum1 = sum2 = sum3 = 0;
-        for (int i = 0; i < ksize; i++)
-        {
-            int p = accessY(y + i);
-            if (p != noval)
-            {
-                sum0 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4    ]);
-                sum1 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4 + 1]);
-                sum2 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4 + 2]);
-                sum3 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4 + 3]);
-            }
-        }
-        res[p2idx(x, y)    ] = sum0;
-        res[p2idx(x, y) + 1] = sum1;
-        res[p2idx(x, y) + 2] = sum2;
-        res[p2idx(x, y) + 3] = sum3;
-    };
-
-    const int left = ksize / 2, right = width - ksize / 2;
-    for (int i = start - ksize / 2; i < end + ksize / 2; i++)
-    {
-        if (i + offset_y >= 0 && i + offset_y < full_height)
-        {
-            if (left >= right)
-            {
-                for (int j = 0; j < width; j++)
-                    process(i, j);
-            }
-            else
-            {
-                for (int j = 0; j < left; j++)
-                    process(i, j);
-                for (int j = right; j < width; j++)
-                    process(i, j);
-
-                int vl;
-                for (int j = left; j < right; j += vl)
-                {
-                    vl = __riscv_vsetvl_e8m1(right - j);
-                    const uchar* extra = src_data + i * src_step + (j - ksize / 2) * 4;
-                    auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl);
-                    auto src0 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl);
-                    auto src1 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl);
-                    auto src2 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl);
-                    auto src3 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl);
-
-                    extra += vl * 4;
-                    auto sum0 = src0, sum1 = src1, sum2 = src2, sum3 = src3;
-                    if (ksize == 3)
-                    {
-                        src0 = __riscv_vslide1down(src0, extra[0], vl);
-                        src1 = __riscv_vslide1down(src1, extra[1], vl);
-                        src2 = __riscv_vslide1down(src2, extra[2], vl);
-                        src3 = __riscv_vslide1down(src3, extra[3], vl);
-                        sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 1, vl), vl);
-                        sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 1, vl), vl);
-                        sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 1, vl), vl);
-                        sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 1, vl), vl);
-                        src0 = __riscv_vslide1down(src0, extra[4], vl);
-                        src1 = __riscv_vslide1down(src1, extra[5], vl);
-                        src2 = __riscv_vslide1down(src2, extra[6], vl);
-                        src3 = __riscv_vslide1down(src3, extra[7], vl);
-                        sum0 = __riscv_vadd(sum0, src0, vl);
-                        sum1 = __riscv_vadd(sum1, src1, vl);
-                        sum2 = __riscv_vadd(sum2, src2, vl);
-                        sum3 = __riscv_vadd(sum3, src3, vl);
-                    }
-                    else
-                    {
-                        src0 = __riscv_vslide1down(src0, extra[0], vl);
-                        src1 = __riscv_vslide1down(src1, extra[1], vl);
-                        src2 = __riscv_vslide1down(src2, extra[2], vl);
-                        src3 = __riscv_vslide1down(src3, extra[3], vl);
-                        sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl);
-                        sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl);
-                        sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl);
-                        sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl);
-                        src0 = __riscv_vslide1down(src0, extra[4], vl);
-                        src1 = __riscv_vslide1down(src1, extra[5], vl);
-                        src2 = __riscv_vslide1down(src2, extra[6], vl);
-                        src3 = __riscv_vslide1down(src3, extra[7], vl);
-                        sum0 = __riscv_vadd(sum0, __riscv_vadd(__riscv_vsll(src0, 1, vl), __riscv_vsll(src0, 2, vl), vl), vl);
-                        sum1 = __riscv_vadd(sum1, __riscv_vadd(__riscv_vsll(src1, 1, vl), __riscv_vsll(src1, 2, vl), vl), vl);
-                        sum2 = __riscv_vadd(sum2, __riscv_vadd(__riscv_vsll(src2, 1, vl), __riscv_vsll(src2, 2, vl), vl), vl);
-                        sum3 = __riscv_vadd(sum3, __riscv_vadd(__riscv_vsll(src3, 1, vl), __riscv_vsll(src3, 2, vl), vl), vl);
-                        src0 = __riscv_vslide1down(src0, extra[ 8], vl);
-                        src1 = __riscv_vslide1down(src1, extra[ 9], vl);
-                        src2 = __riscv_vslide1down(src2, extra[10], vl);
-                        src3 = __riscv_vslide1down(src3, extra[11], vl);
-                        sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl);
-                        sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl);
-                        sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl);
-                        sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl);
-                        src0 = __riscv_vslide1down(src0, extra[12], vl);
-                        src1 = __riscv_vslide1down(src1, extra[13], vl);
-                        src2 = __riscv_vslide1down(src2, extra[14], vl);
-                        src3 = __riscv_vslide1down(src3, extra[15], vl);
-                        sum0 = __riscv_vadd(sum0, src0, vl);
-                        sum1 = __riscv_vadd(sum1, src1, vl);
-                        sum2 = __riscv_vadd(sum2, src2, vl);
-                        sum3 = __riscv_vadd(sum3, src3, vl);
-                    }
-
-                    vuint16m2x4_t dst{};
-                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 0, sum0);
-                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 1, sum1);
-                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 2, sum2);
-                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 3, sum3);
-                    __riscv_vsseg4e16(res.data() + p2idx(i, j), dst, vl);
-                }
-            }
-        }
-
-        int cur = i - ksize / 2;
-        if (cur >= start)
-        {
-            const ushort* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
-            const ushort* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
-            const ushort* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
-            const ushort* row3 = nullptr, *row4 = nullptr;
-            if (ksize == 5)
-            {
-                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
-                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
-            }
-
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = __riscv_vsetvl_e16m2(width - j);
-                vuint16m2_t sum0, sum1, sum2, sum3, src0{}, src1{}, src2{}, src3{};
-                sum0 = sum1 = sum2 = sum3 = __riscv_vmv_v_x_u16m2(0, vl);
-
-                auto loadres = [&](const ushort* row) {
-                    auto src = __riscv_vlseg4e16_v_u16m2x4(row + j * 4, vl);
-                    src0 = __riscv_vget_v_u16m2x4_u16m2(src, 0);
-                    src1 = __riscv_vget_v_u16m2x4_u16m2(src, 1);
-                    src2 = __riscv_vget_v_u16m2x4_u16m2(src, 2);
-                    src3 = __riscv_vget_v_u16m2x4_u16m2(src, 3);
-                };
-                if (row0)
-                {
-                    loadres(row0);
-                    sum0 = src0;
-                    sum1 = src1;
-                    sum2 = src2;
-                    sum3 = src3;
-                }
-                if (row1)
-                {
-                    loadres(row1);
-                    sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, ksize == 5 ? 2 : 1, vl), vl);
-                    sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, ksize == 5 ? 2 : 1, vl), vl);
-                    sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, ksize == 5 ? 2 : 1, vl), vl);
-                    sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, ksize == 5 ? 2 : 1, vl), vl);
-                }
-                if (row2)
-                {
-                    loadres(row2);
-                    if (ksize == 5)
-                    {
-                        src0 = __riscv_vadd(__riscv_vsll(src0, 1, vl), __riscv_vsll(src0, 2, vl), vl);
-                        src1 = __riscv_vadd(__riscv_vsll(src1, 1, vl), __riscv_vsll(src1, 2, vl), vl);
-                        src2 = __riscv_vadd(__riscv_vsll(src2, 1, vl), __riscv_vsll(src2, 2, vl), vl);
-                        src3 = __riscv_vadd(__riscv_vsll(src3, 1, vl), __riscv_vsll(src3, 2, vl), vl);
-                    }
-                    sum0 = __riscv_vadd(sum0, src0, vl);
-                    sum1 = __riscv_vadd(sum1, src1, vl);
-                    sum2 = __riscv_vadd(sum2, src2, vl);
-                    sum3 = __riscv_vadd(sum3, src3, vl);
-                }
-                if (row3)
-                {
-                    loadres(row3);
-                    sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl);
-                    sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl);
-                    sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl);
-                    sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl);
-                }
-                if (row4)
-                {
-                    loadres(row4);
-                    sum0 = __riscv_vadd(sum0, src0, vl);
-                    sum1 = __riscv_vadd(sum1, src1, vl);
-                    sum2 = __riscv_vadd(sum2, src2, vl);
-                    sum3 = __riscv_vadd(sum3, src3, vl);
-                }
-
-                vuint8m1x4_t dst{};
-                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 0, __riscv_vnclipu(sum0, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
-                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 1, __riscv_vnclipu(sum1, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
-                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 2, __riscv_vnclipu(sum2, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
-                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 3, __riscv_vnclipu(sum3, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
-                __riscv_vsseg4e8(dst_data + cur * dst_step + j * 4, dst, vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int gaussianBlurBinomial(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, size_t margin_left, size_t margin_top, size_t margin_right, size_t margin_bottom, size_t ksize, int border_type)
-{
-    const int type = CV_MAKETYPE(depth, cn);
-    if ((type != CV_8UC1 && type != CV_8UC4 && type != CV_16UC1) || src_data == dst_data)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if ((ksize != 3 && ksize != 5) || border_type & BORDER_ISOLATED || border_type == BORDER_WRAP)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    switch (ksize*100 + type)
-    {
-    case 300 + CV_8UC1:
-        return filter::invoke(height, {gaussianBlurC1<3, RVV_U8M4, RVV_U16M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
-    case 500 + CV_8UC1:
-        return filter::invoke(height, {gaussianBlurC1<5, RVV_U8M4, RVV_U16M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
-    case 300 + CV_16UC1:
-        return filter::invoke(height, {gaussianBlurC1<3, RVV_U16M4, RVV_U32M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
-    case 500 + CV_16UC1:
-        return filter::invoke(height, {gaussianBlurC1<5, RVV_U16M4, RVV_U32M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
-    case 300 + CV_8UC4:
-        return filter::invoke(height, {gaussianBlurC4<3>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
-    case 500 + CV_8UC4:
-        return filter::invoke(height, {gaussianBlurC4<5>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
-    }
-
-    return CV_HAL_ERROR_NOT_IMPLEMENTED;
-}
-} // cv::cv_hal_rvv::gaussianBlurBinomial
-
-namespace medianBlur {
-#undef cv_hal_medianBlur
-#define cv_hal_medianBlur cv::cv_hal_rvv::medianBlur::medianBlur
-
-// the algorithm is copied from imgproc/src/median_blur.simd.cpp
-// in the function template static void medianBlur_SortNet
-template<int ksize, typename helper>
-static inline int medianBlurC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
-{
-    using T = typename helper::ElemType;
-    using VT = typename helper::VecType;
-
-    for (int i = start; i < end; i++)
-    {
-        const T* row0 = reinterpret_cast<const T*>(src_data + std::min(std::max(i     - ksize / 2, 0), height - 1) * src_step);
-        const T* row1 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 1 - ksize / 2, 0), height - 1) * src_step);
-        const T* row2 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 2 - ksize / 2, 0), height - 1) * src_step);
-        const T* row3 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 3 - ksize / 2, 0), height - 1) * src_step);
-        const T* row4 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 4 - ksize / 2, 0), height - 1) * src_step);
-        int vl;
-        auto vop = [&vl](VT& a, VT& b) {
-            auto t = a;
-            a = helper::vmin(a, b, vl);
-            b = helper::vmax(t, b, vl);
-        };
-
-        for (int j = 0; j < width; j += vl)
-        {
-            vl = helper::setvl(width - j);
-            if (ksize == 3)
-            {
-                VT p0, p1, p2;
-                VT p3, p4, p5;
-                VT p6, p7, p8;
-                if (j != 0)
-                {
-                    p0 = helper::vload(row0 + j - 1, vl);
-                    p3 = helper::vload(row1 + j - 1, vl);
-                    p6 = helper::vload(row2 + j - 1, vl);
-                }
-                else
-                {
-                    p0 = helper::vslide1up(helper::vload(row0, vl), row0[0], vl);
-                    p3 = helper::vslide1up(helper::vload(row1, vl), row1[0], vl);
-                    p6 = helper::vslide1up(helper::vload(row2, vl), row2[0], vl);
-                }
-                p1 = helper::vslide1down(p0, row0[j + vl - 1], vl);
-                p4 = helper::vslide1down(p3, row1[j + vl - 1], vl);
-                p7 = helper::vslide1down(p6, row2[j + vl - 1], vl);
-                p2 = helper::vslide1down(p1, row0[std::min(width - 1, j + vl)], vl);
-                p5 = helper::vslide1down(p4, row1[std::min(width - 1, j + vl)], vl);
-                p8 = helper::vslide1down(p7, row2[std::min(width - 1, j + vl)], vl);
-
-                vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1);
-                vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5);
-                vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7);
-                vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7);
-                vop(p4, p2); vop(p6, p4); vop(p4, p2);
-                helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, p4, vl);
-            }
-            else
-            {
-                VT p0, p1, p2, p3, p4;
-                VT p5, p6, p7, p8, p9;
-                VT p10, p11, p12, p13, p14;
-                VT p15, p16, p17, p18, p19;
-                VT p20, p21, p22, p23, p24;
-                if (j >= 2)
-                {
-                    p0 = helper::vload(row0 + j - 2, vl);
-                    p5 = helper::vload(row1 + j - 2, vl);
-                    p10 = helper::vload(row2 + j - 2, vl);
-                    p15 = helper::vload(row3 + j - 2, vl);
-                    p20 = helper::vload(row4 + j - 2, vl);
-                }
-                else
-                {
-                    p0 = helper::vslide1up(helper::vload(row0, vl), row0[0], vl);
-                    p5 = helper::vslide1up(helper::vload(row1, vl), row1[0], vl);
-                    p10 = helper::vslide1up(helper::vload(row2, vl), row2[0], vl);
-                    p15 = helper::vslide1up(helper::vload(row3, vl), row3[0], vl);
-                    p20 = helper::vslide1up(helper::vload(row4, vl), row4[0], vl);
-                    if (j == 0)
-                    {
-                        p0 = helper::vslide1up(p0, row0[0], vl);
-                        p5 = helper::vslide1up(p5, row1[0], vl);
-                        p10 = helper::vslide1up(p10, row2[0], vl);
-                        p15 = helper::vslide1up(p15, row3[0], vl);
-                        p20 = helper::vslide1up(p20, row4[0], vl);
-                    }
-                }
-                p1 = helper::vslide1down(p0, row0[j + vl - 2], vl);
-                p6 = helper::vslide1down(p5, row1[j + vl - 2], vl);
-                p11 = helper::vslide1down(p10, row2[j + vl - 2], vl);
-                p16 = helper::vslide1down(p15, row3[j + vl - 2], vl);
-                p21 = helper::vslide1down(p20, row4[j + vl - 2], vl);
-                p2 = helper::vslide1down(p1, row0[j + vl - 1], vl);
-                p7 = helper::vslide1down(p6, row1[j + vl - 1], vl);
-                p12 = helper::vslide1down(p11, row2[j + vl - 1], vl);
-                p17 = helper::vslide1down(p16, row3[j + vl - 1], vl);
-                p22 = helper::vslide1down(p21, row4[j + vl - 1], vl);
-                p3 = helper::vslide1down(p2, row0[std::min(width - 1, j + vl)], vl);
-                p8 = helper::vslide1down(p7, row1[std::min(width - 1, j + vl)], vl);
-                p13 = helper::vslide1down(p12, row2[std::min(width - 1, j + vl)], vl);
-                p18 = helper::vslide1down(p17, row3[std::min(width - 1, j + vl)], vl);
-                p23 = helper::vslide1down(p22, row4[std::min(width - 1, j + vl)], vl);
-                p4 = helper::vslide1down(p3, row0[std::min(width - 1, j + vl + 1)], vl);
-                p9 = helper::vslide1down(p8, row1[std::min(width - 1, j + vl + 1)], vl);
-                p14 = helper::vslide1down(p13, row2[std::min(width - 1, j + vl + 1)], vl);
-                p19 = helper::vslide1down(p18, row3[std::min(width - 1, j + vl + 1)], vl);
-                p24 = helper::vslide1down(p23, row4[std::min(width - 1, j + vl + 1)], vl);
-
-                vop(p1, p2); vop(p0, p1); vop(p1, p2); vop(p4, p5); vop(p3, p4);
-                vop(p4, p5); vop(p0, p3); vop(p2, p5); vop(p2, p3); vop(p1, p4);
-                vop(p1, p2); vop(p3, p4); vop(p7, p8); vop(p6, p7); vop(p7, p8);
-                vop(p10, p11); vop(p9, p10); vop(p10, p11); vop(p6, p9); vop(p8, p11);
-                vop(p8, p9); vop(p7, p10); vop(p7, p8); vop(p9, p10); vop(p0, p6);
-                vop(p4, p10); vop(p4, p6); vop(p2, p8); vop(p2, p4); vop(p6, p8);
-                vop(p1, p7); vop(p5, p11); vop(p5, p7); vop(p3, p9); vop(p3, p5);
-                vop(p7, p9); vop(p1, p2); vop(p3, p4); vop(p5, p6); vop(p7, p8);
-                vop(p9, p10); vop(p13, p14); vop(p12, p13); vop(p13, p14); vop(p16, p17);
-                vop(p15, p16); vop(p16, p17); vop(p12, p15); vop(p14, p17); vop(p14, p15);
-                vop(p13, p16); vop(p13, p14); vop(p15, p16); vop(p19, p20); vop(p18, p19);
-                vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p21, p23); vop(p22, p24);
-                vop(p22, p23); vop(p18, p21); vop(p20, p23); vop(p20, p21); vop(p19, p22);
-                vop(p22, p24); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p12, p18);
-                vop(p16, p22); vop(p16, p18); vop(p14, p20); vop(p20, p24); vop(p14, p16);
-                vop(p18, p20); vop(p22, p24); vop(p13, p19); vop(p17, p23); vop(p17, p19);
-                vop(p15, p21); vop(p15, p17); vop(p19, p21); vop(p13, p14); vop(p15, p16);
-                vop(p17, p18); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p0, p12);
-                vop(p8, p20); vop(p8, p12); vop(p4, p16); vop(p16, p24); vop(p12, p16);
-                vop(p2, p14); vop(p10, p22); vop(p10, p14); vop(p6, p18); vop(p6, p10);
-                vop(p10, p12); vop(p1, p13); vop(p9, p21); vop(p9, p13); vop(p5, p17);
-                vop(p13, p17); vop(p3, p15); vop(p11, p23); vop(p11, p15); vop(p7, p19);
-                vop(p7, p11); vop(p11, p13); vop(p11, p12);
-                helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, p12, vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-template<int ksize>
-static inline int medianBlurC4(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
-{
-    for (int i = start; i < end; i++)
-    {
-        const uchar* row0 = src_data + std::min(std::max(i     - ksize / 2, 0), height - 1) * src_step;
-        const uchar* row1 = src_data + std::min(std::max(i + 1 - ksize / 2, 0), height - 1) * src_step;
-        const uchar* row2 = src_data + std::min(std::max(i + 2 - ksize / 2, 0), height - 1) * src_step;
-        const uchar* row3 = src_data + std::min(std::max(i + 3 - ksize / 2, 0), height - 1) * src_step;
-        const uchar* row4 = src_data + std::min(std::max(i + 4 - ksize / 2, 0), height - 1) * src_step;
-        int vl;
-        for (int j = 0; j < width; j += vl)
-        {
-            if (ksize == 3)
-            {
-                vl = __riscv_vsetvl_e8m1(width - j);
-                vuint8m1_t p00, p01, p02;
-                vuint8m1_t p03, p04, p05;
-                vuint8m1_t p06, p07, p08;
-                vuint8m1_t p10, p11, p12;
-                vuint8m1_t p13, p14, p15;
-                vuint8m1_t p16, p17, p18;
-                vuint8m1_t p20, p21, p22;
-                vuint8m1_t p23, p24, p25;
-                vuint8m1_t p26, p27, p28;
-                vuint8m1_t p30, p31, p32;
-                vuint8m1_t p33, p34, p35;
-                vuint8m1_t p36, p37, p38;
-                auto loadsrc = [&vl](const uchar* row, vuint8m1_t& p0, vuint8m1_t& p1, vuint8m1_t& p2, vuint8m1_t& p3) {
-                    auto src = __riscv_vlseg4e8_v_u8m1x4(row, vl);
-                    p0 = __riscv_vget_v_u8m1x4_u8m1(src, 0);
-                    p1 = __riscv_vget_v_u8m1x4_u8m1(src, 1);
-                    p2 = __riscv_vget_v_u8m1x4_u8m1(src, 2);
-                    p3 = __riscv_vget_v_u8m1x4_u8m1(src, 3);
-                };
-                if (j != 0)
-                {
-                    loadsrc(row0 + (j - 1) * 4, p00, p10, p20, p30);
-                    loadsrc(row1 + (j - 1) * 4, p03, p13, p23, p33);
-                    loadsrc(row2 + (j - 1) * 4, p06, p16, p26, p36);
-                }
-                else
-                {
-                    loadsrc(row0, p00, p10, p20, p30);
-                    loadsrc(row1, p03, p13, p23, p33);
-                    loadsrc(row2, p06, p16, p26, p36);
-                    p00 = __riscv_vslide1up(p00, row0[0], vl);
-                    p10 = __riscv_vslide1up(p10, row0[1], vl);
-                    p20 = __riscv_vslide1up(p20, row0[2], vl);
-                    p30 = __riscv_vslide1up(p30, row0[3], vl);
-                    p03 = __riscv_vslide1up(p03, row1[0], vl);
-                    p13 = __riscv_vslide1up(p13, row1[1], vl);
-                    p23 = __riscv_vslide1up(p23, row1[2], vl);
-                    p33 = __riscv_vslide1up(p33, row1[3], vl);
-                    p06 = __riscv_vslide1up(p06, row2[0], vl);
-                    p16 = __riscv_vslide1up(p16, row2[1], vl);
-                    p26 = __riscv_vslide1up(p26, row2[2], vl);
-                    p36 = __riscv_vslide1up(p36, row2[3], vl);
-                }
-                p01 = __riscv_vslide1down(p00, row0[(j + vl - 1) * 4    ], vl);
-                p11 = __riscv_vslide1down(p10, row0[(j + vl - 1) * 4 + 1], vl);
-                p21 = __riscv_vslide1down(p20, row0[(j + vl - 1) * 4 + 2], vl);
-                p31 = __riscv_vslide1down(p30, row0[(j + vl - 1) * 4 + 3], vl);
-                p04 = __riscv_vslide1down(p03, row1[(j + vl - 1) * 4    ], vl);
-                p14 = __riscv_vslide1down(p13, row1[(j + vl - 1) * 4 + 1], vl);
-                p24 = __riscv_vslide1down(p23, row1[(j + vl - 1) * 4 + 2], vl);
-                p34 = __riscv_vslide1down(p33, row1[(j + vl - 1) * 4 + 3], vl);
-                p07 = __riscv_vslide1down(p06, row2[(j + vl - 1) * 4    ], vl);
-                p17 = __riscv_vslide1down(p16, row2[(j + vl - 1) * 4 + 1], vl);
-                p27 = __riscv_vslide1down(p26, row2[(j + vl - 1) * 4 + 2], vl);
-                p37 = __riscv_vslide1down(p36, row2[(j + vl - 1) * 4 + 3], vl);
-                p02 = __riscv_vslide1down(p01, row0[std::min(width - 1, j + vl) * 4    ], vl);
-                p12 = __riscv_vslide1down(p11, row0[std::min(width - 1, j + vl) * 4 + 1], vl);
-                p22 = __riscv_vslide1down(p21, row0[std::min(width - 1, j + vl) * 4 + 2], vl);
-                p32 = __riscv_vslide1down(p31, row0[std::min(width - 1, j + vl) * 4 + 3], vl);
-                p05 = __riscv_vslide1down(p04, row1[std::min(width - 1, j + vl) * 4    ], vl);
-                p15 = __riscv_vslide1down(p14, row1[std::min(width - 1, j + vl) * 4 + 1], vl);
-                p25 = __riscv_vslide1down(p24, row1[std::min(width - 1, j + vl) * 4 + 2], vl);
-                p35 = __riscv_vslide1down(p34, row1[std::min(width - 1, j + vl) * 4 + 3], vl);
-                p08 = __riscv_vslide1down(p07, row2[std::min(width - 1, j + vl) * 4    ], vl);
-                p18 = __riscv_vslide1down(p17, row2[std::min(width - 1, j + vl) * 4 + 1], vl);
-                p28 = __riscv_vslide1down(p27, row2[std::min(width - 1, j + vl) * 4 + 2], vl);
-                p38 = __riscv_vslide1down(p37, row2[std::min(width - 1, j + vl) * 4 + 3], vl);
-
-                auto vop = [&vl](vuint8m1_t& a, vuint8m1_t& b) {
-                    auto t = a;
-                    a = __riscv_vminu(a, b, vl);
-                    b = __riscv_vmaxu(t, b, vl);
-                };
-                vuint8m1x4_t dst{};
-                vop(p01, p02); vop(p04, p05); vop(p07, p08); vop(p00, p01);
-                vop(p03, p04); vop(p06, p07); vop(p01, p02); vop(p04, p05);
-                vop(p07, p08); vop(p00, p03); vop(p05, p08); vop(p04, p07);
-                vop(p03, p06); vop(p01, p04); vop(p02, p05); vop(p04, p07);
-                vop(p04, p02); vop(p06, p04); vop(p04, p02);
-                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 0, p04);
-                vop(p11, p12); vop(p14, p15); vop(p17, p18); vop(p10, p11);
-                vop(p13, p14); vop(p16, p17); vop(p11, p12); vop(p14, p15);
-                vop(p17, p18); vop(p10, p13); vop(p15, p18); vop(p14, p17);
-                vop(p13, p16); vop(p11, p14); vop(p12, p15); vop(p14, p17);
-                vop(p14, p12); vop(p16, p14); vop(p14, p12);
-                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 1, p14);
-                vop(p21, p22); vop(p24, p25); vop(p27, p28); vop(p20, p21);
-                vop(p23, p24); vop(p26, p27); vop(p21, p22); vop(p24, p25);
-                vop(p27, p28); vop(p20, p23); vop(p25, p28); vop(p24, p27);
-                vop(p23, p26); vop(p21, p24); vop(p22, p25); vop(p24, p27);
-                vop(p24, p22); vop(p26, p24); vop(p24, p22);
-                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 2, p24);
-                vop(p31, p32); vop(p34, p35); vop(p37, p38); vop(p30, p31);
-                vop(p33, p34); vop(p36, p37); vop(p31, p32); vop(p34, p35);
-                vop(p37, p38); vop(p30, p33); vop(p35, p38); vop(p34, p37);
-                vop(p33, p36); vop(p31, p34); vop(p32, p35); vop(p34, p37);
-                vop(p34, p32); vop(p36, p34); vop(p34, p32);
-                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 3, p34);
-                __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
-            }
-            else
-            {
-                vl = __riscv_vsetvl_e8m2(width - j);
-                vuint8m2_t p00, p01, p02, p03, p04;
-                vuint8m2_t p05, p06, p07, p08, p09;
-                vuint8m2_t p010, p011, p012, p013, p014;
-                vuint8m2_t p015, p016, p017, p018, p019;
-                vuint8m2_t p020, p021, p022, p023, p024;
-                vuint8m2_t p10, p11, p12, p13, p14;
-                vuint8m2_t p15, p16, p17, p18, p19;
-                vuint8m2_t p110, p111, p112, p113, p114;
-                vuint8m2_t p115, p116, p117, p118, p119;
-                vuint8m2_t p120, p121, p122, p123, p124;
-                vuint8m2_t p20, p21, p22, p23, p24;
-                vuint8m2_t p25, p26, p27, p28, p29;
-                vuint8m2_t p210, p211, p212, p213, p214;
-                vuint8m2_t p215, p216, p217, p218, p219;
-                vuint8m2_t p220, p221, p222, p223, p224;
-                vuint8m2_t p30, p31, p32, p33, p34;
-                vuint8m2_t p35, p36, p37, p38, p39;
-                vuint8m2_t p310, p311, p312, p313, p314;
-                vuint8m2_t p315, p316, p317, p318, p319;
-                vuint8m2_t p320, p321, p322, p323, p324;
-                auto loadsrc = [&vl](const uchar* row, vuint8m2_t& p0, vuint8m2_t& p1, vuint8m2_t& p2, vuint8m2_t& p3) {
-                    auto src = __riscv_vlseg4e8_v_u8m2x4(row, vl);
-                    p0 = __riscv_vget_v_u8m2x4_u8m2(src, 0);
-                    p1 = __riscv_vget_v_u8m2x4_u8m2(src, 1);
-                    p2 = __riscv_vget_v_u8m2x4_u8m2(src, 2);
-                    p3 = __riscv_vget_v_u8m2x4_u8m2(src, 3);
-                };
-                if (j >= 2)
-                {
-                    loadsrc(row0 + (j - 2) * 4, p00, p10, p20, p30);
-                    loadsrc(row1 + (j - 2) * 4, p05, p15, p25, p35);
-                    loadsrc(row2 + (j - 2) * 4, p010, p110, p210, p310);
-                    loadsrc(row3 + (j - 2) * 4, p015, p115, p215, p315);
-                    loadsrc(row4 + (j - 2) * 4, p020, p120, p220, p320);
-                }
-                else
-                {
-                    loadsrc(row0, p00, p10, p20, p30);
-                    loadsrc(row1, p05, p15, p25, p35);
-                    loadsrc(row2, p010, p110, p210, p310);
-                    loadsrc(row3, p015, p115, p215, p315);
-                    loadsrc(row4, p020, p120, p220, p320);
-                    auto slideup = [&] {
-                        p00 = __riscv_vslide1up(p00, row0[0], vl);
-                        p10 = __riscv_vslide1up(p10, row0[1], vl);
-                        p20 = __riscv_vslide1up(p20, row0[2], vl);
-                        p30 = __riscv_vslide1up(p30, row0[3], vl);
-                        p05 = __riscv_vslide1up(p05, row1[0], vl);
-                        p15 = __riscv_vslide1up(p15, row1[1], vl);
-                        p25 = __riscv_vslide1up(p25, row1[2], vl);
-                        p35 = __riscv_vslide1up(p35, row1[3], vl);
-                        p010 = __riscv_vslide1up(p010, row2[0], vl);
-                        p110 = __riscv_vslide1up(p110, row2[1], vl);
-                        p210 = __riscv_vslide1up(p210, row2[2], vl);
-                        p310 = __riscv_vslide1up(p310, row2[3], vl);
-                        p015 = __riscv_vslide1up(p015, row3[0], vl);
-                        p115 = __riscv_vslide1up(p115, row3[1], vl);
-                        p215 = __riscv_vslide1up(p215, row3[2], vl);
-                        p315 = __riscv_vslide1up(p315, row3[3], vl);
-                        p020 = __riscv_vslide1up(p020, row4[0], vl);
-                        p120 = __riscv_vslide1up(p120, row4[1], vl);
-                        p220 = __riscv_vslide1up(p220, row4[2], vl);
-                        p320 = __riscv_vslide1up(p320, row4[3], vl);
-                    };
-                    slideup();
-                    if (j == 0)
-                    {
-                        slideup();
-                    }
-                }
-                p01 = __riscv_vslide1down(p00, row0[(j + vl - 2) * 4    ], vl);
-                p11 = __riscv_vslide1down(p10, row0[(j + vl - 2) * 4 + 1], vl);
-                p21 = __riscv_vslide1down(p20, row0[(j + vl - 2) * 4 + 2], vl);
-                p31 = __riscv_vslide1down(p30, row0[(j + vl - 2) * 4 + 3], vl);
-                p06 = __riscv_vslide1down(p05, row1[(j + vl - 2) * 4    ], vl);
-                p16 = __riscv_vslide1down(p15, row1[(j + vl - 2) * 4 + 1], vl);
-                p26 = __riscv_vslide1down(p25, row1[(j + vl - 2) * 4 + 2], vl);
-                p36 = __riscv_vslide1down(p35, row1[(j + vl - 2) * 4 + 3], vl);
-                p011 = __riscv_vslide1down(p010, row2[(j + vl - 2) * 4    ], vl);
-                p111 = __riscv_vslide1down(p110, row2[(j + vl - 2) * 4 + 1], vl);
-                p211 = __riscv_vslide1down(p210, row2[(j + vl - 2) * 4 + 2], vl);
-                p311 = __riscv_vslide1down(p310, row2[(j + vl - 2) * 4 + 3], vl);
-                p016 = __riscv_vslide1down(p015, row3[(j + vl - 2) * 4    ], vl);
-                p116 = __riscv_vslide1down(p115, row3[(j + vl - 2) * 4 + 1], vl);
-                p216 = __riscv_vslide1down(p215, row3[(j + vl - 2) * 4 + 2], vl);
-                p316 = __riscv_vslide1down(p315, row3[(j + vl - 2) * 4 + 3], vl);
-                p021 = __riscv_vslide1down(p020, row4[(j + vl - 2) * 4    ], vl);
-                p121 = __riscv_vslide1down(p120, row4[(j + vl - 2) * 4 + 1], vl);
-                p221 = __riscv_vslide1down(p220, row4[(j + vl - 2) * 4 + 2], vl);
-                p321 = __riscv_vslide1down(p320, row4[(j + vl - 2) * 4 + 3], vl);
-                p02 = __riscv_vslide1down(p01, row0[(j + vl - 1) * 4    ], vl);
-                p12 = __riscv_vslide1down(p11, row0[(j + vl - 1) * 4 + 1], vl);
-                p22 = __riscv_vslide1down(p21, row0[(j + vl - 1) * 4 + 2], vl);
-                p32 = __riscv_vslide1down(p31, row0[(j + vl - 1) * 4 + 3], vl);
-                p07 = __riscv_vslide1down(p06, row1[(j + vl - 1) * 4    ], vl);
-                p17 = __riscv_vslide1down(p16, row1[(j + vl - 1) * 4 + 1], vl);
-                p27 = __riscv_vslide1down(p26, row1[(j + vl - 1) * 4 + 2], vl);
-                p37 = __riscv_vslide1down(p36, row1[(j + vl - 1) * 4 + 3], vl);
-                p012 = __riscv_vslide1down(p011, row2[(j + vl - 1) * 4    ], vl);
-                p112 = __riscv_vslide1down(p111, row2[(j + vl - 1) * 4 + 1], vl);
-                p212 = __riscv_vslide1down(p211, row2[(j + vl - 1) * 4 + 2], vl);
-                p312 = __riscv_vslide1down(p311, row2[(j + vl - 1) * 4 + 3], vl);
-                p017 = __riscv_vslide1down(p016, row3[(j + vl - 1) * 4    ], vl);
-                p117 = __riscv_vslide1down(p116, row3[(j + vl - 1) * 4 + 1], vl);
-                p217 = __riscv_vslide1down(p216, row3[(j + vl - 1) * 4 + 2], vl);
-                p317 = __riscv_vslide1down(p316, row3[(j + vl - 1) * 4 + 3], vl);
-                p022 = __riscv_vslide1down(p021, row4[(j + vl - 1) * 4    ], vl);
-                p122 = __riscv_vslide1down(p121, row4[(j + vl - 1) * 4 + 1], vl);
-                p222 = __riscv_vslide1down(p221, row4[(j + vl - 1) * 4 + 2], vl);
-                p322 = __riscv_vslide1down(p321, row4[(j + vl - 1) * 4 + 3], vl);
-                p03 = __riscv_vslide1down(p02, row0[std::min(width - 1, j + vl) * 4    ], vl);
-                p13 = __riscv_vslide1down(p12, row0[std::min(width - 1, j + vl) * 4 + 1], vl);
-                p23 = __riscv_vslide1down(p22, row0[std::min(width - 1, j + vl) * 4 + 2], vl);
-                p33 = __riscv_vslide1down(p32, row0[std::min(width - 1, j + vl) * 4 + 3], vl);
-                p08 = __riscv_vslide1down(p07, row1[std::min(width - 1, j + vl) * 4    ], vl);
-                p18 = __riscv_vslide1down(p17, row1[std::min(width - 1, j + vl) * 4 + 1], vl);
-                p28 = __riscv_vslide1down(p27, row1[std::min(width - 1, j + vl) * 4 + 2], vl);
-                p38 = __riscv_vslide1down(p37, row1[std::min(width - 1, j + vl) * 4 + 3], vl);
-                p013 = __riscv_vslide1down(p012, row2[std::min(width - 1, j + vl) * 4    ], vl);
-                p113 = __riscv_vslide1down(p112, row2[std::min(width - 1, j + vl) * 4 + 1], vl);
-                p213 = __riscv_vslide1down(p212, row2[std::min(width - 1, j + vl) * 4 + 2], vl);
-                p313 = __riscv_vslide1down(p312, row2[std::min(width - 1, j + vl) * 4 + 3], vl);
-                p018 = __riscv_vslide1down(p017, row3[std::min(width - 1, j + vl) * 4    ], vl);
-                p118 = __riscv_vslide1down(p117, row3[std::min(width - 1, j + vl) * 4 + 1], vl);
-                p218 = __riscv_vslide1down(p217, row3[std::min(width - 1, j + vl) * 4 + 2], vl);
-                p318 = __riscv_vslide1down(p317, row3[std::min(width - 1, j + vl) * 4 + 3], vl);
-                p023 = __riscv_vslide1down(p022, row4[std::min(width - 1, j + vl) * 4    ], vl);
-                p123 = __riscv_vslide1down(p122, row4[std::min(width - 1, j + vl) * 4 + 1], vl);
-                p223 = __riscv_vslide1down(p222, row4[std::min(width - 1, j + vl) * 4 + 2], vl);
-                p323 = __riscv_vslide1down(p322, row4[std::min(width - 1, j + vl) * 4 + 3], vl);
-                p04 = __riscv_vslide1down(p03, row0[std::min(width - 1, j + vl + 1) * 4    ], vl);
-                p14 = __riscv_vslide1down(p13, row0[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
-                p24 = __riscv_vslide1down(p23, row0[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
-                p34 = __riscv_vslide1down(p33, row0[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
-                p09 = __riscv_vslide1down(p08, row1[std::min(width - 1, j + vl + 1) * 4    ], vl);
-                p19 = __riscv_vslide1down(p18, row1[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
-                p29 = __riscv_vslide1down(p28, row1[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
-                p39 = __riscv_vslide1down(p38, row1[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
-                p014 = __riscv_vslide1down(p013, row2[std::min(width - 1, j + vl + 1) * 4    ], vl);
-                p114 = __riscv_vslide1down(p113, row2[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
-                p214 = __riscv_vslide1down(p213, row2[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
-                p314 = __riscv_vslide1down(p313, row2[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
-                p019 = __riscv_vslide1down(p018, row3[std::min(width - 1, j + vl + 1) * 4    ], vl);
-                p119 = __riscv_vslide1down(p118, row3[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
-                p219 = __riscv_vslide1down(p218, row3[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
-                p319 = __riscv_vslide1down(p318, row3[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
-                p024 = __riscv_vslide1down(p023, row4[std::min(width - 1, j + vl + 1) * 4    ], vl);
-                p124 = __riscv_vslide1down(p123, row4[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
-                p224 = __riscv_vslide1down(p223, row4[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
-                p324 = __riscv_vslide1down(p323, row4[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
-
-                auto vop = [&vl](vuint8m2_t& a, vuint8m2_t& b) {
-                    auto t = a;
-                    a = __riscv_vminu(a, b, vl);
-                    b = __riscv_vmaxu(t, b, vl);
-                };
-                vuint8m2x4_t dst{};
-                vop(p01, p02); vop(p00, p01); vop(p01, p02); vop(p04, p05); vop(p03, p04);
-                vop(p04, p05); vop(p00, p03); vop(p02, p05); vop(p02, p03); vop(p01, p04);
-                vop(p01, p02); vop(p03, p04); vop(p07, p08); vop(p06, p07); vop(p07, p08);
-                vop(p010, p011); vop(p09, p010); vop(p010, p011); vop(p06, p09); vop(p08, p011);
-                vop(p08, p09); vop(p07, p010); vop(p07, p08); vop(p09, p010); vop(p00, p06);
-                vop(p04, p010); vop(p04, p06); vop(p02, p08); vop(p02, p04); vop(p06, p08);
-                vop(p01, p07); vop(p05, p011); vop(p05, p07); vop(p03, p09); vop(p03, p05);
-                vop(p07, p09); vop(p01, p02); vop(p03, p04); vop(p05, p06); vop(p07, p08);
-                vop(p09, p010); vop(p013, p014); vop(p012, p013); vop(p013, p014); vop(p016, p017);
-                vop(p015, p016); vop(p016, p017); vop(p012, p015); vop(p014, p017); vop(p014, p015);
-                vop(p013, p016); vop(p013, p014); vop(p015, p016); vop(p019, p020); vop(p018, p019);
-                vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p021, p023); vop(p022, p024);
-                vop(p022, p023); vop(p018, p021); vop(p020, p023); vop(p020, p021); vop(p019, p022);
-                vop(p022, p024); vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p012, p018);
-                vop(p016, p022); vop(p016, p018); vop(p014, p020); vop(p020, p024); vop(p014, p016);
-                vop(p018, p020); vop(p022, p024); vop(p013, p019); vop(p017, p023); vop(p017, p019);
-                vop(p015, p021); vop(p015, p017); vop(p019, p021); vop(p013, p014); vop(p015, p016);
-                vop(p017, p018); vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p00, p012);
-                vop(p08, p020); vop(p08, p012); vop(p04, p016); vop(p016, p024); vop(p012, p016);
-                vop(p02, p014); vop(p010, p022); vop(p010, p014); vop(p06, p018); vop(p06, p010);
-                vop(p010, p012); vop(p01, p013); vop(p09, p021); vop(p09, p013); vop(p05, p017);
-                vop(p013, p017); vop(p03, p015); vop(p011, p023); vop(p011, p015); vop(p07, p019);
-                vop(p07, p011); vop(p011, p013); vop(p011, p012);
-                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 0, p012);
-                vop(p11, p12); vop(p10, p11); vop(p11, p12); vop(p14, p15); vop(p13, p14);
-                vop(p14, p15); vop(p10, p13); vop(p12, p15); vop(p12, p13); vop(p11, p14);
-                vop(p11, p12); vop(p13, p14); vop(p17, p18); vop(p16, p17); vop(p17, p18);
-                vop(p110, p111); vop(p19, p110); vop(p110, p111); vop(p16, p19); vop(p18, p111);
-                vop(p18, p19); vop(p17, p110); vop(p17, p18); vop(p19, p110); vop(p10, p16);
-                vop(p14, p110); vop(p14, p16); vop(p12, p18); vop(p12, p14); vop(p16, p18);
-                vop(p11, p17); vop(p15, p111); vop(p15, p17); vop(p13, p19); vop(p13, p15);
-                vop(p17, p19); vop(p11, p12); vop(p13, p14); vop(p15, p16); vop(p17, p18);
-                vop(p19, p110); vop(p113, p114); vop(p112, p113); vop(p113, p114); vop(p116, p117);
-                vop(p115, p116); vop(p116, p117); vop(p112, p115); vop(p114, p117); vop(p114, p115);
-                vop(p113, p116); vop(p113, p114); vop(p115, p116); vop(p119, p120); vop(p118, p119);
-                vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p121, p123); vop(p122, p124);
-                vop(p122, p123); vop(p118, p121); vop(p120, p123); vop(p120, p121); vop(p119, p122);
-                vop(p122, p124); vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p112, p118);
-                vop(p116, p122); vop(p116, p118); vop(p114, p120); vop(p120, p124); vop(p114, p116);
-                vop(p118, p120); vop(p122, p124); vop(p113, p119); vop(p117, p123); vop(p117, p119);
-                vop(p115, p121); vop(p115, p117); vop(p119, p121); vop(p113, p114); vop(p115, p116);
-                vop(p117, p118); vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p10, p112);
-                vop(p18, p120); vop(p18, p112); vop(p14, p116); vop(p116, p124); vop(p112, p116);
-                vop(p12, p114); vop(p110, p122); vop(p110, p114); vop(p16, p118); vop(p16, p110);
-                vop(p110, p112); vop(p11, p113); vop(p19, p121); vop(p19, p113); vop(p15, p117);
-                vop(p113, p117); vop(p13, p115); vop(p111, p123); vop(p111, p115); vop(p17, p119);
-                vop(p17, p111); vop(p111, p113); vop(p111, p112);
-                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 1, p112);
-                vop(p21, p22); vop(p20, p21); vop(p21, p22); vop(p24, p25); vop(p23, p24);
-                vop(p24, p25); vop(p20, p23); vop(p22, p25); vop(p22, p23); vop(p21, p24);
-                vop(p21, p22); vop(p23, p24); vop(p27, p28); vop(p26, p27); vop(p27, p28);
-                vop(p210, p211); vop(p29, p210); vop(p210, p211); vop(p26, p29); vop(p28, p211);
-                vop(p28, p29); vop(p27, p210); vop(p27, p28); vop(p29, p210); vop(p20, p26);
-                vop(p24, p210); vop(p24, p26); vop(p22, p28); vop(p22, p24); vop(p26, p28);
-                vop(p21, p27); vop(p25, p211); vop(p25, p27); vop(p23, p29); vop(p23, p25);
-                vop(p27, p29); vop(p21, p22); vop(p23, p24); vop(p25, p26); vop(p27, p28);
-                vop(p29, p210); vop(p213, p214); vop(p212, p213); vop(p213, p214); vop(p216, p217);
-                vop(p215, p216); vop(p216, p217); vop(p212, p215); vop(p214, p217); vop(p214, p215);
-                vop(p213, p216); vop(p213, p214); vop(p215, p216); vop(p219, p220); vop(p218, p219);
-                vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p221, p223); vop(p222, p224);
-                vop(p222, p223); vop(p218, p221); vop(p220, p223); vop(p220, p221); vop(p219, p222);
-                vop(p222, p224); vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p212, p218);
-                vop(p216, p222); vop(p216, p218); vop(p214, p220); vop(p220, p224); vop(p214, p216);
-                vop(p218, p220); vop(p222, p224); vop(p213, p219); vop(p217, p223); vop(p217, p219);
-                vop(p215, p221); vop(p215, p217); vop(p219, p221); vop(p213, p214); vop(p215, p216);
-                vop(p217, p218); vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p20, p212);
-                vop(p28, p220); vop(p28, p212); vop(p24, p216); vop(p216, p224); vop(p212, p216);
-                vop(p22, p214); vop(p210, p222); vop(p210, p214); vop(p26, p218); vop(p26, p210);
-                vop(p210, p212); vop(p21, p213); vop(p29, p221); vop(p29, p213); vop(p25, p217);
-                vop(p213, p217); vop(p23, p215); vop(p211, p223); vop(p211, p215); vop(p27, p219);
-                vop(p27, p211); vop(p211, p213); vop(p211, p212);
-                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 2, p212);
-                vop(p31, p32); vop(p30, p31); vop(p31, p32); vop(p34, p35); vop(p33, p34);
-                vop(p34, p35); vop(p30, p33); vop(p32, p35); vop(p32, p33); vop(p31, p34);
-                vop(p31, p32); vop(p33, p34); vop(p37, p38); vop(p36, p37); vop(p37, p38);
-                vop(p310, p311); vop(p39, p310); vop(p310, p311); vop(p36, p39); vop(p38, p311);
-                vop(p38, p39); vop(p37, p310); vop(p37, p38); vop(p39, p310); vop(p30, p36);
-                vop(p34, p310); vop(p34, p36); vop(p32, p38); vop(p32, p34); vop(p36, p38);
-                vop(p31, p37); vop(p35, p311); vop(p35, p37); vop(p33, p39); vop(p33, p35);
-                vop(p37, p39); vop(p31, p32); vop(p33, p34); vop(p35, p36); vop(p37, p38);
-                vop(p39, p310); vop(p313, p314); vop(p312, p313); vop(p313, p314); vop(p316, p317);
-                vop(p315, p316); vop(p316, p317); vop(p312, p315); vop(p314, p317); vop(p314, p315);
-                vop(p313, p316); vop(p313, p314); vop(p315, p316); vop(p319, p320); vop(p318, p319);
-                vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p321, p323); vop(p322, p324);
-                vop(p322, p323); vop(p318, p321); vop(p320, p323); vop(p320, p321); vop(p319, p322);
-                vop(p322, p324); vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p312, p318);
-                vop(p316, p322); vop(p316, p318); vop(p314, p320); vop(p320, p324); vop(p314, p316);
-                vop(p318, p320); vop(p322, p324); vop(p313, p319); vop(p317, p323); vop(p317, p319);
-                vop(p315, p321); vop(p315, p317); vop(p319, p321); vop(p313, p314); vop(p315, p316);
-                vop(p317, p318); vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p30, p312);
-                vop(p38, p320); vop(p38, p312); vop(p34, p316); vop(p316, p324); vop(p312, p316);
-                vop(p32, p314); vop(p310, p322); vop(p310, p314); vop(p36, p318); vop(p36, p310);
-                vop(p310, p312); vop(p31, p313); vop(p39, p321); vop(p39, p313); vop(p35, p317);
-                vop(p313, p317); vop(p33, p315); vop(p311, p323); vop(p311, p315); vop(p37, p319);
-                vop(p37, p311); vop(p311, p313); vop(p311, p312);
-                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 3, p312);
-                __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int medianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, int ksize)
-{
-    const int type = CV_MAKETYPE(depth, cn);
-    if (type != CV_8UC1 && type != CV_8UC4 && type != CV_16UC1 && type != CV_16SC1 && type != CV_32FC1)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if ((ksize != 3 && ksize != 5) || src_data == dst_data)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    switch (ksize*100 + type)
-    {
-    case 300 + CV_8UC1:
-        return filter::invoke(height, {medianBlurC1<3, RVV_U8M4>}, src_data, src_step, dst_data, dst_step, width, height);
-    case 300 + CV_16UC1:
-        return filter::invoke(height, {medianBlurC1<3, RVV_U16M4>}, src_data, src_step, dst_data, dst_step, width, height);
-    case 300 + CV_16SC1:
-        return filter::invoke(height, {medianBlurC1<3, RVV_I16M4>}, src_data, src_step, dst_data, dst_step, width, height);
-    case 300 + CV_32FC1:
-        return filter::invoke(height, {medianBlurC1<3, RVV_F32M4>}, src_data, src_step, dst_data, dst_step, width, height);
-    case 500 + CV_8UC1:
-        return filter::invoke(height, {medianBlurC1<5, RVV_U8M1>}, src_data, src_step, dst_data, dst_step, width, height);
-    case 500 + CV_16UC1:
-        return filter::invoke(height, {medianBlurC1<5, RVV_U16M1>}, src_data, src_step, dst_data, dst_step, width, height);
-    case 500 + CV_16SC1:
-        return filter::invoke(height, {medianBlurC1<5, RVV_I16M1>}, src_data, src_step, dst_data, dst_step, width, height);
-    case 500 + CV_32FC1:
-        return filter::invoke(height, {medianBlurC1<5, RVV_F32M1>}, src_data, src_step, dst_data, dst_step, width, height);
-
-    case 300 + CV_8UC4:
-        return filter::invoke(height, {medianBlurC4<3>}, src_data, src_step, dst_data, dst_step, width, height);
-    case 500 + CV_8UC4:
-        return filter::invoke(height, {medianBlurC4<5>}, src_data, src_step, dst_data, dst_step, width, height);
-    }
-
-    return CV_HAL_ERROR_NOT_IMPLEMENTED;
-}
-} // cv::cv_hal_rvv::medianBlur
-
-namespace boxFilter {
-#undef cv_hal_boxFilter
-#define cv_hal_boxFilter cv::cv_hal_rvv::boxFilter::boxFilter
-
-template<typename T> struct rvv;
-template<> struct rvv<uchar>
-{
-    static inline vuint16m8_t vcvt0(vuint8m4_t a, size_t b) { return __riscv_vzext_vf2(a, b); }
-    static inline vuint8m4_t vcvt1(vuint16m8_t a, size_t b) { return __riscv_vnclipu(a, 0, __RISCV_VXRM_RNU, b); }
-    static inline vuint16m8_t vdiv(vuint16m8_t a, ushort b, size_t c) { return __riscv_vdivu(__riscv_vadd(a, b / 2, c), b, c); }
-};
-template<> struct rvv<short>
-{
-    static inline vint32m8_t vcvt0(vint16m4_t a, size_t b) { return __riscv_vsext_vf2(a, b); }
-    static inline vint16m4_t vcvt1(vint32m8_t a, size_t b) { return __riscv_vnclip(a, 0, __RISCV_VXRM_RNU, b); }
-    static inline vint32m8_t vdiv(vint32m8_t a, int b, size_t c) { return __riscv_vdiv(__riscv_vadd(a, b / 2, c), b, c); }
-};
-template<> struct rvv<int>
-{
-    static inline vint32m8_t vcvt0(vint32m8_t a, size_t) { return a; }
-    static inline vint32m8_t vcvt1(vint32m8_t a, size_t) { return a; }
-    static inline vint32m8_t vdiv(vint32m8_t a, int b, size_t c) { return __riscv_vdiv(__riscv_vadd(a, b / 2, c), b, c); }
-};
-template<> struct rvv<float>
-{
-    static inline vfloat32m8_t vcvt0(vfloat32m8_t a, size_t) { return a; }
-    static inline vfloat32m8_t vcvt1(vfloat32m8_t a, size_t) { return a; }
-    static inline vfloat32m8_t vdiv(vfloat32m8_t a, float b, size_t c) { return __riscv_vfdiv(a, b, c); }
-};
-
-// the algorithm is same as cv_hal_sepFilter
-template<int ksize, typename helperT, typename helperWT, bool cast>
-static inline int boxFilterC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int anchor_x, int anchor_y, bool normalize, int border_type)
-{
-    using T = typename helperT::ElemType;
-    using WT = typename helperWT::ElemType;
-
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto accessX = [&](int x) {
-        int pi = filter::borderInterpolate(offset_y + x - anchor_y, full_height, border_type);
-        return pi < 0 ? noval : pi - offset_y;
-    };
-    auto accessY = [&](int y) {
-        int pj = filter::borderInterpolate(offset_x + y - anchor_x, full_width, border_type);
-        return pj < 0 ? noval : pj - offset_x;
-    };
-    auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; };
-
-    std::vector<WT> res(width * ksize);
-    auto process = [&](int x, int y) {
-        WT sum = 0;
-        for (int i = 0; i < ksize; i++)
-        {
-            int p = accessY(y + i);
-            if (p != noval)
-            {
-                sum += reinterpret_cast<const T*>(src_data + x * src_step)[p];
-            }
-        }
-        res[p2idx(x, y)] = sum;
-    };
-
-    const int left = anchor_x, right = width - (ksize - 1 - anchor_x);
-    for (int i = start - anchor_y; i < end + (ksize - 1 - anchor_y); i++)
-    {
-        if (i + offset_y >= 0 && i + offset_y < full_height)
-        {
-            if (left >= right)
-            {
-                for (int j = 0; j < width; j++)
-                    process(i, j);
-            }
-            else
-            {
-                for (int j = 0; j < left; j++)
-                    process(i, j);
-                for (int j = right; j < width; j++)
-                    process(i, j);
-
-                int vl;
-                for (int j = left; j < right; j += vl)
-                {
-                    vl = helperT::setvl(right - j);
-                    const T* extra = reinterpret_cast<const T*>(src_data + i * src_step) + j - anchor_x;
-                    auto src = rvv<T>::vcvt0(helperT::vload(extra, vl), vl);
-
-                    extra += vl;
-                    auto sum = src;
-                    src = helperWT::vslide1down(src, extra[0], vl);
-                    sum = helperWT::vadd(sum, src, vl);
-                    src = helperWT::vslide1down(src, extra[1], vl);
-                    sum = helperWT::vadd(sum, src, vl);
-                    if (ksize == 5)
-                    {
-                        src = helperWT::vslide1down(src, extra[2], vl);
-                        sum = helperWT::vadd(sum, src, vl);
-                        src = helperWT::vslide1down(src, extra[3], vl);
-                        sum = helperWT::vadd(sum, src, vl);
-                    }
-                    helperWT::vstore(res.data() + p2idx(i, j), sum, vl);
-                }
-            }
-        }
-
-        int cur = i - (ksize - 1 - anchor_y);
-        if (cur >= start)
-        {
-            const WT* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
-            const WT* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
-            const WT* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
-            const WT* row3 = nullptr, *row4 = nullptr;
-            if (ksize == 5)
-            {
-                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
-                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
-            }
-
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = helperWT::setvl(width - j);
-                auto sum = row0 ? helperWT::vload(row0 + j, vl) : helperWT::vmv(0, vl);
-                if (row1) sum = helperWT::vadd(sum, helperWT::vload(row1 + j, vl), vl);
-                if (row2) sum = helperWT::vadd(sum, helperWT::vload(row2 + j, vl), vl);
-                if (row3) sum = helperWT::vadd(sum, helperWT::vload(row3 + j, vl), vl);
-                if (row4) sum = helperWT::vadd(sum, helperWT::vload(row4 + j, vl), vl);
-                if (normalize) sum = rvv<T>::vdiv(sum, ksize * ksize, vl);
-
-                if (cast)
-                {
-                    helperT::vstore(reinterpret_cast<T*>(dst_data + cur * dst_step) + j, rvv<T>::vcvt1(sum, vl), vl);
-                }
-                else
-                {
-                    helperWT::vstore(reinterpret_cast<WT*>(dst_data + cur * dst_step) + j, sum, vl);
-                }
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-template<int ksize>
-static inline int boxFilterC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int anchor_x, int anchor_y, bool normalize, int border_type)
-{
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto accessX = [&](int x) {
-        int pi = filter::borderInterpolate(offset_y + x - anchor_y, full_height, border_type);
-        return pi < 0 ? noval : pi - offset_y;
-    };
-    auto accessY = [&](int y) {
-        int pj = filter::borderInterpolate(offset_x + y - anchor_x, full_width, border_type);
-        return pj < 0 ? noval : pj - offset_x;
-    };
-    auto p2idx = [&](int x, int y){ return ((x + ksize) % ksize * width + y) * 3; };
-
-    std::vector<float> res(width * ksize * 3);
-    auto process = [&](int x, int y) {
-        float sum0, sum1, sum2;
-        sum0 = sum1 = sum2 = 0;
-        for (int i = 0; i < ksize; i++)
-        {
-            int p = accessY(y + i);
-            if (p != noval)
-            {
-                sum0 += reinterpret_cast<const float*>(src_data + x * src_step)[p * 3    ];
-                sum1 += reinterpret_cast<const float*>(src_data + x * src_step)[p * 3 + 1];
-                sum2 += reinterpret_cast<const float*>(src_data + x * src_step)[p * 3 + 2];
-            }
-        }
-        res[p2idx(x, y)    ] = sum0;
-        res[p2idx(x, y) + 1] = sum1;
-        res[p2idx(x, y) + 2] = sum2;
-    };
-
-    const int left = anchor_x, right = width - (ksize - 1 - anchor_x);
-    for (int i = start - anchor_y; i < end + (ksize - 1 - anchor_y); i++)
-    {
-        if (i + offset_y >= 0 && i + offset_y < full_height)
-        {
-            if (left >= right)
-            {
-                for (int j = 0; j < width; j++)
-                    process(i, j);
-            }
-            else
-            {
-                for (int j = 0; j < left; j++)
-                    process(i, j);
-                for (int j = right; j < width; j++)
-                    process(i, j);
-
-                int vl;
-                for (int j = left; j < right; j += vl)
-                {
-                    vl = __riscv_vsetvl_e32m2(right - j);
-                    const float* extra = reinterpret_cast<const float*>(src_data + i * src_step) + (j - anchor_x) * 3;
-                    auto src = __riscv_vlseg3e32_v_f32m2x3(extra, vl);
-                    auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
-                    auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
-                    auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
-
-                    extra += vl * 3;
-                    auto sum0 = src0, sum1 = src1, sum2 = src2;
-                    src0 = __riscv_vfslide1down(src0, extra[0], vl);
-                    src1 = __riscv_vfslide1down(src1, extra[1], vl);
-                    src2 = __riscv_vfslide1down(src2, extra[2], vl);
-                    sum0 = __riscv_vfadd(sum0, src0, vl);
-                    sum1 = __riscv_vfadd(sum1, src1, vl);
-                    sum2 = __riscv_vfadd(sum2, src2, vl);
-                    src0 = __riscv_vfslide1down(src0, extra[3], vl);
-                    src1 = __riscv_vfslide1down(src1, extra[4], vl);
-                    src2 = __riscv_vfslide1down(src2, extra[5], vl);
-                    sum0 = __riscv_vfadd(sum0, src0, vl);
-                    sum1 = __riscv_vfadd(sum1, src1, vl);
-                    sum2 = __riscv_vfadd(sum2, src2, vl);
-                    if (ksize == 5)
-                    {
-                        src0 = __riscv_vfslide1down(src0, extra[6], vl);
-                        src1 = __riscv_vfslide1down(src1, extra[7], vl);
-                        src2 = __riscv_vfslide1down(src2, extra[8], vl);
-                        sum0 = __riscv_vfadd(sum0, src0, vl);
-                        sum1 = __riscv_vfadd(sum1, src1, vl);
-                        sum2 = __riscv_vfadd(sum2, src2, vl);
-                        src0 = __riscv_vfslide1down(src0, extra[ 9], vl);
-                        src1 = __riscv_vfslide1down(src1, extra[10], vl);
-                        src2 = __riscv_vfslide1down(src2, extra[11], vl);
-                        sum0 = __riscv_vfadd(sum0, src0, vl);
-                        sum1 = __riscv_vfadd(sum1, src1, vl);
-                        sum2 = __riscv_vfadd(sum2, src2, vl);
-                    }
-
-                    vfloat32m2x3_t dst{};
-                    dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, sum0);
-                    dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, sum1);
-                    dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, sum2);
-                    __riscv_vsseg3e32(res.data() + p2idx(i, j), dst, vl);
-                }
-            }
-        }
-
-        int cur = i - (ksize - 1 - anchor_y);
-        if (cur >= start)
-        {
-            const float* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
-            const float* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
-            const float* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
-            const float* row3 = nullptr, *row4 = nullptr;
-            if (ksize == 5)
-            {
-                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
-                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
-            }
-
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = __riscv_vsetvl_e32m2(width - j);
-                vfloat32m2_t sum0, sum1, sum2;
-                sum0 = sum1 = sum2 = __riscv_vfmv_v_f_f32m2(0, vl);
-                auto loadres = [&](const float* row) {
-                    if (!row) return;
-                    auto src = __riscv_vlseg3e32_v_f32m2x3(row + j * 3, vl);
-                    sum0 = __riscv_vfadd(sum0, __riscv_vget_v_f32m2x3_f32m2(src, 0), vl);
-                    sum1 = __riscv_vfadd(sum1, __riscv_vget_v_f32m2x3_f32m2(src, 1), vl);
-                    sum2 = __riscv_vfadd(sum2, __riscv_vget_v_f32m2x3_f32m2(src, 2), vl);
-                };
-                loadres(row0);
-                loadres(row1);
-                loadres(row2);
-                loadres(row3);
-                loadres(row4);
-                if (normalize)
-                {
-                    sum0 = __riscv_vfdiv(sum0, ksize * ksize, vl);
-                    sum1 = __riscv_vfdiv(sum1, ksize * ksize, vl);
-                    sum2 = __riscv_vfdiv(sum2, ksize * ksize, vl);
-                }
-
-                vfloat32m2x3_t dst{};
-                dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, sum0);
-                dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, sum1);
-                dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, sum2);
-                __riscv_vsseg3e32(reinterpret_cast<float*>(dst_data + cur * dst_step) + j * 3, dst, vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int boxFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, size_t ksize_width, size_t ksize_height, int anchor_x, int anchor_y, bool normalize, int border_type)
-{
-    const int src_type = CV_MAKETYPE(src_depth, cn), dst_type = CV_MAKETYPE(dst_depth, cn);
-    if (ksize_width != ksize_height || (ksize_width != 3 && ksize_width != 5))
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (border_type & BORDER_ISOLATED || border_type == BORDER_WRAP)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    uchar* _dst_data = dst_data;
-    size_t _dst_step = dst_step;
-    const size_t size = CV_ELEM_SIZE(dst_type);
-    std::vector<uchar> dst;
-    if (src_data == _dst_data)
-    {
-        dst = std::vector<uchar>(width * height * size);
-        dst_data = dst.data();
-        dst_step = width * size;
-    }
-
-    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
-    anchor_x = anchor_x < 0 ? ksize_width  / 2 : anchor_x;
-    anchor_y = anchor_y < 0 ? ksize_height / 2 : anchor_y;
-    if (src_type != dst_type)
-    {
-        if (src_type == CV_8UC1 && dst_type == CV_16UC1)
-        {
-            if (ksize_width == 3)
-            {
-                res = filter::invoke(height, {boxFilterC1<3, RVV_U8M4, RVV_U16M8, false>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            }
-            if (ksize_width == 5)
-            {
-                res = filter::invoke(height, {boxFilterC1<5, RVV_U8M4, RVV_U16M8, false>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            }
-        }
-    }
-    else
-    {
-        switch (ksize_width*100 + src_type)
-        {
-        case 300 + CV_8UC1:
-            res = filter::invoke(height, {boxFilterC1<3, RVV_U8M4, RVV_U16M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 500 + CV_8UC1:
-            res = filter::invoke(height, {boxFilterC1<5, RVV_U8M4, RVV_U16M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 300 + CV_16SC1:
-            res = filter::invoke(height, {boxFilterC1<3, RVV_I16M4, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 500 + CV_16SC1:
-            res = filter::invoke(height, {boxFilterC1<5, RVV_I16M4, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 300 + CV_32SC1:
-            res = filter::invoke(height, {boxFilterC1<3, RVV_I32M8, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 500 + CV_32SC1:
-            res = filter::invoke(height, {boxFilterC1<5, RVV_I32M8, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 300 + CV_32FC1:
-            res = filter::invoke(height, {boxFilterC1<3, RVV_F32M8, RVV_F32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 500 + CV_32FC1:
-            res = filter::invoke(height, {boxFilterC1<5, RVV_F32M8, RVV_F32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 300 + CV_32FC3:
-            res = filter::invoke(height, {boxFilterC3<3>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 500 + CV_32FC3:
-            res = filter::invoke(height, {boxFilterC3<5>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        }
-    }
-    if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    if (src_data == _dst_data)
-    {
-        for (int i = 0; i < height; i++)
-            memcpy(_dst_data + i * _dst_step, dst.data() + i * dst_step, dst_step);
-    }
-
-    return res;
-}
-} // cv::cv_hal_rvv::boxFilter
-
-namespace bilateralFilter {
-#undef cv_hal_bilateralFilter
-#define cv_hal_bilateralFilter cv::cv_hal_rvv::bilateralFilter::bilateralFilter
-
-// the algorithm is copied from imgproc/src/bilateral_filter.simd.cpp
-// in the functor BilateralFilter_8u_Invoker
-static inline int bilateralFilter8UC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* color_weight)
-{
-    constexpr int align = 31;
-    std::vector<float> _sum(width + align), _wsum(width + align);
-    float* sum = reinterpret_cast<float*>(((size_t)_sum.data() + align) & ~align);
-    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
-
-    for (int i = start; i < end; i++)
-    {
-        const uchar* sptr = src_data + (i+radius) * src_step + radius;
-        memset(sum, 0, sizeof(float) * width);
-        memset(wsum, 0, sizeof(float) * width);
-        for(int k = 0; k < maxk; k++)
-        {
-            const uchar* ksptr = sptr + space_ofs[k];
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = __riscv_vsetvl_e8m2(width - j);
-                auto src = __riscv_vle8_v_u8m2(sptr + j, vl);
-                auto ksrc = __riscv_vle8_v_u8m2(ksptr + j, vl);
-                auto diff = __riscv_vsub(__riscv_vmaxu(src, ksrc, vl), __riscv_vminu(src, ksrc, vl), vl);
-                auto w = __riscv_vloxei16_v_f32m8(color_weight, __riscv_vmul(__riscv_vzext_vf2(diff, vl), sizeof(float), vl), vl);
-                w = __riscv_vfmul(w, space_weight[k], vl);
-
-                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl);
-                __riscv_vse32(sum + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc, vl), vl), __riscv_vle32_v_f32m8(sum + j, vl), vl), vl);
-            }
-        }
-
-        int vl;
-        for (int j = 0; j < width; j += vl)
-        {
-            vl = __riscv_vsetvl_e8m2(width - j);
-            auto dst = __riscv_vfncvt_xu(__riscv_vfdiv(__riscv_vle32_v_f32m8(sum + j, vl), __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl);
-            __riscv_vse8(dst_data + i * dst_step + j, __riscv_vncvt_x(dst, vl), vl);
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-static inline int bilateralFilter8UC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* color_weight)
-{
-    constexpr int align = 31;
-    std::vector<float> _sum_b(width + align), _sum_g(width + align), _sum_r(width + align), _wsum(width + align);
-    float* sum_b = reinterpret_cast<float*>(((size_t)_sum_b.data() + align) & ~align);
-    float* sum_g = reinterpret_cast<float*>(((size_t)_sum_g.data() + align) & ~align);
-    float* sum_r = reinterpret_cast<float*>(((size_t)_sum_r.data() + align) & ~align);
-    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
-
-    for (int i = start; i < end; i++)
-    {
-        const uchar* sptr = src_data + (i+radius) * src_step + radius*3;
-        memset(sum_b, 0, sizeof(float) * width);
-        memset(sum_g, 0, sizeof(float) * width);
-        memset(sum_r, 0, sizeof(float) * width);
-        memset(wsum, 0, sizeof(float) * width);
-        for(int k = 0; k < maxk; k++)
-        {
-            const uchar* ksptr = sptr + space_ofs[k];
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = __riscv_vsetvl_e8m2(width - j);
-                auto src = __riscv_vlseg3e8_v_u8m2x3(sptr + j * 3, vl);
-                auto src0 = __riscv_vget_v_u8m2x3_u8m2(src, 0);
-                auto src1 = __riscv_vget_v_u8m2x3_u8m2(src, 1);
-                auto src2 = __riscv_vget_v_u8m2x3_u8m2(src, 2);
-                src = __riscv_vlseg3e8_v_u8m2x3(ksptr + j * 3, vl);
-                auto ksrc0 = __riscv_vget_v_u8m2x3_u8m2(src, 0);
-                auto ksrc1 = __riscv_vget_v_u8m2x3_u8m2(src, 1);
-                auto ksrc2 = __riscv_vget_v_u8m2x3_u8m2(src, 2);
-
-                auto diff0 = __riscv_vsub(__riscv_vmaxu(src0, ksrc0, vl), __riscv_vminu(src0, ksrc0, vl), vl);
-                auto diff1 = __riscv_vsub(__riscv_vmaxu(src1, ksrc1, vl), __riscv_vminu(src1, ksrc1, vl), vl);
-                auto diff2 = __riscv_vsub(__riscv_vmaxu(src2, ksrc2, vl), __riscv_vminu(src2, ksrc2, vl), vl);
-                auto w = __riscv_vloxei16_v_f32m8(color_weight, __riscv_vmul(__riscv_vadd(__riscv_vadd(__riscv_vzext_vf2(diff0, vl), __riscv_vzext_vf2(diff1, vl), vl), __riscv_vzext_vf2(diff2, vl), vl), sizeof(float), vl), vl);
-                w = __riscv_vfmul(w, space_weight[k], vl);
-
-                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl);
-                __riscv_vse32(sum_b + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc0, vl), vl), __riscv_vle32_v_f32m8(sum_b + j, vl), vl), vl);
-                __riscv_vse32(sum_g + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc1, vl), vl), __riscv_vle32_v_f32m8(sum_g + j, vl), vl), vl);
-                __riscv_vse32(sum_r + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc2, vl), vl), __riscv_vle32_v_f32m8(sum_r + j, vl), vl), vl);
-            }
-        }
-
-        int vl;
-        for (int j = 0; j < width; j += vl)
-        {
-            vl = __riscv_vsetvl_e8m2(width - j);
-            auto w = __riscv_vfrdiv(__riscv_vle32_v_f32m8(wsum + j, vl), 1.0f, vl);
-            vuint8m2x3_t dst{};
-            dst = __riscv_vset_v_u8m2_u8m2x3(dst, 0,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_b + j, vl), w, vl), vl), vl));
-            dst = __riscv_vset_v_u8m2_u8m2x3(dst, 1,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_g + j, vl), w, vl), vl), vl));
-            dst = __riscv_vset_v_u8m2_u8m2x3(dst, 2,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_r + j, vl), w, vl), vl), vl));
-            __riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl);
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-// the algorithm is copied from imgproc/src/bilateral_filter.simd.cpp
-// in the functor BilateralFilter_32f_Invoker
-static inline int bilateralFilter32FC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* expLUT, float scale_index)
-{
-    constexpr int align = 31;
-    std::vector<float> _sum(width + align), _wsum(width + align);
-    float* sum = reinterpret_cast<float*>(((size_t)_sum.data() + align) & ~align);
-    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
-
-    for (int i = start; i < end; i++)
-    {
-        const float* sptr = reinterpret_cast<const float*>(src_data + (i+radius) * src_step) + radius;
-        memset(sum, 0, sizeof(float) * width);
-        memset(wsum, 0, sizeof(float) * width);
-        for(int k = 0; k < maxk; k++)
-        {
-            const float* ksptr = sptr + space_ofs[k];
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = __riscv_vsetvl_e32m4(width - j);
-                auto src = __riscv_vle32_v_f32m4(sptr + j, vl);
-                auto ksrc = __riscv_vle32_v_f32m4(ksptr + j, vl);
-                auto diff = __riscv_vfmul(__riscv_vfabs(__riscv_vfsub(src, ksrc, vl), vl), scale_index, vl);
-                auto idx = __riscv_vfcvt_rtz_x(diff, vl);
-                auto alpha = __riscv_vfsub(diff, __riscv_vfcvt_f(idx, vl), vl);
-
-                auto exp = __riscv_vloxseg2ei32_v_f32m4x2(expLUT, __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vmul(idx, sizeof(float), vl)), vl);
-                auto w = __riscv_vfmadd(alpha, __riscv_vfsub(__riscv_vget_v_f32m4x2_f32m4(exp, 1), __riscv_vget_v_f32m4x2_f32m4(exp, 0), vl), __riscv_vget_v_f32m4x2_f32m4(exp, 0), vl);
-                w = __riscv_vfmul(w, space_weight[k], vl);
-
-                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m4(wsum + j, vl), vl), vl);
-                __riscv_vse32(sum + j, __riscv_vfmadd(w, ksrc, __riscv_vle32_v_f32m4(sum + j, vl), vl), vl);
-            }
-        }
-
-        int vl;
-        for (int j = 0; j < width; j += vl)
-        {
-            vl = __riscv_vsetvl_e32m4(width - j);
-            auto src = __riscv_vle32_v_f32m4(sptr + j, vl);
-            auto dst = __riscv_vfdiv(__riscv_vfadd(__riscv_vle32_v_f32m4(sum + j, vl), src, vl), __riscv_vfadd(__riscv_vle32_v_f32m4(wsum + j, vl), 1, vl), vl);
-            __riscv_vse32(reinterpret_cast<float*>(dst_data + i * dst_step) + j, dst, vl);
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-static inline int bilateralFilter32FC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* expLUT, float scale_index)
-{
-    constexpr int align = 31;
-    std::vector<float> _sum_b(width + align), _sum_g(width + align), _sum_r(width + align), _wsum(width + align);
-    float* sum_b = reinterpret_cast<float*>(((size_t)_sum_b.data() + align) & ~align);
-    float* sum_g = reinterpret_cast<float*>(((size_t)_sum_g.data() + align) & ~align);
-    float* sum_r = reinterpret_cast<float*>(((size_t)_sum_r.data() + align) & ~align);
-    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
-
-    for (int i = start; i < end; i++)
-    {
-        const float* sptr = reinterpret_cast<const float*>(src_data + (i+radius) * src_step) + radius*3;
-        memset(sum_b, 0, sizeof(float) * width);
-        memset(sum_g, 0, sizeof(float) * width);
-        memset(sum_r, 0, sizeof(float) * width);
-        memset(wsum, 0, sizeof(float) * width);
-        for(int k = 0; k < maxk; k++)
-        {
-            const float* ksptr = sptr + space_ofs[k];
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = __riscv_vsetvl_e32m2(width - j);
-                auto src = __riscv_vlseg3e32_v_f32m2x3(sptr + j * 3, vl);
-                auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
-                auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
-                auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
-                src = __riscv_vlseg3e32_v_f32m2x3(ksptr + j * 3, vl);
-                auto ksrc0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
-                auto ksrc1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
-                auto ksrc2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
-
-                auto diff = __riscv_vfmul(__riscv_vfadd(__riscv_vfadd(__riscv_vfabs(__riscv_vfsub(src0, ksrc0, vl), vl), __riscv_vfabs(__riscv_vfsub(src1, ksrc1, vl), vl), vl), __riscv_vfabs(__riscv_vfsub(src2, ksrc2, vl), vl), vl), scale_index, vl);
-                auto idx = __riscv_vfcvt_rtz_x(diff, vl);
-                auto alpha = __riscv_vfsub(diff, __riscv_vfcvt_f(idx, vl), vl);
-
-                auto exp = __riscv_vloxseg2ei32_v_f32m2x2(expLUT, __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmul(idx, sizeof(float), vl)), vl);
-                auto w = __riscv_vfmadd(alpha, __riscv_vfsub(__riscv_vget_v_f32m2x2_f32m2(exp, 1), __riscv_vget_v_f32m2x2_f32m2(exp, 0), vl), __riscv_vget_v_f32m2x2_f32m2(exp, 0), vl);
-                w = __riscv_vfmul(w, space_weight[k], vl);
-
-                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m2(wsum + j, vl), vl), vl);
-                __riscv_vse32(sum_b + j, __riscv_vfmadd(w, ksrc0, __riscv_vle32_v_f32m2(sum_b + j, vl), vl), vl);
-                __riscv_vse32(sum_g + j, __riscv_vfmadd(w, ksrc1, __riscv_vle32_v_f32m2(sum_g + j, vl), vl), vl);
-                __riscv_vse32(sum_r + j, __riscv_vfmadd(w, ksrc2, __riscv_vle32_v_f32m2(sum_r + j, vl), vl), vl);
-            }
-        }
-
-        int vl;
-        for (int j = 0; j < width; j += vl)
-        {
-            vl = __riscv_vsetvl_e32m2(width - j);
-            auto w = __riscv_vfrdiv(__riscv_vfadd(__riscv_vle32_v_f32m2(wsum + j, vl), 1, vl), 1, vl);
-            auto src = __riscv_vlseg3e32_v_f32m2x3(sptr + j * 3, vl);
-            auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
-            auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
-            auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
-
-            vfloat32m2x3_t dst{};
-            dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_b + j, vl), src0, vl), vl));
-            dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_g + j, vl), src1, vl), vl));
-            dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_r + j, vl), src2, vl), vl));
-            __riscv_vsseg3e32(reinterpret_cast<float*>(dst_data + i * dst_step) + j * 3, dst, vl);
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-// the algorithm is copied from imgproc/src/bilateral_filter.dispatch.cpp
-// in the function static void bilateralFilter_8u and bilateralFilter_32f
-inline int bilateralFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
-                           int width, int height, int depth, int cn, int d, double sigma_color, double sigma_space, int border_type)
-{
-    const int type = CV_MAKETYPE(depth, cn);
-    if (type != CV_8UC1 && type != CV_8UC3 && type != CV_32FC1 && type != CV_32FC3)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (type == CV_32FC1 && width * height > 1 << 20)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (src_data == dst_data || border_type & BORDER_ISOLATED)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    sigma_color = sigma_color <= 0 ? 1 : sigma_color;
-    sigma_space = sigma_space <= 0 ? 1 : sigma_space;
-    double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
-    double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
-    int radius = d <= 0 ? std::round(sigma_space*1.5) : d/2;
-    radius = std::max(radius, 1);
-    d = radius*2 + 1;
-
-    const int size = depth == CV_32F ? cn * sizeof(float) : cn;
-    const int temp_step = (width + radius * 2) * size;
-    std::vector<uchar> _temp((width + radius * 2) * (height + radius * 2) * size, 0);
-    uchar* temp = _temp.data();
-    std::vector<int> width_interpolate(radius * 2);
-    for (int j = 0; j < radius; j++)
-    {
-        width_interpolate[j] = filter::borderInterpolate(j - radius, width, border_type);
-        width_interpolate[j + radius] = filter::borderInterpolate(width + j, width, border_type);
-    }
-    for (int i = 0; i < height + radius * 2; i++)
-    {
-        int x = filter::borderInterpolate(i - radius, height, border_type);
-        if (x != -1)
-        {
-            for (int j = 0; j < radius; j++)
-            {
-                int y = width_interpolate[j];
-                if (y != -1)
-                    memcpy(temp + i * temp_step + j * size, src_data + x * src_step + y * size, size);
-                y = width_interpolate[j + radius];
-                if (y != -1)
-                    memcpy(temp + i * temp_step + (width + j + radius) * size, src_data + x * src_step + y * size, size);
-            }
-            memcpy(temp + i * temp_step + radius * size, src_data + x * src_step, width * size);
-        }
-    }
-
-    std::vector<float> _space_weight(d*d);
-    std::vector<int> _space_ofs(d*d);
-    float* space_weight = _space_weight.data();
-    int* space_ofs = _space_ofs.data();
-    int maxk = 0;
-    for (int i = -radius; i <= radius; i++)
-    {
-        for (int j = -radius; j <= radius; j++)
-        {
-            double r = std::sqrt((double)i*i + (double)j*j);
-            if (r <= radius && (depth == CV_8U || i != 0 || j != 0))
-            {
-                space_weight[maxk] = static_cast<float>(r*r*gauss_space_coeff);
-                space_ofs[maxk++] = (i * (temp_step / size) + j) * cn;
-            }
-        }
-    }
-    cv::cv_hal_rvv::exp32f(space_weight, space_weight, maxk);
-
-    if (depth == CV_8U)
-    {
-        std::vector<float> _color_weight(cn*256);
-        float* color_weight = _color_weight.data();
-        for (int i = 0; i < 256*cn; i++)
-            color_weight[i] = static_cast<float>(i*i*gauss_color_coeff);
-        cv::cv_hal_rvv::exp32f(color_weight, color_weight, 256*cn);
-
-        switch (cn)
-        {
-        case 1:
-            return filter::invoke(height, {bilateralFilter8UC1}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, color_weight);
-        case 3:
-            return filter::invoke(height, {bilateralFilter8UC3}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, color_weight);
-        }
-    }
-    else
-    {
-        double minValSrc = -1, maxValSrc = 1;
-        cv::cv_hal_rvv::minmax::minMaxIdx(src_data, src_step, width * cn, height, CV_32F, &minValSrc, &maxValSrc, nullptr, nullptr, nullptr);
-        if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON)
-        {
-            for (int i = 0; i < width; i++)
-                memcpy(dst_data + i * dst_step, src_data + i * src_step, width * size);
-            return CV_HAL_ERROR_OK;
-        }
-
-        const int kExpNumBinsPerChannel = 1 << 12;
-        const int kExpNumBins = kExpNumBinsPerChannel * cn;
-        const float scale_index = kExpNumBins / static_cast<float>((maxValSrc - minValSrc) * cn);
-        std::vector<float> _expLUT(kExpNumBins+2, 0);
-        float* expLUT = _expLUT.data();
-        for (int i = 0; i < kExpNumBins+2; i++)
-        {
-            double val = i / scale_index;
-            expLUT[i] = static_cast<float>(val * val * gauss_color_coeff);
-        }
-        cv::cv_hal_rvv::exp32f(expLUT, expLUT, kExpNumBins+2);
-
-        switch (cn)
-        {
-        case 1:
-            return filter::invoke(height, {bilateralFilter32FC1}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, expLUT, scale_index);
-        case 3:
-            return filter::invoke(height, {bilateralFilter32FC3}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, expLUT, scale_index);
-        }
-    }
-
-    return CV_HAL_ERROR_NOT_IMPLEMENTED;
-}
-} // cv::cv_hal_rvv::bilateralFilter
-
-}}
-
-#endif
diff --git a/hal/riscv-rvv/hal_rvv_1p0/polar_to_cart.hpp b/hal/riscv-rvv/hal_rvv_1p0/polar_to_cart.hpp
deleted file mode 100644
index feab2047e5..0000000000
--- a/hal/riscv-rvv/hal_rvv_1p0/polar_to_cart.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
-
-#ifndef OPENCV_HAL_RVV_POLAR_TO_CART_HPP_INCLUDED
-#define OPENCV_HAL_RVV_POLAR_TO_CART_HPP_INCLUDED
-
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/sincos.hpp"
-#include "hal_rvv_1p0/types.hpp"
-
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_polarToCart32f
-#define cv_hal_polarToCart32f cv::cv_hal_rvv::polarToCart<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_polarToCart64f
-#define cv_hal_polarToCart64f cv::cv_hal_rvv::polarToCart<cv::cv_hal_rvv::RVV_F64M8>
-
-template <typename RVV_T, typename Elem = typename RVV_T::ElemType>
-inline int
-    polarToCart(const Elem* mag, const Elem* angle, Elem* x, Elem* y, int len, bool angleInDegrees)
-{
-    using T = RVV_F32M4;
-    const auto sincos_scale = angleInDegrees ? detail::sincos_deg_scale : detail::sincos_rad_scale;
-
-    size_t vl;
-    auto cos_p2 = T::vmv(detail::sincos_cos_p2, T::setvlmax());
-    auto cos_p0 = T::vmv(detail::sincos_cos_p0, T::setvlmax());
-    for (; len > 0; len -= (int)vl, angle += vl, x += vl, y += vl)
-    {
-        vl = RVV_T::setvl(len);
-        auto vangle = T::cast(RVV_T::vload(angle, vl), vl);
-        T::VecType vsin, vcos;
-        detail::SinCos32f<T>(vangle, vsin, vcos, sincos_scale, cos_p2, cos_p0, vl);
-        if (mag)
-        {
-            auto vmag = T::cast(RVV_T::vload(mag, vl), vl);
-            vsin = __riscv_vfmul(vsin, vmag, vl);
-            vcos = __riscv_vfmul(vcos, vmag, vl);
-            mag += vl;
-        }
-        RVV_T::vstore(x, RVV_T::cast(vcos, vl), vl);
-        RVV_T::vstore(y, RVV_T::cast(vsin, vl), vl);
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-}}  // namespace cv::cv_hal_rvv
-
-#endif  // OPENCV_HAL_RVV_POLAR_TO_CART_HPP_INCLUDED
diff --git a/hal/riscv-rvv/hal_rvv_1p0/sqrt.hpp b/hal/riscv-rvv/hal_rvv_1p0/sqrt.hpp
deleted file mode 100644
index b87998d637..0000000000
--- a/hal/riscv-rvv/hal_rvv_1p0/sqrt.hpp
+++ /dev/null
@@ -1,131 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level
-// directory of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
-
-#ifndef OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
-#define OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
-
-#include <riscv_vector.h>
-#include <cmath>
-#include "hal_rvv_1p0/types.hpp"
-
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_sqrt32f
-#undef cv_hal_sqrt64f
-#undef cv_hal_invSqrt32f
-#undef cv_hal_invSqrt64f
-
-#define cv_hal_sqrt32f cv::cv_hal_rvv::sqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
-#define cv_hal_sqrt64f cv::cv_hal_rvv::sqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
-
-#ifdef __clang__
-// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access.
-// So a smaller LMUL is used here.
-#    define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M4>>
-#    define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M4>>
-#else
-#    define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
-#    define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
-#endif
-
-namespace detail {
-
-// Newton-Raphson method
-// Use 4 LMUL registers
-template <size_t iter_times, typename VEC_T>
-inline VEC_T sqrt(VEC_T x, size_t vl)
-{
-    auto x2 = __riscv_vfmul(x, 0.5, vl);
-    auto y = __riscv_vfrsqrt7(x, vl);
-#ifdef __clang__
-#pragma unroll
-#endif
-    for (size_t i = 0; i < iter_times; i++)
-    {
-        auto t = __riscv_vfmul(y, y, vl);
-        t = __riscv_vfmul(t, x2, vl);
-        t = __riscv_vfrsub(t, 1.5, vl);
-        y = __riscv_vfmul(t, y, vl);
-    }
-    // just to prevent the compiler from calculating mask before the iteration, which will run out
-    // of registers and cause memory access.
-    asm volatile("" ::: "memory");
-    auto classified = __riscv_vfclass(x, vl);
-    // block -0, +0, positive subnormal number, +inf
-    auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl);
-    return __riscv_vfmul_mu(mask, x, x, y, vl);
-}
-
-// Newton-Raphson method
-// Use 3 LMUL registers and 1 mask register
-template <size_t iter_times, typename VEC_T>
-inline VEC_T invSqrt(VEC_T x, size_t vl)
-{
-    auto classified = __riscv_vfclass(x, vl);
-    // block -0, +0, positive subnormal number, +inf
-    auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl);
-    auto x2 = __riscv_vfmul(x, 0.5, vl);
-    auto y = __riscv_vfrsqrt7(x, vl);
-#ifdef __clang__
-#pragma unroll
-#endif
-    for (size_t i = 0; i < iter_times; i++)
-    {
-        auto t = __riscv_vfmul(y, y, vl);
-        t = __riscv_vfmul(t, x2, vl);
-        t = __riscv_vfrsub(t, 1.5, vl);
-        y = __riscv_vfmul_mu(mask, y, t, y, vl);
-    }
-    return y;
-}
-
-}  // namespace detail
-
-template <typename RVV_T>
-struct Sqrt32f
-{
-    using T = RVV_T;
-    static constexpr size_t iter_times = 2;
-};
-
-template <typename RVV_T>
-struct Sqrt64f
-{
-    using T = RVV_T;
-    static constexpr size_t iter_times = 3;
-};
-
-template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
-inline int sqrt(const Elem* src, Elem* dst, int _len)
-{
-    size_t vl;
-    for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
-    {
-        vl = SQRT_T::T::setvl(len);
-        auto x = SQRT_T::T::vload(src, vl);
-        SQRT_T::T::vstore(dst, detail::sqrt<SQRT_T::iter_times>(x, vl), vl);
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
-inline int invSqrt(const Elem* src, Elem* dst, int _len)
-{
-    size_t vl;
-    for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
-    {
-        vl = SQRT_T::T::setvl(len);
-        auto x = SQRT_T::T::vload(src, vl);
-        SQRT_T::T::vstore(dst, detail::invSqrt<SQRT_T::iter_times>(x, vl), vl);
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-}}  // namespace cv::cv_hal_rvv
-
-#endif  // OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
diff --git a/hal/riscv-rvv/include/core.hpp b/hal/riscv-rvv/include/core.hpp
new file mode 100644
index 0000000000..b800420d42
--- /dev/null
+++ b/hal/riscv-rvv/include/core.hpp
@@ -0,0 +1,332 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_RVV_HAL_CORE_HPP
+#define OPENCV_RVV_HAL_CORE_HPP
+
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+/* ############ merge ############ */
+
+int merge8u(const uchar** src, uchar* dst, int len, int cn);
+int merge16u(const ushort** src, ushort* dst, int len, int cn);
+int merge32s(const int** src, int* dst, int len, int cn);
+int merge64s(const int64** src, int64* dst, int len, int cn);
+
+#undef cv_hal_merge8u
+#define cv_hal_merge8u cv::rvv_hal::core::merge8u
+#undef cv_hal_merge16u
+#define cv_hal_merge16u cv::rvv_hal::core::merge16u
+#undef cv_hal_merge32s
+#define cv_hal_merge32s cv::rvv_hal::core::merge32s
+#undef cv_hal_merge64s
+#define cv_hal_merge64s cv::rvv_hal::core::merge64s
+
+/* ############ meanStdDev ############ */
+
+int meanStdDev(const uchar* src_data, size_t src_step, int width, int height, int src_type,
+               double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
+
+#undef cv_hal_meanStdDev
+#define cv_hal_meanStdDev cv::rvv_hal::core::meanStdDev
+
+/* ############ dft ############ */
+
+int dft(const uchar* src, uchar* dst, int depth, int nf, int *factors, double scale,
+        int* itab, void* wave, int tab_size, int n, bool isInverse, bool noPermute);
+
+#undef cv_hal_dft
+#define cv_hal_dft cv::rvv_hal::core::dft
+
+/* ############ norm ############ */
+
+int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step,
+         int width, int height, int type, int norm_type, double* result);
+
+#undef cv_hal_norm
+#define cv_hal_norm cv::rvv_hal::core::norm
+
+/* ############ normDiff ############ */
+
+int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step,
+             const uchar* mask, size_t mask_step, int width, int height, int type,
+             int norm_type, double* result);
+
+#undef cv_hal_normDiff
+#define cv_hal_normDiff cv::rvv_hal::core::normDiff
+
+/* ############ normHamming ############ */
+
+int normHamming8u(const uchar* a, int n, int cellSize, int* result);
+int normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result);
+
+#undef cv_hal_normHamming8u
+#define cv_hal_normHamming8u cv::rvv_hal::core::normHamming8u
+#undef cv_hal_normHammingDiff8u
+#define cv_hal_normHammingDiff8u cv::rvv_hal::core::normHammingDiff8u
+
+/* ############ convertScale ############ */
+
+int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t dst_step,
+                 int width, int height, int sdepth, int ddepth, double alpha, double beta);
+
+#undef cv_hal_convertScale
+#define cv_hal_convertScale cv::rvv_hal::core::convertScale
+
+/* ############ minMaxIdx ############ */
+
+int minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth,
+              double* minVal, double* maxVal, int* minIdx, int* maxIdx, uchar* mask, size_t mask_step = 0);
+
+#undef cv_hal_minMaxIdx
+#define cv_hal_minMaxIdx cv::rvv_hal::core::minMaxIdx
+#undef cv_hal_minMaxIdxMaskStep
+#define cv_hal_minMaxIdxMaskStep cv::rvv_hal::core::minMaxIdx
+
+/* ############ fastAtan ############ */
+
+int fast_atan_32(const float* y, const float* x, float* dst, size_t n, bool angle_in_deg);
+int fast_atan_64(const double* y, const double* x, double* dst, size_t n, bool angle_in_deg);
+
+#undef cv_hal_fastAtan32f
+#define cv_hal_fastAtan32f cv::rvv_hal::core::fast_atan_32
+#undef cv_hal_fastAtan64f
+#define cv_hal_fastAtan64f cv::rvv_hal::core::fast_atan_64
+
+/* ############ split ############ */
+
+int split8u(const uchar* src, uchar** dst, int len, int cn);
+
+#undef cv_hal_split8u
+#define cv_hal_split8u cv::rvv_hal::core::split8u
+
+/* ############ sqrt ############ */
+
+int sqrt32f(const float* src, float* dst, int _len);
+int sqrt64f(const double* src, double* dst, int _len);
+
+#undef cv_hal_sqrt32f
+#define cv_hal_sqrt32f cv::rvv_hal::core::sqrt32f
+#undef cv_hal_sqrt64f
+#define cv_hal_sqrt64f cv::rvv_hal::core::sqrt64f
+
+int invSqrt32f(const float* src, float* dst, int _len);
+int invSqrt64f(const double* src, double* dst, int _len);
+
+#undef cv_hal_invSqrt32f
+#define cv_hal_invSqrt32f cv::rvv_hal::core::invSqrt32f
+#undef cv_hal_invSqrt64f
+#define cv_hal_invSqrt64f cv::rvv_hal::core::invSqrt64f
+
+/* ############ magnitude ############ */
+
+int magnitude32f(const float *x, const float *y, float *dst, int len);
+int magnitude64f(const double *x, const double  *y, double *dst, int len);
+
+#undef cv_hal_magnitude32f
+#define cv_hal_magnitude32f cv::rvv_hal::core::magnitude32f
+#undef cv_hal_magnitude64f
+#define cv_hal_magnitude64f cv::rvv_hal::core::magnitude64f
+
+/* ############ cartToPolar ############ */
+
+int cartToPolar32f(const float* x, const float* y, float* mag, float* angle, int len, bool angleInDegrees);
+int cartToPolar64f(const double* x, const double* y, double* mag, double* angle, int len, bool angleInDegrees);
+
+#undef cv_hal_cartToPolar32f
+#define cv_hal_cartToPolar32f cv::rvv_hal::core::cartToPolar32f
+#undef cv_hal_cartToPolar64f
+#define cv_hal_cartToPolar64f cv::rvv_hal::core::cartToPolar64f
+
+/* ############ polarToCart ############ */
+
+int polarToCart32f(const float* mag, const float* angle, float* x, float* y, int len, bool angleInDegrees);
+int polarToCart64f(const double* mag, const double* angle, double* x, double* y, int len, bool angleInDegrees);
+
+#undef cv_hal_polarToCart32f
+#define cv_hal_polarToCart32f cv::rvv_hal::core::polarToCart32f
+#undef cv_hal_polarToCart64f
+#define cv_hal_polarToCart64f cv::rvv_hal::core::polarToCart64f
+
+/* ############ polarToCart ############ */
+
+int flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
+         uchar* dst_data, size_t dst_step, int flip_mode);
+
+#undef cv_hal_flip
+#define cv_hal_flip cv::rvv_hal::core::flip
+
+/* ############ lut ############ */
+
+int lut(const uchar* src_data, size_t src_step, size_t src_type,
+        const uchar* lut_data, size_t lut_channel_size, size_t lut_channels,
+        uchar* dst_data, size_t dst_step, int width, int height);
+
+#undef cv_hal_lut
+#define cv_hal_lut cv::rvv_hal::core::lut
+
+/* ############ exp ############ */
+
+int exp32f(const float* src, float* dst, int _len);
+int exp64f(const double* src, double* dst, int _len);
+
+#undef cv_hal_exp32f
+#define cv_hal_exp32f cv::rvv_hal::core::exp32f
+#undef cv_hal_exp64f
+#define cv_hal_exp64f cv::rvv_hal::core::exp64f
+
+/* ############ log ############ */
+
+int log32f(const float* src, float* dst, int _len);
+int log64f(const double* src, double* dst, int _len);
+
+#undef cv_hal_log32f
+#define cv_hal_log32f cv::rvv_hal::core::log32f
+#undef cv_hal_log64f
+#define cv_hal_log64f cv::rvv_hal::core::log64f
+
+/* ############ lu ############ */
+
+int LU32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, int* info);
+int LU64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, int* info);
+
+#undef cv_hal_LU32f
+#define cv_hal_LU32f cv::rvv_hal::core::LU32f
+#undef cv_hal_LU64f
+#define cv_hal_LU64f cv::rvv_hal::core::LU64f
+
+/* ############ cholesky ############ */
+
+int Cholesky32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, bool* info);
+int Cholesky64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, bool* info);
+
+#undef cv_hal_Cholesky32f
+#define cv_hal_Cholesky32f cv::rvv_hal::core::Cholesky32f
+#undef cv_hal_Cholesky64f
+#define cv_hal_Cholesky64f cv::rvv_hal::core::Cholesky64f
+
+/* ############ qr ############ */
+
+int QR32f(float* src1, size_t src1_step, int m, int n, int k, float* src2, size_t src2_step, float* dst, int* info);
+int QR64f(double* src1, size_t src1_step, int m, int n, int k, double* src2, size_t src2_step, double* dst, int* info);
+
+#undef cv_hal_QR32f
+#define cv_hal_QR32f cv::rvv_hal::core::QR32f
+#undef cv_hal_QR64f
+#define cv_hal_QR64f cv::rvv_hal::core::QR64f
+
+/* ############ SVD ############ */
+
+int SVD32f(float* src, size_t src_step, float* w, float* u, size_t u_step, float* vt, size_t vt_step, int m, int n, int flags);
+int SVD64f(double* src, size_t src_step, double* w, double* u, size_t u_step, double* vt, size_t vt_step, int m, int n, int flags);
+
+#undef cv_hal_SVD32f
+#define cv_hal_SVD32f cv::rvv_hal::core::SVD32f
+#undef cv_hal_SVD64f
+#define cv_hal_SVD64f cv::rvv_hal::core::SVD64f
+
+/* ############ copyToMasked ############ */
+
+int copyToMasked(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height,
+                 int type, const uchar *mask_data, size_t mask_step, int mask_type);
+
+#undef cv_hal_copyToMasked
+#define cv_hal_copyToMasked cv::rvv_hal::core::copyToMasked
+
+/* ############ div, recip ############ */
+
+int div8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, double scale);
+int div8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, schar *dst_data, size_t dst_step, int width, int height, double scale);
+int div16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, ushort *dst_data, size_t dst_step, int width, int height, double scale);
+int div16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, short *dst_data, size_t dst_step, int width, int height, double scale);
+int div32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale);
+int div32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale);
+// int div64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height, double scale);
+
+#undef cv_hal_div8u
+#define cv_hal_div8u cv::rvv_hal::core::div8u
+#undef cv_hal_div8s
+#define cv_hal_div8s cv::rvv_hal::core::div8s
+#undef cv_hal_div16u
+#define cv_hal_div16u cv::rvv_hal::core::div16u
+#undef cv_hal_div16s
+#define cv_hal_div16s cv::rvv_hal::core::div16s
+#undef cv_hal_div32s
+#define cv_hal_div32s cv::rvv_hal::core::div32s
+#undef cv_hal_div32f
+#define cv_hal_div32f cv::rvv_hal::core::div32f
+// #undef cv_hal_div64f
+// #define cv_hal_div64f cv::rvv_hal::core::div64f
+
+int recip8u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, double scale);
+int recip8s(const schar *src_data, size_t src_step, schar *dst_data, size_t dst_step, int width, int height, double scale);
+int recip16u(const ushort *src_data, size_t src_step, ushort *dst_data, size_t dst_step, int width, int height, double scale);
+int recip16s(const short *src_data, size_t src_step, short *dst_data, size_t dst_step, int width, int height, double scale);
+int recip32s(const int *src_data, size_t src_step, int *dst_data, size_t dst_step, int width, int height, double scale);
+int recip32f(const float *src_data, size_t src_step, float *dst_data, size_t dst_step, int width, int height, double scale);
+// int recip64f(const double *src_data, size_t src_step, double *dst_data, size_t dst_step, int width, int height, double scale);
+
+#undef cv_hal_recip8u
+#define cv_hal_recip8u cv::rvv_hal::core::recip8u
+#undef cv_hal_recip8s
+#define cv_hal_recip8s cv::rvv_hal::core::recip8s
+#undef cv_hal_recip16u
+#define cv_hal_recip16u cv::rvv_hal::core::recip16u
+#undef cv_hal_recip16s
+#define cv_hal_recip16s cv::rvv_hal::core::recip16s
+#undef cv_hal_recip32s
+#define cv_hal_recip32s cv::rvv_hal::core::recip32s
+#undef cv_hal_recip32f
+#define cv_hal_recip32f cv::rvv_hal::core::recip32f
+// #undef cv_hal_recip64f
+// #define cv_hal_recip64f cv::rvv_hal::core::recip64f
+
+/* ############ dotProduct ############ */
+
+int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size_t b_step,
+            int width, int height, int type, double *dot_val);
+
+#undef cv_hal_dotProduct
+#define cv_hal_dotProduct cv::rvv_hal::core::dotprod
+
+/* ############ compare ############ */
+
+int cmp8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+int cmp8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+int cmp16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+int cmp16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+int cmp32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+int cmp32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+// int cmp64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+
+#undef cv_hal_cmp8u
+#define cv_hal_cmp8u cv::rvv_hal::core::cmp8u
+#undef cv_hal_cmp8s
+#define cv_hal_cmp8s cv::rvv_hal::core::cmp8s
+#undef cv_hal_cmp16u
+#define cv_hal_cmp16u cv::rvv_hal::core::cmp16u
+#undef cv_hal_cmp16s
+#define cv_hal_cmp16s cv::rvv_hal::core::cmp16s
+#undef cv_hal_cmp32s
+#define cv_hal_cmp32s cv::rvv_hal::core::cmp32s
+#undef cv_hal_cmp32f
+#define cv_hal_cmp32f cv::rvv_hal::core::cmp32f
+// #undef cv_hal_cmp64f
+// #define cv_hal_cmp64f cv::rvv_hal::core::cmp64f
+
+/* ############ transpose2d ############ */
+
+int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                int src_width, int src_height, int element_size);
+
+#undef cv_hal_transpose2d
+#define cv_hal_transpose2d cv::rvv_hal::core::transpose2d
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
+
+#endif // OPENCV_RVV_HAL_CORE_HPP
diff --git a/hal/riscv-rvv/include/imgproc.hpp b/hal/riscv-rvv/include/imgproc.hpp
new file mode 100644
index 0000000000..66c75786a0
--- /dev/null
+++ b/hal/riscv-rvv/include/imgproc.hpp
@@ -0,0 +1,249 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_RVV_HAL_IMGPROC_HPP
+#define OPENCV_RVV_HAL_IMGPROC_HPP
+
+struct cvhalFilter2D;
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+/* ############ imageMoments ############ */
+
+int imageMoments(const uchar* src_data, size_t src_step, int src_type,
+                 int width, int height, bool binary, double m[10]);
+
+#undef cv_hal_imageMoments
+#define cv_hal_imageMoments cv::rvv_hal::imgproc::imageMoments
+
+/* ############ filter ############ */
+
+int filterInit(cvhalFilter2D** context, uchar* kernel_data, size_t kernel_step, int kernel_type, int kernel_width, int kernel_height, int /*max_width*/, int /*max_height*/, int src_type, int dst_type, int borderType, double delta, int anchor_x, int anchor_y, bool /*allowSubmatrix*/, bool /*allowInplace*/);
+int filter(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y);
+int filterFree(cvhalFilter2D* context);
+
+#undef cv_hal_filterInit
+#define cv_hal_filterInit cv::rvv_hal::imgproc::filterInit
+#undef cv_hal_filter
+#define cv_hal_filter cv::rvv_hal::imgproc::filter
+#undef cv_hal_filterFree
+#define cv_hal_filterFree cv::rvv_hal::imgproc::filterFree
+
+/* ############ sepFilter ############ */
+
+int sepFilterInit(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type, uchar* kernelx_data, int kernelx_length, uchar* kernely_data, int kernely_length, int anchor_x, int anchor_y, double delta, int borderType);
+int sepFilter(cvhalFilter2D *context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y);
+int sepFilterFree(cvhalFilter2D* context);
+
+#undef cv_hal_sepFilterInit
+#define cv_hal_sepFilterInit cv::rvv_hal::imgproc::sepFilterInit
+#undef cv_hal_sepFilter
+#define cv_hal_sepFilter cv::rvv_hal::imgproc::sepFilter
+#undef cv_hal_sepFilterFree
+#define cv_hal_sepFilterFree cv::rvv_hal::imgproc::sepFilterFree
+
+/* ############ morph ############ */
+
+int morphInit(cvhalFilter2D** context, int operation, int src_type, int dst_type, int /*max_width*/, int /*max_height*/, int kernel_type, uchar* kernel_data, size_t kernel_step, int kernel_width, int kernel_height, int anchor_x, int anchor_y, int borderType, const double borderValue[4], int iterations, bool /*allowSubmatrix*/, bool /*allowInplace*/);
+int morph(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_full_width, int src_full_height, int src_roi_x, int src_roi_y, int /*dst_full_width*/, int /*dst_full_height*/, int /*dst_roi_x*/, int /*dst_roi_y*/);
+int morphFree(cvhalFilter2D* context);
+
+#undef cv_hal_morphInit
+#undef cv_hal_morph
+#undef cv_hal_morphFree
+#define cv_hal_morphInit cv::rvv_hal::imgproc::morphInit
+#define cv_hal_morph cv::rvv_hal::imgproc::morph
+#define cv_hal_morphFree cv::rvv_hal::imgproc::morphFree
+
+/* ############ gaussianBlur ############ */
+
+int gaussianBlurBinomial(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, size_t margin_left, size_t margin_top, size_t margin_right, size_t margin_bottom, size_t ksize, int border_type);
+
+#undef cv_hal_gaussianBlurBinomial
+#define cv_hal_gaussianBlurBinomial cv::rvv_hal::imgproc::gaussianBlurBinomial
+
+/* ############ medianBlur ############ */
+
+int medianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, int ksize);
+
+#undef cv_hal_medianBlur
+#define cv_hal_medianBlur cv::rvv_hal::imgproc::medianBlur
+
+/* ############ boxFilter ############ */
+
+int boxFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, size_t ksize_width, size_t ksize_height, int anchor_x, int anchor_y, bool normalize, int border_type);
+
+#undef cv_hal_boxFilter
+#define cv_hal_boxFilter cv::rvv_hal::imgproc::boxFilter
+
+/* ############ bilateralFilter ############ */
+
+int bilateralFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                    int width, int height, int depth, int cn, int d, double sigma_color,
+                    double sigma_space, int border_type);
+
+#undef cv_hal_bilateralFilter
+#define cv_hal_bilateralFilter cv::rvv_hal::imgproc::bilateralFilter
+
+/* ############ pyramid ############ */
+
+int pyrDown(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type);
+int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type);
+
+#undef cv_hal_pyrdown
+#define cv_hal_pyrdown cv::rvv_hal::imgproc::pyrDown
+#undef cv_hal_pyrup
+#define cv_hal_pyrup cv::rvv_hal::imgproc::pyrUp
+
+/* ############ cvtColor ############ */
+
+int cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue);
+int cvtGraytoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn);
+int cvtBGRtoGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue);
+int cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int greenBits);
+int cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int greenBits);
+int cvtBGR5x5toGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits);
+int cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits);
+int cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr);
+int cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr);
+int cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx, int yIdx);
+int cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx);
+int cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx);
+int cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int yIdx);
+int cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, uchar * y_data, size_t y_step, uchar * uv_data, size_t uv_step, int width, int height, int scn, bool swapBlue, int uIdx);
+int cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx);
+int cvtHSVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV);
+int cvtBGRtoHSV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV);
+int cvtXYZtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue);
+int cvtBGRtoXYZ(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue);
+int cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isLab, bool srgb);
+int cvtBGRtoLab(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isLab, bool srgb);
+
+#undef cv_hal_cvtBGRtoBGR
+#define cv_hal_cvtBGRtoBGR cv::rvv_hal::imgproc::cvtBGRtoBGR
+#undef cv_hal_cvtGraytoBGR
+#define cv_hal_cvtGraytoBGR cv::rvv_hal::imgproc::cvtGraytoBGR
+#undef cv_hal_cvtBGRtoGray
+#define cv_hal_cvtBGRtoGray cv::rvv_hal::imgproc::cvtBGRtoGray
+#undef cv_hal_cvtBGR5x5toBGR
+#define cv_hal_cvtBGR5x5toBGR cv::rvv_hal::imgproc::cvtBGR5x5toBGR
+#undef cv_hal_cvtBGRtoBGR5x5
+#define cv_hal_cvtBGRtoBGR5x5 cv::rvv_hal::imgproc::cvtBGRtoBGR5x5
+#undef cv_hal_cvtBGR5x5toGray
+#define cv_hal_cvtBGR5x5toGray cv::rvv_hal::imgproc::cvtBGR5x5toGray
+#undef cv_hal_cvtGraytoBGR5x5
+#define cv_hal_cvtGraytoBGR5x5 cv::rvv_hal::imgproc::cvtGraytoBGR5x5
+#undef cv_hal_cvtYUVtoBGR
+#define cv_hal_cvtYUVtoBGR cv::rvv_hal::imgproc::cvtYUVtoBGR
+#undef cv_hal_cvtBGRtoYUV
+#define cv_hal_cvtBGRtoYUV cv::rvv_hal::imgproc::cvtBGRtoYUV
+#undef cv_hal_cvtOnePlaneYUVtoBGR
+#define cv_hal_cvtOnePlaneYUVtoBGR cv::rvv_hal::imgproc::cvtOnePlaneYUVtoBGR
+#undef cv_hal_cvtTwoPlaneYUVtoBGR
+#define cv_hal_cvtTwoPlaneYUVtoBGR cv::rvv_hal::imgproc::cvtTwoPlaneYUVtoBGR
+#undef cv_hal_cvtThreePlaneYUVtoBGR
+#define cv_hal_cvtThreePlaneYUVtoBGR cv::rvv_hal::imgproc::cvtThreePlaneYUVtoBGR
+#undef cv_hal_cvtOnePlaneBGRtoYUV
+#define cv_hal_cvtOnePlaneBGRtoYUV cv::rvv_hal::imgproc::cvtOnePlaneBGRtoYUV
+#undef cv_hal_cvtBGRtoTwoPlaneYUV
+#define cv_hal_cvtBGRtoTwoPlaneYUV cv::rvv_hal::imgproc::cvtBGRtoTwoPlaneYUV
+#undef cv_hal_cvtBGRtoThreePlaneYUV
+#define cv_hal_cvtBGRtoThreePlaneYUV cv::rvv_hal::imgproc::cvtBGRtoThreePlaneYUV
+#undef cv_hal_cvtHSVtoBGR
+#define cv_hal_cvtHSVtoBGR cv::rvv_hal::imgproc::cvtHSVtoBGR
+#undef cv_hal_cvtBGRtoHSV
+#define cv_hal_cvtBGRtoHSV cv::rvv_hal::imgproc::cvtBGRtoHSV
+#undef cv_hal_cvtXYZtoBGR
+#define cv_hal_cvtXYZtoBGR cv::rvv_hal::imgproc::cvtXYZtoBGR
+#undef cv_hal_cvtBGRtoXYZ
+#define cv_hal_cvtBGRtoXYZ cv::rvv_hal::imgproc::cvtBGRtoXYZ
+#undef cv_hal_cvtLabtoBGR
+#define cv_hal_cvtLabtoBGR cv::rvv_hal::imgproc::cvtLabtoBGR
+#undef cv_hal_cvtBGRtoLab
+#define cv_hal_cvtBGRtoLab cv::rvv_hal::imgproc::cvtBGRtoLab
+
+/* ############ warp ############ */
+
+int remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+             uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+             float* mapx, size_t mapx_step, float* mapy, size_t mapy_step,
+             int interpolation, int border_type, const double border_value[4]);
+int remap32fc2(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+               uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+               float* map, size_t map_step, int interpolation, int border_type, const double border_value[4]);
+int remap16s(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+             uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+             short* mapx, size_t mapx_step, ushort* mapy, size_t mapy_step,
+             int interpolation, int border_type, const double border_value[4]);
+
+#undef cv_hal_remap32f
+#define cv_hal_remap32f cv::rvv_hal::imgproc::remap32f
+#undef cv_hal_remap32fc2
+#define cv_hal_remap32fc2 cv::rvv_hal::imgproc::remap32fc2
+#undef cv_hal_remap16s
+#define cv_hal_remap16s cv::rvv_hal::imgproc::remap16s
+
+int warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4]);
+int warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4]);
+
+#undef cv_hal_warpAffine
+#define cv_hal_warpAffine cv::rvv_hal::imgproc::warpAffine
+#undef cv_hal_warpPerspective
+#define cv_hal_warpPerspective cv::rvv_hal::imgproc::warpPerspective
+
+/* ############ threshold ############ */
+
+int threshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, double thresh, double maxValue, int thresholdType);
+int threshold_otsu(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, double maxValue, int thresholdType, double* thresh);
+int adaptiveThreshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, double maxValue, int adaptiveMethod, int thresholdType, int blockSize, double C);
+
+// disabled since UI is fast enough, only called in threshold_otsu
+// #undef cv_hal_threshold
+// #define cv_hal_threshold cv::rvv_hal::imgproc::threshold
+#undef cv_hal_threshold_otsu
+#define cv_hal_threshold_otsu cv::rvv_hal::imgproc::threshold_otsu
+#undef cv_hal_adaptiveThreshold
+#define cv_hal_adaptiveThreshold cv::rvv_hal::imgproc::adaptiveThreshold
+
+/* ############ histogram ############ */
+
+int equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height);
+
+#undef cv_hal_equalize_hist
+#define cv_hal_equalize_hist cv::rvv_hal::imgproc::equalize_hist
+
+/* ############ resize ############ */
+
+int resize(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y, int interpolation);
+
+#undef cv_hal_resize
+#define cv_hal_resize cv::rvv_hal::imgproc::resize
+
+/* ############ resize ############ */
+
+int integral(int depth, int sdepth, int sqdepth,
+             const uchar* src_data, size_t src_step,
+             uchar* sum_data, size_t sum_step,
+             uchar* sqsum_data, size_t sqsum_step,
+             uchar* tilted_data, [[maybe_unused]] size_t tilted_step,
+             int width, int height, int cn);
+
+#undef cv_hal_integral
+#define cv_hal_integral cv::rvv_hal::imgproc::integral
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+#if CV_HAL_RVV_071_ENABLED
+
+int cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue);
+#undef cv_hal_cvtBGRtoBGR
+#define cv_hal_cvtBGRtoBGR cv::rvv_hal::imgproc::cvtBGRtoBGR
+
+#endif // CV_HAL_RVV_071_ENABLED
+
+}}} // cv::rvv_hal::imgproc
+
+#endif // OPENCV_RVV_HAL_IMGPROC_HPP
diff --git a/hal/riscv-rvv/hal_rvv_1p0/types.hpp b/hal/riscv-rvv/include/types.hpp
similarity index 99%
rename from hal/riscv-rvv/hal_rvv_1p0/types.hpp
rename to hal/riscv-rvv/include/types.hpp
index 6613a018fc..948bbfbd30 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/types.hpp
+++ b/hal/riscv-rvv/include/types.hpp
@@ -4,13 +4,15 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_TYPES_HPP_INCLUDED
-#define OPENCV_HAL_RVV_TYPES_HPP_INCLUDED
+#ifndef OPENCV_RVV_HAL_TYPES_HPP
+#define OPENCV_RVV_HAL_TYPES_HPP
 
 #include <riscv_vector.h>
 #include <type_traits>
 
-namespace cv { namespace cv_hal_rvv {
+namespace cv { namespace rvv_hal {
+
+#if CV_HAL_RVV_1P0_ENABLED
 
 enum RVV_LMUL
 {
@@ -869,6 +871,8 @@ HAL_RVV_GROUP(RVV_F64M1, RVV_F64M8, f64, m1, m8)
 
 #undef HAL_RVV_GROUP
 
-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif //OPENCV_HAL_RVV_TYPES_HPP_INCLUDED
+}}  // namespace cv::rvv_hal
+
+#endif //OPENCV_RVV_HAL_TYPES_HPP
diff --git a/hal/riscv-rvv/rvv_hal.hpp b/hal/riscv-rvv/rvv_hal.hpp
new file mode 100644
index 0000000000..88989aaeb8
--- /dev/null
+++ b/hal/riscv-rvv/rvv_hal.hpp
@@ -0,0 +1,31 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_HAL_RVV_HPP_INCLUDED
+#define OPENCV_HAL_RVV_HPP_INCLUDED
+
+#include "opencv2/core/base.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/hal/interface.h"
+
+#if defined(__riscv_v) && __riscv_v == 1000000
+#define CV_HAL_RVV_1P0_ENABLED 1
+#else
+#define CV_HAL_RVV_1P0_ENABLED 0
+#endif
+
+#if defined(__riscv_v) && __riscv_v == 7000 && defined(__GNUC__) && __GNUC__ == 10 && __GNUC_MINOR__ == 4 && defined(__THEAD_VERSION__)
+#define CV_HAL_RVV_071_ENABLED 1
+#else
+#define CV_HAL_RVV_071_ENABLED 0
+#endif
+
+#if CV_HAL_RVV_1P0_ENABLED || CV_HAL_RVV_071_ENABLED
+#include <riscv_vector.h>
+#endif
+#include "include/types.hpp"
+#include "include/core.hpp"
+#include "include/imgproc.hpp"
+
+#endif // OPENCV_HAL_RVV_HPP_INCLUDED
diff --git a/hal/riscv-rvv/src/core/atan.cpp b/hal/riscv-rvv/src/core/atan.cpp
new file mode 100644
index 0000000000..e2b0d5c314
--- /dev/null
+++ b/hal/riscv-rvv/src/core/atan.cpp
@@ -0,0 +1,64 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level
+// directory of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+int fast_atan_32(const float* y, const float* x, float* dst, size_t n, bool angle_in_deg)
+{
+    auto atan_params = angle_in_deg ? common::atan_params_deg : common::atan_params_rad;
+
+    for (size_t vl = 0; n > 0; n -= vl)
+    {
+        vl = __riscv_vsetvl_e32m4(n);
+
+        auto vy = __riscv_vle32_v_f32m4(y, vl);
+        auto vx = __riscv_vle32_v_f32m4(x, vl);
+
+        auto a = common::rvv_atan(vy, vx, vl, atan_params);
+
+        __riscv_vse32(dst, a, vl);
+
+        x += vl;
+        y += vl;
+        dst += vl;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+int fast_atan_64(const double* y, const double* x, double* dst, size_t n, bool angle_in_deg)
+{
+    // this also uses float32 version, ref: mathfuncs_core.simd.hpp
+
+    auto atan_params = angle_in_deg ? common::atan_params_deg : common::atan_params_rad;
+
+    for (size_t vl = 0; n > 0; n -= vl)
+    {
+        vl = __riscv_vsetvl_e64m8(n);
+
+        auto vy = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(y, vl), vl);
+        auto vx = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(x, vl), vl);
+
+        auto a = common::rvv_atan(vy, vx, vl, atan_params);
+
+        __riscv_vse64(dst, __riscv_vfwcvt_f(a, vl), vl);
+
+        x += vl;
+        y += vl;
+        dst += vl;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/cart_to_polar.hpp b/hal/riscv-rvv/src/core/cart_to_polar.cpp
similarity index 53%
rename from hal/riscv-rvv/hal_rvv_1p0/cart_to_polar.hpp
rename to hal/riscv-rvv/src/core/cart_to_polar.cpp
index 676133b668..56ee0fcefc 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/cart_to_polar.hpp
+++ b/hal/riscv-rvv/src/core/cart_to_polar.cpp
@@ -4,27 +4,20 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED
-#define OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED
+#include "rvv_hal.hpp"
+#include "common.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {
 
-#include "hal_rvv_1p0/atan.hpp"
-#include "hal_rvv_1p0/sqrt.hpp"
-#include "hal_rvv_1p0/types.hpp"
+#if CV_HAL_RVV_1P0_ENABLED
 
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_cartToPolar32f
-#define cv_hal_cartToPolar32f cv::cv_hal_rvv::cartToPolar<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_cartToPolar64f
-#define cv_hal_cartToPolar64f cv::cv_hal_rvv::cartToPolar<cv::cv_hal_rvv::RVV_F64M8>
+namespace {
 
 template <typename RVV_T, typename T = typename RVV_T::ElemType>
 inline int cartToPolar(const T* x, const T* y, T* mag, T* angle, int len, bool angleInDegrees)
 {
     using CalType = RVV_SameLen<float, RVV_T>;
-    auto atan_params = angleInDegrees ? detail::atan_params_deg : detail::atan_params_rad;
+    auto atan_params = angleInDegrees ? common::atan_params_deg : common::atan_params_rad;
     size_t vl;
     for (; len > 0; len -= (int)vl, x += vl, y += vl, mag += vl, angle += vl)
     {
@@ -33,16 +26,25 @@ inline int cartToPolar(const T* x, const T* y, T* mag, T* angle, int len, bool a
         auto vx = CalType::cast(RVV_T::vload(x, vl), vl);
         auto vy = CalType::cast(RVV_T::vload(y, vl), vl);
 
-        auto vmag = detail::sqrt<2>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl);
+        auto vmag = common::sqrt<2>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl);
         RVV_T::vstore(mag, RVV_T::cast(vmag, vl), vl);
 
-        auto vangle = detail::rvv_atan(vy, vx, vl, atan_params);
+        auto vangle = common::rvv_atan(vy, vx, vl, atan_params);
         RVV_T::vstore(angle, RVV_T::cast(vangle, vl), vl);
     }
 
     return CV_HAL_ERROR_OK;
 }
 
-}}  // namespace cv::cv_hal_rvv
+} // anonymous
 
-#endif  // OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED
+int cartToPolar32f(const float* x, const float* y, float* mag, float* angle, int len, bool angleInDegrees) {
+    return cartToPolar<RVV_F32M4>(x, y, mag, angle, len, angleInDegrees);
+}
+int cartToPolar64f(const double* x, const double* y, double* mag, double* angle, int len, bool angleInDegrees) {
+    return cartToPolar<RVV_F64M8>(x, y, mag, angle, len, angleInDegrees);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/cholesky.hpp b/hal/riscv-rvv/src/core/cholesky.cpp
similarity index 88%
rename from hal/riscv-rvv/hal_rvv_1p0/cholesky.hpp
rename to hal/riscv-rvv/src/core/cholesky.cpp
index b5d9d3e891..995e7eb5be 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/cholesky.hpp
+++ b/hal/riscv-rvv/src/core/cholesky.cpp
@@ -4,20 +4,15 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_CHOLESKY_HPP_INCLUDED
-#define OPENCV_HAL_RVV_CHOLESKY_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include <cmath>
 #include <limits>
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
 
-namespace cv { namespace cv_hal_rvv { namespace cholesky {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_Cholesky32f
-#define cv_hal_Cholesky32f cv::cv_hal_rvv::cholesky::Cholesky<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_Cholesky64f
-#define cv_hal_Cholesky64f cv::cv_hal_rvv::cholesky::Cholesky<cv::cv_hal_rvv::RVV_F64M4>
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
 
 // the algorithm is copied from core/src/matrix_decomp.cpp,
 // in the function template static int cv::CholImpl
@@ -119,6 +114,15 @@ inline int Cholesky(T* src1, size_t src1_step, int m, T* src2, size_t src2_step,
     return CV_HAL_ERROR_OK;
 }
 
-}}}
+} // anonymous
 
-#endif
+int Cholesky32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, bool* info) {
+    return Cholesky<RVV_F32M4>(src1, src1_step, m, src2, src2_step, n, info);
+}
+int Cholesky64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, bool* info) {
+    return Cholesky<RVV_F64M4>(src1, src1_step, m, src2, src2_step, n, info);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/src/core/common.hpp b/hal/riscv-rvv/src/core/common.hpp
new file mode 100644
index 0000000000..37ef0194d4
--- /dev/null
+++ b/hal/riscv-rvv/src/core/common.hpp
@@ -0,0 +1,183 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+// Third party copyrights are property of their respective owners.
+
+#ifndef OPENCV_HAL_RVV_CORE_COMMON_HPP_INCLUDED
+#define OPENCV_HAL_RVV_CORE_COMMON_HPP_INCLUDED
+
+#include <riscv_vector.h>
+#include <cmath>
+#include <cfloat>
+
+namespace cv { namespace rvv_hal { namespace core { namespace common {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+#define CV_HAL_RVV_NOOP(a) (a)
+
+// ############ abs ############
+
+#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(_Tpvs, _Tpvd, shift, suffix) \
+    inline _Tpvd __riscv_vabs(const _Tpvs& v, const int vl) { \
+        _Tpvs mask = __riscv_vsra(v, shift, vl); \
+        _Tpvs v_xor = __riscv_vxor(v, mask, vl); \
+        return __riscv_vreinterpret_##suffix( \
+            __riscv_vsub(v_xor, mask, vl) \
+        ); \
+    }
+
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m2_t,  vuint8m2_t,  7,  u8m2)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m8_t,  vuint8m8_t,  7,  u8m8)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m4_t, vuint16m4_t, 15, u16m4)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m8_t, vuint16m8_t, 15, u16m8)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m4_t, vuint32m4_t, 31, u32m4)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m8_t, vuint32m8_t, 31, u32m8)
+
+// ############ absdiff ############
+
+#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(_Tpvs, _Tpvd, cast, sub, max, min) \
+    inline _Tpvd __riscv_vabd(const _Tpvs& v1, const _Tpvs& v2, const int vl) { \
+        return cast(__riscv_##sub(__riscv_##max(v1, v2, vl), __riscv_##min(v1, v2, vl), vl)); \
+    }
+
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m4_t, vuint8m4_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m8_t, vuint8m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m2_t, vuint16m2_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m8_t, vuint16m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
+
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m4_t, vuint8m4_t, __riscv_vreinterpret_u8m4, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m8_t, vuint8m8_t, __riscv_vreinterpret_u8m8, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m2_t, vuint16m2_t, __riscv_vreinterpret_u16m2, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m8_t, vuint16m8_t, __riscv_vreinterpret_u16m8, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m4_t, vuint32m4_t, __riscv_vreinterpret_u32m4, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m8_t, vuint32m8_t, __riscv_vreinterpret_u32m8, vsub, vmax, vmin)
+
+// ############ atan ############
+
+// ref: mathfuncs_core.simd.hpp
+static constexpr float pi = CV_PI;
+
+struct AtanParams
+{
+    float p1, p3, p5, p7, angle_90;
+};
+
+static constexpr AtanParams atan_params_rad {
+    0.9997878412794807F,
+    -0.3258083974640975F,
+    0.1555786518463281F,
+    -0.04432655554792128F,
+    90.F * (pi / 180.F)};
+static constexpr AtanParams atan_params_deg {
+    atan_params_rad.p1 * (180 / pi),
+    atan_params_rad.p3 * (180 / pi),
+    atan_params_rad.p5 * (180 / pi),
+    atan_params_rad.p7 * (180 / pi),
+    90.F};
+
+template <typename VEC_T>
+__attribute__((always_inline)) inline VEC_T
+    rvv_atan(VEC_T vy, VEC_T vx, size_t vl, const AtanParams& params)
+{
+    const auto ax = __riscv_vfabs(vx, vl);
+    const auto ay = __riscv_vfabs(vy, vl);
+    // Reciprocal Estimate (vfrec7) is not accurate enough to pass the test of cartToPolar.
+    const auto c = __riscv_vfdiv(__riscv_vfmin(ax, ay, vl),
+                                 __riscv_vfadd(__riscv_vfmax(ax, ay, vl), FLT_EPSILON, vl),
+                                 vl);
+    const auto c2 = __riscv_vfmul(c, c, vl);
+
+    // Using vfmadd only results in about a 2% performance improvement, but it occupies 3 additional
+    // M4 registers. (Performance test on phase32f::VectorLength::1048576: time decreased
+    // from 5.952ms to 5.805ms on Muse Pi)
+    // Additionally, when registers are nearly fully utilized (though not yet exhausted), the
+    // compiler is likely to fail to optimize and may introduce slower memory access (e.g., in
+    // cv::rvv_hal::fast_atan_64).
+    // Saving registers can also make this function more reusable in other contexts.
+    // Therefore, vfmadd is not used here.
+    auto a = __riscv_vfadd(__riscv_vfmul(c2, params.p7, vl), params.p5, vl);
+    a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p3, vl);
+    a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p1, vl);
+    a = __riscv_vfmul(a, c, vl);
+
+    a = __riscv_vfrsub_mu(__riscv_vmflt(ax, ay, vl), a, a, params.angle_90, vl);
+    a = __riscv_vfrsub_mu(__riscv_vmflt(vx, 0.F, vl), a, a, params.angle_90 * 2, vl);
+    a = __riscv_vfrsub_mu(__riscv_vmflt(vy, 0.F, vl), a, a, params.angle_90 * 4, vl);
+
+    return a;
+}
+
+// ############ sqrt ############
+
+template <typename RVV_T>
+struct Sqrt32f
+{
+    using T = RVV_T;
+    static constexpr size_t iter_times = 2;
+};
+
+template <typename RVV_T>
+struct Sqrt64f
+{
+    using T = RVV_T;
+    static constexpr size_t iter_times = 3;
+};
+
+// Newton-Raphson method
+// Use 4 LMUL registers
+template <size_t iter_times, typename VEC_T>
+inline VEC_T sqrt(VEC_T x, size_t vl)
+{
+    auto x2 = __riscv_vfmul(x, 0.5, vl);
+    auto y = __riscv_vfrsqrt7(x, vl);
+#ifdef __clang__
+#pragma unroll
+#endif
+    for (size_t i = 0; i < iter_times; i++)
+    {
+        auto t = __riscv_vfmul(y, y, vl);
+        t = __riscv_vfmul(t, x2, vl);
+        t = __riscv_vfrsub(t, 1.5, vl);
+        y = __riscv_vfmul(t, y, vl);
+    }
+    // just to prevent the compiler from calculating mask before the iteration, which will run out
+    // of registers and cause memory access.
+    asm volatile("" ::: "memory");
+    auto classified = __riscv_vfclass(x, vl);
+    // block -0, +0, positive subnormal number, +inf
+    auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl);
+    return __riscv_vfmul_mu(mask, x, x, y, vl);
+}
+
+// Newton-Raphson method
+// Use 3 LMUL registers and 1 mask register
+template <size_t iter_times, typename VEC_T>
+inline VEC_T invSqrt(VEC_T x, size_t vl)
+{
+    auto classified = __riscv_vfclass(x, vl);
+    // block -0, +0, positive subnormal number, +inf
+    auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl);
+    auto x2 = __riscv_vfmul(x, 0.5, vl);
+    auto y = __riscv_vfrsqrt7(x, vl);
+#ifdef __clang__
+#pragma unroll
+#endif
+    for (size_t i = 0; i < iter_times; i++)
+    {
+        auto t = __riscv_vfmul(y, y, vl);
+        t = __riscv_vfmul(t, x2, vl);
+        t = __riscv_vfrsub(t, 1.5, vl);
+        y = __riscv_vfmul_mu(mask, y, t, y, vl);
+    }
+    return y;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}}} // cv::rvv_hal::core::common
+
+#endif // OPENCV_HAL_RVV_CORE_COMMON_HPP_INCLUDED
diff --git a/hal/riscv-rvv/hal_rvv_1p0/compare.hpp b/hal/riscv-rvv/src/core/compare.cpp
similarity index 76%
rename from hal/riscv-rvv/hal_rvv_1p0/compare.hpp
rename to hal/riscv-rvv/src/core/compare.cpp
index 6efd92e18a..ccf0151afb 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/compare.hpp
+++ b/hal/riscv-rvv/src/core/compare.cpp
@@ -5,12 +5,11 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 
-#ifndef OPENCV_HAL_RVV_COMPARE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_COMPARE_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include "types.hpp"
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv { namespace compare {
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace {
 
@@ -90,23 +89,6 @@ int compare_impl(const _Tps *src1_data, size_t src1_step, const _Tps *src2_data,
     return CV_HAL_ERROR_OK;
 }
 
-} // anonymous
-
-#undef cv_hal_cmp8u
-#define cv_hal_cmp8u cv::cv_hal_rvv::compare::compare<uchar>
-#undef cv_hal_cmp8s
-#define cv_hal_cmp8s cv::cv_hal_rvv::compare::compare<schar>
-#undef cv_hal_cmp16u
-#define cv_hal_cmp16u cv::cv_hal_rvv::compare::compare<ushort>
-#undef cv_hal_cmp16s
-#define cv_hal_cmp16s cv::cv_hal_rvv::compare::compare<short>
-#undef cv_hal_cmp32s
-#define cv_hal_cmp32s cv::cv_hal_rvv::compare::compare<int>
-#undef cv_hal_cmp32f
-#define cv_hal_cmp32f cv::cv_hal_rvv::compare::compare<float>
-// #undef cv_hal_cmp64f
-// #define cv_hal_cmp64f cv::cv_hal_rvv::compare::compare<double>
-
 template <typename _Tps> inline
 int compare(const _Tps *src1_data, size_t src1_step, const _Tps *src2_data, size_t src2_step,
             uchar *dst_data, size_t dst_step, int width, int height, int operation) {
@@ -121,6 +103,27 @@ int compare(const _Tps *src1_data, size_t src1_step, const _Tps *src2_data, size
     }
 }
 
-}}} // cv::cv_hal_rvv::compare
+} // namespace anonymous
 
-#endif // OPENCV_HAL_RVV_COMPARE_HPP_INCLUDED
+int cmp8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<uchar>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+int cmp8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<schar>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+int cmp16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<ushort>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+int cmp16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<short>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+int cmp32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<int>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+int cmp32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<float>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/convert_scale.hpp b/hal/riscv-rvv/src/core/convert_scale.cpp
similarity index 89%
rename from hal/riscv-rvv/hal_rvv_1p0/convert_scale.hpp
rename to hal/riscv-rvv/src/core/convert_scale.cpp
index 2f28f20bfd..8c5f83a677 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/convert_scale.hpp
+++ b/hal/riscv-rvv/src/core/convert_scale.cpp
@@ -4,15 +4,11 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_CONVERT_SCALE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_CONVERT_SCALE_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_convertScale
-#define cv_hal_convertScale cv::cv_hal_rvv::convertScale
+#if CV_HAL_RVV_1P0_ENABLED
 
 inline int convertScale_8U8U(const uchar* src, size_t src_step, uchar* dst, size_t dst_step, int width, int height, double alpha, double beta)
 {
@@ -89,8 +85,8 @@ inline int convertScale_32F32F(const uchar* src, size_t src_step, uchar* dst, si
     return CV_HAL_ERROR_OK;
 }
 
-inline int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t dst_step, int width, int height,
-                        int sdepth, int ddepth, double alpha, double beta)
+int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t dst_step,
+                 int width, int height, int sdepth, int ddepth, double alpha, double beta)
 {
     if (!dst)
         return CV_HAL_ERROR_OK;
@@ -118,6 +114,6 @@ inline int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t ds
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/copy_mask.hpp b/hal/riscv-rvv/src/core/copy_mask.cpp
similarity index 86%
rename from hal/riscv-rvv/hal_rvv_1p0/copy_mask.hpp
rename to hal/riscv-rvv/src/core/copy_mask.cpp
index f13b8bc22e..dd49cfdeab 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/copy_mask.hpp
+++ b/hal/riscv-rvv/src/core/copy_mask.cpp
@@ -5,21 +5,17 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 
-#ifndef OPENCV_HAL_RVV_COPY_MASK_HPP_INCLUDED
-#define OPENCV_HAL_RVV_COPY_MASK_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_copyToMasked
-#define cv_hal_copyToMasked cv::cv_hal_rvv::copyToMasked
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace {
 
 #define CV_HAL_RVV_COPY_MASK_eXc1(X, mask_lmul) \
 static int copyToMasked_e##X##c1(const uchar *src_data, size_t src_step, const uchar *mask_data, size_t mask_step, \
-                                      uchar *dst_data, size_t dst_step, int width, int height) { \
+                                 uchar *dst_data, size_t dst_step, int width, int height) { \
     for (; height--; mask_data += mask_step, src_data += src_step, dst_data += dst_step) { \
         const uint##X##_t *src = (const uint##X##_t*)src_data; \
         uint##X##_t *dst = (uint##X##_t*)dst_data; \
@@ -41,7 +37,7 @@ CV_HAL_RVV_COPY_MASK_eXc1(64, 1)
 
 #define CV_HAL_RVV_COPY_MASK_eXc3(X, mask_lmul) \
 static int copyToMasked_e##X##c3(const uchar *src_data, size_t src_step, const uchar *mask_data, size_t mask_step, \
-                                      uchar *dst_data, size_t dst_step, int width, int height) { \
+                                 uchar *dst_data, size_t dst_step, int width, int height) { \
     for (; height--; mask_data += mask_step, src_data += src_step, dst_data += dst_step) { \
         const uint##X##_t *src = (const uint##X##_t*)src_data; \
         uint##X##_t *dst = (uint##X##_t*)dst_data; \
@@ -62,9 +58,9 @@ CV_HAL_RVV_COPY_MASK_eXc3(32, f2)
 CV_HAL_RVV_COPY_MASK_eXc3(64, f4)
 
 static int copyToMasked_e64c2(const uchar *src_data, size_t src_step,
-                                   const uchar *mask_data, size_t mask_step,
-                                   uchar *dst_data, size_t dst_step, int width,
-                                   int height) {
+                              const uchar *mask_data, size_t mask_step,
+                              uchar *dst_data, size_t dst_step, int width,
+                              int height) {
     for (; height--; mask_data += mask_step, src_data += src_step, dst_data += dst_step) {
         const uint64_t *src = (const uint64_t *)src_data;
         uint64_t *dst = (uint64_t *)dst_data;
@@ -80,9 +76,9 @@ static int copyToMasked_e64c2(const uchar *src_data, size_t src_step,
 }
 
 static int copyToMasked_e64c4(const uchar *src_data, size_t src_step,
-                                   const uchar *mask_data, size_t mask_step,
-                                   uchar *dst_data, size_t dst_step, int width,
-                                   int height) {
+                              const uchar *mask_data, size_t mask_step,
+                              uchar *dst_data, size_t dst_step, int width,
+                              int height) {
     for (; height--; mask_data += mask_step, src_data += src_step, dst_data += dst_step) {
         const uint64_t *src = (const uint64_t *)src_data;
         uint64_t *dst = (uint64_t *)dst_data;
@@ -100,8 +96,8 @@ static int copyToMasked_e64c4(const uchar *src_data, size_t src_step,
 } // anonymous
 
 using CopyToMaskedFunc = int (*)(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int);
-inline int copyToMasked(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height,
-                      int type, const uchar *mask_data, size_t mask_step, int mask_type) {
+int copyToMasked(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height,
+                 int type, const uchar *mask_data, size_t mask_step, int mask_type) {
     int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     int mdepth = CV_MAT_DEPTH(mask_type), mcn = CV_MAT_CN(mask_type);
 
@@ -189,6 +185,6 @@ inline int copyToMasked(const uchar *src_data, size_t src_step, uchar *dst_data,
     return CV_HAL_ERROR_OK;
 }
 
-}} // cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/src/core/div.cpp b/hal/riscv-rvv/src/core/div.cpp
new file mode 100644
index 0000000000..e12e3775f5
--- /dev/null
+++ b/hal/riscv-rvv/src/core/div.cpp
@@ -0,0 +1,276 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "rvv_hal.hpp"
+#include <limits>
+
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+inline size_t setvl(int l) { return __riscv_vsetvl_e8m2(l); }
+
+inline   vuint8m2_t vle(const uint8_t  *p, int vl) { return __riscv_vle8_v_u8m2(p, vl); }
+inline    vint8m2_t vle(const int8_t   *p, int vl) { return __riscv_vle8_v_i8m2(p, vl); }
+inline  vuint16m4_t vle(const uint16_t *p, int vl) { return __riscv_vle16_v_u16m4(p, vl); }
+inline   vint16m4_t vle(const int16_t  *p, int vl) { return __riscv_vle16_v_i16m4(p, vl); }
+inline   vint32m8_t vle(const int      *p, int vl) { return __riscv_vle32_v_i32m8(p, vl); }
+inline vfloat32m8_t vle(const float    *p, int vl) { return __riscv_vle32_v_f32m8(p, vl); }
+
+inline void vse(uint8_t  *p, const   vuint8m2_t &v, int vl) { __riscv_vse8(p, v, vl); }
+inline void vse(int8_t   *p, const    vint8m2_t &v, int vl) { __riscv_vse8(p, v, vl); }
+inline void vse(uint16_t *p, const  vuint16m4_t &v, int vl) { __riscv_vse16(p, v, vl); }
+inline void vse(int16_t  *p, const   vint16m4_t &v, int vl) { __riscv_vse16(p, v, vl); }
+inline void vse(int      *p, const   vint32m8_t &v, int vl) { __riscv_vse32(p, v, vl); }
+inline void vse(float    *p, const vfloat32m8_t &v, int vl) { __riscv_vse32(p, v, vl); }
+
+inline vuint16m4_t ext(const  vuint8m2_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); }
+inline  vint16m4_t ext(const   vint8m2_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); }
+inline vuint32m8_t ext(const vuint16m4_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); }
+inline  vint32m8_t ext(const  vint16m4_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); }
+
+inline  vuint8m2_t nclip(const vuint16m4_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); }
+inline   vint8m2_t nclip(const  vint16m4_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); }
+inline vuint16m4_t nclip(const vuint32m8_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); }
+inline  vint16m4_t nclip(const  vint32m8_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); }
+
+template <typename VT> inline
+VT div_sat(const VT &v1, const VT &v2, const float scale, const int vl) {
+    return nclip(div_sat(ext(v1, vl), ext(v2, vl), scale, vl), vl);
+}
+template <> inline
+vint32m8_t div_sat(const vint32m8_t &v1, const vint32m8_t &v2, const float scale, const int vl) {
+    auto f1 = __riscv_vfcvt_f(v1, vl);
+    auto f2 = __riscv_vfcvt_f(v2, vl);
+    auto res = __riscv_vfmul(f1, __riscv_vfrdiv(f2, scale, vl), vl);
+    return __riscv_vfcvt_x(res, vl);
+}
+template <> inline
+vuint32m8_t div_sat(const vuint32m8_t &v1, const vuint32m8_t &v2, const float scale, const int vl) {
+    auto f1 = __riscv_vfcvt_f(v1, vl);
+    auto f2 = __riscv_vfcvt_f(v2, vl);
+    auto res = __riscv_vfmul(f1, __riscv_vfrdiv(f2, scale, vl), vl);
+    return __riscv_vfcvt_xu(res, vl);
+}
+
+template <typename VT> inline
+VT recip_sat(const VT &v, const float scale, const int vl) {
+    return nclip(recip_sat(ext(v, vl), scale, vl), vl);
+}
+template <> inline
+vint32m8_t recip_sat(const vint32m8_t &v, const float scale, const int vl) {
+    auto f = __riscv_vfcvt_f(v, vl);
+    auto res = __riscv_vfrdiv(f, scale, vl);
+    return __riscv_vfcvt_x(res, vl);
+}
+template <> inline
+vuint32m8_t recip_sat(const vuint32m8_t &v, const float scale, const int vl) {
+    auto f = __riscv_vfcvt_f(v, vl);
+    auto res = __riscv_vfrdiv(f, scale, vl);
+    return __riscv_vfcvt_xu(res, vl);
+}
+
+// Implementation
+
+template <typename ST> inline
+int div(const ST *src1, size_t step1, const ST *src2, size_t step2,
+         ST *dst, size_t step, int width, int height, float scale) {
+    float max_fval = static_cast<float>(std::numeric_limits<ST>::max());
+    if (scale == 0.f || ((scale * max_fval) <  1.f && (scale * max_fval) > -1.f)) {
+        for (int h = 0; h < height; h++) {
+            ST *dst_h = reinterpret_cast<ST*>((uchar*)dst + h * step);
+            std::memset(dst_h, 0, sizeof(ST) * width);
+        }
+        return CV_HAL_ERROR_OK;
+    }
+
+    for (int h = 0; h < height; h++) {
+        const ST *src1_h = reinterpret_cast<const ST*>((const uchar*)src1 + h * step1);
+        const ST *src2_h = reinterpret_cast<const ST*>((const uchar*)src2 + h * step2);
+        ST *dst_h = reinterpret_cast<ST*>((uchar*)dst + h * step);
+
+        int vl;
+        for (int w = 0; w < width; w += vl) {
+            vl = setvl(width - w);
+
+            auto v1 = vle(src1_h + w, vl);
+            auto v2 = vle(src2_h + w, vl);
+
+            auto mask = __riscv_vmseq(v2, 0, vl);
+            vse(dst_h + w, __riscv_vmerge(div_sat(v1, v2, scale, vl), 0, mask, vl), vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <>
+int div(const float *src1, size_t step1, const float *src2, size_t step2,
+        float *dst, size_t step, int width, int height, float scale) {
+    if (scale == 0.f) {
+        for (int h = 0; h < height; h++) {
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
+            std::memset(dst_h, 0, sizeof(float) * width);
+        }
+        return CV_HAL_ERROR_OK;
+    }
+
+    if (std::fabs(scale - 1.f) < FLT_EPSILON) {
+        for (int h = 0; h < height; h++) {
+            const float *src1_h = reinterpret_cast<const float*>((const uchar*)src1 + h * step1);
+            const float *src2_h = reinterpret_cast<const float*>((const uchar*)src2 + h * step2);
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
+
+            int vl;
+            for (int w = 0; w < width; w += vl) {
+                vl = setvl(width - w);
+
+                auto v1 = vle(src1_h + w, vl);
+                auto v2 = vle(src2_h + w, vl);
+
+                vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfrdiv(v2, 1.f, vl), vl), vl);
+            }
+        }
+    } else {
+        for (int h = 0; h < height; h++) {
+            const float *src1_h = reinterpret_cast<const float*>((const uchar*)src1 + h * step1);
+            const float *src2_h = reinterpret_cast<const float*>((const uchar*)src2 + h * step2);
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
+
+            int vl;
+            for (int w = 0; w < width; w += vl) {
+                vl = setvl(width - w);
+
+                auto v1 = vle(src1_h + w, vl);
+                auto v2 = vle(src2_h + w, vl);
+
+                vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfrdiv(v2, scale, vl), vl), vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <typename ST> inline
+int recip(const ST *src_data, size_t src_step, ST *dst_data, size_t dst_step,
+          int width, int height, float scale) {
+    if (scale == 0.f || (scale < 1.f && scale > -1.f)) {
+        for (int h = 0; h < height; h++) {
+            ST *dst_h = reinterpret_cast<ST*>((uchar*)dst_data + h * dst_step);
+            std::memset(dst_h, 0, sizeof(ST) * width);
+        }
+        return CV_HAL_ERROR_OK;
+    }
+
+    for (int h = 0; h < height; h++) {
+        const ST *src_h = reinterpret_cast<const ST*>((const uchar*)src_data + h * src_step);
+        ST *dst_h = reinterpret_cast<ST*>((uchar*)dst_data + h * dst_step);
+
+        int vl;
+        for (int w = 0; w < width; w += vl) {
+            vl = setvl(width - w);
+
+            auto v = vle(src_h + w, vl);
+
+            auto mask = __riscv_vmseq(v, 0, vl);
+            vse(dst_h + w, __riscv_vmerge(recip_sat(v, scale, vl), 0, mask, vl), vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <>
+int recip(const float *src_data, size_t src_step, float *dst_data, size_t dst_step,
+          int width, int height, float scale) {
+    if (scale == 0.f) {
+        for (int h = 0; h < height; h++) {
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
+            std::memset(dst_h, 0, sizeof(float) * width);
+        }
+        return CV_HAL_ERROR_OK;
+    }
+
+    if (std::fabs(scale - 1.f) < FLT_EPSILON) {
+        for (int h = 0; h < height; h++) {
+            const float *src_h = reinterpret_cast<const float*>((const uchar*)src_data + h * src_step);
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
+
+            int vl;
+            for (int w = 0; w < width; w += vl) {
+                vl = setvl(width - w);
+
+                auto v = vle(src_h + w, vl);
+
+                vse(dst_h + w, __riscv_vfrdiv(v, 1.f, vl), vl);
+            }
+        }
+    } else {
+        for (int h = 0; h < height; h++) {
+            const float *src_h = reinterpret_cast<const float*>((const uchar*)src_data + h * src_step);
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
+
+            int vl;
+            for (int w = 0; w < width; w += vl) {
+                vl = setvl(width - w);
+
+                auto v = vle(src_h + w, vl);
+
+                vse(dst_h + w, __riscv_vfrdiv(v, scale, vl), vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int div8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<uchar>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+int div8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, schar *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<schar>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+int div16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, ushort *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<ushort>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+int div16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, short *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<short>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+int div32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<int>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+int div32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<float>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+
+int recip8u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<uchar>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+int recip8s(const schar *src_data, size_t src_step, schar *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<schar>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+int recip16u(const ushort *src_data, size_t src_step, ushort *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<ushort>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+int recip16s(const short *src_data, size_t src_step, short *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<short>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+int recip32s(const int *src_data, size_t src_step, int *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<int>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+int recip32f(const float *src_data, size_t src_step, float *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<float>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/dotprod.hpp b/hal/riscv-rvv/src/core/dotprod.cpp
similarity index 87%
rename from hal/riscv-rvv/hal_rvv_1p0/dotprod.hpp
rename to hal/riscv-rvv/src/core/dotprod.cpp
index e16a97cf6a..2630ca198d 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/dotprod.hpp
+++ b/hal/riscv-rvv/src/core/dotprod.cpp
@@ -5,21 +5,16 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 
-
-#ifndef OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED
-#define OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED
-
-#include <riscv_vector.h>
+#include "rvv_hal.hpp"
 #include <algorithm>
 
-namespace cv { namespace cv_hal_rvv { namespace dotprod {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_dotProduct
-#define cv_hal_dotProduct cv::cv_hal_rvv::dotprod::dotprod
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace {
 
-double dotProd_8u(const uchar *a, const uchar *b, int len) {
+static inline double dotProd_8u(const uchar *a, const uchar *b, int len) {
     constexpr int block_size0 = (1 << 15);
 
     double r = 0;
@@ -47,7 +42,7 @@ double dotProd_8u(const uchar *a, const uchar *b, int len) {
     return r;
 }
 
-double dotProd_8s(const schar *a, const schar *b, int len) {
+static inline double dotProd_8s(const schar *a, const schar *b, int len) {
     constexpr int block_size0 = (1 << 14);
 
     double r = 0;
@@ -75,7 +70,7 @@ double dotProd_8s(const schar *a, const schar *b, int len) {
     return r;
 }
 
-double dotProd_16u(const ushort *a, const ushort *b, int len) {
+static inline double dotProd_16u(const ushort *a, const ushort *b, int len) {
     constexpr int block_size0 = (1 << 24);
 
     double r = 0;
@@ -103,7 +98,7 @@ double dotProd_16u(const ushort *a, const ushort *b, int len) {
     return r;
 }
 
-double dotProd_16s(const short *a, const short *b, int len) {
+static inline double dotProd_16s(const short *a, const short *b, int len) {
     constexpr int block_size0 = (1 << 24);
 
     double r = 0;
@@ -131,7 +126,7 @@ double dotProd_16s(const short *a, const short *b, int len) {
     return r;
 }
 
-double dotProd_32s(const int *a, const int *b, int len) {
+static inline double dotProd_32s(const int *a, const int *b, int len) {
     double r = 0;
 
     vfloat64m8_t s = __riscv_vfmv_v_f_f64m8(0.f, __riscv_vsetvlmax_e64m8());
@@ -149,7 +144,7 @@ double dotProd_32s(const int *a, const int *b, int len) {
     return r;
 }
 
-double dotProd_32f(const float *a, const float *b, int len) {
+static inline double dotProd_32f(const float *a, const float *b, int len) {
     constexpr int block_size0 = (1 << 11);
 
     double r = 0.f;
@@ -180,8 +175,8 @@ double dotProd_32f(const float *a, const float *b, int len) {
 } // anonymous
 
 using DotProdFunc = double (*)(const uchar *a, const uchar *b, int len);
-inline int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size_t b_step,
-                   int width, int height, int type, double *dot_val) {
+int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size_t b_step,
+            int width, int height, int type, double *dot_val) {
     int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
 
     static DotProdFunc dotprod_tab[CV_DEPTH_MAX] = {
@@ -228,6 +223,6 @@ inline int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size
     return CV_HAL_ERROR_OK;
 }
 
-}}} // cv::cv_hal_rvv::dotprod
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif // OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/dxt.hpp b/hal/riscv-rvv/src/core/dxt.cpp
similarity index 97%
rename from hal/riscv-rvv/hal_rvv_1p0/dxt.hpp
rename to hal/riscv-rvv/src/core/dxt.cpp
index 25f4879532..fa0c464e88 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/dxt.hpp
+++ b/hal/riscv-rvv/src/core/dxt.cpp
@@ -4,17 +4,11 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_DXT_HPP_INCLUDED
-#define OPENCV_HAL_RVV_DXT_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
-#include "opencv2/core/types.hpp"
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv { namespace dxt {
-
-#undef cv_hal_dft
-#define cv_hal_dft cv::cv_hal_rvv::dxt::dft
+#if CV_HAL_RVV_1P0_ENABLED
 
 template<typename T> struct rvv;
 
@@ -42,7 +36,7 @@ template<> struct rvv<double> : RVV_F64M1
 // in the function template static void cv::DFT and cv::DFT_R2, cv::DFT_R3, cv::DFT_R5
 template <typename T>
 inline int dft(const Complex<T>* src, Complex<T>* dst, int nf, int *factors, T scale, int* itab,
-                  const Complex<T>* wave, int tab_size, int len, bool isInverse, bool noPermute)
+               const Complex<T>* wave, int tab_size, int len, bool isInverse, bool noPermute)
 {
     int n = len;
     int f_idx, nx;
@@ -545,8 +539,8 @@ inline int dft(const Complex<T>* src, Complex<T>* dst, int nf, int *factors, T s
     return CV_HAL_ERROR_OK;
 }
 
-inline int dft(const uchar* src, uchar* dst, int depth, int nf, int *factors, double scale, int* itab, void* wave,
-                  int tab_size, int n, bool isInverse, bool noPermute)
+int dft(const uchar* src, uchar* dst, int depth, int nf, int *factors, double scale,
+        int* itab, void* wave, int tab_size, int n, bool isInverse, bool noPermute)
 {
     if( n == 0 )
         return CV_HAL_ERROR_OK;
@@ -563,6 +557,6 @@ inline int dft(const uchar* src, uchar* dst, int depth, int nf, int *factors, do
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-}}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/exp.hpp b/hal/riscv-rvv/src/core/exp.cpp
similarity index 95%
rename from hal/riscv-rvv/hal_rvv_1p0/exp.hpp
rename to hal/riscv-rvv/src/core/exp.cpp
index 82690fb321..552fdc0e3f 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/exp.hpp
+++ b/hal/riscv-rvv/src/core/exp.cpp
@@ -4,17 +4,11 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_EXP_HPP_INCLUDED
-#define OPENCV_HAL_RVV_EXP_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_exp32f
-#define cv_hal_exp32f cv::cv_hal_rvv::exp32f
-#undef cv_hal_exp64f
-#define cv_hal_exp64f cv::cv_hal_rvv::exp64f
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace detail {
 
@@ -116,7 +110,7 @@ static constexpr double exp_tab_64f[exp_tab_size] = EXP_TAB_VALUE;
 
 }  // namespace detail
 
-inline int exp32f(const float* src, float* dst, int _len)
+int exp32f(const float* src, float* dst, int _len)
 {
     size_t vl = __riscv_vsetvlmax_e32m4();
     auto exp_a2 = __riscv_vfmv_v_f_f32m4(detail::exp32f_a2, vl);
@@ -158,7 +152,7 @@ inline int exp32f(const float* src, float* dst, int _len)
     return CV_HAL_ERROR_OK;
 }
 
-inline int exp64f(const double* src, double* dst, int _len)
+int exp64f(const double* src, double* dst, int _len)
 {
     size_t vl = __riscv_vsetvlmax_e64m4();
     // all vector registers are used up, so not load more constants
@@ -203,6 +197,6 @@ inline int exp64f(const double* src, double* dst, int _len)
     return CV_HAL_ERROR_OK;
 }
 
-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif //OPENCV_HAL_RVV_EXP_HPP_INCLUDED
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/flip.hpp b/hal/riscv-rvv/src/core/flip.cpp
similarity index 96%
rename from hal/riscv-rvv/hal_rvv_1p0/flip.hpp
rename to hal/riscv-rvv/src/core/flip.cpp
index 02abeb6e93..6f4c577c25 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/flip.hpp
+++ b/hal/riscv-rvv/src/core/flip.cpp
@@ -5,13 +5,7 @@
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 
-#ifndef OPENCV_HAL_RVV_FLIP_HPP_INCLUDED
-#define OPENCV_HAL_RVV_FLIP_HPP_INCLUDED
-
-
-#include <riscv_vector.h>
-#include <opencv2/core/base.hpp>
-#include "hal_rvv_1p0/types.hpp"
+#include "rvv_hal.hpp"
 
 #if defined (__clang__) && __clang_major__ < 18
 #define OPENCV_HAL_IMPL_RVV_VCREATE_x3(suffix, width, v0, v1, v2) \
@@ -24,10 +18,9 @@
 #define __riscv_vcreate_v_u64m2x3(v0, v1, v2) OPENCV_HAL_IMPL_RVV_VCREATE_x3(u64, 2, v0, v1, v2)
 #endif
 
-namespace cv { namespace cv_hal_rvv {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_flip
-#define cv_hal_flip cv::cv_hal_rvv::flip
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace {
 
@@ -73,6 +66,13 @@ CV_HAL_RVV_FLIP_INPLACE_C1(16UC1, ushort, RVV_U16M8)
 CV_HAL_RVV_FLIP_INPLACE_C1(32UC1, unsigned, RVV_U32M8)
 CV_HAL_RVV_FLIP_INPLACE_C1(64UC1, uint64_t, RVV_U64M8)
 
+// Suppress warnings of "ignoring attributes applied to VecType after definition",
+// VecType is vuint8m2x3_t, vuint16m2x3_t, vuint32m2x3_t or vuint64m2x3_t
+#if defined (__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wattributes"
+#endif
+
 #define CV_HAL_RVV_FLIP_C3_TYPES(width) \
 struct RVV_C3_U##width##M2 : RVV_U##width##M2 { \
     static inline vuint##width##m2x3_t vload3(const uint##width##_t *base, size_t vl) { return __riscv_vlseg3e##width##_v_u##width##m2x3(base, vl); } \
@@ -90,6 +90,10 @@ CV_HAL_RVV_FLIP_C3_TYPES(16)
 CV_HAL_RVV_FLIP_C3_TYPES(32)
 CV_HAL_RVV_FLIP_C3_TYPES(64)
 
+#if defined (__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
 #define CV_HAL_RVV_FLIP_C3(name, _Tps, RVV) \
 inline void flip_##name(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int src_width, int src_height, int flip_mode) { \
     for (int h = 0; h < src_height; h++) { \
@@ -311,7 +315,7 @@ inline int flip_inplace(int esz, uchar* data, size_t step, int width, int height
     return CV_HAL_ERROR_OK;
 }
 
-inline int flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
+int flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
                 uchar* dst_data, size_t dst_step, int flip_mode)
 {
     int esz = CV_ELEM_SIZE(src_type);
@@ -368,6 +372,6 @@ inline int flip(int src_type, const uchar* src_data, size_t src_step, int src_wi
     return CV_HAL_ERROR_OK;
 }
 
-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif //OPENCV_HAL_RVV_FLIP_HPP_INCLUDED
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/log.hpp b/hal/riscv-rvv/src/core/log.cpp
similarity index 98%
rename from hal/riscv-rvv/hal_rvv_1p0/log.hpp
rename to hal/riscv-rvv/src/core/log.cpp
index 8df0761861..0783e3be54 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/log.hpp
+++ b/hal/riscv-rvv/src/core/log.cpp
@@ -4,17 +4,11 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_LOG_HPP_INCLUDED
-#define OPENCV_HAL_RVV_LOG_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_log32f
-#define cv_hal_log32f cv::cv_hal_rvv::log32f
-#undef cv_hal_log64f
-#define cv_hal_log64f cv::cv_hal_rvv::log64f
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace detail {
 
@@ -306,7 +300,7 @@ static constexpr double log_tab_64f[log_tab_size] = LOG_TAB_VALUE;
 
 }  // namespace detail
 
-inline int log32f(const float* src, float* dst, int _len)
+int log32f(const float* src, float* dst, int _len)
 {
     size_t vl = __riscv_vsetvlmax_e32m4();
     auto log_a2 = __riscv_vfmv_v_f_f32m4(detail::log32f_a2, vl);
@@ -340,7 +334,7 @@ inline int log32f(const float* src, float* dst, int _len)
     return CV_HAL_ERROR_OK;
 }
 
-inline int log64f(const double* src, double* dst, int _len)
+int log64f(const double* src, double* dst, int _len)
 {
     size_t vl = __riscv_vsetvlmax_e64m4();
     // all vector registers are used up, so not load more constants
@@ -382,6 +376,6 @@ inline int log64f(const double* src, double* dst, int _len)
     return CV_HAL_ERROR_OK;
 }
 
-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif //OPENCV_HAL_RVV_LOG_HPP_INCLUDED
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/lu.hpp b/hal/riscv-rvv/src/core/lu.cpp
similarity index 91%
rename from hal/riscv-rvv/hal_rvv_1p0/lu.hpp
rename to hal/riscv-rvv/src/core/lu.cpp
index 6de137fe82..d4579caa47 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/lu.hpp
+++ b/hal/riscv-rvv/src/core/lu.cpp
@@ -4,21 +4,16 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_LU_HPP_INCLUDED
-#define OPENCV_HAL_RVV_LU_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include <cfloat>
 #include <cmath>
 #include <typeinfo>
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
 
-namespace cv { namespace cv_hal_rvv { namespace lu {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_LU32f
-#define cv_hal_LU32f cv::cv_hal_rvv::lu::LU<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_LU64f
-#define cv_hal_LU64f cv::cv_hal_rvv::lu::LU<cv::cv_hal_rvv::RVV_F64M4>
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
 
 // the algorithm is copied from core/src/matrix_decomp.cpp,
 // in the function template static int cv::LUImpl
@@ -167,6 +162,15 @@ inline int LU(T* src1, size_t src1_step, int m, T* src2, size_t src2_step, int n
     return CV_HAL_ERROR_OK;
 }
 
-}}}
+} // anonymous
 
-#endif
+int LU32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, int* info) {
+    return LU<RVV_F32M4>(src1, src1_step, m, src2, src2_step, n, info);
+}
+int LU64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, int* info) {
+    return LU<RVV_F64M4>(src1, src1_step, m, src2, src2_step, n, info);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/lut.hpp b/hal/riscv-rvv/src/core/lut.cpp
similarity index 93%
rename from hal/riscv-rvv/hal_rvv_1p0/lut.hpp
rename to hal/riscv-rvv/src/core/lut.cpp
index c13a5b2f0a..a90afd2604 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/lut.hpp
+++ b/hal/riscv-rvv/src/core/lut.cpp
@@ -4,19 +4,11 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_LUT_HPP_INCLUDED
-#define OPENCV_HAL_RVV_LUT_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
-#include <opencv2/core/base.hpp>
-#include <opencv2/core/utility.hpp>
+namespace cv { namespace rvv_hal { namespace core {
 
-#include "hal_rvv_1p0/types.hpp"
-
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_lut
-#define cv_hal_lut cv::cv_hal_rvv::lut
+#if CV_HAL_RVV_1P0_ENABLED
 
 // need vlen >= 256
 struct LUTCacheU8 : RVV_U8M8
@@ -135,7 +127,7 @@ private:
     LUTParallelBody& operator=(const LUTParallelBody&);
 };
 
-inline int lut(const uchar* src_data,
+int lut(const uchar* src_data,
                size_t src_step,
                size_t src_type,
                const uchar* lut_data,
@@ -191,6 +183,6 @@ inline int lut(const uchar* src_data,
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif //OPENCV_HAL_RVV_LUT_HPP_INCLUDED
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/magnitude.hpp b/hal/riscv-rvv/src/core/magnitude.cpp
similarity index 54%
rename from hal/riscv-rvv/hal_rvv_1p0/magnitude.hpp
rename to hal/riscv-rvv/src/core/magnitude.cpp
index eb814c1b77..8630b717da 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/magnitude.hpp
+++ b/hal/riscv-rvv/src/core/magnitude.cpp
@@ -4,20 +4,14 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_MAGNITUDE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_MAGNITUDE_HPP_INCLUDED
+#include "rvv_hal.hpp"
+#include "common.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {
 
-#include "hal_rvv_1p0/sqrt.hpp"
-#include "hal_rvv_1p0/types.hpp"
+#if CV_HAL_RVV_1P0_ENABLED
 
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_magnitude32f
-#define cv_hal_magnitude32f cv::cv_hal_rvv::magnitude<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
-#undef cv_hal_magnitude64f
-#define cv_hal_magnitude64f cv::cv_hal_rvv::magnitude<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
+namespace {
 
 template <typename SQRT_T, typename T = typename SQRT_T::T::ElemType>
 inline int magnitude(const T* x, const T* y, T* dst, int len)
@@ -30,13 +24,22 @@ inline int magnitude(const T* x, const T* y, T* dst, int len)
         auto vx = SQRT_T::T::vload(x, vl);
         auto vy = SQRT_T::T::vload(y, vl);
 
-        auto vmag = detail::sqrt<SQRT_T::iter_times>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl);
+        auto vmag = common::sqrt<SQRT_T::iter_times>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl);
         SQRT_T::T::vstore(dst, vmag, vl);
     }
 
     return CV_HAL_ERROR_OK;
 }
 
-}}  // namespace cv::cv_hal_rvv
+} // anonymous
 
-#endif  // OPENCV_HAL_RVV_MAGNITUDE_HPP_INCLUDED
+int magnitude32f(const float *x, const float *y, float *dst, int len) {
+    return magnitude<common::Sqrt32f<RVV_F32M8>>(x, y, dst, len);
+}
+int magnitude64f(const double *x, const double  *y, double *dst, int len) {
+    return magnitude<common::Sqrt64f<RVV_F64M8>>(x, y, dst, len);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/mean.hpp b/hal/riscv-rvv/src/core/mean.cpp
similarity index 95%
rename from hal/riscv-rvv/hal_rvv_1p0/mean.hpp
rename to hal/riscv-rvv/src/core/mean.cpp
index e8156371b3..2fc2f98f65 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/mean.hpp
+++ b/hal/riscv-rvv/src/core/mean.cpp
@@ -4,15 +4,11 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_MEANSTDDEV_HPP_INCLUDED
-#define OPENCV_HAL_RVV_MEANSTDDEV_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_meanStdDev
-#define cv_hal_meanStdDev cv::cv_hal_rvv::meanStdDev
+#if CV_HAL_RVV_1P0_ENABLED
 
 inline int meanStdDev_8UC1(const uchar* src_data, size_t src_step, int width, int height,
                             double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
@@ -21,8 +17,8 @@ inline int meanStdDev_8UC4(const uchar* src_data, size_t src_step, int width, in
 inline int meanStdDev_32FC1(const uchar* src_data, size_t src_step, int width, int height,
                             double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
 
-inline int meanStdDev(const uchar* src_data, size_t src_step, int width, int height,
-                             int src_type, double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
+int meanStdDev(const uchar* src_data, size_t src_step, int width, int height, int src_type,
+               double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
     switch (src_type)
     {
     case CV_8UC1:
@@ -226,6 +222,6 @@ inline int meanStdDev_32FC1(const uchar* src_data, size_t src_step, int width, i
     return CV_HAL_ERROR_OK;
 }
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/merge.hpp b/hal/riscv-rvv/src/core/merge.cpp
similarity index 93%
rename from hal/riscv-rvv/hal_rvv_1p0/merge.hpp
rename to hal/riscv-rvv/src/core/merge.cpp
index b1da204b39..9dcc6b67e2 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/merge.hpp
+++ b/hal/riscv-rvv/src/core/merge.cpp
@@ -4,21 +4,7 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
-
-#include <riscv_vector.h>
-
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_merge8u
-#define cv_hal_merge8u cv::cv_hal_rvv::merge8u
-#undef cv_hal_merge16u
-#define cv_hal_merge16u cv::cv_hal_rvv::merge16u
-#undef cv_hal_merge32s
-#define cv_hal_merge32s cv::cv_hal_rvv::merge32s
-#undef cv_hal_merge64s
-#define cv_hal_merge64s cv::cv_hal_rvv::merge64s
+#include "rvv_hal.hpp"
 
 #if defined __clang__ && __clang_major__ < 18
 #define OPENCV_HAL_IMPL_RVV_VCREATE_x2(suffix, width, v0, v1) \
@@ -44,7 +30,11 @@ namespace cv { namespace cv_hal_rvv {
 #define __riscv_vcreate_v_u16m2x4(v0, v1, v2, v3) OPENCV_HAL_IMPL_RVV_VCREATE_x4(u16, 2, v0, v1, v2, v3)
 #endif  // clang < 18
 
-inline int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
     int vl = 0;
     if (cn == 1)
     {
@@ -129,7 +119,7 @@ inline int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
     return CV_HAL_ERROR_OK;
 }
 
-inline int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
+int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
     int vl = 0;
     if (cn == 1)
     {
@@ -217,7 +207,7 @@ inline int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
 #if defined __GNUC__ && !defined(__clang__)
 __attribute__((optimize("no-tree-vectorize")))
 #endif
-inline int merge32s(const int** src, int* dst, int len, int cn ) {
+int merge32s(const int** src, int* dst, int len, int cn ) {
     int k = cn % 4 ? cn % 4 : 4;
     int i, j;
     if( k == 1 )
@@ -287,7 +277,7 @@ inline int merge32s(const int** src, int* dst, int len, int cn ) {
 #if defined __GNUC__ && !defined(__clang__)
 __attribute__((optimize("no-tree-vectorize")))
 #endif
-inline int merge64s(const int64** src, int64* dst, int len, int cn ) {
+int merge64s(const int64** src, int64* dst, int len, int cn ) {
     int k = cn % 4 ? cn % 4 : 4;
     int i, j;
     if( k == 1 )
@@ -354,6 +344,6 @@ inline int merge64s(const int64** src, int64* dst, int len, int cn ) {
     return CV_HAL_ERROR_OK;
 }
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/minmax.hpp b/hal/riscv-rvv/src/core/minmax.cpp
similarity index 94%
rename from hal/riscv-rvv/hal_rvv_1p0/minmax.hpp
rename to hal/riscv-rvv/src/core/minmax.cpp
index c07a1ff6f7..5fbc3a0f50 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/minmax.hpp
+++ b/hal/riscv-rvv/src/core/minmax.cpp
@@ -4,19 +4,11 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_MINMAX_HPP_INCLUDED
-#define OPENCV_HAL_RVV_MINMAX_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
-#include <opencv2/core/base.hpp>
-#include "hal_rvv_1p0/types.hpp"
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv { namespace minmax {
-
-#undef cv_hal_minMaxIdx
-#define cv_hal_minMaxIdx cv::cv_hal_rvv::minmax::minMaxIdx
-#undef cv_hal_minMaxIdxMaskStep
-#define cv_hal_minMaxIdxMaskStep cv::cv_hal_rvv::minmax::minMaxIdx
+#if CV_HAL_RVV_1P0_ENABLED
 
 template<typename VEC_T, typename BOOL_T, typename T = typename VEC_T::ElemType>
 inline int minMaxIdxReadTwice(const uchar* src_data, size_t src_step, int width, int height, double* minVal, double* maxVal,
@@ -257,8 +249,8 @@ inline int minMaxIdxReadOnce(const uchar* src_data, size_t src_step, int width,
     return CV_HAL_ERROR_OK;
 }
 
-inline int minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth, double* minVal, double* maxVal,
-                     int* minIdx, int* maxIdx, uchar* mask, size_t mask_step = 0)
+int minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth,
+              double* minVal, double* maxVal, int* minIdx, int* maxIdx, uchar* mask, size_t mask_step)
 {
     if (!mask_step)
         mask_step = src_step;
@@ -284,6 +276,6 @@ inline int minMaxIdx(const uchar* src_data, size_t src_step, int width, int heig
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-}}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/norm.hpp b/hal/riscv-rvv/src/core/norm.cpp
similarity index 96%
rename from hal/riscv-rvv/hal_rvv_1p0/norm.hpp
rename to hal/riscv-rvv/src/core/norm.cpp
index c35c0a3bd5..b2deb3f4fc 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/norm.hpp
+++ b/hal/riscv-rvv/src/core/norm.cpp
@@ -6,15 +6,12 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 
-#ifndef OPENCV_HAL_RVV_NORM_HPP_INCLUDED
-#define OPENCV_HAL_RVV_NORM_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include "common.hpp"
 
-namespace cv { namespace cv_hal_rvv { namespace norm {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_norm
-#define cv_hal_norm cv::cv_hal_rvv::norm::norm
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace {
 
@@ -76,7 +73,7 @@ struct NormInf_RVV<schar, int> {
         for (int i = 0; i < n; i += vl) {
             vl = __riscv_vsetvl_e8m8(n - i);
             auto v = __riscv_vle8_v_i8m8(src + i, vl);
-            s = __riscv_vmaxu_tu(s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+            s = __riscv_vmaxu_tu(s, s, common::__riscv_vabs(v, vl), vl);
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax));
     }
@@ -106,7 +103,7 @@ struct NormInf_RVV<short, int> {
         for (int i = 0; i < n; i += vl) {
             vl = __riscv_vsetvl_e16m8(n - i);
             auto v = __riscv_vle16_v_i16m8(src + i, vl);
-            s = __riscv_vmaxu_tu(s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+            s = __riscv_vmaxu_tu(s, s, common::__riscv_vabs(v, vl), vl);
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax));
     }
@@ -121,7 +118,7 @@ struct NormInf_RVV<int, int> {
         for (int i = 0; i < n; i += vl) {
             vl = __riscv_vsetvl_e32m8(n - i);
             auto v = __riscv_vle32_v_i32m8(src + i, vl);
-            s = __riscv_vmaxu_tu(s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+            s = __riscv_vmaxu_tu(s, s, common::__riscv_vabs(v, vl), vl);
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u32m1(0, __riscv_vsetvlmax_e32m1()), vlmax));
     }
@@ -180,7 +177,7 @@ struct NormL1_RVV<schar, int> {
         int vl;
         for (int i = 0; i < n; i += vl) {
             vl = __riscv_vsetvl_e8m8(n - i);
-            auto v = custom_intrin::__riscv_vabs(__riscv_vle8_v_i8m8(src + i, vl), vl);
+            auto v = common::__riscv_vabs(__riscv_vle8_v_i8m8(src + i, vl), vl);
             s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
         }
         return __riscv_vmv_x(s);
@@ -208,7 +205,7 @@ struct NormL1_RVV<short, int> {
         int vl;
         for (int i = 0; i < n; i += vl) {
             vl = __riscv_vsetvl_e16m8(n - i);
-            auto v = custom_intrin::__riscv_vabs(__riscv_vle16_v_i16m8(src + i, vl), vl);
+            auto v = common::__riscv_vabs(__riscv_vle16_v_i16m8(src + i, vl), vl);
             s = __riscv_vwredsumu(v, s, vl);
         }
         return __riscv_vmv_x(s);
@@ -223,7 +220,7 @@ struct NormL1_RVV<int, double> {
         int vl;
         for (int i = 0; i < n; i += vl) {
             vl = __riscv_vsetvl_e32m4(n - i);
-            auto v = custom_intrin::__riscv_vabs(__riscv_vle32_v_i32m4(src + i, vl), vl);
+            auto v = common::__riscv_vabs(__riscv_vle32_v_i32m4(src + i, vl), vl);
             s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v, vl), vl);
         }
         return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax));
@@ -544,7 +541,7 @@ struct MaskedNormInf_RVV<schar, int> {
                 auto v = __riscv_vlse8_v_i8m8(src + cn * i + cn_index, sizeof(schar) * cn, vl);
                 auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
-                s = __riscv_vmaxu_tumu(b, s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+                s = __riscv_vmaxu_tumu(b, s, s, common::__riscv_vabs(v, vl), vl);
             }
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax));
@@ -560,7 +557,7 @@ struct MaskedNormL1_RVV<schar, int> {
             int vl;
             for (int i = 0; i < len; i += vl) {
                 vl = __riscv_vsetvl_e8m8(len - i);
-                auto v = custom_intrin::__riscv_vabs(__riscv_vlse8_v_i8m8(src + cn * i + cn_index, sizeof(schar) * cn, vl), vl);
+                auto v = common::__riscv_vabs(__riscv_vlse8_v_i8m8(src + cn * i + cn_index, sizeof(schar) * cn, vl), vl);
                 auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
@@ -657,7 +654,7 @@ struct MaskedNormInf_RVV<short, int> {
                 auto v = __riscv_vlse16_v_i16m8(src + cn * i + cn_index, sizeof(short) * cn, vl);
                 auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
-                s = __riscv_vmaxu_tumu(b, s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+                s = __riscv_vmaxu_tumu(b, s, s, common::__riscv_vabs(v, vl), vl);
             }
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax));
@@ -672,7 +669,7 @@ struct MaskedNormL1_RVV<short, int> {
             int vl;
             for (int i = 0; i < len; i += vl) {
                 vl = __riscv_vsetvl_e8m4(len - i);
-                auto v = custom_intrin::__riscv_vabs(__riscv_vlse16_v_i16m8(src + cn * i + cn_index, sizeof(short) * cn, vl), vl);
+                auto v = common::__riscv_vabs(__riscv_vlse16_v_i16m8(src + cn * i + cn_index, sizeof(short) * cn, vl), vl);
                 auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vwredsumu_tum(b, s, v, s, vl);
@@ -714,7 +711,7 @@ struct MaskedNormInf_RVV<int, int> {
                 auto v = __riscv_vlse32_v_i32m8(src + cn * i + cn_index, sizeof(int) * cn, vl);
                 auto m = __riscv_vle8_v_u8m2(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
-                s = __riscv_vmaxu_tumu(b, s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+                s = __riscv_vmaxu_tumu(b, s, s, common::__riscv_vabs(v, vl), vl);
             }
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u32m1(0, __riscv_vsetvlmax_e32m1()), vlmax));
@@ -733,7 +730,7 @@ struct MaskedNormL1_RVV<int, double> {
                 auto v = __riscv_vlse32_v_i32m4(src + cn * i + cn_index, sizeof(int) * cn, vl);
                 auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
-                s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, custom_intrin::__riscv_vabs(v, vl), vl), vl);
+                s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, common::__riscv_vabs(v, vl), vl), vl);
             }
         }
         return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax));
@@ -972,8 +969,8 @@ CV_HAL_RVV_DEF_NORM_ALL(64f, double, double, double, double)
 }
 
 using NormFunc = int (*)(const uchar*, const uchar*, uchar*, int, int);
-inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width,
-                int height, int type, int norm_type, double* result) {
+int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step,
+         int width, int height, int type, int norm_type, double* result) {
     int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
 
     if (result == nullptr || depth == CV_16F || norm_type > NORM_L2SQR) {
@@ -1090,6 +1087,6 @@ inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mas
     return CV_HAL_ERROR_OK;
 }
 
-}}} // cv::cv_hal_rvv::norm
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/norm_diff.hpp b/hal/riscv-rvv/src/core/norm_diff.cpp
similarity index 93%
rename from hal/riscv-rvv/hal_rvv_1p0/norm_diff.hpp
rename to hal/riscv-rvv/src/core/norm_diff.cpp
index 1ffa42f15d..f136be108a 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/norm_diff.hpp
+++ b/hal/riscv-rvv/src/core/norm_diff.cpp
@@ -6,15 +6,12 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 
-#ifndef OPENCV_HAL_RVV_NORM_DIFF_HPP_INCLUDED
-#define OPENCV_HAL_RVV_NORM_DIFF_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include "common.hpp"
 
-namespace cv { namespace cv_hal_rvv { namespace norm_diff {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_normDiff
-#define cv_hal_normDiff cv::cv_hal_rvv::norm_diff::normDiff
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace {
 
@@ -62,7 +59,7 @@ struct NormDiffInf_RVV<uchar, int> {
             vl = __riscv_vsetvl_e8m8(n - i);
             auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl);
             auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vmaxu_tu(s, s, v, vl);
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax));
@@ -79,7 +76,7 @@ struct NormDiffInf_RVV<schar, int> {
             vl = __riscv_vsetvl_e8m8(n - i);
             auto v1 = __riscv_vle8_v_i8m8(src1 + i, vl);
             auto v2 = __riscv_vle8_v_i8m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vmaxu_tu(s, s, v, vl);
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax));
@@ -96,7 +93,7 @@ struct NormDiffInf_RVV<ushort, int> {
             vl = __riscv_vsetvl_e16m8(n - i);
             auto v1 = __riscv_vle16_v_u16m8(src1 + i, vl);
             auto v2 = __riscv_vle16_v_u16m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vmaxu_tu(s, s, v, vl);
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax));
@@ -113,7 +110,7 @@ struct NormDiffInf_RVV<short, int> {
             vl = __riscv_vsetvl_e16m8(n - i);
             auto v1 = __riscv_vle16_v_i16m8(src1 + i, vl);
             auto v2 = __riscv_vle16_v_i16m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vmaxu_tu(s, s, v, vl);
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax));
@@ -130,8 +127,8 @@ struct NormDiffInf_RVV<int, int> {
             vl = __riscv_vsetvl_e32m8(n - i);
             auto v1 = __riscv_vle32_v_i32m8(src1 + i, vl);
             auto v2 = __riscv_vle32_v_i32m8(src2 + i, vl);
-            // auto v = custom_intrin::__riscv_vabd(v1, v2, vl); // 5.x
-            auto v = custom_intrin::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x
+            // auto v = common::__riscv_vabd(v1, v2, vl); // 5.x
+            auto v = common::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x
             s = __riscv_vmaxu_tu(s, s, v, vl);
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u32m1(0, __riscv_vsetvlmax_e32m1()), vlmax));
@@ -182,7 +179,7 @@ struct NormDiffL1_RVV<uchar, int> {
             vl = __riscv_vsetvl_e8m8(n - i);
             auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl);
             auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
         }
         return __riscv_vmv_x(s);
@@ -199,7 +196,7 @@ struct NormDiffL1_RVV<schar, int> {
             vl = __riscv_vsetvl_e8m8(n - i);
             auto v1 = __riscv_vle8_v_i8m8(src1 + i, vl);
             auto v2 = __riscv_vle8_v_i8m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
         }
         return __riscv_vmv_x(s);
@@ -215,7 +212,7 @@ struct NormDiffL1_RVV<ushort, int> {
             vl = __riscv_vsetvl_e16m8(n - i);
             auto v1 = __riscv_vle16_v_u16m8(src1 + i, vl);
             auto v2 = __riscv_vle16_v_u16m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vwredsumu(v, s, vl);
         }
         return __riscv_vmv_x(s);
@@ -231,7 +228,7 @@ struct NormDiffL1_RVV<short, int> {
             vl = __riscv_vsetvl_e16m8(n - i);
             auto v1 = __riscv_vle16_v_i16m8(src1 + i, vl);
             auto v2 = __riscv_vle16_v_i16m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vwredsumu(v, s, vl);
         }
         return __riscv_vmv_x(s);
@@ -248,8 +245,8 @@ struct NormDiffL1_RVV<int, double> {
             vl = __riscv_vsetvl_e32m4(n - i);
             auto v1 = __riscv_vle32_v_i32m4(src1 + i, vl);
             auto v2 = __riscv_vle32_v_i32m4(src2 + i, vl);
-            // auto v = custom_intrin::__riscv_vabd(v1, v2, vl); // 5.x
-            auto v = custom_intrin::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x
+            // auto v = common::__riscv_vabd(v1, v2, vl); // 5.x
+            auto v = common::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x
             s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v, vl), vl);
         }
         return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax));
@@ -299,7 +296,7 @@ struct NormDiffL2_RVV<uchar, int> {
             vl = __riscv_vsetvl_e8m4(n - i);
             auto v1 = __riscv_vle8_v_u8m4(src1 + i, vl);
             auto v2 = __riscv_vle8_v_u8m4(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vwredsumu(__riscv_vwmulu(v, v, vl), s, vl);
         }
         return __riscv_vmv_x(s);
@@ -315,7 +312,7 @@ struct NormDiffL2_RVV<schar, int> {
             vl = __riscv_vsetvl_e8m4(n - i);
             auto v1 = __riscv_vle8_v_i8m4(src1 + i, vl);
             auto v2 = __riscv_vle8_v_i8m4(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vwredsumu(__riscv_vwmulu(v, v, vl), s, vl);
         }
         return __riscv_vmv_x(s);
@@ -332,7 +329,7 @@ struct NormDiffL2_RVV<ushort, double> {
             vl = __riscv_vsetvl_e16m2(n - i);
             auto v1 = __riscv_vle16_v_u16m2(src1 + i, vl);
             auto v2 = __riscv_vle16_v_u16m2(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             auto v_mul = __riscv_vwmulu(v, v, vl);
             s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v_mul, vl), vl);
         }
@@ -350,7 +347,7 @@ struct NormDiffL2_RVV<short, double> {
             vl = __riscv_vsetvl_e16m2(n - i);
             auto v1 = __riscv_vle16_v_i16m2(src1 + i, vl);
             auto v2 = __riscv_vle16_v_i16m2(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             auto v_mul = __riscv_vwmulu(v, v, vl);
             s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v_mul, vl), vl);
         }
@@ -368,7 +365,7 @@ struct NormDiffL2_RVV<int, double> {
             vl = __riscv_vsetvl_e32m4(n - i);
             auto v1 = __riscv_vle32_v_i32m4(src1 + i, vl);
             auto v2 = __riscv_vle32_v_i32m4(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             auto v_mul = __riscv_vwmulu(v, v, vl);
             s = __riscv_vfadd_tu(s, s, __riscv_vfcvt_f(v_mul, vl), vl);
         }
@@ -471,7 +468,7 @@ struct MaskedNormDiffInf_RVV<uchar, int> {
                 vl = __riscv_vsetvl_e8m8(len - i);
                 auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl);
                 auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@@ -482,7 +479,7 @@ struct MaskedNormDiffInf_RVV<uchar, int> {
                 vl = __riscv_vsetvl_e8m2(len - i);
                 auto v1 = __riscv_vle8_v_u8m8(src1 + i * 4, vl * 4);
                 auto v2 = __riscv_vle8_v_u8m8(src2 + i * 4, vl * 4);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl * 4);
+                auto v = common::__riscv_vabd(v1, v2, vl * 4);
                 auto m = __riscv_vle8_v_u8m2(mask + i, vl);
                 auto b = __riscv_vmsne(__riscv_vreinterpret_u8m8(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4);
                 s = __riscv_vmaxu_tumu(b, s, s, v, vl * 4);
@@ -494,7 +491,7 @@ struct MaskedNormDiffInf_RVV<uchar, int> {
                     vl = __riscv_vsetvl_e8m8(len - i);
                     auto v1 = __riscv_vlse8_v_u8m8(src1 + cn * i + cn_index, sizeof(uchar) * cn, vl);
                     auto v2 = __riscv_vlse8_v_u8m8(src2 + cn * i + cn_index, sizeof(uchar) * cn, vl);
-                    auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                    auto v = common::__riscv_vabd(v1, v2, vl);
                     auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                     auto b = __riscv_vmsne(m, 0, vl);
                     s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@@ -516,7 +513,7 @@ struct MaskedNormDiffInf_RVV<schar, int> {
                 vl = __riscv_vsetvl_e8m8(len - i);
                 auto v1 = __riscv_vlse8_v_i8m8(src1 + cn * i + cn_index, sizeof(schar) * cn, vl);
                 auto v2 = __riscv_vlse8_v_i8m8(src2 + cn * i + cn_index, sizeof(schar) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@@ -537,7 +534,7 @@ struct MaskedNormDiffInf_RVV<ushort, int> {
                 vl = __riscv_vsetvl_e16m8(len - i);
                 auto v1 = __riscv_vlse16_v_u16m8(src1 + cn * i + cn_index, sizeof(ushort) * cn, vl);
                 auto v2 = __riscv_vlse16_v_u16m8(src2 + cn * i + cn_index, sizeof(ushort) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@@ -558,7 +555,7 @@ struct MaskedNormDiffInf_RVV<short, int> {
                 vl = __riscv_vsetvl_e16m8(len - i);
                 auto v1 = __riscv_vlse16_v_i16m8(src1 + cn * i + cn_index, sizeof(short) * cn, vl);
                 auto v2 = __riscv_vlse16_v_i16m8(src2 + cn * i + cn_index, sizeof(short) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@@ -579,8 +576,8 @@ struct MaskedNormDiffInf_RVV<int, int> {
                 vl = __riscv_vsetvl_e32m8(len - i);
                 auto v1 = __riscv_vlse32_v_i32m8(src1 + cn * i + cn_index, sizeof(int) * cn, vl);
                 auto v2 = __riscv_vlse32_v_i32m8(src2 + cn * i + cn_index, sizeof(int) * cn, vl);
-                // auto v = custom_intrin::__riscv_vabd(v1, v2, vl); // 5.x
-                auto v = custom_intrin::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x
+                // auto v = common::__riscv_vabd(v1, v2, vl); // 5.x
+                auto v = common::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x
                 auto m = __riscv_vle8_v_u8m2(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@@ -656,7 +653,7 @@ struct MaskedNormDiffL1_RVV<uchar, int> {
                 vl = __riscv_vsetvl_e8m8(len - i);
                 auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl);
                 auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
@@ -667,7 +664,7 @@ struct MaskedNormDiffL1_RVV<uchar, int> {
                 vl = __riscv_vsetvl_e8m2(len - i);
                 auto v1 = __riscv_vle8_v_u8m8(src1 + i * 4, vl * 4);
                 auto v2 = __riscv_vle8_v_u8m8(src2 + i * 4, vl * 4);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl * 4);
+                auto v = common::__riscv_vabd(v1, v2, vl * 4);
                 auto m = __riscv_vle8_v_u8m2(mask + i, vl);
                 auto b = __riscv_vmsne(__riscv_vreinterpret_u8m8(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4);
                 s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl * 4), s, __riscv_vsetvlmax_e16m1());
@@ -679,7 +676,7 @@ struct MaskedNormDiffL1_RVV<uchar, int> {
                     vl = __riscv_vsetvl_e8m8(len - i);
                     auto v1 = __riscv_vlse8_v_u8m8(src1 + cn * i + cn_index, sizeof(uchar) * cn, vl);
                     auto v2 = __riscv_vlse8_v_u8m8(src2 + cn * i + cn_index, sizeof(uchar) * cn, vl);
-                    auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                    auto v = common::__riscv_vabd(v1, v2, vl);
                     auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                     auto b = __riscv_vmsne(m, 0, vl);
                     s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
@@ -701,7 +698,7 @@ struct MaskedNormDiffL1_RVV<schar, int> {
                 vl = __riscv_vsetvl_e8m8(len - i);
                 auto v1 = __riscv_vlse8_v_i8m8(src1 + cn * i + cn_index, sizeof(schar) * cn, vl);
                 auto v2 = __riscv_vlse8_v_i8m8(src2 + cn * i + cn_index, sizeof(schar) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
@@ -721,7 +718,7 @@ struct MaskedNormDiffL1_RVV<ushort, int> {
                 vl = __riscv_vsetvl_e8m4(len - i);
                 auto v1 = __riscv_vlse16_v_u16m8(src1 + cn * i + cn_index, sizeof(ushort) * cn, vl);
                 auto v2 = __riscv_vlse16_v_u16m8(src2 + cn * i + cn_index, sizeof(ushort) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vwredsumu_tum(b, s, v, s, vl);
@@ -741,7 +738,7 @@ struct MaskedNormDiffL1_RVV<short, int> {
                 vl = __riscv_vsetvl_e8m4(len - i);
                 auto v1 = __riscv_vlse16_v_i16m8(src1 + cn * i + cn_index, sizeof(short) * cn, vl);
                 auto v2 = __riscv_vlse16_v_i16m8(src2 + cn * i + cn_index, sizeof(short) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vwredsumu_tum(b, s, v, s, vl);
@@ -762,8 +759,8 @@ struct MaskedNormDiffL1_RVV<int, double> {
                 vl = __riscv_vsetvl_e32m4(len - i);
                 auto v1 = __riscv_vlse32_v_i32m4(src1 + cn * i + cn_index, sizeof(int) * cn, vl);
                 auto v2 = __riscv_vlse32_v_i32m4(src2 + cn * i + cn_index, sizeof(int) * cn, vl);
-                // auto v = custom_intrin::__riscv_vabd(v1, v2, vl); // 5.x
-                auto v = custom_intrin::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x
+                // auto v = common::__riscv_vabd(v1, v2, vl); // 5.x
+                auto v = common::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x
                 auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, v, vl), vl);
@@ -838,7 +835,7 @@ struct MaskedNormDiffL2_RVV<uchar, int> {
                 vl = __riscv_vsetvl_e8m4(len - i);
                 auto v1 = __riscv_vle8_v_u8m4(src1 + i, vl);
                 auto v2 = __riscv_vle8_v_u8m4(src2 + i, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl);
@@ -849,7 +846,7 @@ struct MaskedNormDiffL2_RVV<uchar, int> {
                 vl = __riscv_vsetvl_e8m1(len - i);
                 auto v1 = __riscv_vle8_v_u8m4(src1 + i * 4, vl * 4);
                 auto v2 = __riscv_vle8_v_u8m4(src2 + i * 4, vl * 4);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl * 4);
+                auto v = common::__riscv_vabd(v1, v2, vl * 4);
                 auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                 auto b = __riscv_vmsne(__riscv_vreinterpret_u8m4(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4);
                 s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl * 4), s, vl * 4);
@@ -861,7 +858,7 @@ struct MaskedNormDiffL2_RVV<uchar, int> {
                     vl = __riscv_vsetvl_e8m4(len - i);
                     auto v1 = __riscv_vlse8_v_u8m4(src1 + cn * i + cn_index, sizeof(uchar) * cn, vl);
                     auto v2 = __riscv_vlse8_v_u8m4(src2 + cn * i + cn_index, sizeof(uchar) * cn, vl);
-                    auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                    auto v = common::__riscv_vabd(v1, v2, vl);
                     auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                     auto b = __riscv_vmsne(m, 0, vl);
                     s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl);
@@ -882,7 +879,7 @@ struct MaskedNormDiffL2_RVV<schar, int> {
                 vl = __riscv_vsetvl_e8m4(len - i);
                 auto v1 = __riscv_vlse8_v_i8m4(src1 + cn * i + cn_index, sizeof(schar) * cn, vl);
                 auto v2 = __riscv_vlse8_v_i8m4(src2 + cn * i + cn_index, sizeof(schar) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl);
@@ -903,7 +900,7 @@ struct MaskedNormDiffL2_RVV<ushort, double> {
                 vl = __riscv_vsetvl_e16m2(len - i);
                 auto v1 = __riscv_vlse16_v_u16m2(src1 + cn * i + cn_index, sizeof(ushort) * cn, vl);
                 auto v2 = __riscv_vlse16_v_u16m2(src2 + cn * i + cn_index, sizeof(ushort) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 auto v_mul = __riscv_vwmulu(b, v, v, vl);
@@ -925,7 +922,7 @@ struct MaskedNormDiffL2_RVV<short, double> {
                 vl = __riscv_vsetvl_e16m2(len - i);
                 auto v1 = __riscv_vlse16_v_i16m2(src1 + cn * i + cn_index, sizeof(short) * cn, vl);
                 auto v2 = __riscv_vlse16_v_i16m2(src2 + cn * i + cn_index, sizeof(short) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 auto v_mul = __riscv_vwmulu(b, v, v, vl);
@@ -947,7 +944,7 @@ struct MaskedNormDiffL2_RVV<int, double> {
                 vl = __riscv_vsetvl_e16m2(len - i);
                 auto v1 = __riscv_vlse32_v_i32m4(src1 + cn * i + cn_index, sizeof(int) * cn, vl);
                 auto v2 = __riscv_vlse32_v_i32m4(src2 + cn * i + cn_index, sizeof(int) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 auto v_mul = __riscv_vwmulu(b, v, v, vl);
@@ -1081,9 +1078,8 @@ CV_HAL_RVV_DEF_NORM_DIFF_ALL(64f, double, double, double, double)
 }
 
 using NormDiffFunc = int (*)(const uchar*, const uchar*, const uchar*, uchar*, int, int);
-inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask,
-                    size_t mask_step, int width, int height, int type, int norm_type, double* result)
-{
+int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step,
+             int width, int height, int type, int norm_type, double* result) {
     int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
 
     bool relative = norm_type & NORM_RELATIVE;
@@ -1207,7 +1203,7 @@ inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size
     if(relative)
     {
         double result_;
-        int ret = cv::cv_hal_rvv::norm::norm(src2, src2_step, mask, mask_step, width, height, type, norm_type, &result_);
+        int ret = cv::rvv_hal::core::norm(src2, src2_step, mask, mask_step, width, height, type, norm_type, &result_);
         if(ret == CV_HAL_ERROR_OK)
         {
             *result /= result_ + DBL_EPSILON;
@@ -1217,6 +1213,6 @@ inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size
     return CV_HAL_ERROR_OK;
 }
 
-}}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/norm_hamming.hpp b/hal/riscv-rvv/src/core/norm_hamming.cpp
similarity index 89%
rename from hal/riscv-rvv/hal_rvv_1p0/norm_hamming.hpp
rename to hal/riscv-rvv/src/core/norm_hamming.cpp
index 9c19f62b7e..7a0951f3bc 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/norm_hamming.hpp
+++ b/hal/riscv-rvv/src/core/norm_hamming.cpp
@@ -4,18 +4,11 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_NORM_HAMMING_HPP_INCLUDED
-#define OPENCV_HAL_RVV_NORM_HAMMING_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
-#include <opencv2/core/base.hpp>
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_normHamming8u
-#define cv_hal_normHamming8u cv::cv_hal_rvv::normHamming8u
-#undef cv_hal_normHammingDiff8u
-#define cv_hal_normHammingDiff8u cv::cv_hal_rvv::normHammingDiff8u
+#if CV_HAL_RVV_1P0_ENABLED
 
 template <typename CellType>
 inline void normHammingCnt_m8(vuint8m8_t v, vbool1_t mask, size_t len_bool, size_t& result)
@@ -153,7 +146,7 @@ inline void normHammingDiff8uLoop(const uchar* a, const uchar* b, size_t n, size
     }
 }
 
-inline int normHamming8u(const uchar* a, int n, int cellSize, int* result)
+int normHamming8u(const uchar* a, int n, int cellSize, int* result)
 {
     size_t _result = 0;
 
@@ -168,7 +161,7 @@ inline int normHamming8u(const uchar* a, int n, int cellSize, int* result)
     return CV_HAL_ERROR_OK;
 }
 
-inline int normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result)
+int normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result)
 {
     size_t _result = 0;
 
@@ -183,6 +176,6 @@ inline int normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize
     return CV_HAL_ERROR_OK;
 }
 
-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif //OPENCV_HAL_RVV_NORM_HAMMING_HPP_INCLUDED
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/sincos.hpp b/hal/riscv-rvv/src/core/polar_to_cart.cpp
similarity index 61%
rename from hal/riscv-rvv/hal_rvv_1p0/sincos.hpp
rename to hal/riscv-rvv/src/core/polar_to_cart.cpp
index 776d58f42c..bb5824ca49 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/sincos.hpp
+++ b/hal/riscv-rvv/src/core/polar_to_cart.cpp
@@ -1,16 +1,16 @@
 // This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level
-// directory of this distribution and at http://opencv.org/license.html.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_SINCOS_HPP_INCLUDED
-#define OPENCV_HAL_RVV_SINCOS_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv { namespace detail {
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
 
 static constexpr size_t sincos_mask = 0x3;
 
@@ -67,6 +67,44 @@ static inline void
     cosval = __riscv_vfneg_mu(__riscv_vmor(idx1, idx2, vl), cosval, cosval, vl);
 }
 
-}}}  // namespace cv::cv_hal_rvv::detail
+template <typename RVV_T, typename Elem = typename RVV_T::ElemType>
+inline int polarToCart(const Elem* mag, const Elem* angle, Elem* x, Elem* y, int len, bool angleInDegrees)
+{
+    using T = RVV_F32M4;
+    const auto sincos_scale = angleInDegrees ? sincos_deg_scale : sincos_rad_scale;
 
-#endif  // OPENCV_HAL_RVV_SINCOS_HPP_INCLUDED
+    size_t vl;
+    auto cos_p2 = T::vmv(sincos_cos_p2, T::setvlmax());
+    auto cos_p0 = T::vmv(sincos_cos_p0, T::setvlmax());
+    for (; len > 0; len -= (int)vl, angle += vl, x += vl, y += vl)
+    {
+        vl = RVV_T::setvl(len);
+        auto vangle = T::cast(RVV_T::vload(angle, vl), vl);
+        T::VecType vsin, vcos;
+        SinCos32f<T>(vangle, vsin, vcos, sincos_scale, cos_p2, cos_p0, vl);
+        if (mag)
+        {
+            auto vmag = T::cast(RVV_T::vload(mag, vl), vl);
+            vsin = __riscv_vfmul(vsin, vmag, vl);
+            vcos = __riscv_vfmul(vcos, vmag, vl);
+            mag += vl;
+        }
+        RVV_T::vstore(x, RVV_T::cast(vcos, vl), vl);
+        RVV_T::vstore(y, RVV_T::cast(vsin, vl), vl);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int polarToCart32f(const float* mag, const float* angle, float* x, float* y, int len, bool angleInDegrees) {
+    return polarToCart<RVV_F32M4>(mag, angle, x, y, len, angleInDegrees);
+}
+int polarToCart64f(const double* mag, const double* angle, double* x, double* y, int len, bool angleInDegrees) {
+    return polarToCart<RVV_F64M8>(mag, angle, x, y, len, angleInDegrees);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/qr.hpp b/hal/riscv-rvv/src/core/qr.cpp
similarity index 91%
rename from hal/riscv-rvv/hal_rvv_1p0/qr.hpp
rename to hal/riscv-rvv/src/core/qr.cpp
index a7085e062b..1bb471a5aa 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/qr.hpp
+++ b/hal/riscv-rvv/src/core/qr.cpp
@@ -4,22 +4,17 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_QR_HPP_INCLUDED
-#define OPENCV_HAL_RVV_QR_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include <cfloat>
 #include <cmath>
 #include <typeinfo>
 #include <vector>
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
 
-namespace cv { namespace cv_hal_rvv { namespace qr {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_QR32f
-#define cv_hal_QR32f cv::cv_hal_rvv::qr::QR<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_QR64f
-#define cv_hal_QR64f cv::cv_hal_rvv::qr::QR<cv::cv_hal_rvv::RVV_F64M4>
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
 
 // the algorithm is copied from core/src/matrix_decomp.cpp,
 // in the function template static int cv::QRImpl
@@ -171,6 +166,15 @@ inline int QR(T* src1, size_t src1_step, int m, int n, int k, T* src2, size_t sr
     return CV_HAL_ERROR_OK;
 }
 
-}}}
+} // anonymous
 
-#endif
+int QR32f(float* src1, size_t src1_step, int m, int n, int k, float* src2, size_t src2_step, float* dst, int* info) {
+    return QR<RVV_F32M4>(src1, src1_step, m, n, k, src2, src2_step, dst, info);
+}
+int QR64f(double* src1, size_t src1_step, int m, int n, int k, double* src2, size_t src2_step, double* dst, int* info) {
+    return QR<RVV_F64M4>(src1, src1_step, m, n, k, src2, src2_step, dst, info);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/split.hpp b/hal/riscv-rvv/src/core/split.cpp
similarity index 91%
rename from hal/riscv-rvv/hal_rvv_1p0/split.hpp
rename to hal/riscv-rvv/src/core/split.cpp
index 9646fd9f67..1a843c939e 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/split.hpp
+++ b/hal/riscv-rvv/src/core/split.cpp
@@ -1,17 +1,14 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
-#ifndef OPENCV_HAL_RVV_SPLIT_HPP_INCLUDED
-#define OPENCV_HAL_RVV_SPLIT_HPP_INCLUDED
 
-#include <riscv_vector.h>
+#include "rvv_hal.hpp"
 
-namespace cv { namespace cv_hal_rvv {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_split8u
-#define cv_hal_split8u cv::cv_hal_rvv::split8u
+#if CV_HAL_RVV_1P0_ENABLED
 
-inline int split8u(const uchar* src, uchar** dst, int len, int cn)
+int split8u(const uchar* src, uchar** dst, int len, int cn)
 {
     int vl = 0;
     if (cn == 1)
@@ -89,5 +86,6 @@ inline int split8u(const uchar* src, uchar** dst, int len, int cn)
     return CV_HAL_ERROR_OK;
 }
 
-}}
-#endif
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/src/core/sqrt.cpp b/hal/riscv-rvv/src/core/sqrt.cpp
new file mode 100644
index 0000000000..7186f1bcca
--- /dev/null
+++ b/hal/riscv-rvv/src/core/sqrt.cpp
@@ -0,0 +1,74 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level
+// directory of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
+inline int sqrt(const Elem* src, Elem* dst, int _len)
+{
+    size_t vl;
+    for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
+    {
+        vl = SQRT_T::T::setvl(len);
+        auto x = SQRT_T::T::vload(src, vl);
+        SQRT_T::T::vstore(dst, common::sqrt<SQRT_T::iter_times>(x, vl), vl);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
+inline int invSqrt(const Elem* src, Elem* dst, int _len)
+{
+    size_t vl;
+    for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
+    {
+        vl = SQRT_T::T::setvl(len);
+        auto x = SQRT_T::T::vload(src, vl);
+        SQRT_T::T::vstore(dst, common::invSqrt<SQRT_T::iter_times>(x, vl), vl);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int sqrt32f(const float* src, float* dst, int len) {
+    return sqrt<common::Sqrt32f<RVV_F32M8>>(src, dst, len);
+}
+int sqrt64f(const double* src, double* dst, int len) {
+    return sqrt<common::Sqrt64f<RVV_F64M8>>(src, dst, len);
+}
+
+int invSqrt32f(const float* src, float* dst, int len) {
+#ifdef __clang__
+// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access.
+// So a smaller LMUL is used here.
+    return invSqrt<common::Sqrt32f<RVV_F32M4>>(src, dst, len);
+#else
+    return invSqrt<common::Sqrt32f<RVV_F32M8>>(src, dst, len);
+#endif
+}
+int invSqrt64f(const double* src, double* dst, int len) {
+#ifdef __clang__
+// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access.
+// So a smaller LMUL is used here.
+    return invSqrt<common::Sqrt64f<RVV_F64M4>>(src, dst, len);
+#else
+    return invSqrt<common::Sqrt64f<RVV_F64M8>>(src, dst, len);
+#endif
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/svd.hpp b/hal/riscv-rvv/src/core/svd.cpp
similarity index 93%
rename from hal/riscv-rvv/hal_rvv_1p0/svd.hpp
rename to hal/riscv-rvv/src/core/svd.cpp
index 2ecad0671e..8454b60a85 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/svd.hpp
+++ b/hal/riscv-rvv/src/core/svd.cpp
@@ -4,22 +4,17 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_SVD_HPP_INCLUDED
-#define OPENCV_HAL_RVV_SVD_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include <cfloat>
 #include <cmath>
 #include <typeinfo>
 #include <vector>
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
 
-namespace cv { namespace cv_hal_rvv { namespace svd {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_SVD32f
-#define cv_hal_SVD32f cv::cv_hal_rvv::svd::SVD<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_SVD64f
-#define cv_hal_SVD64f cv::cv_hal_rvv::svd::SVD<cv::cv_hal_rvv::RVV_F64M4>
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
 
 // the algorithm is copied from core/src/lapack.cpp,
 // in the function template static void cv::JacobiSVDImpl_
@@ -268,6 +263,15 @@ inline int SVD(T* src, size_t src_step, T* w, T*, size_t, T* vt, size_t vt_step,
     return CV_HAL_ERROR_OK;
 }
 
-}}}
+} // anonymous
 
-#endif
+int SVD32f(float* src, size_t src_step, float* w, float* u, size_t u_step, float* vt, size_t vt_step, int m, int n, int flags) {
+    return SVD<RVV_F32M4>(src, src_step, w, u, u_step, vt, vt_step, m, n, flags);
+}
+int SVD64f(double* src, size_t src_step, double* w, double* u, size_t u_step, double* vt, size_t vt_step, int m, int n, int flags) {
+    return SVD<RVV_F64M4>(src, src_step, w, u, u_step, vt, vt_step, m, n, flags);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/transpose.hpp b/hal/riscv-rvv/src/core/transpose.cpp
similarity index 71%
rename from hal/riscv-rvv/hal_rvv_1p0/transpose.hpp
rename to hal/riscv-rvv/src/core/transpose.cpp
index 10bf9b4d3e..9881c3db90 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/transpose.hpp
+++ b/hal/riscv-rvv/src/core/transpose.cpp
@@ -5,12 +5,7 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 
-#ifndef OPENCV_HAL_RVV_TRANSPOSE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_TRANSPOSE_HPP_INCLUDED
-
-#include <riscv_vector.h>
-
-namespace cv { namespace cv_hal_rvv { namespace transpose {
+#include "rvv_hal.hpp"
 
 #if defined (__clang__) && __clang_major__ < 18
 #define OPENCV_HAL_IMPL_RVV_VCREATE_x4(suffix, width, v0, v1, v2, v3) \
@@ -35,18 +30,22 @@ namespace cv { namespace cv_hal_rvv { namespace transpose {
 #define __riscv_vcreate_v_i64m1x8(v0, v1, v2, v3, v4, v5, v6, v7) OPENCV_HAL_IMPL_RVV_VCREATE_x8(i64, 1, v0, v1, v2, v3, v4, v5, v6, v7)
 #endif
 
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
 static void transpose2d_8u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) {
-    auto transpose_8u_8xVl = [](const uchar *src, size_t src_step, uchar *dst, size_t dst_step, const int vl) {
+    auto transpose_8u_8xVl = [](const uchar *src, size_t sstep, uchar *dst, size_t dstep, const int vl) {
         auto v0 = __riscv_vle8_v_u8m1(src, vl);
-        auto v1 = __riscv_vle8_v_u8m1(src + src_step, vl);
-        auto v2 = __riscv_vle8_v_u8m1(src + 2 * src_step, vl);
-        auto v3 = __riscv_vle8_v_u8m1(src + 3 * src_step, vl);
-        auto v4 = __riscv_vle8_v_u8m1(src + 4 * src_step, vl);
-        auto v5 = __riscv_vle8_v_u8m1(src + 5 * src_step, vl);
-        auto v6 = __riscv_vle8_v_u8m1(src + 6 * src_step, vl);
-        auto v7 = __riscv_vle8_v_u8m1(src + 7 * src_step, vl);
+        auto v1 = __riscv_vle8_v_u8m1(src + sstep, vl);
+        auto v2 = __riscv_vle8_v_u8m1(src + 2 * sstep, vl);
+        auto v3 = __riscv_vle8_v_u8m1(src + 3 * sstep, vl);
+        auto v4 = __riscv_vle8_v_u8m1(src + 4 * sstep, vl);
+        auto v5 = __riscv_vle8_v_u8m1(src + 5 * sstep, vl);
+        auto v6 = __riscv_vle8_v_u8m1(src + 6 * sstep, vl);
+        auto v7 = __riscv_vle8_v_u8m1(src + 7 * sstep, vl);
         vuint8m1x8_t v = __riscv_vcreate_v_u8m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
-        __riscv_vssseg8e8(dst, dst_step, v, vl);
+        __riscv_vssseg8e8(dst, dstep, v, vl);
     };
 
     int h = 0, w = 0;
@@ -72,17 +71,17 @@ static void transpose2d_8u(const uchar *src_data, size_t src_step, uchar *dst_da
 }
 
 static void transpose2d_16u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) {
-    auto transpose_16u_8xVl = [](const ushort *src, size_t src_step, ushort *dst, size_t dst_step, const int vl) {
+    auto transpose_16u_8xVl = [](const ushort *src, size_t sstep, ushort *dst, size_t dstep, const int vl) {
         auto v0 = __riscv_vle16_v_u16m1(src, vl);
-        auto v1 = __riscv_vle16_v_u16m1(src + src_step, vl);
-        auto v2 = __riscv_vle16_v_u16m1(src + 2 * src_step, vl);
-        auto v3 = __riscv_vle16_v_u16m1(src + 3 * src_step, vl);
-        auto v4 = __riscv_vle16_v_u16m1(src + 4 * src_step, vl);
-        auto v5 = __riscv_vle16_v_u16m1(src + 5 * src_step, vl);
-        auto v6 = __riscv_vle16_v_u16m1(src + 6 * src_step, vl);
-        auto v7 = __riscv_vle16_v_u16m1(src + 7 * src_step, vl);
+        auto v1 = __riscv_vle16_v_u16m1(src + sstep, vl);
+        auto v2 = __riscv_vle16_v_u16m1(src + 2 * sstep, vl);
+        auto v3 = __riscv_vle16_v_u16m1(src + 3 * sstep, vl);
+        auto v4 = __riscv_vle16_v_u16m1(src + 4 * sstep, vl);
+        auto v5 = __riscv_vle16_v_u16m1(src + 5 * sstep, vl);
+        auto v6 = __riscv_vle16_v_u16m1(src + 6 * sstep, vl);
+        auto v7 = __riscv_vle16_v_u16m1(src + 7 * sstep, vl);
         vuint16m1x8_t v = __riscv_vcreate_v_u16m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
-        __riscv_vssseg8e16(dst, dst_step, v, vl);
+        __riscv_vssseg8e16(dst, dstep, v, vl);
     };
 
     size_t src_step_base = src_step / sizeof(ushort);
@@ -111,13 +110,13 @@ static void transpose2d_16u(const uchar *src_data, size_t src_step, uchar *dst_d
 }
 
 static void transpose2d_32s(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) {
-    auto transpose_32s_4xVl = [](const int *src, size_t src_step, int *dst, size_t dst_step, const int vl) {
+    auto transpose_32s_4xVl = [](const int *src, size_t sstep, int *dst, size_t dstep, const int vl) {
         auto v0 = __riscv_vle32_v_i32m1(src, vl);
-        auto v1 = __riscv_vle32_v_i32m1(src + src_step, vl);
-        auto v2 = __riscv_vle32_v_i32m1(src + 2 * src_step, vl);
-        auto v3 = __riscv_vle32_v_i32m1(src + 3 * src_step, vl);
+        auto v1 = __riscv_vle32_v_i32m1(src + sstep, vl);
+        auto v2 = __riscv_vle32_v_i32m1(src + 2 * sstep, vl);
+        auto v3 = __riscv_vle32_v_i32m1(src + 3 * sstep, vl);
         vint32m1x4_t v = __riscv_vcreate_v_i32m1x4(v0, v1, v2, v3);
-        __riscv_vssseg4e32(dst, dst_step, v, vl);
+        __riscv_vssseg4e32(dst, dstep, v, vl);
     };
 
     size_t src_step_base = src_step / sizeof(int);
@@ -146,17 +145,17 @@ static void transpose2d_32s(const uchar *src_data, size_t src_step, uchar *dst_d
 }
 
 static void transpose2d_32sC2(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) {
-    auto transpose_64s_8xVl = [](const int64_t *src, size_t src_step, int64_t *dst, size_t dst_step, const int vl) {
+    auto transpose_64s_8xVl = [](const int64_t *src, size_t sstep, int64_t *dst, size_t dstep, const int vl) {
         auto v0 = __riscv_vle64_v_i64m1(src, vl);
-        auto v1 = __riscv_vle64_v_i64m1(src + src_step, vl);
-        auto v2 = __riscv_vle64_v_i64m1(src + 2 * src_step, vl);
-        auto v3 = __riscv_vle64_v_i64m1(src + 3 * src_step, vl);
-        auto v4 = __riscv_vle64_v_i64m1(src + 4 * src_step, vl);
-        auto v5 = __riscv_vle64_v_i64m1(src + 5 * src_step, vl);
-        auto v6 = __riscv_vle64_v_i64m1(src + 6 * src_step, vl);
-        auto v7 = __riscv_vle64_v_i64m1(src + 7 * src_step, vl);
+        auto v1 = __riscv_vle64_v_i64m1(src + sstep, vl);
+        auto v2 = __riscv_vle64_v_i64m1(src + 2 * sstep, vl);
+        auto v3 = __riscv_vle64_v_i64m1(src + 3 * sstep, vl);
+        auto v4 = __riscv_vle64_v_i64m1(src + 4 * sstep, vl);
+        auto v5 = __riscv_vle64_v_i64m1(src + 5 * sstep, vl);
+        auto v6 = __riscv_vle64_v_i64m1(src + 6 * sstep, vl);
+        auto v7 = __riscv_vle64_v_i64m1(src + 7 * sstep, vl);
         vint64m1x8_t v = __riscv_vcreate_v_i64m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
-        __riscv_vssseg8e64(dst, dst_step, v, vl);
+        __riscv_vssseg8e64(dst, dstep, v, vl);
     };
 
     size_t src_step_base = src_step / sizeof(int64_t);
@@ -184,12 +183,9 @@ static void transpose2d_32sC2(const uchar *src_data, size_t src_step, uchar *dst
     }
 }
 
-#undef cv_hal_transpose2d
-#define cv_hal_transpose2d cv::cv_hal_rvv::transpose::transpose2d
-
 using Transpose2dFunc = void (*)(const uchar*, size_t, uchar*, size_t, int, int);
-inline int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
-                       int src_width, int src_height, int element_size) {
+int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                int src_width, int src_height, int element_size) {
     if (src_data == dst_data) {
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     }
@@ -215,6 +211,6 @@ inline int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data,
     return CV_HAL_ERROR_OK;
 }
 
-}}} // cv::cv_hal_rvv::transpose
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif // OPENCV_HAL_RVV_TRANSPOSE_HPP_INCLUDED
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/src/imgproc/bilateral_filter.cpp b/hal/riscv-rvv/src/imgproc/bilateral_filter.cpp
new file mode 100644
index 0000000000..0756f2e6c0
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/bilateral_filter.cpp
@@ -0,0 +1,361 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+// the algorithm is copied from imgproc/src/bilateral_filter.simd.cpp
+// in the functor BilateralFilter_8u_Invoker
+static inline int bilateralFilter8UC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* color_weight)
+{
+    constexpr int align = 31;
+    std::vector<float> _sum(width + align), _wsum(width + align);
+    float* sum = reinterpret_cast<float*>(((size_t)_sum.data() + align) & ~align);
+    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
+
+    for (int i = start; i < end; i++)
+    {
+        const uchar* sptr = src_data + (i+radius) * src_step + radius;
+        memset(sum, 0, sizeof(float) * width);
+        memset(wsum, 0, sizeof(float) * width);
+        for(int k = 0; k < maxk; k++)
+        {
+            const uchar* ksptr = sptr + space_ofs[k];
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                auto src = __riscv_vle8_v_u8m2(sptr + j, vl);
+                auto ksrc = __riscv_vle8_v_u8m2(ksptr + j, vl);
+                auto diff = __riscv_vsub(__riscv_vmaxu(src, ksrc, vl), __riscv_vminu(src, ksrc, vl), vl);
+                auto w = __riscv_vloxei16_v_f32m8(color_weight, __riscv_vmul(__riscv_vzext_vf2(diff, vl), sizeof(float), vl), vl);
+                w = __riscv_vfmul(w, space_weight[k], vl);
+
+                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl);
+                __riscv_vse32(sum + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc, vl), vl), __riscv_vle32_v_f32m8(sum + j, vl), vl), vl);
+            }
+        }
+
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e8m2(width - j);
+            auto dst = __riscv_vfncvt_xu(__riscv_vfdiv(__riscv_vle32_v_f32m8(sum + j, vl), __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl);
+            __riscv_vse8(dst_data + i * dst_step + j, __riscv_vncvt_x(dst, vl), vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+static inline int bilateralFilter8UC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* color_weight)
+{
+    constexpr int align = 31;
+    std::vector<float> _sum_b(width + align), _sum_g(width + align), _sum_r(width + align), _wsum(width + align);
+    float* sum_b = reinterpret_cast<float*>(((size_t)_sum_b.data() + align) & ~align);
+    float* sum_g = reinterpret_cast<float*>(((size_t)_sum_g.data() + align) & ~align);
+    float* sum_r = reinterpret_cast<float*>(((size_t)_sum_r.data() + align) & ~align);
+    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
+
+    for (int i = start; i < end; i++)
+    {
+        const uchar* sptr = src_data + (i+radius) * src_step + radius*3;
+        memset(sum_b, 0, sizeof(float) * width);
+        memset(sum_g, 0, sizeof(float) * width);
+        memset(sum_r, 0, sizeof(float) * width);
+        memset(wsum, 0, sizeof(float) * width);
+        for(int k = 0; k < maxk; k++)
+        {
+            const uchar* ksptr = sptr + space_ofs[k];
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                auto src = __riscv_vlseg3e8_v_u8m2x3(sptr + j * 3, vl);
+                auto src0 = __riscv_vget_v_u8m2x3_u8m2(src, 0);
+                auto src1 = __riscv_vget_v_u8m2x3_u8m2(src, 1);
+                auto src2 = __riscv_vget_v_u8m2x3_u8m2(src, 2);
+                src = __riscv_vlseg3e8_v_u8m2x3(ksptr + j * 3, vl);
+                auto ksrc0 = __riscv_vget_v_u8m2x3_u8m2(src, 0);
+                auto ksrc1 = __riscv_vget_v_u8m2x3_u8m2(src, 1);
+                auto ksrc2 = __riscv_vget_v_u8m2x3_u8m2(src, 2);
+
+                auto diff0 = __riscv_vsub(__riscv_vmaxu(src0, ksrc0, vl), __riscv_vminu(src0, ksrc0, vl), vl);
+                auto diff1 = __riscv_vsub(__riscv_vmaxu(src1, ksrc1, vl), __riscv_vminu(src1, ksrc1, vl), vl);
+                auto diff2 = __riscv_vsub(__riscv_vmaxu(src2, ksrc2, vl), __riscv_vminu(src2, ksrc2, vl), vl);
+                auto w = __riscv_vloxei16_v_f32m8(color_weight, __riscv_vmul(__riscv_vadd(__riscv_vadd(__riscv_vzext_vf2(diff0, vl), __riscv_vzext_vf2(diff1, vl), vl), __riscv_vzext_vf2(diff2, vl), vl), sizeof(float), vl), vl);
+                w = __riscv_vfmul(w, space_weight[k], vl);
+
+                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl);
+                __riscv_vse32(sum_b + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc0, vl), vl), __riscv_vle32_v_f32m8(sum_b + j, vl), vl), vl);
+                __riscv_vse32(sum_g + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc1, vl), vl), __riscv_vle32_v_f32m8(sum_g + j, vl), vl), vl);
+                __riscv_vse32(sum_r + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc2, vl), vl), __riscv_vle32_v_f32m8(sum_r + j, vl), vl), vl);
+            }
+        }
+
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e8m2(width - j);
+            auto w = __riscv_vfrdiv(__riscv_vle32_v_f32m8(wsum + j, vl), 1.0f, vl);
+            vuint8m2x3_t dst{};
+            dst = __riscv_vset_v_u8m2_u8m2x3(dst, 0,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_b + j, vl), w, vl), vl), vl));
+            dst = __riscv_vset_v_u8m2_u8m2x3(dst, 1,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_g + j, vl), w, vl), vl), vl));
+            dst = __riscv_vset_v_u8m2_u8m2x3(dst, 2,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_r + j, vl), w, vl), vl), vl));
+            __riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+// the algorithm is copied from imgproc/src/bilateral_filter.simd.cpp
+// in the functor BilateralFilter_32f_Invoker
+static inline int bilateralFilter32FC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* expLUT, float scale_index)
+{
+    constexpr int align = 31;
+    std::vector<float> _sum(width + align), _wsum(width + align);
+    float* sum = reinterpret_cast<float*>(((size_t)_sum.data() + align) & ~align);
+    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
+
+    for (int i = start; i < end; i++)
+    {
+        const float* sptr = reinterpret_cast<const float*>(src_data + (i+radius) * src_step) + radius;
+        memset(sum, 0, sizeof(float) * width);
+        memset(wsum, 0, sizeof(float) * width);
+        for(int k = 0; k < maxk; k++)
+        {
+            const float* ksptr = sptr + space_ofs[k];
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m4(width - j);
+                auto src = __riscv_vle32_v_f32m4(sptr + j, vl);
+                auto ksrc = __riscv_vle32_v_f32m4(ksptr + j, vl);
+                auto diff = __riscv_vfmul(__riscv_vfabs(__riscv_vfsub(src, ksrc, vl), vl), scale_index, vl);
+                auto idx = __riscv_vfcvt_rtz_x(diff, vl);
+                auto alpha = __riscv_vfsub(diff, __riscv_vfcvt_f(idx, vl), vl);
+
+                auto exp = __riscv_vloxseg2ei32_v_f32m4x2(expLUT, __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vmul(idx, sizeof(float), vl)), vl);
+                auto w = __riscv_vfmadd(alpha, __riscv_vfsub(__riscv_vget_v_f32m4x2_f32m4(exp, 1), __riscv_vget_v_f32m4x2_f32m4(exp, 0), vl), __riscv_vget_v_f32m4x2_f32m4(exp, 0), vl);
+                w = __riscv_vfmul(w, space_weight[k], vl);
+
+                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m4(wsum + j, vl), vl), vl);
+                __riscv_vse32(sum + j, __riscv_vfmadd(w, ksrc, __riscv_vle32_v_f32m4(sum + j, vl), vl), vl);
+            }
+        }
+
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e32m4(width - j);
+            auto src = __riscv_vle32_v_f32m4(sptr + j, vl);
+            auto dst = __riscv_vfdiv(__riscv_vfadd(__riscv_vle32_v_f32m4(sum + j, vl), src, vl), __riscv_vfadd(__riscv_vle32_v_f32m4(wsum + j, vl), 1, vl), vl);
+            __riscv_vse32(reinterpret_cast<float*>(dst_data + i * dst_step) + j, dst, vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+static inline int bilateralFilter32FC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* expLUT, float scale_index)
+{
+    constexpr int align = 31;
+    std::vector<float> _sum_b(width + align), _sum_g(width + align), _sum_r(width + align), _wsum(width + align);
+    float* sum_b = reinterpret_cast<float*>(((size_t)_sum_b.data() + align) & ~align);
+    float* sum_g = reinterpret_cast<float*>(((size_t)_sum_g.data() + align) & ~align);
+    float* sum_r = reinterpret_cast<float*>(((size_t)_sum_r.data() + align) & ~align);
+    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
+
+    for (int i = start; i < end; i++)
+    {
+        const float* sptr = reinterpret_cast<const float*>(src_data + (i+radius) * src_step) + radius*3;
+        memset(sum_b, 0, sizeof(float) * width);
+        memset(sum_g, 0, sizeof(float) * width);
+        memset(sum_r, 0, sizeof(float) * width);
+        memset(wsum, 0, sizeof(float) * width);
+        for(int k = 0; k < maxk; k++)
+        {
+            const float* ksptr = sptr + space_ofs[k];
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m2(width - j);
+                auto src = __riscv_vlseg3e32_v_f32m2x3(sptr + j * 3, vl);
+                auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
+                auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
+                auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
+                src = __riscv_vlseg3e32_v_f32m2x3(ksptr + j * 3, vl);
+                auto ksrc0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
+                auto ksrc1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
+                auto ksrc2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
+
+                auto diff = __riscv_vfmul(__riscv_vfadd(__riscv_vfadd(__riscv_vfabs(__riscv_vfsub(src0, ksrc0, vl), vl), __riscv_vfabs(__riscv_vfsub(src1, ksrc1, vl), vl), vl), __riscv_vfabs(__riscv_vfsub(src2, ksrc2, vl), vl), vl), scale_index, vl);
+                auto idx = __riscv_vfcvt_rtz_x(diff, vl);
+                auto alpha = __riscv_vfsub(diff, __riscv_vfcvt_f(idx, vl), vl);
+
+                auto exp = __riscv_vloxseg2ei32_v_f32m2x2(expLUT, __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmul(idx, sizeof(float), vl)), vl);
+                auto w = __riscv_vfmadd(alpha, __riscv_vfsub(__riscv_vget_v_f32m2x2_f32m2(exp, 1), __riscv_vget_v_f32m2x2_f32m2(exp, 0), vl), __riscv_vget_v_f32m2x2_f32m2(exp, 0), vl);
+                w = __riscv_vfmul(w, space_weight[k], vl);
+
+                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m2(wsum + j, vl), vl), vl);
+                __riscv_vse32(sum_b + j, __riscv_vfmadd(w, ksrc0, __riscv_vle32_v_f32m2(sum_b + j, vl), vl), vl);
+                __riscv_vse32(sum_g + j, __riscv_vfmadd(w, ksrc1, __riscv_vle32_v_f32m2(sum_g + j, vl), vl), vl);
+                __riscv_vse32(sum_r + j, __riscv_vfmadd(w, ksrc2, __riscv_vle32_v_f32m2(sum_r + j, vl), vl), vl);
+            }
+        }
+
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e32m2(width - j);
+            auto w = __riscv_vfrdiv(__riscv_vfadd(__riscv_vle32_v_f32m2(wsum + j, vl), 1, vl), 1, vl);
+            auto src = __riscv_vlseg3e32_v_f32m2x3(sptr + j * 3, vl);
+            auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
+            auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
+            auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
+
+            vfloat32m2x3_t dst{};
+            dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_b + j, vl), src0, vl), vl));
+            dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_g + j, vl), src1, vl), vl));
+            dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_r + j, vl), src2, vl), vl));
+            __riscv_vsseg3e32(reinterpret_cast<float*>(dst_data + i * dst_step) + j * 3, dst, vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+// the algorithm is copied from imgproc/src/bilateral_filter.dispatch.cpp
+// in the function static void bilateralFilter_8u and bilateralFilter_32f
+int bilateralFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                    int width, int height, int depth, int cn, int d, double sigma_color, double sigma_space, int border_type)
+{
+    const int type = CV_MAKETYPE(depth, cn);
+    if (type != CV_8UC1 && type != CV_8UC3 && type != CV_32FC1 && type != CV_32FC3)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (type == CV_32FC1 && width * height > 1 << 20)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (src_data == dst_data || border_type & BORDER_ISOLATED)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    sigma_color = sigma_color <= 0 ? 1 : sigma_color;
+    sigma_space = sigma_space <= 0 ? 1 : sigma_space;
+    double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
+    double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
+    int radius = d <= 0 ? std::round(sigma_space*1.5) : d/2;
+    radius = std::max(radius, 1);
+    d = radius*2 + 1;
+
+    const int size = depth == CV_32F ? cn * sizeof(float) : cn;
+    const int temp_step = (width + radius * 2) * size;
+    std::vector<uchar> _temp((width + radius * 2) * (height + radius * 2) * size, 0);
+    uchar* temp = _temp.data();
+    std::vector<int> width_interpolate(radius * 2);
+    for (int j = 0; j < radius; j++)
+    {
+        width_interpolate[j] = common::borderInterpolate(j - radius, width, border_type);
+        width_interpolate[j + radius] = common::borderInterpolate(width + j, width, border_type);
+    }
+    for (int i = 0; i < height + radius * 2; i++)
+    {
+        int x = common::borderInterpolate(i - radius, height, border_type);
+        if (x != -1)
+        {
+            for (int j = 0; j < radius; j++)
+            {
+                int y = width_interpolate[j];
+                if (y != -1)
+                    memcpy(temp + i * temp_step + j * size, src_data + x * src_step + y * size, size);
+                y = width_interpolate[j + radius];
+                if (y != -1)
+                    memcpy(temp + i * temp_step + (width + j + radius) * size, src_data + x * src_step + y * size, size);
+            }
+            memcpy(temp + i * temp_step + radius * size, src_data + x * src_step, width * size);
+        }
+    }
+
+    std::vector<float> _space_weight(d*d);
+    std::vector<int> _space_ofs(d*d);
+    float* space_weight = _space_weight.data();
+    int* space_ofs = _space_ofs.data();
+    int maxk = 0;
+    for (int i = -radius; i <= radius; i++)
+    {
+        for (int j = -radius; j <= radius; j++)
+        {
+            double r = std::sqrt((double)i*i + (double)j*j);
+            if (r <= radius && (depth == CV_8U || i != 0 || j != 0))
+            {
+                space_weight[maxk] = static_cast<float>(r*r*gauss_space_coeff);
+                space_ofs[maxk++] = (i * (temp_step / size) + j) * cn;
+            }
+        }
+    }
+    cv::rvv_hal::core::exp32f(space_weight, space_weight, maxk);
+
+    if (depth == CV_8U)
+    {
+        std::vector<float> _color_weight(cn*256);
+        float* color_weight = _color_weight.data();
+        for (int i = 0; i < 256*cn; i++)
+            color_weight[i] = static_cast<float>(i*i*gauss_color_coeff);
+        cv::rvv_hal::core::exp32f(color_weight, color_weight, 256*cn);
+
+        switch (cn)
+        {
+        case 1:
+            return common::invoke(height, {bilateralFilter8UC1}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, color_weight);
+        case 3:
+            return common::invoke(height, {bilateralFilter8UC3}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, color_weight);
+        }
+    }
+    else
+    {
+        double minValSrc = -1, maxValSrc = 1;
+        cv::rvv_hal::core::minMaxIdx(src_data, src_step, width * cn, height, CV_32F, &minValSrc, &maxValSrc, nullptr, nullptr, nullptr);
+        if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON)
+        {
+            for (int i = 0; i < width; i++)
+                memcpy(dst_data + i * dst_step, src_data + i * src_step, width * size);
+            return CV_HAL_ERROR_OK;
+        }
+
+        const int kExpNumBinsPerChannel = 1 << 12;
+        const int kExpNumBins = kExpNumBinsPerChannel * cn;
+        const float scale_index = kExpNumBins / static_cast<float>((maxValSrc - minValSrc) * cn);
+        std::vector<float> _expLUT(kExpNumBins+2, 0);
+        float* expLUT = _expLUT.data();
+        for (int i = 0; i < kExpNumBins+2; i++)
+        {
+            double val = i / scale_index;
+            expLUT[i] = static_cast<float>(val * val * gauss_color_coeff);
+        }
+        cv::rvv_hal::core::exp32f(expLUT, expLUT, kExpNumBins+2);
+
+        switch (cn)
+        {
+        case 1:
+            return common::invoke(height, {bilateralFilter32FC1}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, expLUT, scale_index);
+        case 3:
+            return common::invoke(height, {bilateralFilter32FC3}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, expLUT, scale_index);
+        }
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/src/imgproc/box_filter.cpp b/hal/riscv-rvv/src/imgproc/box_filter.cpp
new file mode 100644
index 0000000000..8a91ef57bb
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/box_filter.cpp
@@ -0,0 +1,392 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+template<typename T> struct rvv;
+template<> struct rvv<uchar>
+{
+    static inline vuint16m8_t vcvt0(vuint8m4_t a, size_t b) { return __riscv_vzext_vf2(a, b); }
+    static inline vuint8m4_t vcvt1(vuint16m8_t a, size_t b) { return __riscv_vnclipu(a, 0, __RISCV_VXRM_RNU, b); }
+    static inline vuint16m8_t vdiv(vuint16m8_t a, ushort b, size_t c) { return __riscv_vdivu(__riscv_vadd(a, b / 2, c), b, c); }
+};
+template<> struct rvv<short>
+{
+    static inline vint32m8_t vcvt0(vint16m4_t a, size_t b) { return __riscv_vsext_vf2(a, b); }
+    static inline vint16m4_t vcvt1(vint32m8_t a, size_t b) { return __riscv_vnclip(a, 0, __RISCV_VXRM_RNU, b); }
+    static inline vint32m8_t vdiv(vint32m8_t a, int b, size_t c) { return __riscv_vdiv(__riscv_vadd(a, b / 2, c), b, c); }
+};
+template<> struct rvv<int>
+{
+    static inline vint32m8_t vcvt0(vint32m8_t a, size_t) { return a; }
+    static inline vint32m8_t vcvt1(vint32m8_t a, size_t) { return a; }
+    static inline vint32m8_t vdiv(vint32m8_t a, int b, size_t c) { return __riscv_vdiv(__riscv_vadd(a, b / 2, c), b, c); }
+};
+template<> struct rvv<float>
+{
+    static inline vfloat32m8_t vcvt0(vfloat32m8_t a, size_t) { return a; }
+    static inline vfloat32m8_t vcvt1(vfloat32m8_t a, size_t) { return a; }
+    static inline vfloat32m8_t vdiv(vfloat32m8_t a, float b, size_t c) { return __riscv_vfdiv(a, b, c); }
+};
+
+// the algorithm is same as cv_hal_sepFilter
+template<int ksize, typename helperT, typename helperWT, bool cast>
+static inline int boxFilterC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int anchor_x, int anchor_y, bool normalize, int border_type)
+{
+    using T = typename helperT::ElemType;
+    using WT = typename helperWT::ElemType;
+
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto accessX = [&](int x) {
+        int pi = common::borderInterpolate(offset_y + x - anchor_y, full_height, border_type);
+        return pi < 0 ? noval : pi - offset_y;
+    };
+    auto accessY = [&](int y) {
+        int pj = common::borderInterpolate(offset_x + y - anchor_x, full_width, border_type);
+        return pj < 0 ? noval : pj - offset_x;
+    };
+    auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; };
+
+    std::vector<WT> res(width * ksize);
+    auto process = [&](int x, int y) {
+        WT sum = 0;
+        for (int i = 0; i < ksize; i++)
+        {
+            int p = accessY(y + i);
+            if (p != noval)
+            {
+                sum += reinterpret_cast<const T*>(src_data + x * src_step)[p];
+            }
+        }
+        res[p2idx(x, y)] = sum;
+    };
+
+    const int left = anchor_x, right = width - (ksize - 1 - anchor_x);
+    for (int i = start - anchor_y; i < end + (ksize - 1 - anchor_y); i++)
+    {
+        if (i + offset_y >= 0 && i + offset_y < full_height)
+        {
+            if (left >= right)
+            {
+                for (int j = 0; j < width; j++)
+                    process(i, j);
+            }
+            else
+            {
+                for (int j = 0; j < left; j++)
+                    process(i, j);
+                for (int j = right; j < width; j++)
+                    process(i, j);
+
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = helperT::setvl(right - j);
+                    const T* extra = reinterpret_cast<const T*>(src_data + i * src_step) + j - anchor_x;
+                    auto src = rvv<T>::vcvt0(helperT::vload(extra, vl), vl);
+
+                    extra += vl;
+                    auto sum = src;
+                    src = helperWT::vslide1down(src, extra[0], vl);
+                    sum = helperWT::vadd(sum, src, vl);
+                    src = helperWT::vslide1down(src, extra[1], vl);
+                    sum = helperWT::vadd(sum, src, vl);
+                    if (ksize == 5)
+                    {
+                        src = helperWT::vslide1down(src, extra[2], vl);
+                        sum = helperWT::vadd(sum, src, vl);
+                        src = helperWT::vslide1down(src, extra[3], vl);
+                        sum = helperWT::vadd(sum, src, vl);
+                    }
+                    helperWT::vstore(res.data() + p2idx(i, j), sum, vl);
+                }
+            }
+        }
+
+        int cur = i - (ksize - 1 - anchor_y);
+        if (cur >= start)
+        {
+            const WT* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
+            const WT* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
+            const WT* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
+            const WT* row3 = nullptr, *row4 = nullptr;
+            if (ksize == 5)
+            {
+                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
+                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
+            }
+
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = helperWT::setvl(width - j);
+                auto sum = row0 ? helperWT::vload(row0 + j, vl) : helperWT::vmv(0, vl);
+                if (row1) sum = helperWT::vadd(sum, helperWT::vload(row1 + j, vl), vl);
+                if (row2) sum = helperWT::vadd(sum, helperWT::vload(row2 + j, vl), vl);
+                if (row3) sum = helperWT::vadd(sum, helperWT::vload(row3 + j, vl), vl);
+                if (row4) sum = helperWT::vadd(sum, helperWT::vload(row4 + j, vl), vl);
+                if (normalize) sum = rvv<T>::vdiv(sum, ksize * ksize, vl);
+
+                if (cast)
+                {
+                    helperT::vstore(reinterpret_cast<T*>(dst_data + cur * dst_step) + j, rvv<T>::vcvt1(sum, vl), vl);
+                }
+                else
+                {
+                    helperWT::vstore(reinterpret_cast<WT*>(dst_data + cur * dst_step) + j, sum, vl);
+                }
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template<int ksize>
+static inline int boxFilterC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int anchor_x, int anchor_y, bool normalize, int border_type)
+{
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto accessX = [&](int x) {
+        int pi = common::borderInterpolate(offset_y + x - anchor_y, full_height, border_type);
+        return pi < 0 ? noval : pi - offset_y;
+    };
+    auto accessY = [&](int y) {
+        int pj = common::borderInterpolate(offset_x + y - anchor_x, full_width, border_type);
+        return pj < 0 ? noval : pj - offset_x;
+    };
+    auto p2idx = [&](int x, int y){ return ((x + ksize) % ksize * width + y) * 3; };
+
+    std::vector<float> res(width * ksize * 3);
+    auto process = [&](int x, int y) {
+        float sum0, sum1, sum2;
+        sum0 = sum1 = sum2 = 0;
+        for (int i = 0; i < ksize; i++)
+        {
+            int p = accessY(y + i);
+            if (p != noval)
+            {
+                sum0 += reinterpret_cast<const float*>(src_data + x * src_step)[p * 3    ];
+                sum1 += reinterpret_cast<const float*>(src_data + x * src_step)[p * 3 + 1];
+                sum2 += reinterpret_cast<const float*>(src_data + x * src_step)[p * 3 + 2];
+            }
+        }
+        res[p2idx(x, y)    ] = sum0;
+        res[p2idx(x, y) + 1] = sum1;
+        res[p2idx(x, y) + 2] = sum2;
+    };
+
+    const int left = anchor_x, right = width - (ksize - 1 - anchor_x);
+    for (int i = start - anchor_y; i < end + (ksize - 1 - anchor_y); i++)
+    {
+        if (i + offset_y >= 0 && i + offset_y < full_height)
+        {
+            if (left >= right)
+            {
+                for (int j = 0; j < width; j++)
+                    process(i, j);
+            }
+            else
+            {
+                for (int j = 0; j < left; j++)
+                    process(i, j);
+                for (int j = right; j < width; j++)
+                    process(i, j);
+
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = __riscv_vsetvl_e32m2(right - j);
+                    const float* extra = reinterpret_cast<const float*>(src_data + i * src_step) + (j - anchor_x) * 3;
+                    auto src = __riscv_vlseg3e32_v_f32m2x3(extra, vl);
+                    auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
+                    auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
+                    auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
+
+                    extra += vl * 3;
+                    auto sum0 = src0, sum1 = src1, sum2 = src2;
+                    src0 = __riscv_vfslide1down(src0, extra[0], vl);
+                    src1 = __riscv_vfslide1down(src1, extra[1], vl);
+                    src2 = __riscv_vfslide1down(src2, extra[2], vl);
+                    sum0 = __riscv_vfadd(sum0, src0, vl);
+                    sum1 = __riscv_vfadd(sum1, src1, vl);
+                    sum2 = __riscv_vfadd(sum2, src2, vl);
+                    src0 = __riscv_vfslide1down(src0, extra[3], vl);
+                    src1 = __riscv_vfslide1down(src1, extra[4], vl);
+                    src2 = __riscv_vfslide1down(src2, extra[5], vl);
+                    sum0 = __riscv_vfadd(sum0, src0, vl);
+                    sum1 = __riscv_vfadd(sum1, src1, vl);
+                    sum2 = __riscv_vfadd(sum2, src2, vl);
+                    if (ksize == 5)
+                    {
+                        src0 = __riscv_vfslide1down(src0, extra[6], vl);
+                        src1 = __riscv_vfslide1down(src1, extra[7], vl);
+                        src2 = __riscv_vfslide1down(src2, extra[8], vl);
+                        sum0 = __riscv_vfadd(sum0, src0, vl);
+                        sum1 = __riscv_vfadd(sum1, src1, vl);
+                        sum2 = __riscv_vfadd(sum2, src2, vl);
+                        src0 = __riscv_vfslide1down(src0, extra[ 9], vl);
+                        src1 = __riscv_vfslide1down(src1, extra[10], vl);
+                        src2 = __riscv_vfslide1down(src2, extra[11], vl);
+                        sum0 = __riscv_vfadd(sum0, src0, vl);
+                        sum1 = __riscv_vfadd(sum1, src1, vl);
+                        sum2 = __riscv_vfadd(sum2, src2, vl);
+                    }
+
+                    vfloat32m2x3_t dst{};
+                    dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, sum0);
+                    dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, sum1);
+                    dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, sum2);
+                    __riscv_vsseg3e32(res.data() + p2idx(i, j), dst, vl);
+                }
+            }
+        }
+
+        int cur = i - (ksize - 1 - anchor_y);
+        if (cur >= start)
+        {
+            const float* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
+            const float* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
+            const float* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
+            const float* row3 = nullptr, *row4 = nullptr;
+            if (ksize == 5)
+            {
+                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
+                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
+            }
+
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m2(width - j);
+                vfloat32m2_t sum0, sum1, sum2;
+                sum0 = sum1 = sum2 = __riscv_vfmv_v_f_f32m2(0, vl);
+                auto loadres = [&](const float* row) {
+                    if (!row) return;
+                    auto src = __riscv_vlseg3e32_v_f32m2x3(row + j * 3, vl);
+                    sum0 = __riscv_vfadd(sum0, __riscv_vget_v_f32m2x3_f32m2(src, 0), vl);
+                    sum1 = __riscv_vfadd(sum1, __riscv_vget_v_f32m2x3_f32m2(src, 1), vl);
+                    sum2 = __riscv_vfadd(sum2, __riscv_vget_v_f32m2x3_f32m2(src, 2), vl);
+                };
+                loadres(row0);
+                loadres(row1);
+                loadres(row2);
+                loadres(row3);
+                loadres(row4);
+                if (normalize)
+                {
+                    sum0 = __riscv_vfdiv(sum0, ksize * ksize, vl);
+                    sum1 = __riscv_vfdiv(sum1, ksize * ksize, vl);
+                    sum2 = __riscv_vfdiv(sum2, ksize * ksize, vl);
+                }
+
+                vfloat32m2x3_t dst{};
+                dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, sum0);
+                dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, sum1);
+                dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, sum2);
+                __riscv_vsseg3e32(reinterpret_cast<float*>(dst_data + cur * dst_step) + j * 3, dst, vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int boxFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, size_t ksize_width, size_t ksize_height, int anchor_x, int anchor_y, bool normalize, int border_type)
+{
+    const int src_type = CV_MAKETYPE(src_depth, cn), dst_type = CV_MAKETYPE(dst_depth, cn);
+    if (ksize_width != ksize_height || (ksize_width != 3 && ksize_width != 5))
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (border_type & BORDER_ISOLATED || border_type == BORDER_WRAP)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    uchar* _dst_data = dst_data;
+    size_t _dst_step = dst_step;
+    const size_t size = CV_ELEM_SIZE(dst_type);
+    std::vector<uchar> dst;
+    if (src_data == _dst_data)
+    {
+        dst = std::vector<uchar>(width * height * size);
+        dst_data = dst.data();
+        dst_step = width * size;
+    }
+
+    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
+    anchor_x = anchor_x < 0 ? ksize_width  / 2 : anchor_x;
+    anchor_y = anchor_y < 0 ? ksize_height / 2 : anchor_y;
+    if (src_type != dst_type)
+    {
+        if (src_type == CV_8UC1 && dst_type == CV_16UC1)
+        {
+            if (ksize_width == 3)
+            {
+                res = common::invoke(height, {boxFilterC1<3, RVV_U8M4, RVV_U16M8, false>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            }
+            if (ksize_width == 5)
+            {
+                res = common::invoke(height, {boxFilterC1<5, RVV_U8M4, RVV_U16M8, false>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            }
+        }
+    }
+    else
+    {
+        switch (ksize_width*100 + src_type)
+        {
+        case 300 + CV_8UC1:
+            res = common::invoke(height, {boxFilterC1<3, RVV_U8M4, RVV_U16M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 500 + CV_8UC1:
+            res = common::invoke(height, {boxFilterC1<5, RVV_U8M4, RVV_U16M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 300 + CV_16SC1:
+            res = common::invoke(height, {boxFilterC1<3, RVV_I16M4, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 500 + CV_16SC1:
+            res = common::invoke(height, {boxFilterC1<5, RVV_I16M4, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 300 + CV_32SC1:
+            res = common::invoke(height, {boxFilterC1<3, RVV_I32M8, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 500 + CV_32SC1:
+            res = common::invoke(height, {boxFilterC1<5, RVV_I32M8, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 300 + CV_32FC1:
+            res = common::invoke(height, {boxFilterC1<3, RVV_F32M8, RVV_F32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 500 + CV_32FC1:
+            res = common::invoke(height, {boxFilterC1<5, RVV_F32M8, RVV_F32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 300 + CV_32FC3:
+            res = common::invoke(height, {boxFilterC3<3>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 500 + CV_32FC3:
+            res = common::invoke(height, {boxFilterC3<5>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        }
+    }
+    if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    if (src_data == _dst_data)
+    {
+        for (int i = 0; i < height; i++)
+            memcpy(_dst_data + i * _dst_step, dst.data() + i * dst_step, dst_step);
+    }
+
+    return res;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/hal_rvv_1p0/color.hpp b/hal/riscv-rvv/src/imgproc/color.cpp
similarity index 90%
rename from hal/riscv-rvv/hal_rvv_1p0/color.hpp
rename to hal/riscv-rvv/src/imgproc/color.cpp
index c715c6ad38..1b7ee0a4d3 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/color.hpp
+++ b/hal/riscv-rvv/src/imgproc/color.cpp
@@ -4,12 +4,12 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_COLOR_HPP_INCLUDED
-#define OPENCV_HAL_RVV_COLOR_HPP_INCLUDED
+#include "rvv_hal.hpp"
+#include <limits>
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace imgproc {
 
-namespace cv { namespace cv_hal_rvv {
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace color {
     class ColorInvoker : public ParallelLoopBody
@@ -41,11 +41,9 @@ namespace color {
     {
         return val - std::remainder(val, 1.0);
     }
-} // cv::cv_hal_rvv::color
+} // cv::rvv_hal::color
 
 namespace BGRtoBGR {
-#undef cv_hal_cvtBGRtoBGR
-#define cv_hal_cvtBGRtoBGR cv::cv_hal_rvv::BGRtoBGR::cvtBGRtoBGR
 
 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@@ -206,27 +204,26 @@ static inline int cvtBGRtoBGR(int start, int end, const T * src, size_t src_step
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue)
+} // BGRtoBGR
+
+int cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue)
 {
     if ((scn != 3 && scn != 4) || (dcn != 3 && dcn != 4))
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return cvtBGRtoBGR<uchar>(0, height, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, dcn, swapBlue);
+        return BGRtoBGR::cvtBGRtoBGR<uchar>(0, height, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, dcn, swapBlue);
     case CV_16U:
-        return cvtBGRtoBGR<ushort>(0, height, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, dcn, swapBlue);
+        return BGRtoBGR::cvtBGRtoBGR<ushort>(0, height, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, dcn, swapBlue);
     case CV_32F:
-        return cvtBGRtoBGR<float>(0, height, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, dcn, swapBlue);
+        return BGRtoBGR::cvtBGRtoBGR<float>(0, height, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, dcn, swapBlue);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoBGR
 
 namespace GraytoBGR {
-#undef cv_hal_cvtGraytoBGR
-#define cv_hal_cvtGraytoBGR cv::cv_hal_rvv::GraytoBGR::cvtGraytoBGR
 
 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@@ -337,27 +334,26 @@ static inline int cvtGraytoBGR(int start, int end, const T * src, size_t src_ste
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtGraytoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn)
+} // GraytoBGR
+
+int cvtGraytoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn)
 {
     if (dcn != 3 && dcn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return cvtGraytoBGR<uchar>(0, height, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn);
+        return GraytoBGR::cvtGraytoBGR<uchar>(0, height, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn);
     case CV_16U:
-        return cvtGraytoBGR<ushort>(0, height, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn);
+        return GraytoBGR::cvtGraytoBGR<ushort>(0, height, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn);
     case CV_32F:
-        return cvtGraytoBGR<float>(0, height, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn);
+        return GraytoBGR::cvtGraytoBGR<float>(0, height, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::GraytoBGR
 
 namespace BGRtoGray {
-#undef cv_hal_cvtBGRtoGray
-#define cv_hal_cvtBGRtoGray cv::cv_hal_rvv::BGRtoGray::cvtBGRtoGray
 
 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@@ -462,27 +458,26 @@ static inline int cvtBGRtoGray(int start, int end, const T * src, size_t src_ste
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGRtoGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue)
+} // BGRtoGray
+
+int cvtBGRtoGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue)
 {
     if (scn != 3 && scn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return color::invoke(width, height, {cvtBGRtoGray<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoGray::cvtBGRtoGray<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue);
     case CV_16U:
-        return color::invoke(width, height, {cvtBGRtoGray<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoGray::cvtBGRtoGray<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue);
     case CV_32F:
-        return color::invoke(width, height, {cvtBGRtoGray<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoGray::cvtBGRtoGray<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoGray
 
 namespace BGR5x5toBGR {
-#undef cv_hal_cvtBGR5x5toBGR
-#define cv_hal_cvtBGR5x5toBGR cv::cv_hal_rvv::BGR5x5toBGR::cvtBGR5x5toBGR
 
 // the algorithm is copied from imgproc/src/color_rgb.simd.cpp,
 // in the functor struct RGB5x52RGB
@@ -540,18 +535,17 @@ static inline int cvtBGR5x5toBGR_u(int start, int end, const ushort * src, size_
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int greenBits)
+} // BGR5x5toBGR
+
+int cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int greenBits)
 {
     if ((dcn != 3 && dcn != 4) || (greenBits != 5 && greenBits != 6))
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
 
-    return color::invoke(width, height, {cvtBGR5x5toBGR_u}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, greenBits);
+    return color::invoke(width, height, {BGR5x5toBGR::cvtBGR5x5toBGR_u}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, greenBits);
 }
-} // cv::cv_hal_rvv::BGR5x5toBGR
 
 namespace BGRtoBGR5x5 {
-#undef cv_hal_cvtBGRtoBGR5x5
-#define cv_hal_cvtBGRtoBGR5x5 cv::cv_hal_rvv::BGRtoBGR5x5::cvtBGRtoBGR5x5
 
 // the algorithm is copied from imgproc/src/color_rgb.simd.cpp,
 // in the functor struct RGB2RGB5x5
@@ -604,18 +598,17 @@ static inline int cvtBGRtoBGR5x5_u(int start, int end, const uchar * src, size_t
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int greenBits)
+} // BGRtoBGR5x5
+
+int cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int greenBits)
 {
     if ((scn != 3 && scn != 4) || (greenBits != 5 && greenBits != 6))
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
 
-    return color::invoke(width, height, {cvtBGRtoBGR5x5_u}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue, greenBits);
+    return color::invoke(width, height, {BGRtoBGR5x5::cvtBGRtoBGR5x5_u}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue, greenBits);
 }
-} // cv::cv_hal_rvv::BGRtoBGR5x5
 
 namespace BGR5x5toGray {
-#undef cv_hal_cvtBGR5x5toGray
-#define cv_hal_cvtBGR5x5toGray cv::cv_hal_rvv::BGR5x5toGray::cvtBGR5x5toGray
 
 // the algorithm is copied from imgproc/src/color_rgb.simd.cpp,
 // in the functor struct RGB5x52Gray
@@ -654,18 +647,17 @@ static inline int cvtBGR5x5toGray_u(int start, int end, const ushort * src, size
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGR5x5toGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits)
+} // BGR5x5toGray
+
+int cvtBGR5x5toGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits)
 {
     if (greenBits != 5 && greenBits != 6)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
 
-    return color::invoke(width, height, {cvtBGR5x5toGray_u}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, greenBits);
+    return color::invoke(width, height, {BGR5x5toGray::cvtBGR5x5toGray_u}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, greenBits);
 }
-} // cv::cv_hal_rvv::BGR5x5toGray
 
 namespace GraytoBGR5x5 {
-#undef cv_hal_cvtGraytoBGR5x5
-#define cv_hal_cvtGraytoBGR5x5 cv::cv_hal_rvv::GraytoBGR5x5::cvtGraytoBGR5x5
 
 // the algorithm is copied from imgproc/src/color_rgb.simd.cpp,
 // in the functor struct Gray2RGB5x5
@@ -697,18 +689,17 @@ static inline int cvtGraytoBGR5x5_u(int start, int end, const uchar * src, size_
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits)
+} // GraytoBGR5x5
+
+int cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits)
 {
     if (greenBits != 5 && greenBits != 6)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
 
-    return color::invoke(width, height, {cvtGraytoBGR5x5_u}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, greenBits);
+    return color::invoke(width, height, {GraytoBGR5x5::cvtGraytoBGR5x5_u}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, greenBits);
 }
-} // cv::cv_hal_rvv::GraytoBGR5x5
 
 namespace YUVtoBGR {
-#undef cv_hal_cvtYUVtoBGR
-#define cv_hal_cvtYUVtoBGR cv::cv_hal_rvv::YUVtoBGR::cvtYUVtoBGR
 
 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@@ -857,27 +848,26 @@ static inline int cvtYUVtoBGR(int start, int end, const T * src, size_t src_step
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr)
+} // YUVtoBGR
+
+int cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr)
 {
     if (dcn != 3 && dcn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return color::invoke(width, height, {cvtYUVtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
+        return color::invoke(width, height, {YUVtoBGR::cvtYUVtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
     case CV_16U:
-        return color::invoke(width, height, {cvtYUVtoBGR<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
+        return color::invoke(width, height, {YUVtoBGR::cvtYUVtoBGR<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
     case CV_32F:
-        return color::invoke(width, height, {cvtYUVtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
+        return color::invoke(width, height, {YUVtoBGR::cvtYUVtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::YUVtoBGR
 
 namespace BGRtoYUV {
-#undef cv_hal_cvtBGRtoYUV
-#define cv_hal_cvtBGRtoYUV cv::cv_hal_rvv::BGRtoYUV::cvtBGRtoYUV
 
 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@@ -1027,31 +1017,26 @@ static inline int cvtBGRtoYUV(int start, int end, const T * src, size_t src_step
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr)
+} // BGRtoYUV
+
+int cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr)
 {
     if (scn != 3 && scn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return color::invoke(width, height, {cvtBGRtoYUV<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
+        return color::invoke(width, height, {BGRtoYUV::cvtBGRtoYUV<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
     case CV_16U:
-        return color::invoke(width, height, {cvtBGRtoYUV<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
+        return color::invoke(width, height, {BGRtoYUV::cvtBGRtoYUV<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
     case CV_32F:
-        return color::invoke(width, height, {cvtBGRtoYUV<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
+        return color::invoke(width, height, {BGRtoYUV::cvtBGRtoYUV<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoYUV
 
 namespace PlaneYUVtoBGR {
-#undef cv_hal_cvtOnePlaneYUVtoBGR
-#define cv_hal_cvtOnePlaneYUVtoBGR cv::cv_hal_rvv::PlaneYUVtoBGR::cvtOnePlaneYUVtoBGR
-#undef cv_hal_cvtTwoPlaneYUVtoBGR
-#define cv_hal_cvtTwoPlaneYUVtoBGR cv::cv_hal_rvv::PlaneYUVtoBGR::cvtTwoPlaneYUVtoBGR
-#undef cv_hal_cvtThreePlaneYUVtoBGR
-#define cv_hal_cvtThreePlaneYUVtoBGR cv::cv_hal_rvv::PlaneYUVtoBGR::cvtThreePlaneYUVtoBGR
 
 static const int ITUR_BT_601_SHIFT = 20;
 static const int ITUR_BT_601_CY  = 1220542;
@@ -1241,22 +1226,24 @@ static inline int cvtMultiPlaneYUVtoBGR(int start, int end, uchar * dst_data, si
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx, int yIdx)
+} // PlaneYUVtoBGR
+
+int cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx, int yIdx)
 {
     if (dcn != 3 && dcn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    return color::invoke(dst_width, dst_height, {cvtSinglePlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, dcn, swapBlue, uIdx, yIdx);
+    return color::invoke(dst_width, dst_height, {PlaneYUVtoBGR::cvtSinglePlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, dcn, swapBlue, uIdx, yIdx);
 }
 
-inline int cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx)
+int cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx)
 {
     if (dcn != 3 && dcn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     const uchar* uv = src_data + src_step * static_cast<size_t>(dst_height);
-    return color::invoke(dst_width, dst_height / 2, {cvtMultiPlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, uv, uv, 0, 0, dcn, swapBlue, uIdx);
+    return color::invoke(dst_width, dst_height / 2, {PlaneYUVtoBGR::cvtMultiPlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, uv, uv, 0, 0, dcn, swapBlue, uIdx);
 }
 
-inline int cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx)
+int cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx)
 {
     if (dcn != 3 && dcn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -1267,17 +1254,10 @@ inline int cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar
     int vstepIdx = dst_height % 4 == 2 ? 1 : 0;
     if (uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); }
 
-    return color::invoke(dst_width, dst_height / 2, {cvtMultiPlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, u, v, ustepIdx, vstepIdx, dcn, swapBlue, -1);
+    return color::invoke(dst_width, dst_height / 2, {PlaneYUVtoBGR::cvtMultiPlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, u, v, ustepIdx, vstepIdx, dcn, swapBlue, -1);
 }
-} // cv::cv_hal_rvv::PlaneYUVtoBGR
 
 namespace PlaneBGRtoYUV {
-#undef cv_hal_cvtOnePlaneBGRtoYUV
-#define cv_hal_cvtOnePlaneBGRtoYUV cv::cv_hal_rvv::PlaneBGRtoYUV::cvtOnePlaneBGRtoYUV
-#undef cv_hal_cvtBGRtoTwoPlaneYUV
-#define cv_hal_cvtBGRtoTwoPlaneYUV cv::cv_hal_rvv::PlaneBGRtoYUV::cvtBGRtoTwoPlaneYUV
-#undef cv_hal_cvtBGRtoThreePlaneYUV
-#define cv_hal_cvtBGRtoThreePlaneYUV cv::cv_hal_rvv::PlaneBGRtoYUV::cvtBGRtoThreePlaneYUV
 
 static const int ITUR_BT_601_SHIFT = 20;
 static const int ITUR_BT_601_CBY =  102760; // 0.114035 * (236-16)/256 * (1 << ITUR_BT_601_SHIFT)
@@ -1512,35 +1492,34 @@ static inline int cvtBGRtoMultiPlaneYUV(int start, int end, uchar * yData, uchar
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int yIdx)
+} // PlaneBGRtoYUV
+
+int cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int yIdx)
 {
     if (scn != 3 && scn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    return color::invoke(width, height, {cvtBGRtoSinglePlaneYUV}, dst_data, dst_step, width, src_step, src_data, scn, swapBlue, uIdx, yIdx);
+    return color::invoke(width, height, {PlaneBGRtoYUV::cvtBGRtoSinglePlaneYUV}, dst_data, dst_step, width, src_step, src_data, scn, swapBlue, uIdx, yIdx);
 }
 
-inline int cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
+int cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
                                uchar * y_data, size_t y_step, uchar * uv_data, size_t uv_step,
                                int width, int height,
                                int scn, bool swapBlue, int uIdx)
 {
     if (y_step != uv_step || (scn != 3 && scn != 4))
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    return color::invoke(width, height / 2, {cvtBGRtoMultiPlaneYUV}, y_data, uv_data, y_step, width, height, src_step, src_data, scn, swapBlue, uIdx == 2);
+    return color::invoke(width, height / 2, {PlaneBGRtoYUV::cvtBGRtoMultiPlaneYUV}, y_data, uv_data, y_step, width, height, src_step, src_data, scn, swapBlue, uIdx == 2);
 }
 
-inline int cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx)
+int cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx)
 {
     if (scn != 3 && scn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     uchar* uv_data = dst_data + dst_step * static_cast<size_t>(height);
-    return color::invoke(width, height / 2, {cvtBGRtoMultiPlaneYUV}, dst_data, uv_data, dst_step, width, height, src_step, src_data, scn, swapBlue, uIdx == 2 ? 3 : 2);
+    return color::invoke(width, height / 2, {PlaneBGRtoYUV::cvtBGRtoMultiPlaneYUV}, dst_data, uv_data, dst_step, width, height, src_step, src_data, scn, swapBlue, uIdx == 2 ? 3 : 2);
 }
-} // cv::cv_hal_rvv::PlaneBGRtoYUV
 
 namespace HSVtoBGR {
-#undef cv_hal_cvtHSVtoBGR
-#define cv_hal_cvtHSVtoBGR cv::cv_hal_rvv::HSVtoBGR::cvtHSVtoBGR
 
 template<typename T>
 static inline int cvtHSVtoBGR(int start, int end, const T * src, size_t src_step, T * dst, size_t dst_step, int width, int dcn, bool swapBlue, bool isFullRange, bool isHSV);
@@ -1710,25 +1689,24 @@ inline int cvtHSVtoBGR<float>(int start, int end, const float * src, size_t src_
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtHSVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV)
+} // HSVtoBGR
+
+int cvtHSVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV)
 {
     if (dcn != 3 && dcn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return color::invoke(width, height, {cvtHSVtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isFullRange, isHSV);
+        return color::invoke(width, height, {HSVtoBGR::cvtHSVtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isFullRange, isHSV);
     case CV_32F:
-        return color::invoke(width, height, {cvtHSVtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isFullRange, isHSV);
+        return color::invoke(width, height, {HSVtoBGR::cvtHSVtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isFullRange, isHSV);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::HSVtoBGR
 
 namespace BGRtoHSV {
-#undef cv_hal_cvtBGRtoHSV
-#define cv_hal_cvtBGRtoHSV cv::cv_hal_rvv::BGRtoHSV::cvtBGRtoHSV
 
 template<typename T>
 static inline int cvtBGRtoHSV(int start, int end, const T * src, size_t src_step, T * dst, size_t dst_step, int width, int scn, bool swapBlue, bool isFullRange, bool isHSV);
@@ -1870,25 +1848,24 @@ inline int cvtBGRtoHSV<float>(int start, int end, const float * src, size_t src_
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGRtoHSV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV)
+} // BGRtoHSV
+
+int cvtBGRtoHSV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV)
 {
     if (scn != 3 && scn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return color::invoke(width, height, {cvtBGRtoHSV<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue, isFullRange, isHSV);
+        return color::invoke(width, height, {BGRtoHSV::cvtBGRtoHSV<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue, isFullRange, isHSV);
     case CV_32F:
-        return color::invoke(width, height, {cvtBGRtoHSV<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isFullRange, isHSV);
+        return color::invoke(width, height, {BGRtoHSV::cvtBGRtoHSV<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isFullRange, isHSV);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoHSV
 
 namespace XYZtoBGR {
-#undef cv_hal_cvtXYZtoBGR
-#define cv_hal_cvtXYZtoBGR cv::cv_hal_rvv::XYZtoBGR::cvtXYZtoBGR
 
 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@@ -2042,27 +2019,26 @@ static inline int cvtXYZtoBGR(int start, int end, const T * src, size_t src_step
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtXYZtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue)
+} // XYZtoBGR
+
+int cvtXYZtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue)
 {
     if (dcn != 3 && dcn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return color::invoke(width, height, {cvtXYZtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue);
+        return color::invoke(width, height, {XYZtoBGR::cvtXYZtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue);
     case CV_16U:
-        return color::invoke(width, height, {cvtXYZtoBGR<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn, swapBlue);
+        return color::invoke(width, height, {XYZtoBGR::cvtXYZtoBGR<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn, swapBlue);
     case CV_32F:
-        return color::invoke(width, height, {cvtXYZtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue);
+        return color::invoke(width, height, {XYZtoBGR::cvtXYZtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::XYZtoBGR
 
 namespace BGRtoXYZ {
-#undef cv_hal_cvtBGRtoXYZ
-#define cv_hal_cvtBGRtoXYZ cv::cv_hal_rvv::BGRtoXYZ::cvtBGRtoXYZ
 
 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@@ -2209,23 +2185,24 @@ static inline int cvtBGRtoXYZ(int start, int end, const T * src, size_t src_step
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGRtoXYZ(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue)
+} // BGRtoXYZ
+
+int cvtBGRtoXYZ(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue)
 {
     if (scn != 3 && scn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return color::invoke(width, height, {cvtBGRtoXYZ<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoXYZ::cvtBGRtoXYZ<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue);
     case CV_16U:
-        return color::invoke(width, height, {cvtBGRtoXYZ<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoXYZ::cvtBGRtoXYZ<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue);
     case CV_32F:
-        return color::invoke(width, height, {cvtBGRtoXYZ<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoXYZ::cvtBGRtoXYZ<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoXYZ
 
 namespace LabTable
 {
@@ -2495,11 +2472,9 @@ namespace LabTable
             return __riscv_vfmadd(__riscv_vfmadd(__riscv_vfmadd(__riscv_vget_v_f32m2x4_f32m2(val, 3), x, __riscv_vget_v_f32m2x4_f32m2(val, 2), vl), x, __riscv_vget_v_f32m2x4_f32m2(val, 1), vl), x, __riscv_vget_v_f32m2x4_f32m2(val, 0), vl);
         }
     };
-} // cv::cv_hal_rvv::LabTable
+} // cv::rvv_hal::imgproc::LabTable
 
 namespace LabtoBGR {
-#undef cv_hal_cvtLabtoBGR
-#define cv_hal_cvtLabtoBGR cv::cv_hal_rvv::LabtoBGR::cvtLabtoBGR
 
 template<typename T>
 static inline int cvtLabtoBGR(int start, int end, const T * src, size_t src_step, T * dst, size_t dst_step, int width, int dcn, bool swapBlue, bool isLab, bool srgb);
@@ -2713,25 +2688,24 @@ inline int cvtLabtoBGR<float>(int start, int end, const float * src, size_t src_
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isLab, bool srgb)
+} // LabtoBGR
+
+int cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isLab, bool srgb)
 {
     if (dcn != 3 && dcn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return color::invoke(width, height, {cvtLabtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isLab, srgb);
+        return color::invoke(width, height, {LabtoBGR::cvtLabtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isLab, srgb);
     case CV_32F:
-        return color::invoke(width, height, {cvtLabtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isLab, srgb);
+        return color::invoke(width, height, {LabtoBGR::cvtLabtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isLab, srgb);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::LabtoBGR
 
 namespace BGRtoLab {
-#undef cv_hal_cvtBGRtoLab
-#define cv_hal_cvtBGRtoLab cv::cv_hal_rvv::BGRtoLab::cvtBGRtoLab
 
 struct rvv_base
 {
@@ -3060,31 +3034,126 @@ static inline int cvtBGRtoLab_f(int start, int end, const float * src, size_t sr
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGRtoLab(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isLab, bool srgb)
+} // BGRtoLab
+
+int cvtBGRtoLab(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isLab, bool srgb)
 {
     if (scn != 3 && scn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
 
-    auto cvtBGRtoLab_b = cvtBGRtoLab_u<true, true>;
+    auto cvtBGRtoLab_b = BGRtoLab::cvtBGRtoLab_u<true, true>;
     if (!isLab && !srgb)
-        cvtBGRtoLab_b = cvtBGRtoLab_u<false, false>;
+        cvtBGRtoLab_b = BGRtoLab::cvtBGRtoLab_u<false, false>;
     else if (!isLab && srgb)
-        cvtBGRtoLab_b = cvtBGRtoLab_u<false, true>;
+        cvtBGRtoLab_b = BGRtoLab::cvtBGRtoLab_u<false, true>;
     else if (isLab && !srgb)
-        cvtBGRtoLab_b = cvtBGRtoLab_u<true, false>;
+        cvtBGRtoLab_b = BGRtoLab::cvtBGRtoLab_u<true, false>;
 
     switch (depth)
     {
     case CV_8U:
         return color::invoke(width, height, {cvtBGRtoLab_b}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue);
     case CV_32F:
-        return color::invoke(width, height, {cvtBGRtoLab_f}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isLab, srgb);
+        return color::invoke(width, height, {BGRtoLab::cvtBGRtoLab_f}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isLab, srgb);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoLab
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+#if CV_HAL_RVV_071_ENABLED
+
+static const unsigned char index_array_32 [32]
+                        { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15, 18, 17, 16, 19, 22, 21, 20, 23, 26, 25, 24, 27, 30, 29, 28, 31  };
+
+static const unsigned char index_array_24 [24]
+                        { 2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17, 16, 15, 20, 19, 18, 23, 22, 21  };
+
+static void vBGRtoBGR(const unsigned char* src, unsigned char * dst, const unsigned char * index, int n, int scn, int dcn, int vsize_pixels, const int vsize)
+{
+    vuint8m2_t vec_index = vle8_v_u8m2(index, vsize);
+
+    int i = 0;
+
+    for ( ; i <= n-vsize; i += vsize_pixels, src += vsize, dst += vsize)
+    {
+        vuint8m2_t vec_src = vle8_v_u8m2(src, vsize);
+        vuint8m2_t vec_dst = vrgather_vv_u8m2(vec_src, vec_index, vsize);
+        vse8_v_u8m2(dst, vec_dst, vsize);
+    }
+
+    for ( ; i < n; i++, src += scn, dst += dcn )
+    {
+        unsigned char t0 = src[0], t1 = src[1], t2 = src[2];
+        dst[2] = t0;
+        dst[1] = t1;
+        dst[0] = t2;
+        if(dcn == 4)
+        {
+            unsigned char d = src[3];
+            dst[3] = d;
+        }
+    }
+}
+
+static void sBGRtoBGR(const unsigned char* src, unsigned char * dst, int n, int scn, int dcn, int bi)
+{
+    for (int i = 0; i < n; i++, src += scn, dst += dcn)
+    {
+        unsigned char t0 = src[0], t1 = src[1], t2 = src[2];
+        dst[bi  ] = t0;
+        dst[1]    = t1;
+        dst[bi^2] = t2;
+        if(dcn == 4)
+        {
+            unsigned char d = scn == 4 ? src[3] : std::numeric_limits<unsigned char>::max();
+            dst[3] = d;
+        }
+    }
+}
+
+int cvtBGRtoBGR(const unsigned char * src_data, size_t src_step, unsigned char * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue)
+{
+    if (depth != CV_8U)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    const int blueIdx = swapBlue ? 2 : 0;
+    if (scn == dcn)
+    {
+        if (!swapBlue)
+        {
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+        }
+
+        const int vsize_pixels = 8;
+
+        if (scn == 4)
+        {
+            for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
+            {
+                vBGRtoBGR(src_data, dst_data, index_array_32, width, scn, dcn, vsize_pixels, 32);
+            }
+        }
+        else
+        {
+            for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
+            {
+                vBGRtoBGR(src_data, dst_data, index_array_24, width, scn, dcn, vsize_pixels, 24);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
+            sBGRtoBGR(src_data, dst_data, width, scn, dcn, blueIdx);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_071_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/src/imgproc/common.hpp b/hal/riscv-rvv/src/imgproc/common.hpp
new file mode 100644
index 0000000000..819b43421c
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/common.hpp
@@ -0,0 +1,76 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+// Third party copyrights are property of their respective owners.
+
+#ifndef OPENCV_HAL_RVV_IMGPROC_COMMON_HPP_INCLUDED
+#define OPENCV_HAL_RVV_IMGPROC_COMMON_HPP_INCLUDED
+
+#include "opencv2/core/hal/interface.h"
+#include "opencv2/imgproc/hal/interface.h"
+
+namespace cv { namespace rvv_hal { namespace imgproc { namespace common {
+
+inline int borderInterpolate( int p, int len, int borderType )
+{
+    if ((unsigned)p < (unsigned)len)
+        ;
+    else if (borderType == CV_HAL_BORDER_REPLICATE)
+        p = p < 0 ? 0 : len - 1;
+    else if (borderType == CV_HAL_BORDER_REFLECT || borderType == CV_HAL_BORDER_REFLECT_101)
+    {
+        int delta = borderType == CV_HAL_BORDER_REFLECT_101;
+        if (len == 1)
+            return 0;
+        do
+        {
+            if (p < 0)
+                p = -p - 1 + delta;
+            else
+                p = len - 1 - (p - len) - delta;
+        }
+        while( (unsigned)p >= (unsigned)len );
+    }
+    else if (borderType == CV_HAL_BORDER_WRAP)
+    {
+        if (p < 0)
+            p -= ((p-len+1)/len)*len;
+        if (p >= len)
+            p %= len;
+    }
+    else if (borderType == CV_HAL_BORDER_CONSTANT)
+        p = -1;
+    return p;
+}
+
+class FilterInvoker : public ParallelLoopBody
+{
+public:
+    template<typename... Args>
+    FilterInvoker(std::function<int(int, int, Args...)> _func, Args&&... args)
+    {
+        func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward<Args>(args)...);
+    }
+
+    virtual void operator()(const Range& range) const override
+    {
+        func(range.start, range.end);
+    }
+
+private:
+    std::function<int(int, int)> func;
+};
+
+template<typename... Args>
+inline int invoke(int height, std::function<int(int, int, Args...)> func, Args&&... args)
+{
+    cv::parallel_for_(Range(1, height), FilterInvoker(func, std::forward<Args>(args)...), cv::getNumThreads());
+    return func(0, 1, std::forward<Args>(args)...);
+}
+
+}}}} // cv::rvv_hal::imgproc::common
+
+#endif // OPENCV_HAL_RVV_IMGPROC_COMMON_HPP_INCLUDED
diff --git a/hal/riscv-rvv/src/imgproc/filter.cpp b/hal/riscv-rvv/src/imgproc/filter.cpp
new file mode 100644
index 0000000000..f23b56e01d
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/filter.cpp
@@ -0,0 +1,264 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+struct Filter2D
+{
+    const uchar* kernel_data;
+    size_t kernel_step;
+    int kernel_type;
+    int kernel_width;
+    int kernel_height;
+    int src_type;
+    int dst_type;
+    int borderType;
+    double delta;
+    int anchor_x;
+    int anchor_y;
+};
+
+static void process3(int anchor, int left, int right, float delta, const float* kernel, const uchar* row0, const uchar* row1, const uchar* row2, uchar* dst)
+{
+    int vl;
+    for (int i = left; i < right; i += vl)
+    {
+        vl = __riscv_vsetvl_e8m1(right - i);
+        auto s0 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s1 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s2 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s3 = __riscv_vfmv_v_f_f32m4(delta, vl);
+
+        auto addshift = [&](vfloat32m4_t a, vfloat32m4_t b, float k0, float k1, float k2, float r1, float r2) {
+            a = __riscv_vfmacc(a, k0, b, vl);
+            b = __riscv_vfslide1down(b, r1, vl);
+            a = __riscv_vfmacc(a, k1, b, vl);
+            b = __riscv_vfslide1down(b, r2, vl);
+            return __riscv_vfmacc(a, k2, b, vl);
+        };
+        auto loadsrc = [&](const uchar* row, float k0, float k1, float k2) {
+            if (!row) return;
+
+            const uchar* extra = row + (i - anchor) * 4;
+            auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl);
+            auto v0 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl), vl);
+            auto v1 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl), vl);
+            auto v2 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl), vl);
+            auto v3 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl), vl);
+
+            extra += vl * 4;
+            s0 = addshift(s0, v0, k0, k1, k2, extra[0], extra[4]);
+            s1 = addshift(s1, v1, k0, k1, k2, extra[1], extra[5]);
+            s2 = addshift(s2, v2, k0, k1, k2, extra[2], extra[6]);
+            s3 = addshift(s3, v3, k0, k1, k2, extra[3], extra[7]);
+        };
+
+        loadsrc(row0, kernel[0], kernel[1], kernel[2]);
+        loadsrc(row1, kernel[3], kernel[4], kernel[5]);
+        loadsrc(row2, kernel[6], kernel[7], kernel[8]);
+        vuint8m1x4_t val{};
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 0, __riscv_vnclipu(__riscv_vfncvt_xu(s0, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 1, __riscv_vnclipu(__riscv_vfncvt_xu(s1, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 2, __riscv_vnclipu(__riscv_vfncvt_xu(s2, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 3, __riscv_vnclipu(__riscv_vfncvt_xu(s3, vl), 0, __RISCV_VXRM_RNU, vl));
+        __riscv_vsseg4e8(dst + i * 4, val, vl);
+    }
+}
+
+static void process5(int anchor, int left, int right, float delta, const float* kernel, const uchar* row0, const uchar* row1, const uchar* row2, const uchar* row3, const uchar* row4, uchar* dst)
+{
+    int vl;
+    for (int i = left; i < right; i += vl)
+    {
+        vl = __riscv_vsetvl_e8m1(right - i);
+        auto s0 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s1 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s2 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s3 = __riscv_vfmv_v_f_f32m4(delta, vl);
+
+        auto addshift = [&](vfloat32m4_t a, vfloat32m4_t b, float k0, float k1, float k2, float k3, float k4, float r1, float r2, float r3, float r4) {
+            a = __riscv_vfmacc(a, k0, b, vl);
+            b = __riscv_vfslide1down(b, r1, vl);
+            a = __riscv_vfmacc(a, k1, b, vl);
+            b = __riscv_vfslide1down(b, r2, vl);
+            a = __riscv_vfmacc(a, k2, b, vl);
+            b = __riscv_vfslide1down(b, r3, vl);
+            a = __riscv_vfmacc(a, k3, b, vl);
+            b = __riscv_vfslide1down(b, r4, vl);
+            return __riscv_vfmacc(a, k4, b, vl);
+        };
+        auto loadsrc = [&](const uchar* row, float k0, float k1, float k2, float k3, float k4) {
+            if (!row) return;
+
+            const uchar* extra = row + (i - anchor) * 4;
+            auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl);
+            auto v0 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl), vl);
+            auto v1 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl), vl);
+            auto v2 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl), vl);
+            auto v3 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl), vl);
+
+            extra += vl * 4;
+            s0 = addshift(s0, v0, k0, k1, k2, k3, k4, extra[0], extra[4], extra[ 8], extra[12]);
+            s1 = addshift(s1, v1, k0, k1, k2, k3, k4, extra[1], extra[5], extra[ 9], extra[13]);
+            s2 = addshift(s2, v2, k0, k1, k2, k3, k4, extra[2], extra[6], extra[10], extra[14]);
+            s3 = addshift(s3, v3, k0, k1, k2, k3, k4, extra[3], extra[7], extra[11], extra[15]);
+        };
+
+        loadsrc(row0, kernel[ 0], kernel[ 1], kernel[ 2], kernel[ 3], kernel[ 4]);
+        loadsrc(row1, kernel[ 5], kernel[ 6], kernel[ 7], kernel[ 8], kernel[ 9]);
+        loadsrc(row2, kernel[10], kernel[11], kernel[12], kernel[13], kernel[14]);
+        loadsrc(row3, kernel[15], kernel[16], kernel[17], kernel[18], kernel[19]);
+        loadsrc(row4, kernel[20], kernel[21], kernel[22], kernel[23], kernel[24]);
+        vuint8m1x4_t val{};
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 0, __riscv_vnclipu(__riscv_vfncvt_xu(s0, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 1, __riscv_vnclipu(__riscv_vfncvt_xu(s1, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 2, __riscv_vnclipu(__riscv_vfncvt_xu(s2, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 3, __riscv_vnclipu(__riscv_vfncvt_xu(s3, vl), 0, __RISCV_VXRM_RNU, vl));
+        __riscv_vsseg4e8(dst + i * 4, val, vl);
+    }
+}
+
+// the algorithm is copied from 3rdparty/carotene/src/convolution.cpp,
+// in the function void CAROTENE_NS::convolution
+template<int ksize>
+static inline int filter(int start, int end, Filter2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
+{
+    float kernel[ksize * ksize];
+    for (int i = 0; i < ksize * ksize; i++)
+    {
+        kernel[i] = reinterpret_cast<const float*>(data->kernel_data + (i / ksize) * data->kernel_step)[i % ksize];
+    }
+
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto access = [&](int x, int y) {
+        int pi, pj;
+        if (data->borderType & BORDER_ISOLATED)
+        {
+            pi = common::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
+            pj = common::borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED);
+            pi = pi < 0 ? noval : pi;
+            pj = pj < 0 ? noval : pj;
+        }
+        else
+        {
+            pi = common::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
+            pj = common::borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType);
+            pi = pi < 0 ? noval : pi - offset_y;
+            pj = pj < 0 ? noval : pj - offset_x;
+        }
+        return std::make_pair(pi, pj);
+    };
+
+    auto process = [&](int x, int y) {
+        float sum0, sum1, sum2, sum3;
+        sum0 = sum1 = sum2 = sum3 = data->delta;
+        for (int i = 0; i < ksize * ksize; i++)
+        {
+            auto p = access(x + i / ksize, y + i % ksize);
+            if (p.first != noval && p.second != noval)
+            {
+                sum0 += kernel[i] * src_data[p.first * src_step + p.second * 4    ];
+                sum1 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 1];
+                sum2 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 2];
+                sum3 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 3];
+            }
+        }
+        dst_data[(x * width + y) * 4    ] = std::max(0, std::min((int)std::round(sum0), (int)std::numeric_limits<uchar>::max()));
+        dst_data[(x * width + y) * 4 + 1] = std::max(0, std::min((int)std::round(sum1), (int)std::numeric_limits<uchar>::max()));
+        dst_data[(x * width + y) * 4 + 2] = std::max(0, std::min((int)std::round(sum2), (int)std::numeric_limits<uchar>::max()));
+        dst_data[(x * width + y) * 4 + 3] = std::max(0, std::min((int)std::round(sum3), (int)std::numeric_limits<uchar>::max()));
+    };
+
+    const int left = data->anchor_x, right = width - (ksize - 1 - data->anchor_x);
+    for (int i = start; i < end; i++)
+    {
+        if (left >= right)
+        {
+            for (int j = 0; j < width; j++)
+                process(i, j);
+        }
+        else
+        {
+            for (int j = 0; j < left; j++)
+                process(i, j);
+            for (int j = right; j < width; j++)
+                process(i, j);
+
+            const uchar* row0 = access(i    , 0).first == noval ? nullptr : src_data + access(i    , 0).first * src_step;
+            const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step;
+            const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step;
+            if (ksize == 3)
+            {
+                process3(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, dst_data + i * width * 4);
+            }
+            else
+            {
+                const uchar* row3 = access(i + 3, 0).first == noval ? nullptr : src_data + access(i + 3, 0).first * src_step;
+                const uchar* row4 = access(i + 4, 0).first == noval ? nullptr : src_data + access(i + 4, 0).first * src_step;
+                process5(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, row3, row4, dst_data + i * width * 4);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int filterInit(cvhalFilter2D** context, uchar* kernel_data, size_t kernel_step, int kernel_type, int kernel_width, int kernel_height, int /*max_width*/, int /*max_height*/, int src_type, int dst_type, int borderType, double delta, int anchor_x, int anchor_y, bool /*allowSubmatrix*/, bool /*allowInplace*/)
+{
+    if (kernel_type != CV_32FC1 || src_type != CV_8UC4 || dst_type != CV_8UC4)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (kernel_width != kernel_height)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (kernel_width != 3 && kernel_width != 5)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    anchor_x = anchor_x < 0 ? kernel_width  / 2 : anchor_x;
+    anchor_y = anchor_y < 0 ? kernel_height / 2 : anchor_y;
+    *context = reinterpret_cast<cvhalFilter2D*>(new Filter2D{kernel_data, kernel_step, kernel_type, kernel_width, kernel_height, src_type, dst_type, borderType, delta, anchor_x, anchor_y});
+    return CV_HAL_ERROR_OK;
+}
+
+int filter(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
+{
+    Filter2D* data = reinterpret_cast<Filter2D*>(context);
+    std::vector<uchar> dst(width * height * 4);
+
+    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
+    switch (data->kernel_width)
+    {
+    case 3:
+        res = common::invoke(height, {filter<3>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 5:
+        res = common::invoke(height, {filter<5>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    }
+
+    for (int i = 0; i < height; i++)
+        memcpy(dst_data + i * dst_step, dst.data() + i * width * 4, width * 4);
+    return res;
+}
+
+int filterFree(cvhalFilter2D* context)
+{
+    delete reinterpret_cast<Filter2D*>(context);
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/src/imgproc/gaussian_blur.cpp b/hal/riscv-rvv/src/imgproc/gaussian_blur.cpp
new file mode 100644
index 0000000000..495efa4ee7
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/gaussian_blur.cpp
@@ -0,0 +1,389 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+// the algorithm is same as cv_hal_sepFilter
+template<int ksize, typename helperT, typename helperWT>
+static inline int gaussianBlurC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int border_type)
+{
+    using T = typename helperT::ElemType;
+    using WT = typename helperWT::ElemType;
+
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto accessX = [&](int x) {
+        int pi = common::borderInterpolate(offset_y + x - ksize / 2, full_height, border_type); // [TODO] fix dependencies
+        return pi < 0 ? noval : pi - offset_y;
+    };
+    auto accessY = [&](int y) {
+        int pj = common::borderInterpolate(offset_x + y - ksize / 2, full_width, border_type);
+        return pj < 0 ? noval : pj - offset_x;
+    };
+    auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; };
+
+    constexpr uint kernel[2][5] = {{1, 2, 1}, {1, 4, 6, 4, 1}};
+    std::vector<WT> res(width * ksize);
+    auto process = [&](int x, int y) {
+        WT sum = 0;
+        for (int i = 0; i < ksize; i++)
+        {
+            int p = accessY(y + i);
+            if (p != noval)
+            {
+                sum += kernel[ksize == 5][i] * static_cast<WT>(reinterpret_cast<const T*>(src_data + x * src_step)[p]);
+            }
+        }
+        res[p2idx(x, y)] = sum;
+    };
+
+    const int left = ksize / 2, right = width - ksize / 2;
+    for (int i = start - ksize / 2; i < end + ksize / 2; i++)
+    {
+        if (i + offset_y >= 0 && i + offset_y < full_height)
+        {
+            if (left >= right)
+            {
+                for (int j = 0; j < width; j++)
+                    process(i, j);
+            }
+            else
+            {
+                for (int j = 0; j < left; j++)
+                    process(i, j);
+                for (int j = right; j < width; j++)
+                    process(i, j);
+
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = helperT::setvl(right - j);
+                    const T* extra = reinterpret_cast<const T*>(src_data + i * src_step) + j - ksize / 2;
+                    auto src = __riscv_vzext_vf2(helperT::vload(extra, vl), vl);
+
+                    extra += vl;
+                    auto sum = src;
+                    if (ksize == 3)
+                    {
+                        src = __riscv_vslide1down(src, extra[0], vl);
+                        sum = __riscv_vadd(sum, __riscv_vsll(src, 1, vl), vl);
+                        src = __riscv_vslide1down(src, extra[1], vl);
+                        sum = __riscv_vadd(sum, src, vl);
+                    }
+                    else
+                    {
+                        src = __riscv_vslide1down(src, extra[0], vl);
+                        sum = __riscv_vadd(sum, __riscv_vsll(src, 2, vl), vl);
+                        src = __riscv_vslide1down(src, extra[1], vl);
+                        sum = __riscv_vadd(sum, __riscv_vadd(__riscv_vsll(src, 1, vl), __riscv_vsll(src, 2, vl), vl), vl);
+                        src = __riscv_vslide1down(src, extra[2], vl);
+                        sum = __riscv_vadd(sum, __riscv_vsll(src, 2, vl), vl);
+                        src = __riscv_vslide1down(src, extra[3], vl);
+                        sum = __riscv_vadd(sum, src, vl);
+                    }
+                    helperWT::vstore(res.data() + p2idx(i, j), sum, vl);
+                }
+            }
+        }
+
+        int cur = i - ksize / 2;
+        if (cur >= start)
+        {
+            const WT* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
+            const WT* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
+            const WT* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
+            const WT* row3 = nullptr, *row4 = nullptr;
+            if (ksize == 5)
+            {
+                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
+                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
+            }
+
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = helperWT::setvl(width - j);
+                auto v0 = row0 ? helperWT::vload(row0 + j, vl) : helperWT::vmv(0, vl);
+                auto v1 = row1 ? helperWT::vload(row1 + j, vl) : helperWT::vmv(0, vl);
+                auto v2 = row2 ? helperWT::vload(row2 + j, vl) : helperWT::vmv(0, vl);
+                typename helperWT::VecType sum;
+                if (ksize == 3)
+                {
+                    sum = __riscv_vadd(__riscv_vadd(v0, v2, vl), __riscv_vsll(v1, 1, vl), vl);
+                }
+                else
+                {
+                    sum = __riscv_vadd(v0, __riscv_vadd(__riscv_vsll(v2, 1, vl), __riscv_vsll(v2, 2, vl), vl), vl);
+                    auto v3 = row3 ? helperWT::vload(row3 + j, vl) : helperWT::vmv(0, vl);
+                    sum = __riscv_vadd(sum, __riscv_vsll(__riscv_vadd(v1, v3, vl), 2, vl), vl);
+                    auto v4 = row4 ? helperWT::vload(row4 + j, vl) : helperWT::vmv(0, vl);
+                    sum = __riscv_vadd(sum, v4, vl);
+                }
+                helperT::vstore(reinterpret_cast<T*>(dst_data + cur * dst_step) + j, __riscv_vnclipu(sum, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl), vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template<int ksize>
+static inline int gaussianBlurC4(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int border_type)
+{
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto accessX = [&](int x) {
+        int pi = common::borderInterpolate(offset_y + x - ksize / 2, full_height, border_type);
+        return pi < 0 ? noval : pi - offset_y;
+    };
+    auto accessY = [&](int y) {
+        int pj = common::borderInterpolate(offset_x + y - ksize / 2, full_width, border_type);
+        return pj < 0 ? noval : pj - offset_x;
+    };
+    auto p2idx = [&](int x, int y){ return ((x + ksize) % ksize * width + y) * 4; };
+
+    constexpr uint kernel[2][5] = {{1, 2, 1}, {1, 4, 6, 4, 1}};
+    std::vector<ushort> res(width * ksize * 4);
+    auto process = [&](int x, int y) {
+        ushort sum0, sum1, sum2, sum3;
+        sum0 = sum1 = sum2 = sum3 = 0;
+        for (int i = 0; i < ksize; i++)
+        {
+            int p = accessY(y + i);
+            if (p != noval)
+            {
+                sum0 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4    ]);
+                sum1 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4 + 1]);
+                sum2 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4 + 2]);
+                sum3 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4 + 3]);
+            }
+        }
+        res[p2idx(x, y)    ] = sum0;
+        res[p2idx(x, y) + 1] = sum1;
+        res[p2idx(x, y) + 2] = sum2;
+        res[p2idx(x, y) + 3] = sum3;
+    };
+
+    const int left = ksize / 2, right = width - ksize / 2;
+    for (int i = start - ksize / 2; i < end + ksize / 2; i++)
+    {
+        if (i + offset_y >= 0 && i + offset_y < full_height)
+        {
+            if (left >= right)
+            {
+                for (int j = 0; j < width; j++)
+                    process(i, j);
+            }
+            else
+            {
+                for (int j = 0; j < left; j++)
+                    process(i, j);
+                for (int j = right; j < width; j++)
+                    process(i, j);
+
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = __riscv_vsetvl_e8m1(right - j);
+                    const uchar* extra = src_data + i * src_step + (j - ksize / 2) * 4;
+                    auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl);
+                    auto src0 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl);
+                    auto src1 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl);
+                    auto src2 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl);
+                    auto src3 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl);
+
+                    extra += vl * 4;
+                    auto sum0 = src0, sum1 = src1, sum2 = src2, sum3 = src3;
+                    if (ksize == 3)
+                    {
+                        src0 = __riscv_vslide1down(src0, extra[0], vl);
+                        src1 = __riscv_vslide1down(src1, extra[1], vl);
+                        src2 = __riscv_vslide1down(src2, extra[2], vl);
+                        src3 = __riscv_vslide1down(src3, extra[3], vl);
+                        sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 1, vl), vl);
+                        sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 1, vl), vl);
+                        sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 1, vl), vl);
+                        sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 1, vl), vl);
+                        src0 = __riscv_vslide1down(src0, extra[4], vl);
+                        src1 = __riscv_vslide1down(src1, extra[5], vl);
+                        src2 = __riscv_vslide1down(src2, extra[6], vl);
+                        src3 = __riscv_vslide1down(src3, extra[7], vl);
+                        sum0 = __riscv_vadd(sum0, src0, vl);
+                        sum1 = __riscv_vadd(sum1, src1, vl);
+                        sum2 = __riscv_vadd(sum2, src2, vl);
+                        sum3 = __riscv_vadd(sum3, src3, vl);
+                    }
+                    else
+                    {
+                        src0 = __riscv_vslide1down(src0, extra[0], vl);
+                        src1 = __riscv_vslide1down(src1, extra[1], vl);
+                        src2 = __riscv_vslide1down(src2, extra[2], vl);
+                        src3 = __riscv_vslide1down(src3, extra[3], vl);
+                        sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl);
+                        sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl);
+                        sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl);
+                        sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl);
+                        src0 = __riscv_vslide1down(src0, extra[4], vl);
+                        src1 = __riscv_vslide1down(src1, extra[5], vl);
+                        src2 = __riscv_vslide1down(src2, extra[6], vl);
+                        src3 = __riscv_vslide1down(src3, extra[7], vl);
+                        sum0 = __riscv_vadd(sum0, __riscv_vadd(__riscv_vsll(src0, 1, vl), __riscv_vsll(src0, 2, vl), vl), vl);
+                        sum1 = __riscv_vadd(sum1, __riscv_vadd(__riscv_vsll(src1, 1, vl), __riscv_vsll(src1, 2, vl), vl), vl);
+                        sum2 = __riscv_vadd(sum2, __riscv_vadd(__riscv_vsll(src2, 1, vl), __riscv_vsll(src2, 2, vl), vl), vl);
+                        sum3 = __riscv_vadd(sum3, __riscv_vadd(__riscv_vsll(src3, 1, vl), __riscv_vsll(src3, 2, vl), vl), vl);
+                        src0 = __riscv_vslide1down(src0, extra[ 8], vl);
+                        src1 = __riscv_vslide1down(src1, extra[ 9], vl);
+                        src2 = __riscv_vslide1down(src2, extra[10], vl);
+                        src3 = __riscv_vslide1down(src3, extra[11], vl);
+                        sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl);
+                        sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl);
+                        sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl);
+                        sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl);
+                        src0 = __riscv_vslide1down(src0, extra[12], vl);
+                        src1 = __riscv_vslide1down(src1, extra[13], vl);
+                        src2 = __riscv_vslide1down(src2, extra[14], vl);
+                        src3 = __riscv_vslide1down(src3, extra[15], vl);
+                        sum0 = __riscv_vadd(sum0, src0, vl);
+                        sum1 = __riscv_vadd(sum1, src1, vl);
+                        sum2 = __riscv_vadd(sum2, src2, vl);
+                        sum3 = __riscv_vadd(sum3, src3, vl);
+                    }
+
+                    vuint16m2x4_t dst{};
+                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 0, sum0);
+                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 1, sum1);
+                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 2, sum2);
+                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 3, sum3);
+                    __riscv_vsseg4e16(res.data() + p2idx(i, j), dst, vl);
+                }
+            }
+        }
+
+        int cur = i - ksize / 2;
+        if (cur >= start)
+        {
+            const ushort* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
+            const ushort* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
+            const ushort* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
+            const ushort* row3 = nullptr, *row4 = nullptr;
+            if (ksize == 5)
+            {
+                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
+                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
+            }
+
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e16m2(width - j);
+                vuint16m2_t sum0, sum1, sum2, sum3, src0{}, src1{}, src2{}, src3{};
+                sum0 = sum1 = sum2 = sum3 = __riscv_vmv_v_x_u16m2(0, vl);
+
+                auto loadres = [&](const ushort* row) {
+                    auto src = __riscv_vlseg4e16_v_u16m2x4(row + j * 4, vl);
+                    src0 = __riscv_vget_v_u16m2x4_u16m2(src, 0);
+                    src1 = __riscv_vget_v_u16m2x4_u16m2(src, 1);
+                    src2 = __riscv_vget_v_u16m2x4_u16m2(src, 2);
+                    src3 = __riscv_vget_v_u16m2x4_u16m2(src, 3);
+                };
+                if (row0)
+                {
+                    loadres(row0);
+                    sum0 = src0;
+                    sum1 = src1;
+                    sum2 = src2;
+                    sum3 = src3;
+                }
+                if (row1)
+                {
+                    loadres(row1);
+                    sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, ksize == 5 ? 2 : 1, vl), vl);
+                    sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, ksize == 5 ? 2 : 1, vl), vl);
+                    sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, ksize == 5 ? 2 : 1, vl), vl);
+                    sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, ksize == 5 ? 2 : 1, vl), vl);
+                }
+                if (row2)
+                {
+                    loadres(row2);
+                    if (ksize == 5)
+                    {
+                        src0 = __riscv_vadd(__riscv_vsll(src0, 1, vl), __riscv_vsll(src0, 2, vl), vl);
+                        src1 = __riscv_vadd(__riscv_vsll(src1, 1, vl), __riscv_vsll(src1, 2, vl), vl);
+                        src2 = __riscv_vadd(__riscv_vsll(src2, 1, vl), __riscv_vsll(src2, 2, vl), vl);
+                        src3 = __riscv_vadd(__riscv_vsll(src3, 1, vl), __riscv_vsll(src3, 2, vl), vl);
+                    }
+                    sum0 = __riscv_vadd(sum0, src0, vl);
+                    sum1 = __riscv_vadd(sum1, src1, vl);
+                    sum2 = __riscv_vadd(sum2, src2, vl);
+                    sum3 = __riscv_vadd(sum3, src3, vl);
+                }
+                if (row3)
+                {
+                    loadres(row3);
+                    sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl);
+                    sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl);
+                    sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl);
+                    sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl);
+                }
+                if (row4)
+                {
+                    loadres(row4);
+                    sum0 = __riscv_vadd(sum0, src0, vl);
+                    sum1 = __riscv_vadd(sum1, src1, vl);
+                    sum2 = __riscv_vadd(sum2, src2, vl);
+                    sum3 = __riscv_vadd(sum3, src3, vl);
+                }
+
+                vuint8m1x4_t dst{};
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 0, __riscv_vnclipu(sum0, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 1, __riscv_vnclipu(sum1, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 2, __riscv_vnclipu(sum2, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 3, __riscv_vnclipu(sum3, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
+                __riscv_vsseg4e8(dst_data + cur * dst_step + j * 4, dst, vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int gaussianBlurBinomial(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, size_t margin_left, size_t margin_top, size_t margin_right, size_t margin_bottom, size_t ksize, int border_type)
+{
+    const int type = CV_MAKETYPE(depth, cn);
+    if ((type != CV_8UC1 && type != CV_8UC4 && type != CV_16UC1) || src_data == dst_data)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((ksize != 3 && ksize != 5) || border_type & BORDER_ISOLATED || border_type == BORDER_WRAP)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    switch (ksize*100 + type)
+    {
+    case 300 + CV_8UC1:
+        return common::invoke(height, {gaussianBlurC1<3, RVV_U8M4, RVV_U16M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    case 500 + CV_8UC1:
+        return common::invoke(height, {gaussianBlurC1<5, RVV_U8M4, RVV_U16M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    case 300 + CV_16UC1:
+        return common::invoke(height, {gaussianBlurC1<3, RVV_U16M4, RVV_U32M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    case 500 + CV_16UC1:
+        return common::invoke(height, {gaussianBlurC1<5, RVV_U16M4, RVV_U32M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    case 300 + CV_8UC4:
+        return common::invoke(height, {gaussianBlurC4<3>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    case 500 + CV_8UC4:
+        return common::invoke(height, {gaussianBlurC4<5>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/hal_rvv_1p0/histogram.hpp b/hal/riscv-rvv/src/imgproc/histogram.cpp
similarity index 86%
rename from hal/riscv-rvv/hal_rvv_1p0/histogram.hpp
rename to hal/riscv-rvv/src/imgproc/histogram.cpp
index 48f6123b0d..fd6adc3be3 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/histogram.hpp
+++ b/hal/riscv-rvv/src/imgproc/histogram.cpp
@@ -4,16 +4,13 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_HISTOGRAM_HPP_INCLUDED
-#define OPENCV_HAL_RVV_HISTOGRAM_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace imgproc {
 
-namespace cv { namespace cv_hal_rvv {
+#if CV_HAL_RVV_1P0_ENABLED
 
-namespace equalize_hist {
-#undef cv_hal_equalize_hist
-#define cv_hal_equalize_hist cv::cv_hal_rvv::equalize_hist::equalize_hist
+namespace {
 
 class HistogramInvoker : public ParallelLoopBody
 {
@@ -77,9 +74,11 @@ static inline void lut_invoke(int start, int end, const uchar* src_data, size_t
     }
 }
 
+} // equalize_hist
+
 // the algorithm is copied from imgproc/src/histogram.cpp,
 // in the function void cv::equalizeHist
-inline int equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
+int equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
 {
     int hist[HIST_SZ] = {0};
     uchar lut[HIST_SZ];
@@ -101,8 +100,7 @@ inline int equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data
 
     return CV_HAL_ERROR_OK;
 }
-} // cv::cv_hal_rvv::equalize_hist
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/hal_rvv_1p0/integral.hpp b/hal/riscv-rvv/src/imgproc/integral.cpp
similarity index 92%
rename from hal/riscv-rvv/hal_rvv_1p0/integral.hpp
rename to hal/riscv-rvv/src/imgproc/integral.cpp
index a3ea0b5557..e0c7f44995 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/integral.hpp
+++ b/hal/riscv-rvv/src/imgproc/integral.cpp
@@ -4,16 +4,13 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_INTEGRAL_HPP_INCLUDED
-#define OPENCV_HAL_RVV_INTEGRAL_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
-#include "types.hpp"
+namespace cv { namespace rvv_hal { namespace imgproc {
 
-namespace cv { namespace cv_hal_rvv {
+#if CV_HAL_RVV_1P0_ENABLED
 
-#undef cv_hal_integral
-#define cv_hal_integral cv::cv_hal_rvv::integral
+namespace {
 
 template <typename vec_t>
 inline typename vec_t::VecType repeat_last_n(typename vec_t::VecType vs, int n, size_t vl) {
@@ -87,6 +84,8 @@ inline int integral(const uchar* src_data, size_t src_step, uchar* sum_data, siz
     return result;
 }
 
+} // anonymous
+
 /**
    @brief Calculate integral image
    @param depth Depth of source image
@@ -119,12 +118,12 @@ inline int integral(const uchar* src_data, size_t src_step, uchar* sum_data, siz
     CV_32F | CV_64F | CV_64F
     CV_64F | CV_64F | CV_64F
 */
-inline int integral(int depth, int sdepth, int sqdepth,
-                    const uchar* src_data, size_t src_step,
-                    uchar* sum_data, size_t sum_step,
-                    uchar* sqsum_data, size_t sqsum_step,
-                    uchar* tilted_data, [[maybe_unused]] size_t tilted_step,
-                    int width, int height, int cn) {
+int integral(int depth, int sdepth, int sqdepth,
+             const uchar* src_data, size_t src_step,
+             uchar* sum_data, size_t sum_step,
+             uchar* sqsum_data, size_t sqsum_step,
+             uchar* tilted_data, [[maybe_unused]] size_t tilted_step,
+             int width, int height, int cn) {
     // tilted sum and cn == 3 cases are not supported
     if (tilted_data || cn == 3) {
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -168,6 +167,6 @@ inline int integral(int depth, int sdepth, int sqdepth,
     return result;
 }
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/src/imgproc/median_blur.cpp b/hal/riscv-rvv/src/imgproc/median_blur.cpp
new file mode 100644
index 0000000000..d86b2d92e3
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/median_blur.cpp
@@ -0,0 +1,575 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+// the algorithm is copied from imgproc/src/median_blur.simd.cpp
+// in the function template static void medianBlur_SortNet
+template<int ksize, typename helper>
+static inline int medianBlurC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
+{
+    using T = typename helper::ElemType;
+    using VT = typename helper::VecType;
+
+    for (int i = start; i < end; i++)
+    {
+        const T* row0 = reinterpret_cast<const T*>(src_data + std::min(std::max(i     - ksize / 2, 0), height - 1) * src_step);
+        const T* row1 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 1 - ksize / 2, 0), height - 1) * src_step);
+        const T* row2 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 2 - ksize / 2, 0), height - 1) * src_step);
+        const T* row3 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 3 - ksize / 2, 0), height - 1) * src_step);
+        const T* row4 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 4 - ksize / 2, 0), height - 1) * src_step);
+        int vl;
+        auto vop = [&vl](VT& a, VT& b) {
+            auto t = a;
+            a = helper::vmin(a, b, vl);
+            b = helper::vmax(t, b, vl);
+        };
+
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = helper::setvl(width - j);
+            if (ksize == 3)
+            {
+                VT p0, p1, p2;
+                VT p3, p4, p5;
+                VT p6, p7, p8;
+                if (j != 0)
+                {
+                    p0 = helper::vload(row0 + j - 1, vl);
+                    p3 = helper::vload(row1 + j - 1, vl);
+                    p6 = helper::vload(row2 + j - 1, vl);
+                }
+                else
+                {
+                    p0 = helper::vslide1up(helper::vload(row0, vl), row0[0], vl);
+                    p3 = helper::vslide1up(helper::vload(row1, vl), row1[0], vl);
+                    p6 = helper::vslide1up(helper::vload(row2, vl), row2[0], vl);
+                }
+                p1 = helper::vslide1down(p0, row0[j + vl - 1], vl);
+                p4 = helper::vslide1down(p3, row1[j + vl - 1], vl);
+                p7 = helper::vslide1down(p6, row2[j + vl - 1], vl);
+                p2 = helper::vslide1down(p1, row0[std::min(width - 1, j + vl)], vl);
+                p5 = helper::vslide1down(p4, row1[std::min(width - 1, j + vl)], vl);
+                p8 = helper::vslide1down(p7, row2[std::min(width - 1, j + vl)], vl);
+
+                vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1);
+                vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5);
+                vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7);
+                vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7);
+                vop(p4, p2); vop(p6, p4); vop(p4, p2);
+                helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, p4, vl);
+            }
+            else
+            {
+                VT p0, p1, p2, p3, p4;
+                VT p5, p6, p7, p8, p9;
+                VT p10, p11, p12, p13, p14;
+                VT p15, p16, p17, p18, p19;
+                VT p20, p21, p22, p23, p24;
+                if (j >= 2)
+                {
+                    p0 = helper::vload(row0 + j - 2, vl);
+                    p5 = helper::vload(row1 + j - 2, vl);
+                    p10 = helper::vload(row2 + j - 2, vl);
+                    p15 = helper::vload(row3 + j - 2, vl);
+                    p20 = helper::vload(row4 + j - 2, vl);
+                }
+                else
+                {
+                    p0 = helper::vslide1up(helper::vload(row0, vl), row0[0], vl);
+                    p5 = helper::vslide1up(helper::vload(row1, vl), row1[0], vl);
+                    p10 = helper::vslide1up(helper::vload(row2, vl), row2[0], vl);
+                    p15 = helper::vslide1up(helper::vload(row3, vl), row3[0], vl);
+                    p20 = helper::vslide1up(helper::vload(row4, vl), row4[0], vl);
+                    if (j == 0)
+                    {
+                        p0 = helper::vslide1up(p0, row0[0], vl);
+                        p5 = helper::vslide1up(p5, row1[0], vl);
+                        p10 = helper::vslide1up(p10, row2[0], vl);
+                        p15 = helper::vslide1up(p15, row3[0], vl);
+                        p20 = helper::vslide1up(p20, row4[0], vl);
+                    }
+                }
+                p1 = helper::vslide1down(p0, row0[j + vl - 2], vl);
+                p6 = helper::vslide1down(p5, row1[j + vl - 2], vl);
+                p11 = helper::vslide1down(p10, row2[j + vl - 2], vl);
+                p16 = helper::vslide1down(p15, row3[j + vl - 2], vl);
+                p21 = helper::vslide1down(p20, row4[j + vl - 2], vl);
+                p2 = helper::vslide1down(p1, row0[j + vl - 1], vl);
+                p7 = helper::vslide1down(p6, row1[j + vl - 1], vl);
+                p12 = helper::vslide1down(p11, row2[j + vl - 1], vl);
+                p17 = helper::vslide1down(p16, row3[j + vl - 1], vl);
+                p22 = helper::vslide1down(p21, row4[j + vl - 1], vl);
+                p3 = helper::vslide1down(p2, row0[std::min(width - 1, j + vl)], vl);
+                p8 = helper::vslide1down(p7, row1[std::min(width - 1, j + vl)], vl);
+                p13 = helper::vslide1down(p12, row2[std::min(width - 1, j + vl)], vl);
+                p18 = helper::vslide1down(p17, row3[std::min(width - 1, j + vl)], vl);
+                p23 = helper::vslide1down(p22, row4[std::min(width - 1, j + vl)], vl);
+                p4 = helper::vslide1down(p3, row0[std::min(width - 1, j + vl + 1)], vl);
+                p9 = helper::vslide1down(p8, row1[std::min(width - 1, j + vl + 1)], vl);
+                p14 = helper::vslide1down(p13, row2[std::min(width - 1, j + vl + 1)], vl);
+                p19 = helper::vslide1down(p18, row3[std::min(width - 1, j + vl + 1)], vl);
+                p24 = helper::vslide1down(p23, row4[std::min(width - 1, j + vl + 1)], vl);
+
+                vop(p1, p2); vop(p0, p1); vop(p1, p2); vop(p4, p5); vop(p3, p4);
+                vop(p4, p5); vop(p0, p3); vop(p2, p5); vop(p2, p3); vop(p1, p4);
+                vop(p1, p2); vop(p3, p4); vop(p7, p8); vop(p6, p7); vop(p7, p8);
+                vop(p10, p11); vop(p9, p10); vop(p10, p11); vop(p6, p9); vop(p8, p11);
+                vop(p8, p9); vop(p7, p10); vop(p7, p8); vop(p9, p10); vop(p0, p6);
+                vop(p4, p10); vop(p4, p6); vop(p2, p8); vop(p2, p4); vop(p6, p8);
+                vop(p1, p7); vop(p5, p11); vop(p5, p7); vop(p3, p9); vop(p3, p5);
+                vop(p7, p9); vop(p1, p2); vop(p3, p4); vop(p5, p6); vop(p7, p8);
+                vop(p9, p10); vop(p13, p14); vop(p12, p13); vop(p13, p14); vop(p16, p17);
+                vop(p15, p16); vop(p16, p17); vop(p12, p15); vop(p14, p17); vop(p14, p15);
+                vop(p13, p16); vop(p13, p14); vop(p15, p16); vop(p19, p20); vop(p18, p19);
+                vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p21, p23); vop(p22, p24);
+                vop(p22, p23); vop(p18, p21); vop(p20, p23); vop(p20, p21); vop(p19, p22);
+                vop(p22, p24); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p12, p18);
+                vop(p16, p22); vop(p16, p18); vop(p14, p20); vop(p20, p24); vop(p14, p16);
+                vop(p18, p20); vop(p22, p24); vop(p13, p19); vop(p17, p23); vop(p17, p19);
+                vop(p15, p21); vop(p15, p17); vop(p19, p21); vop(p13, p14); vop(p15, p16);
+                vop(p17, p18); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p0, p12);
+                vop(p8, p20); vop(p8, p12); vop(p4, p16); vop(p16, p24); vop(p12, p16);
+                vop(p2, p14); vop(p10, p22); vop(p10, p14); vop(p6, p18); vop(p6, p10);
+                vop(p10, p12); vop(p1, p13); vop(p9, p21); vop(p9, p13); vop(p5, p17);
+                vop(p13, p17); vop(p3, p15); vop(p11, p23); vop(p11, p15); vop(p7, p19);
+                vop(p7, p11); vop(p11, p13); vop(p11, p12);
+                helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, p12, vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template<int ksize>
+static inline int medianBlurC4(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
+{
+    for (int i = start; i < end; i++)
+    {
+        const uchar* row0 = src_data + std::min(std::max(i     - ksize / 2, 0), height - 1) * src_step;
+        const uchar* row1 = src_data + std::min(std::max(i + 1 - ksize / 2, 0), height - 1) * src_step;
+        const uchar* row2 = src_data + std::min(std::max(i + 2 - ksize / 2, 0), height - 1) * src_step;
+        const uchar* row3 = src_data + std::min(std::max(i + 3 - ksize / 2, 0), height - 1) * src_step;
+        const uchar* row4 = src_data + std::min(std::max(i + 4 - ksize / 2, 0), height - 1) * src_step;
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            if (ksize == 3)
+            {
+                vl = __riscv_vsetvl_e8m1(width - j);
+                vuint8m1_t p00, p01, p02;
+                vuint8m1_t p03, p04, p05;
+                vuint8m1_t p06, p07, p08;
+                vuint8m1_t p10, p11, p12;
+                vuint8m1_t p13, p14, p15;
+                vuint8m1_t p16, p17, p18;
+                vuint8m1_t p20, p21, p22;
+                vuint8m1_t p23, p24, p25;
+                vuint8m1_t p26, p27, p28;
+                vuint8m1_t p30, p31, p32;
+                vuint8m1_t p33, p34, p35;
+                vuint8m1_t p36, p37, p38;
+                auto loadsrc = [&vl](const uchar* row, vuint8m1_t& p0, vuint8m1_t& p1, vuint8m1_t& p2, vuint8m1_t& p3) {
+                    auto src = __riscv_vlseg4e8_v_u8m1x4(row, vl);
+                    p0 = __riscv_vget_v_u8m1x4_u8m1(src, 0);
+                    p1 = __riscv_vget_v_u8m1x4_u8m1(src, 1);
+                    p2 = __riscv_vget_v_u8m1x4_u8m1(src, 2);
+                    p3 = __riscv_vget_v_u8m1x4_u8m1(src, 3);
+                };
+                if (j != 0)
+                {
+                    loadsrc(row0 + (j - 1) * 4, p00, p10, p20, p30);
+                    loadsrc(row1 + (j - 1) * 4, p03, p13, p23, p33);
+                    loadsrc(row2 + (j - 1) * 4, p06, p16, p26, p36);
+                }
+                else
+                {
+                    loadsrc(row0, p00, p10, p20, p30);
+                    loadsrc(row1, p03, p13, p23, p33);
+                    loadsrc(row2, p06, p16, p26, p36);
+                    p00 = __riscv_vslide1up(p00, row0[0], vl);
+                    p10 = __riscv_vslide1up(p10, row0[1], vl);
+                    p20 = __riscv_vslide1up(p20, row0[2], vl);
+                    p30 = __riscv_vslide1up(p30, row0[3], vl);
+                    p03 = __riscv_vslide1up(p03, row1[0], vl);
+                    p13 = __riscv_vslide1up(p13, row1[1], vl);
+                    p23 = __riscv_vslide1up(p23, row1[2], vl);
+                    p33 = __riscv_vslide1up(p33, row1[3], vl);
+                    p06 = __riscv_vslide1up(p06, row2[0], vl);
+                    p16 = __riscv_vslide1up(p16, row2[1], vl);
+                    p26 = __riscv_vslide1up(p26, row2[2], vl);
+                    p36 = __riscv_vslide1up(p36, row2[3], vl);
+                }
+                p01 = __riscv_vslide1down(p00, row0[(j + vl - 1) * 4    ], vl);
+                p11 = __riscv_vslide1down(p10, row0[(j + vl - 1) * 4 + 1], vl);
+                p21 = __riscv_vslide1down(p20, row0[(j + vl - 1) * 4 + 2], vl);
+                p31 = __riscv_vslide1down(p30, row0[(j + vl - 1) * 4 + 3], vl);
+                p04 = __riscv_vslide1down(p03, row1[(j + vl - 1) * 4    ], vl);
+                p14 = __riscv_vslide1down(p13, row1[(j + vl - 1) * 4 + 1], vl);
+                p24 = __riscv_vslide1down(p23, row1[(j + vl - 1) * 4 + 2], vl);
+                p34 = __riscv_vslide1down(p33, row1[(j + vl - 1) * 4 + 3], vl);
+                p07 = __riscv_vslide1down(p06, row2[(j + vl - 1) * 4    ], vl);
+                p17 = __riscv_vslide1down(p16, row2[(j + vl - 1) * 4 + 1], vl);
+                p27 = __riscv_vslide1down(p26, row2[(j + vl - 1) * 4 + 2], vl);
+                p37 = __riscv_vslide1down(p36, row2[(j + vl - 1) * 4 + 3], vl);
+                p02 = __riscv_vslide1down(p01, row0[std::min(width - 1, j + vl) * 4    ], vl);
+                p12 = __riscv_vslide1down(p11, row0[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p22 = __riscv_vslide1down(p21, row0[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p32 = __riscv_vslide1down(p31, row0[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p05 = __riscv_vslide1down(p04, row1[std::min(width - 1, j + vl) * 4    ], vl);
+                p15 = __riscv_vslide1down(p14, row1[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p25 = __riscv_vslide1down(p24, row1[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p35 = __riscv_vslide1down(p34, row1[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p08 = __riscv_vslide1down(p07, row2[std::min(width - 1, j + vl) * 4    ], vl);
+                p18 = __riscv_vslide1down(p17, row2[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p28 = __riscv_vslide1down(p27, row2[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p38 = __riscv_vslide1down(p37, row2[std::min(width - 1, j + vl) * 4 + 3], vl);
+
+                auto vop = [&vl](vuint8m1_t& a, vuint8m1_t& b) {
+                    auto t = a;
+                    a = __riscv_vminu(a, b, vl);
+                    b = __riscv_vmaxu(t, b, vl);
+                };
+                vuint8m1x4_t dst{};
+                vop(p01, p02); vop(p04, p05); vop(p07, p08); vop(p00, p01);
+                vop(p03, p04); vop(p06, p07); vop(p01, p02); vop(p04, p05);
+                vop(p07, p08); vop(p00, p03); vop(p05, p08); vop(p04, p07);
+                vop(p03, p06); vop(p01, p04); vop(p02, p05); vop(p04, p07);
+                vop(p04, p02); vop(p06, p04); vop(p04, p02);
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 0, p04);
+                vop(p11, p12); vop(p14, p15); vop(p17, p18); vop(p10, p11);
+                vop(p13, p14); vop(p16, p17); vop(p11, p12); vop(p14, p15);
+                vop(p17, p18); vop(p10, p13); vop(p15, p18); vop(p14, p17);
+                vop(p13, p16); vop(p11, p14); vop(p12, p15); vop(p14, p17);
+                vop(p14, p12); vop(p16, p14); vop(p14, p12);
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 1, p14);
+                vop(p21, p22); vop(p24, p25); vop(p27, p28); vop(p20, p21);
+                vop(p23, p24); vop(p26, p27); vop(p21, p22); vop(p24, p25);
+                vop(p27, p28); vop(p20, p23); vop(p25, p28); vop(p24, p27);
+                vop(p23, p26); vop(p21, p24); vop(p22, p25); vop(p24, p27);
+                vop(p24, p22); vop(p26, p24); vop(p24, p22);
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 2, p24);
+                vop(p31, p32); vop(p34, p35); vop(p37, p38); vop(p30, p31);
+                vop(p33, p34); vop(p36, p37); vop(p31, p32); vop(p34, p35);
+                vop(p37, p38); vop(p30, p33); vop(p35, p38); vop(p34, p37);
+                vop(p33, p36); vop(p31, p34); vop(p32, p35); vop(p34, p37);
+                vop(p34, p32); vop(p36, p34); vop(p34, p32);
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 3, p34);
+                __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
+            }
+            else
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                vuint8m2_t p00, p01, p02, p03, p04;
+                vuint8m2_t p05, p06, p07, p08, p09;
+                vuint8m2_t p010, p011, p012, p013, p014;
+                vuint8m2_t p015, p016, p017, p018, p019;
+                vuint8m2_t p020, p021, p022, p023, p024;
+                vuint8m2_t p10, p11, p12, p13, p14;
+                vuint8m2_t p15, p16, p17, p18, p19;
+                vuint8m2_t p110, p111, p112, p113, p114;
+                vuint8m2_t p115, p116, p117, p118, p119;
+                vuint8m2_t p120, p121, p122, p123, p124;
+                vuint8m2_t p20, p21, p22, p23, p24;
+                vuint8m2_t p25, p26, p27, p28, p29;
+                vuint8m2_t p210, p211, p212, p213, p214;
+                vuint8m2_t p215, p216, p217, p218, p219;
+                vuint8m2_t p220, p221, p222, p223, p224;
+                vuint8m2_t p30, p31, p32, p33, p34;
+                vuint8m2_t p35, p36, p37, p38, p39;
+                vuint8m2_t p310, p311, p312, p313, p314;
+                vuint8m2_t p315, p316, p317, p318, p319;
+                vuint8m2_t p320, p321, p322, p323, p324;
+                auto loadsrc = [&vl](const uchar* row, vuint8m2_t& p0, vuint8m2_t& p1, vuint8m2_t& p2, vuint8m2_t& p3) {
+                    auto src = __riscv_vlseg4e8_v_u8m2x4(row, vl);
+                    p0 = __riscv_vget_v_u8m2x4_u8m2(src, 0);
+                    p1 = __riscv_vget_v_u8m2x4_u8m2(src, 1);
+                    p2 = __riscv_vget_v_u8m2x4_u8m2(src, 2);
+                    p3 = __riscv_vget_v_u8m2x4_u8m2(src, 3);
+                };
+                if (j >= 2)
+                {
+                    loadsrc(row0 + (j - 2) * 4, p00, p10, p20, p30);
+                    loadsrc(row1 + (j - 2) * 4, p05, p15, p25, p35);
+                    loadsrc(row2 + (j - 2) * 4, p010, p110, p210, p310);
+                    loadsrc(row3 + (j - 2) * 4, p015, p115, p215, p315);
+                    loadsrc(row4 + (j - 2) * 4, p020, p120, p220, p320);
+                }
+                else
+                {
+                    loadsrc(row0, p00, p10, p20, p30);
+                    loadsrc(row1, p05, p15, p25, p35);
+                    loadsrc(row2, p010, p110, p210, p310);
+                    loadsrc(row3, p015, p115, p215, p315);
+                    loadsrc(row4, p020, p120, p220, p320);
+                    auto slideup = [&] {
+                        p00 = __riscv_vslide1up(p00, row0[0], vl);
+                        p10 = __riscv_vslide1up(p10, row0[1], vl);
+                        p20 = __riscv_vslide1up(p20, row0[2], vl);
+                        p30 = __riscv_vslide1up(p30, row0[3], vl);
+                        p05 = __riscv_vslide1up(p05, row1[0], vl);
+                        p15 = __riscv_vslide1up(p15, row1[1], vl);
+                        p25 = __riscv_vslide1up(p25, row1[2], vl);
+                        p35 = __riscv_vslide1up(p35, row1[3], vl);
+                        p010 = __riscv_vslide1up(p010, row2[0], vl);
+                        p110 = __riscv_vslide1up(p110, row2[1], vl);
+                        p210 = __riscv_vslide1up(p210, row2[2], vl);
+                        p310 = __riscv_vslide1up(p310, row2[3], vl);
+                        p015 = __riscv_vslide1up(p015, row3[0], vl);
+                        p115 = __riscv_vslide1up(p115, row3[1], vl);
+                        p215 = __riscv_vslide1up(p215, row3[2], vl);
+                        p315 = __riscv_vslide1up(p315, row3[3], vl);
+                        p020 = __riscv_vslide1up(p020, row4[0], vl);
+                        p120 = __riscv_vslide1up(p120, row4[1], vl);
+                        p220 = __riscv_vslide1up(p220, row4[2], vl);
+                        p320 = __riscv_vslide1up(p320, row4[3], vl);
+                    };
+                    slideup();
+                    if (j == 0)
+                    {
+                        slideup();
+                    }
+                }
+                p01 = __riscv_vslide1down(p00, row0[(j + vl - 2) * 4    ], vl);
+                p11 = __riscv_vslide1down(p10, row0[(j + vl - 2) * 4 + 1], vl);
+                p21 = __riscv_vslide1down(p20, row0[(j + vl - 2) * 4 + 2], vl);
+                p31 = __riscv_vslide1down(p30, row0[(j + vl - 2) * 4 + 3], vl);
+                p06 = __riscv_vslide1down(p05, row1[(j + vl - 2) * 4    ], vl);
+                p16 = __riscv_vslide1down(p15, row1[(j + vl - 2) * 4 + 1], vl);
+                p26 = __riscv_vslide1down(p25, row1[(j + vl - 2) * 4 + 2], vl);
+                p36 = __riscv_vslide1down(p35, row1[(j + vl - 2) * 4 + 3], vl);
+                p011 = __riscv_vslide1down(p010, row2[(j + vl - 2) * 4    ], vl);
+                p111 = __riscv_vslide1down(p110, row2[(j + vl - 2) * 4 + 1], vl);
+                p211 = __riscv_vslide1down(p210, row2[(j + vl - 2) * 4 + 2], vl);
+                p311 = __riscv_vslide1down(p310, row2[(j + vl - 2) * 4 + 3], vl);
+                p016 = __riscv_vslide1down(p015, row3[(j + vl - 2) * 4    ], vl);
+                p116 = __riscv_vslide1down(p115, row3[(j + vl - 2) * 4 + 1], vl);
+                p216 = __riscv_vslide1down(p215, row3[(j + vl - 2) * 4 + 2], vl);
+                p316 = __riscv_vslide1down(p315, row3[(j + vl - 2) * 4 + 3], vl);
+                p021 = __riscv_vslide1down(p020, row4[(j + vl - 2) * 4    ], vl);
+                p121 = __riscv_vslide1down(p120, row4[(j + vl - 2) * 4 + 1], vl);
+                p221 = __riscv_vslide1down(p220, row4[(j + vl - 2) * 4 + 2], vl);
+                p321 = __riscv_vslide1down(p320, row4[(j + vl - 2) * 4 + 3], vl);
+                p02 = __riscv_vslide1down(p01, row0[(j + vl - 1) * 4    ], vl);
+                p12 = __riscv_vslide1down(p11, row0[(j + vl - 1) * 4 + 1], vl);
+                p22 = __riscv_vslide1down(p21, row0[(j + vl - 1) * 4 + 2], vl);
+                p32 = __riscv_vslide1down(p31, row0[(j + vl - 1) * 4 + 3], vl);
+                p07 = __riscv_vslide1down(p06, row1[(j + vl - 1) * 4    ], vl);
+                p17 = __riscv_vslide1down(p16, row1[(j + vl - 1) * 4 + 1], vl);
+                p27 = __riscv_vslide1down(p26, row1[(j + vl - 1) * 4 + 2], vl);
+                p37 = __riscv_vslide1down(p36, row1[(j + vl - 1) * 4 + 3], vl);
+                p012 = __riscv_vslide1down(p011, row2[(j + vl - 1) * 4    ], vl);
+                p112 = __riscv_vslide1down(p111, row2[(j + vl - 1) * 4 + 1], vl);
+                p212 = __riscv_vslide1down(p211, row2[(j + vl - 1) * 4 + 2], vl);
+                p312 = __riscv_vslide1down(p311, row2[(j + vl - 1) * 4 + 3], vl);
+                p017 = __riscv_vslide1down(p016, row3[(j + vl - 1) * 4    ], vl);
+                p117 = __riscv_vslide1down(p116, row3[(j + vl - 1) * 4 + 1], vl);
+                p217 = __riscv_vslide1down(p216, row3[(j + vl - 1) * 4 + 2], vl);
+                p317 = __riscv_vslide1down(p316, row3[(j + vl - 1) * 4 + 3], vl);
+                p022 = __riscv_vslide1down(p021, row4[(j + vl - 1) * 4    ], vl);
+                p122 = __riscv_vslide1down(p121, row4[(j + vl - 1) * 4 + 1], vl);
+                p222 = __riscv_vslide1down(p221, row4[(j + vl - 1) * 4 + 2], vl);
+                p322 = __riscv_vslide1down(p321, row4[(j + vl - 1) * 4 + 3], vl);
+                p03 = __riscv_vslide1down(p02, row0[std::min(width - 1, j + vl) * 4    ], vl);
+                p13 = __riscv_vslide1down(p12, row0[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p23 = __riscv_vslide1down(p22, row0[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p33 = __riscv_vslide1down(p32, row0[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p08 = __riscv_vslide1down(p07, row1[std::min(width - 1, j + vl) * 4    ], vl);
+                p18 = __riscv_vslide1down(p17, row1[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p28 = __riscv_vslide1down(p27, row1[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p38 = __riscv_vslide1down(p37, row1[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p013 = __riscv_vslide1down(p012, row2[std::min(width - 1, j + vl) * 4    ], vl);
+                p113 = __riscv_vslide1down(p112, row2[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p213 = __riscv_vslide1down(p212, row2[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p313 = __riscv_vslide1down(p312, row2[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p018 = __riscv_vslide1down(p017, row3[std::min(width - 1, j + vl) * 4    ], vl);
+                p118 = __riscv_vslide1down(p117, row3[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p218 = __riscv_vslide1down(p217, row3[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p318 = __riscv_vslide1down(p317, row3[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p023 = __riscv_vslide1down(p022, row4[std::min(width - 1, j + vl) * 4    ], vl);
+                p123 = __riscv_vslide1down(p122, row4[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p223 = __riscv_vslide1down(p222, row4[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p323 = __riscv_vslide1down(p322, row4[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p04 = __riscv_vslide1down(p03, row0[std::min(width - 1, j + vl + 1) * 4    ], vl);
+                p14 = __riscv_vslide1down(p13, row0[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
+                p24 = __riscv_vslide1down(p23, row0[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
+                p34 = __riscv_vslide1down(p33, row0[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
+                p09 = __riscv_vslide1down(p08, row1[std::min(width - 1, j + vl + 1) * 4    ], vl);
+                p19 = __riscv_vslide1down(p18, row1[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
+                p29 = __riscv_vslide1down(p28, row1[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
+                p39 = __riscv_vslide1down(p38, row1[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
+                p014 = __riscv_vslide1down(p013, row2[std::min(width - 1, j + vl + 1) * 4    ], vl);
+                p114 = __riscv_vslide1down(p113, row2[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
+                p214 = __riscv_vslide1down(p213, row2[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
+                p314 = __riscv_vslide1down(p313, row2[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
+                p019 = __riscv_vslide1down(p018, row3[std::min(width - 1, j + vl + 1) * 4    ], vl);
+                p119 = __riscv_vslide1down(p118, row3[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
+                p219 = __riscv_vslide1down(p218, row3[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
+                p319 = __riscv_vslide1down(p318, row3[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
+                p024 = __riscv_vslide1down(p023, row4[std::min(width - 1, j + vl + 1) * 4    ], vl);
+                p124 = __riscv_vslide1down(p123, row4[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
+                p224 = __riscv_vslide1down(p223, row4[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
+                p324 = __riscv_vslide1down(p323, row4[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
+
+                auto vop = [&vl](vuint8m2_t& a, vuint8m2_t& b) {
+                    auto t = a;
+                    a = __riscv_vminu(a, b, vl);
+                    b = __riscv_vmaxu(t, b, vl);
+                };
+                vuint8m2x4_t dst{};
+                vop(p01, p02); vop(p00, p01); vop(p01, p02); vop(p04, p05); vop(p03, p04);
+                vop(p04, p05); vop(p00, p03); vop(p02, p05); vop(p02, p03); vop(p01, p04);
+                vop(p01, p02); vop(p03, p04); vop(p07, p08); vop(p06, p07); vop(p07, p08);
+                vop(p010, p011); vop(p09, p010); vop(p010, p011); vop(p06, p09); vop(p08, p011);
+                vop(p08, p09); vop(p07, p010); vop(p07, p08); vop(p09, p010); vop(p00, p06);
+                vop(p04, p010); vop(p04, p06); vop(p02, p08); vop(p02, p04); vop(p06, p08);
+                vop(p01, p07); vop(p05, p011); vop(p05, p07); vop(p03, p09); vop(p03, p05);
+                vop(p07, p09); vop(p01, p02); vop(p03, p04); vop(p05, p06); vop(p07, p08);
+                vop(p09, p010); vop(p013, p014); vop(p012, p013); vop(p013, p014); vop(p016, p017);
+                vop(p015, p016); vop(p016, p017); vop(p012, p015); vop(p014, p017); vop(p014, p015);
+                vop(p013, p016); vop(p013, p014); vop(p015, p016); vop(p019, p020); vop(p018, p019);
+                vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p021, p023); vop(p022, p024);
+                vop(p022, p023); vop(p018, p021); vop(p020, p023); vop(p020, p021); vop(p019, p022);
+                vop(p022, p024); vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p012, p018);
+                vop(p016, p022); vop(p016, p018); vop(p014, p020); vop(p020, p024); vop(p014, p016);
+                vop(p018, p020); vop(p022, p024); vop(p013, p019); vop(p017, p023); vop(p017, p019);
+                vop(p015, p021); vop(p015, p017); vop(p019, p021); vop(p013, p014); vop(p015, p016);
+                vop(p017, p018); vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p00, p012);
+                vop(p08, p020); vop(p08, p012); vop(p04, p016); vop(p016, p024); vop(p012, p016);
+                vop(p02, p014); vop(p010, p022); vop(p010, p014); vop(p06, p018); vop(p06, p010);
+                vop(p010, p012); vop(p01, p013); vop(p09, p021); vop(p09, p013); vop(p05, p017);
+                vop(p013, p017); vop(p03, p015); vop(p011, p023); vop(p011, p015); vop(p07, p019);
+                vop(p07, p011); vop(p011, p013); vop(p011, p012);
+                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 0, p012);
+                vop(p11, p12); vop(p10, p11); vop(p11, p12); vop(p14, p15); vop(p13, p14);
+                vop(p14, p15); vop(p10, p13); vop(p12, p15); vop(p12, p13); vop(p11, p14);
+                vop(p11, p12); vop(p13, p14); vop(p17, p18); vop(p16, p17); vop(p17, p18);
+                vop(p110, p111); vop(p19, p110); vop(p110, p111); vop(p16, p19); vop(p18, p111);
+                vop(p18, p19); vop(p17, p110); vop(p17, p18); vop(p19, p110); vop(p10, p16);
+                vop(p14, p110); vop(p14, p16); vop(p12, p18); vop(p12, p14); vop(p16, p18);
+                vop(p11, p17); vop(p15, p111); vop(p15, p17); vop(p13, p19); vop(p13, p15);
+                vop(p17, p19); vop(p11, p12); vop(p13, p14); vop(p15, p16); vop(p17, p18);
+                vop(p19, p110); vop(p113, p114); vop(p112, p113); vop(p113, p114); vop(p116, p117);
+                vop(p115, p116); vop(p116, p117); vop(p112, p115); vop(p114, p117); vop(p114, p115);
+                vop(p113, p116); vop(p113, p114); vop(p115, p116); vop(p119, p120); vop(p118, p119);
+                vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p121, p123); vop(p122, p124);
+                vop(p122, p123); vop(p118, p121); vop(p120, p123); vop(p120, p121); vop(p119, p122);
+                vop(p122, p124); vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p112, p118);
+                vop(p116, p122); vop(p116, p118); vop(p114, p120); vop(p120, p124); vop(p114, p116);
+                vop(p118, p120); vop(p122, p124); vop(p113, p119); vop(p117, p123); vop(p117, p119);
+                vop(p115, p121); vop(p115, p117); vop(p119, p121); vop(p113, p114); vop(p115, p116);
+                vop(p117, p118); vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p10, p112);
+                vop(p18, p120); vop(p18, p112); vop(p14, p116); vop(p116, p124); vop(p112, p116);
+                vop(p12, p114); vop(p110, p122); vop(p110, p114); vop(p16, p118); vop(p16, p110);
+                vop(p110, p112); vop(p11, p113); vop(p19, p121); vop(p19, p113); vop(p15, p117);
+                vop(p113, p117); vop(p13, p115); vop(p111, p123); vop(p111, p115); vop(p17, p119);
+                vop(p17, p111); vop(p111, p113); vop(p111, p112);
+                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 1, p112);
+                vop(p21, p22); vop(p20, p21); vop(p21, p22); vop(p24, p25); vop(p23, p24);
+                vop(p24, p25); vop(p20, p23); vop(p22, p25); vop(p22, p23); vop(p21, p24);
+                vop(p21, p22); vop(p23, p24); vop(p27, p28); vop(p26, p27); vop(p27, p28);
+                vop(p210, p211); vop(p29, p210); vop(p210, p211); vop(p26, p29); vop(p28, p211);
+                vop(p28, p29); vop(p27, p210); vop(p27, p28); vop(p29, p210); vop(p20, p26);
+                vop(p24, p210); vop(p24, p26); vop(p22, p28); vop(p22, p24); vop(p26, p28);
+                vop(p21, p27); vop(p25, p211); vop(p25, p27); vop(p23, p29); vop(p23, p25);
+                vop(p27, p29); vop(p21, p22); vop(p23, p24); vop(p25, p26); vop(p27, p28);
+                vop(p29, p210); vop(p213, p214); vop(p212, p213); vop(p213, p214); vop(p216, p217);
+                vop(p215, p216); vop(p216, p217); vop(p212, p215); vop(p214, p217); vop(p214, p215);
+                vop(p213, p216); vop(p213, p214); vop(p215, p216); vop(p219, p220); vop(p218, p219);
+                vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p221, p223); vop(p222, p224);
+                vop(p222, p223); vop(p218, p221); vop(p220, p223); vop(p220, p221); vop(p219, p222);
+                vop(p222, p224); vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p212, p218);
+                vop(p216, p222); vop(p216, p218); vop(p214, p220); vop(p220, p224); vop(p214, p216);
+                vop(p218, p220); vop(p222, p224); vop(p213, p219); vop(p217, p223); vop(p217, p219);
+                vop(p215, p221); vop(p215, p217); vop(p219, p221); vop(p213, p214); vop(p215, p216);
+                vop(p217, p218); vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p20, p212);
+                vop(p28, p220); vop(p28, p212); vop(p24, p216); vop(p216, p224); vop(p212, p216);
+                vop(p22, p214); vop(p210, p222); vop(p210, p214); vop(p26, p218); vop(p26, p210);
+                vop(p210, p212); vop(p21, p213); vop(p29, p221); vop(p29, p213); vop(p25, p217);
+                vop(p213, p217); vop(p23, p215); vop(p211, p223); vop(p211, p215); vop(p27, p219);
+                vop(p27, p211); vop(p211, p213); vop(p211, p212);
+                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 2, p212);
+                vop(p31, p32); vop(p30, p31); vop(p31, p32); vop(p34, p35); vop(p33, p34);
+                vop(p34, p35); vop(p30, p33); vop(p32, p35); vop(p32, p33); vop(p31, p34);
+                vop(p31, p32); vop(p33, p34); vop(p37, p38); vop(p36, p37); vop(p37, p38);
+                vop(p310, p311); vop(p39, p310); vop(p310, p311); vop(p36, p39); vop(p38, p311);
+                vop(p38, p39); vop(p37, p310); vop(p37, p38); vop(p39, p310); vop(p30, p36);
+                vop(p34, p310); vop(p34, p36); vop(p32, p38); vop(p32, p34); vop(p36, p38);
+                vop(p31, p37); vop(p35, p311); vop(p35, p37); vop(p33, p39); vop(p33, p35);
+                vop(p37, p39); vop(p31, p32); vop(p33, p34); vop(p35, p36); vop(p37, p38);
+                vop(p39, p310); vop(p313, p314); vop(p312, p313); vop(p313, p314); vop(p316, p317);
+                vop(p315, p316); vop(p316, p317); vop(p312, p315); vop(p314, p317); vop(p314, p315);
+                vop(p313, p316); vop(p313, p314); vop(p315, p316); vop(p319, p320); vop(p318, p319);
+                vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p321, p323); vop(p322, p324);
+                vop(p322, p323); vop(p318, p321); vop(p320, p323); vop(p320, p321); vop(p319, p322);
+                vop(p322, p324); vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p312, p318);
+                vop(p316, p322); vop(p316, p318); vop(p314, p320); vop(p320, p324); vop(p314, p316);
+                vop(p318, p320); vop(p322, p324); vop(p313, p319); vop(p317, p323); vop(p317, p319);
+                vop(p315, p321); vop(p315, p317); vop(p319, p321); vop(p313, p314); vop(p315, p316);
+                vop(p317, p318); vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p30, p312);
+                vop(p38, p320); vop(p38, p312); vop(p34, p316); vop(p316, p324); vop(p312, p316);
+                vop(p32, p314); vop(p310, p322); vop(p310, p314); vop(p36, p318); vop(p36, p310);
+                vop(p310, p312); vop(p31, p313); vop(p39, p321); vop(p39, p313); vop(p35, p317);
+                vop(p313, p317); vop(p33, p315); vop(p311, p323); vop(p311, p315); vop(p37, p319);
+                vop(p37, p311); vop(p311, p313); vop(p311, p312);
+                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 3, p312);
+                __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int medianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, int ksize)
+{
+    const int type = CV_MAKETYPE(depth, cn);
+    if (type != CV_8UC1 && type != CV_8UC4 && type != CV_16UC1 && type != CV_16SC1 && type != CV_32FC1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((ksize != 3 && ksize != 5) || src_data == dst_data)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    switch (ksize*100 + type)
+    {
+    case 300 + CV_8UC1:
+        return common::invoke(height, {medianBlurC1<3, RVV_U8M4>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 300 + CV_16UC1:
+        return common::invoke(height, {medianBlurC1<3, RVV_U16M4>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 300 + CV_16SC1:
+        return common::invoke(height, {medianBlurC1<3, RVV_I16M4>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 300 + CV_32FC1:
+        return common::invoke(height, {medianBlurC1<3, RVV_F32M4>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 500 + CV_8UC1:
+        return common::invoke(height, {medianBlurC1<5, RVV_U8M1>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 500 + CV_16UC1:
+        return common::invoke(height, {medianBlurC1<5, RVV_U16M1>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 500 + CV_16SC1:
+        return common::invoke(height, {medianBlurC1<5, RVV_I16M1>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 500 + CV_32FC1:
+        return common::invoke(height, {medianBlurC1<5, RVV_F32M1>}, src_data, src_step, dst_data, dst_step, width, height);
+
+    case 300 + CV_8UC4:
+        return common::invoke(height, {medianBlurC4<3>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 500 + CV_8UC4:
+        return common::invoke(height, {medianBlurC4<5>}, src_data, src_step, dst_data, dst_step, width, height);
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/hal_rvv_1p0/moments.hpp b/hal/riscv-rvv/src/imgproc/moments.cpp
similarity index 94%
rename from hal/riscv-rvv/hal_rvv_1p0/moments.hpp
rename to hal/riscv-rvv/src/imgproc/moments.cpp
index f0db8b3a17..c29f1edfd0 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/moments.hpp
+++ b/hal/riscv-rvv/src/imgproc/moments.cpp
@@ -4,16 +4,13 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_MOMENTS_HPP_INCLUDED
-#define OPENCV_HAL_RVV_MOMENTS_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace imgproc {
 
-namespace cv { namespace cv_hal_rvv {
+#if CV_HAL_RVV_1P0_ENABLED
 
-namespace imageMoments {
-#undef cv_hal_imageMoments
-#define cv_hal_imageMoments cv::cv_hal_rvv::imageMoments::imageMoments
+namespace {
 
 class MomentsInvoker : public ParallelLoopBody
 {
@@ -152,9 +149,11 @@ static inline int imageMoments(int start, int end, const uchar* src_data, size_t
     return CV_HAL_ERROR_OK;
 }
 
+} // anonymous
+
 // the algorithm is copied from imgproc/src/moments.cpp,
 // in the function cv::Moments cv::moments
-inline int imageMoments(const uchar* src_data, size_t src_step, int src_type, int width, int height, bool binary, double m[10])
+int imageMoments(const uchar* src_data, size_t src_step, int src_type, int width, int height, bool binary, double m[10])
 {
     if (src_type != CV_16UC1 && src_type != CV_16SC1 && src_type != CV_32FC1 && src_type != CV_64FC1)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -184,8 +183,7 @@ inline int imageMoments(const uchar* src_data, size_t src_step, int src_type, in
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::imageMoments
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/src/imgproc/morph.cpp b/hal/riscv-rvv/src/imgproc/morph.cpp
new file mode 100644
index 0000000000..e5d79b598b
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/morph.cpp
@@ -0,0 +1,331 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+struct Morph2D
+{
+    int operation;
+    int src_type;
+    int dst_type;
+    int kernel_type;
+    uchar* kernel_data;
+    size_t kernel_step;
+    int kernel_width;
+    int kernel_height;
+    int anchor_x;
+    int anchor_y;
+    int borderType;
+    const uchar* borderValue;
+};
+
+template<int op> struct rvv;
+template<> struct rvv<CV_HAL_MORPH_ERODE>
+{
+    static inline uchar init() { return std::numeric_limits<uchar>::max(); }
+    static inline uchar mop(uchar a, uchar b) { return a < b ? a : b; }
+    static inline vuint8m4_t vop(vuint8m4_t a, vuint8m4_t b, size_t c) { return __riscv_vminu(a, b, c); }
+    static inline vuint8m4_t vop(vuint8m4_t a, uchar b, size_t c) { return __riscv_vminu(a, b, c); }
+};
+template<> struct rvv<CV_HAL_MORPH_DILATE>
+{
+    static inline uchar init() { return std::numeric_limits<uchar>::min(); }
+    static inline uchar mop(uchar a, uchar b) { return a > b ? a : b; }
+    static inline vuint8m4_t vop(vuint8m4_t a, vuint8m4_t b, size_t c) { return __riscv_vmaxu(a, b, c); }
+    static inline vuint8m4_t vop(vuint8m4_t a, uchar b, size_t c) { return __riscv_vmaxu(a, b, c); }
+};
+
+// the algorithm is copied from 3rdparty/carotene/src/morph.cpp,
+// in the function template void morph3x3
+template<int op>
+static inline int morph(int start, int end, Morph2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
+{
+    bool kernel[9];
+    for (int i = 0; i < 9; i++)
+    {
+        kernel[i] = data->kernel_data[(i / 3) * data->kernel_step + i % 3] != 0;
+    }
+
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto access = [&](int x, int y) {
+        int pi, pj;
+        if (data->borderType & BORDER_ISOLATED)
+        {
+            pi = common::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
+            pj = common::borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED);
+            pi = pi < 0 ? noval : pi;
+            pj = pj < 0 ? noval : pj;
+        }
+        else
+        {
+            pi = common::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
+            pj = common::borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType);
+            pi = pi < 0 ? noval : pi - offset_y;
+            pj = pj < 0 ? noval : pj - offset_x;
+        }
+        return std::make_pair(pi, pj);
+    };
+
+    auto process = [&](int x, int y) {
+        if (data->src_type == CV_8UC1)
+        {
+            uchar val = rvv<op>::init();
+            for (int i = 0; i < 9; i++)
+            {
+                if (kernel[i])
+                {
+                    auto p = access(x + i / 3, y + i % 3);
+                    if (p.first != noval && p.second != noval)
+                    {
+                        val = rvv<op>::mop(val, src_data[p.first * src_step + p.second]);
+                    }
+                    else
+                    {
+                        val = rvv<op>::mop(val, data->borderValue[0]);
+                    }
+                }
+            }
+            dst_data[x * width + y] = val;
+        }
+        else
+        {
+            uchar val0, val1, val2, val3;
+            val0 = val1 = val2 = val3 = rvv<op>::init();
+            for (int i = 0; i < 9; i++)
+            {
+                if (kernel[i])
+                {
+                    auto p = access(x + i / 3, y + i % 3);
+                    if (p.first != noval && p.second != noval)
+                    {
+                        val0 = rvv<op>::mop(val0, src_data[p.first * src_step + p.second * 4    ]);
+                        val1 = rvv<op>::mop(val1, src_data[p.first * src_step + p.second * 4 + 1]);
+                        val2 = rvv<op>::mop(val2, src_data[p.first * src_step + p.second * 4 + 2]);
+                        val3 = rvv<op>::mop(val3, src_data[p.first * src_step + p.second * 4 + 3]);
+                    }
+                    else
+                    {
+                        val0 = rvv<op>::mop(val0, data->borderValue[0]);
+                        val1 = rvv<op>::mop(val1, data->borderValue[1]);
+                        val2 = rvv<op>::mop(val2, data->borderValue[2]);
+                        val3 = rvv<op>::mop(val3, data->borderValue[3]);
+                    }
+                }
+            }
+            dst_data[(x * width + y) * 4    ] = val0;
+            dst_data[(x * width + y) * 4 + 1] = val1;
+            dst_data[(x * width + y) * 4 + 2] = val2;
+            dst_data[(x * width + y) * 4 + 3] = val3;
+        }
+    };
+
+    const int left = data->anchor_x, right = width - (2 - data->anchor_x);
+    for (int i = start; i < end; i++)
+    {
+        if (left >= right)
+        {
+            for (int j = 0; j < width; j++)
+                process(i, j);
+        }
+        else
+        {
+            for (int j = 0; j < left; j++)
+                process(i, j);
+            for (int j = right; j < width; j++)
+                process(i, j);
+
+            const uchar* row0 = access(i    , 0).first == noval ? nullptr : src_data + access(i    , 0).first * src_step;
+            const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step;
+            const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step;
+            if (data->src_type == CV_8UC1)
+            {
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = __riscv_vsetvl_e8m4(right - j);
+                    auto m0 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
+                    auto loadsrc = [&](const uchar* row, bool k0, bool k1, bool k2) {
+                        if (!row)
+                        {
+                            m0 = rvv<op>::vop(m0, data->borderValue[0], vl);
+                            return;
+                        }
+
+                        const uchar* extra = row + j - data->anchor_x;
+                        auto v0 = __riscv_vle8_v_u8m4(extra, vl);
+
+                        if (k0) m0 = rvv<op>::vop(m0, v0, vl);
+                        v0 = __riscv_vslide1down(v0, extra[vl], vl);
+                        if (k1) m0 = rvv<op>::vop(m0, v0, vl);
+                        if (!k2) return;
+                        v0 = __riscv_vslide1down(v0, extra[vl + 1], vl);
+                        m0 = rvv<op>::vop(m0, v0, vl);
+                    };
+
+                    loadsrc(row0, kernel[0], kernel[1], kernel[2]);
+                    loadsrc(row1, kernel[3], kernel[4], kernel[5]);
+                    loadsrc(row2, kernel[6], kernel[7], kernel[8]);
+                    __riscv_vse8(dst_data + i * width + j, m0, vl);
+                }
+            }
+            else
+            {
+                int vl, vl0, vl1;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = __riscv_vsetvl_e8m4(right - j);
+                    vl0 = std::min(vl, (int)__riscv_vlenb() * 2);
+                    vl1 = vl - vl0;
+                    auto m0 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
+                    auto m1 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
+                    auto m2 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
+                    auto m3 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
+
+                    auto opshift = [&](vuint8m4_t a, vuint8m4_t b, bool k0, bool k1, bool k2, uchar r1, uchar r2) {
+                        if (k0) a = rvv<op>::vop(a, b, vl);
+                        b = __riscv_vslide1down(b, r1, vl);
+                        if (k1) a = rvv<op>::vop(a, b, vl);
+                        if (!k2) return a;
+                        b = __riscv_vslide1down(b, r2, vl);
+                        return rvv<op>::vop(a, b, vl);
+                    };
+                    auto loadsrc = [&](const uchar* row, bool k0, bool k1, bool k2) {
+                        if (!row)
+                        {
+                            m0 = rvv<op>::vop(m0, data->borderValue[0], vl);
+                            m1 = rvv<op>::vop(m1, data->borderValue[1], vl);
+                            m2 = rvv<op>::vop(m2, data->borderValue[2], vl);
+                            m3 = rvv<op>::vop(m3, data->borderValue[3], vl);
+                            return;
+                        }
+
+                        vuint8m4_t v0{}, v1{}, v2{}, v3{};
+                        const uchar* extra = row + (j - data->anchor_x) * 4;
+                        auto src = __riscv_vlseg4e8_v_u8m2x4(extra, vl0);
+                        v0 = __riscv_vset_v_u8m2_u8m4(v0, 0, __riscv_vget_v_u8m2x4_u8m2(src, 0));
+                        v1 = __riscv_vset_v_u8m2_u8m4(v1, 0, __riscv_vget_v_u8m2x4_u8m2(src, 1));
+                        v2 = __riscv_vset_v_u8m2_u8m4(v2, 0, __riscv_vget_v_u8m2x4_u8m2(src, 2));
+                        v3 = __riscv_vset_v_u8m2_u8m4(v3, 0, __riscv_vget_v_u8m2x4_u8m2(src, 3));
+                        src = __riscv_vlseg4e8_v_u8m2x4(extra + vl0 * 4, vl1);
+                        v0 = __riscv_vset_v_u8m2_u8m4(v0, 1, __riscv_vget_v_u8m2x4_u8m2(src, 0));
+                        v1 = __riscv_vset_v_u8m2_u8m4(v1, 1, __riscv_vget_v_u8m2x4_u8m2(src, 1));
+                        v2 = __riscv_vset_v_u8m2_u8m4(v2, 1, __riscv_vget_v_u8m2x4_u8m2(src, 2));
+                        v3 = __riscv_vset_v_u8m2_u8m4(v3, 1, __riscv_vget_v_u8m2x4_u8m2(src, 3));
+
+                        extra += vl * 4;
+                        m0 = opshift(m0, v0, k0, k1, k2, extra[0], extra[4]);
+                        m1 = opshift(m1, v1, k0, k1, k2, extra[1], extra[5]);
+                        m2 = opshift(m2, v2, k0, k1, k2, extra[2], extra[6]);
+                        m3 = opshift(m3, v3, k0, k1, k2, extra[3], extra[7]);
+                    };
+
+                    loadsrc(row0, kernel[0], kernel[1], kernel[2]);
+                    loadsrc(row1, kernel[3], kernel[4], kernel[5]);
+                    loadsrc(row2, kernel[6], kernel[7], kernel[8]);
+                    vuint8m2x4_t val{};
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 0, __riscv_vget_v_u8m4_u8m2(m0, 0));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 1, __riscv_vget_v_u8m4_u8m2(m1, 0));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 2, __riscv_vget_v_u8m4_u8m2(m2, 0));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 3, __riscv_vget_v_u8m4_u8m2(m3, 0));
+                    __riscv_vsseg4e8(dst_data + (i * width + j) * 4, val, vl0);
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 0, __riscv_vget_v_u8m4_u8m2(m0, 1));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 1, __riscv_vget_v_u8m4_u8m2(m1, 1));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 2, __riscv_vget_v_u8m4_u8m2(m2, 1));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 3, __riscv_vget_v_u8m4_u8m2(m3, 1));
+                    __riscv_vsseg4e8(dst_data + (i * width + j + vl0) * 4, val, vl1);
+                }
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int morphInit(cvhalFilter2D** context, int operation, int src_type, int dst_type, int /*max_width*/, int /*max_height*/, int kernel_type, uchar* kernel_data, size_t kernel_step, int kernel_width, int kernel_height, int anchor_x, int anchor_y, int borderType, const double borderValue[4], int iterations, bool /*allowSubmatrix*/, bool /*allowInplace*/)
+{
+    if (kernel_type != CV_8UC1 || src_type != dst_type)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (src_type != CV_8UC1 && src_type != CV_8UC4)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (kernel_width != kernel_height || kernel_width != 3)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (iterations != 1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (operation != CV_HAL_MORPH_ERODE && operation != CV_HAL_MORPH_DILATE)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    uchar* borderV;
+    if (src_type == CV_8UC1)
+    {
+        borderV = new uchar{static_cast<uchar>(borderValue[0])};
+        if (operation == CV_HAL_MORPH_DILATE && borderValue[0] == DBL_MAX)
+            borderV[0] = 0;
+    }
+    else
+    {
+        borderV = new uchar[4]{static_cast<uchar>(borderValue[0]), static_cast<uchar>(borderValue[1]), static_cast<uchar>(borderValue[2]), static_cast<uchar>(borderValue[3])};
+        if (operation == CV_HAL_MORPH_DILATE)
+        {
+            if (borderValue[0] == DBL_MAX)
+                borderV[0] = 0;
+            if (borderValue[1] == DBL_MAX)
+                borderV[1] = 0;
+            if (borderValue[2] == DBL_MAX)
+                borderV[2] = 0;
+            if (borderValue[3] == DBL_MAX)
+                borderV[3] = 0;
+        }
+    }
+
+    anchor_x = anchor_x < 0 ? kernel_width  / 2 : anchor_x;
+    anchor_y = anchor_y < 0 ? kernel_height / 2 : anchor_y;
+    *context = reinterpret_cast<cvhalFilter2D*>(new Morph2D{operation, src_type, dst_type, kernel_type, kernel_data, kernel_step, kernel_width, kernel_height, anchor_x, anchor_y, borderType, borderV});
+    return CV_HAL_ERROR_OK;
+}
+
+int morph(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_full_width, int src_full_height, int src_roi_x, int src_roi_y, int /*dst_full_width*/, int /*dst_full_height*/, int /*dst_roi_x*/, int /*dst_roi_y*/)
+{
+    Morph2D* data = reinterpret_cast<Morph2D*>(context);
+    int cn = data->src_type == CV_8UC1 ? 1 : 4;
+    std::vector<uchar> dst(width * height * cn);
+
+    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
+    switch (data->operation)
+    {
+    case CV_HAL_MORPH_ERODE:
+        res = common::invoke(height, {morph<CV_HAL_MORPH_ERODE>}, data, src_data, src_step, dst.data(), width, height, src_full_width, src_full_height, src_roi_x, src_roi_y);
+        break;
+    case CV_HAL_MORPH_DILATE:
+        res = common::invoke(height, {morph<CV_HAL_MORPH_DILATE>}, data, src_data, src_step, dst.data(), width, height, src_full_width, src_full_height, src_roi_x, src_roi_y);
+        break;
+    }
+
+    for (int i = 0; i < height; i++)
+        memcpy(dst_data + i * dst_step, dst.data() + i * width * cn, width * cn);
+    return res;
+}
+
+int morphFree(cvhalFilter2D* context)
+{
+    delete reinterpret_cast<Morph2D*>(context)->borderValue;
+    delete reinterpret_cast<Morph2D*>(context);
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/hal_rvv_1p0/pyramids.hpp b/hal/riscv-rvv/src/imgproc/pyramids.cpp
similarity index 97%
rename from hal/riscv-rvv/hal_rvv_1p0/pyramids.hpp
rename to hal/riscv-rvv/src/imgproc/pyramids.cpp
index a349d341c5..66bf4c1b4d 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/pyramids.hpp
+++ b/hal/riscv-rvv/src/imgproc/pyramids.cpp
@@ -4,18 +4,13 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_PYRAMIDS_HPP_INCLUDED
-#define OPENCV_HAL_RVV_PYRAMIDS_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
+namespace cv { namespace rvv_hal { namespace imgproc {
 
-namespace cv { namespace cv_hal_rvv { namespace pyramids {
+#if CV_HAL_RVV_1P0_ENABLED
 
-#undef cv_hal_pyrdown
-#define cv_hal_pyrdown cv::cv_hal_rvv::pyramids::pyrDown
-#undef cv_hal_pyrup
-#define cv_hal_pyrup cv::cv_hal_rvv::pyramids::pyrUp
+namespace {
 
 template<typename T> struct rvv;
 
@@ -562,7 +557,9 @@ inline int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_
     return CV_HAL_ERROR_OK;
 }
 
-inline int pyrDown(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type)
+} // anonymous
+
+int pyrDown(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type)
 {
     if (border_type == BORDER_CONSTANT || (depth == CV_32F && cn == 1))
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -580,7 +577,7 @@ inline int pyrDown(const uchar* src_data, size_t src_step, int src_width, int sr
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-inline int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type)
+int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type)
 {
     if (border_type != BORDER_DEFAULT)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -598,6 +595,6 @@ inline int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-}}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/hal_rvv_1p0/resize.hpp b/hal/riscv-rvv/src/imgproc/resize.cpp
similarity index 99%
rename from hal/riscv-rvv/hal_rvv_1p0/resize.hpp
rename to hal/riscv-rvv/src/imgproc/resize.cpp
index d18db5f058..1ce5e16bb3 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/resize.hpp
+++ b/hal/riscv-rvv/src/imgproc/resize.cpp
@@ -4,17 +4,15 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_RESIZE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_RESIZE_HPP_INCLUDED
-
-#include <riscv_vector.h>
+#include "rvv_hal.hpp"
+#include "common.hpp"
 #include <list>
 
-namespace cv { namespace cv_hal_rvv {
+namespace cv { namespace rvv_hal { namespace imgproc {
 
-namespace resize {
-#undef cv_hal_resize
-#define cv_hal_resize cv::cv_hal_rvv::resize::resize
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
 
 class ResizeInvoker : public ParallelLoopBody
 {
@@ -986,7 +984,9 @@ static inline int resizeArea(int src_type, const uchar *src_data, size_t src_ste
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-inline int resize(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y, int interpolation)
+} // anonymous
+
+int resize(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y, int interpolation)
 {
     inv_scale_x = 1 / inv_scale_x;
     inv_scale_y = 1 / inv_scale_y;
@@ -999,8 +999,7 @@ inline int resize(int src_type, const uchar *src_data, size_t src_step, int src_
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::resize
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/src/imgproc/sep_filter.cpp b/hal/riscv-rvv/src/imgproc/sep_filter.cpp
new file mode 100644
index 0000000000..54267683e5
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/sep_filter.cpp
@@ -0,0 +1,259 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+struct sepFilter2D
+{
+    int src_type;
+    int dst_type;
+    int kernel_type;
+    const uchar* kernelx_data;
+    int kernelx_length;
+    const uchar* kernely_data;
+    int kernely_length;
+    int anchor_x;
+    int anchor_y;
+    double delta;
+    int borderType;
+};
+
+// the algorithm is copied from 3rdparty/carotene/src/separable_filter.hpp,
+// in the functor RowFilter3x3S16Generic and ColFilter3x3S16Generic
+template<int ksize, typename T>
+static inline int sepFilter(int start, int end, sepFilter2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
+{
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto accessX = [&](int x) {
+        int pi;
+        if (data->borderType & BORDER_ISOLATED)
+        {
+            pi = common::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
+            pi = pi < 0 ? noval : pi;
+        }
+        else
+        {
+            pi = common::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
+            pi = pi < 0 ? noval : pi - offset_y;
+        }
+        return pi;
+    };
+    auto accessY = [&](int y) {
+        int pj;
+        if (data->borderType & BORDER_ISOLATED)
+        {
+            pj = common::borderInterpolate(y - data->anchor_x, width, data->borderType & ~BORDER_ISOLATED);
+            pj = pj < 0 ? noval : pj;
+        }
+        else
+        {
+            pj = common::borderInterpolate(offset_x + y - data->anchor_x, full_width, data->borderType);
+            pj = pj < 0 ? noval : pj - offset_x;
+        }
+        return pj;
+    };
+    auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; };
+
+    const float* kx = reinterpret_cast<const float*>(data->kernelx_data);
+    const float* ky = reinterpret_cast<const float*>(data->kernely_data);
+    std::vector<float> res(width * ksize);
+    auto process = [&](int x, int y) {
+        float sum = 0;
+        for (int i = 0; i < ksize; i++)
+        {
+            int p = accessY(y + i);
+            if (p != noval)
+            {
+                sum += kx[i] * reinterpret_cast<const T*>(src_data + x * src_step)[p];
+            }
+        }
+        res[p2idx(x, y)] = sum;
+    };
+
+    const int left = data->anchor_x, right = width - (ksize - 1 - data->anchor_x);
+    for (int i = start - data->anchor_y; i < end + (ksize - 1 - data->anchor_y); i++)
+    {
+        if (i + offset_y >= 0 && i + offset_y < full_height)
+        {
+            if (left >= right)
+            {
+                for (int j = 0; j < width; j++)
+                    process(i, j);
+            }
+            else
+            {
+                for (int j = 0; j < left; j++)
+                    process(i, j);
+                for (int j = right; j < width; j++)
+                    process(i, j);
+
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = __riscv_vsetvl_e8m2(right - j);
+                    const T* extra = reinterpret_cast<const T*>(src_data + i * src_step) + j - data->anchor_x;
+                    vfloat32m8_t src;
+                    if (std::is_same<T, uchar>::value)
+                    {
+                        src = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vle8_v_u8m2(reinterpret_cast<const uchar*>(extra), vl), vl), vl);
+                    }
+                    else if (std::is_same<T, short>::value)
+                    {
+                        src = __riscv_vfwcvt_f(__riscv_vle16_v_i16m4(reinterpret_cast<const short*>(extra), vl), vl);
+                    }
+                    else
+                    {
+                        src = __riscv_vle32_v_f32m8(reinterpret_cast<const float*>(extra), vl);
+                    }
+
+                    extra += vl;
+                    auto sum = __riscv_vfmul(src, kx[0], vl);
+                    src = __riscv_vfslide1down(src, extra[0], vl);
+                    sum = __riscv_vfmacc(sum, kx[1], src, vl);
+                    src = __riscv_vfslide1down(src, extra[1], vl);
+                    sum = __riscv_vfmacc(sum, kx[2], src, vl);
+                    if (ksize == 5)
+                    {
+                        src = __riscv_vfslide1down(src, extra[2], vl);
+                        sum = __riscv_vfmacc(sum, kx[3], src, vl);
+                        src = __riscv_vfslide1down(src, extra[3], vl);
+                        sum = __riscv_vfmacc(sum, kx[4], src, vl);
+                    }
+                    __riscv_vse32(res.data() + p2idx(i, j), sum, vl);
+                }
+            }
+        }
+
+        int cur = i - (ksize - 1 - data->anchor_y);
+        if (cur >= start)
+        {
+            const float* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
+            const float* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
+            const float* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
+            const float* row3 = nullptr, *row4 = nullptr;
+            if (ksize == 5)
+            {
+                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
+                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
+            }
+
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m4(width - j);
+                auto v0 = row0 ? __riscv_vle32_v_f32m4(row0 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
+                auto v1 = row1 ? __riscv_vle32_v_f32m4(row1 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
+                auto v2 = row2 ? __riscv_vle32_v_f32m4(row2 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
+                auto sum = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmv_v_f_f32m4(data->delta, vl), ky[0], v0, vl), ky[1], v1, vl), ky[2], v2, vl);
+
+                if (ksize == 5)
+                {
+                    auto v3 = row3 ? __riscv_vle32_v_f32m4(row3 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
+                    auto v4 = row4 ? __riscv_vle32_v_f32m4(row4 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
+                    sum = __riscv_vfmacc(__riscv_vfmacc(sum, ky[3], v3, vl), ky[4], v4, vl);
+                }
+
+                if (data->dst_type == CV_16SC1)
+                {
+                    __riscv_vse16(reinterpret_cast<short*>(dst_data + cur * dst_step) + j, __riscv_vfncvt_x(sum, vl), vl);
+                }
+                else
+                {
+                    __riscv_vse32(reinterpret_cast<float*>(dst_data + cur * dst_step) + j, sum, vl);
+                }
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int sepFilterInit(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type, uchar* kernelx_data, int kernelx_length, uchar* kernely_data, int kernely_length, int anchor_x, int anchor_y, double delta, int borderType)
+{
+    if (kernel_type != CV_32FC1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (src_type != CV_8UC1 && src_type != CV_16SC1 && src_type != CV_32FC1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (dst_type != CV_16SC1 && dst_type != CV_32FC1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((kernelx_length != 3 && kernelx_length != 5) || kernelx_length != kernely_length)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    anchor_x = anchor_x < 0 ? kernelx_length / 2 : anchor_x;
+    anchor_y = anchor_y < 0 ? kernely_length / 2 : anchor_y;
+    *context = reinterpret_cast<cvhalFilter2D*>(new sepFilter2D{src_type, dst_type, kernel_type, kernelx_data, kernelx_length, kernely_data, kernely_length, anchor_x, anchor_y, delta, borderType & ~BORDER_ISOLATED});
+    return CV_HAL_ERROR_OK;
+}
+
+int sepFilter(cvhalFilter2D *context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
+{
+    sepFilter2D* data = reinterpret_cast<sepFilter2D*>(context);
+
+    uchar* _dst_data = dst_data;
+    size_t _dst_step = dst_step;
+    const size_t size = CV_ELEM_SIZE(data->dst_type);
+    std::vector<uchar> dst;
+    if (src_data == _dst_data)
+    {
+        dst = std::vector<uchar>(width * height * size);
+        dst_data = dst.data();
+        dst_step = width * size;
+    }
+
+    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
+    switch (data->kernelx_length*100 + data->src_type)
+    {
+    case 300 + CV_8UC1:
+        res = common::invoke(height, {sepFilter<3, uchar>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 500 + CV_8UC1:
+        res = common::invoke(height, {sepFilter<5, uchar>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 300 + CV_16SC1:
+        res = common::invoke(height, {sepFilter<3, short>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 500 + CV_16SC1:
+        res = common::invoke(height, {sepFilter<5, short>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 300 + CV_32FC1:
+        res = common::invoke(height, {sepFilter<3, float>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 500 + CV_32FC1:
+        res = common::invoke(height, {sepFilter<5, float>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    }
+    if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    if (src_data == _dst_data)
+    {
+        for (int i = 0; i < height; i++)
+            memcpy(_dst_data + i * _dst_step, dst.data() + i * dst_step, dst_step);
+    }
+
+    return res;
+}
+
+int sepFilterFree(cvhalFilter2D* context)
+{
+    delete reinterpret_cast<sepFilter2D*>(context);
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/hal_rvv_1p0/thresh.hpp b/hal/riscv-rvv/src/imgproc/threshold.cpp
similarity index 86%
rename from hal/riscv-rvv/hal_rvv_1p0/thresh.hpp
rename to hal/riscv-rvv/src/imgproc/threshold.cpp
index 738e3d5012..8d76b5626d 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/thresh.hpp
+++ b/hal/riscv-rvv/src/imgproc/threshold.cpp
@@ -4,18 +4,15 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_THRESH_HPP_INCLUDED
-#define OPENCV_HAL_RVV_THRESH_HPP_INCLUDED
-
-#include <riscv_vector.h>
+#include "rvv_hal.hpp"
+#include "common.hpp"
 #include <atomic>
 
-namespace cv { namespace cv_hal_rvv {
+namespace cv { namespace rvv_hal { namespace imgproc {
 
-namespace threshold {
-// disabled since UI is fast enough, only called in threshold_otsu
-// #undef cv_hal_threshold
-// #define cv_hal_threshold cv::cv_hal_rvv::threshold::threshold
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
 
 class ThresholdInvoker : public ParallelLoopBody
 {
@@ -182,16 +179,6 @@ static inline int threshold_range(int start, int end, const uchar* src_data, siz
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-inline int threshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, double thresh, double maxValue, int thresholdType)
-{
-    return threshold_range(0, height, src_data, src_step, dst_data, dst_step, width, depth, cn, thresh, maxValue, thresholdType);
-}
-} // cv::cv_hal_rvv::threshold
-
-namespace threshold_otsu {
-#undef cv_hal_threshold_otsu
-#define cv_hal_threshold_otsu cv::cv_hal_rvv::threshold_otsu::threshold_otsu
-
 static inline int otsu(int start, int end, const uchar* src_data, size_t src_step, int width, std::atomic<int>* cnt, int N, int* h)
 {
     const int c = cnt->fetch_add(1) % cv::getNumThreads();
@@ -205,69 +192,6 @@ static inline int otsu(int start, int end, const uchar* src_data, size_t src_ste
     return CV_HAL_ERROR_OK;
 }
 
-// the algorithm is copied from imgproc/src/thresh.cpp,
-// in the function template static double getThreshVal_Otsu
-inline int threshold_otsu(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, double maxValue, int thresholdType, double* thresh)
-{
-    if (depth != CV_8UC1 || width * height < (1 << 15))
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    const int N = std::numeric_limits<uchar>::max() + 1;
-    const int nums = cv::getNumThreads();
-    std::vector<int> _h(N * nums, 0);
-    int* h = _h.data();
-
-    std::atomic<int> cnt(0);
-    cv::parallel_for_(Range(0, height), threshold::ThresholdInvoker({otsu}, src_data, src_step, width, &cnt, N, h), nums);
-    for (int i = N; i < nums * N; i++)
-    {
-        h[i % N] += h[i];
-    }
-
-    double mu = 0, scale = 1. / (width*height);
-    for (int i = 0; i < N; i++)
-    {
-        mu += i*(double)h[i];
-    }
-
-    mu *= scale;
-    double mu1 = 0, q1 = 0;
-    double max_sigma = 0, max_val = 0;
-
-    for (int i = 0; i < N; i++)
-    {
-        double p_i, q2, mu2, sigma;
-
-        p_i = h[i]*scale;
-        mu1 *= q1;
-        q1 += p_i;
-        q2 = 1. - q1;
-
-        if (std::min(q1,q2) < FLT_EPSILON || std::max(q1,q2) > 1. - FLT_EPSILON)
-            continue;
-
-        mu1 = (mu1 + i*p_i)/q1;
-        mu2 = (mu - q1*mu1)/q2;
-        sigma = q1*q2*(mu1 - mu2)*(mu1 - mu2);
-        if (sigma > max_sigma)
-        {
-            max_sigma = sigma;
-            max_val = i;
-        }
-    }
-
-    *thresh = max_val;
-    if (dst_data == nullptr)
-        return CV_HAL_ERROR_OK;
-
-    return threshold::invoke(width, height, {threshold::threshold_range}, src_data, src_step, dst_data, dst_step, width, depth, 1, max_val, maxValue, thresholdType);
-}
-} // cv::cv_hal_rvv::threshold_otsu
-
-namespace adaptiveThreshold {
-#undef cv_hal_adaptiveThreshold
-#define cv_hal_adaptiveThreshold cv::cv_hal_rvv::adaptiveThreshold::adaptiveThreshold
-
 // the algorithm is copied from imgproc/src/thresh.cpp,
 // in the function void cv::adaptiveThreshold
 template<int ksize, int method, int type>
@@ -444,7 +368,72 @@ static inline int adaptiveThreshold(int start, int end, const uchar* src_data, s
     return CV_HAL_ERROR_OK;
 }
 
-inline int adaptiveThreshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, double maxValue, int adaptiveMethod, int thresholdType, int blockSize, double C)
+} // anonymous
+
+int threshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, double thresh, double maxValue, int thresholdType)
+{
+    return threshold_range(0, height, src_data, src_step, dst_data, dst_step, width, depth, cn, thresh, maxValue, thresholdType);
+}
+
+// the algorithm is copied from imgproc/src/thresh.cpp,
+// in the function template static double getThreshVal_Otsu
+int threshold_otsu(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, double maxValue, int thresholdType, double* thresh)
+{
+    if (depth != CV_8UC1 || width * height < (1 << 15))
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    const int N = std::numeric_limits<uchar>::max() + 1;
+    const int nums = cv::getNumThreads();
+    std::vector<int> _h(N * nums, 0);
+    int* h = _h.data();
+
+    std::atomic<int> cnt(0);
+    cv::parallel_for_(Range(0, height), ThresholdInvoker({otsu}, src_data, src_step, width, &cnt, N, h), nums);
+    for (int i = N; i < nums * N; i++)
+    {
+        h[i % N] += h[i];
+    }
+
+    double mu = 0, scale = 1. / (width*height);
+    for (int i = 0; i < N; i++)
+    {
+        mu += i*(double)h[i];
+    }
+
+    mu *= scale;
+    double mu1 = 0, q1 = 0;
+    double max_sigma = 0, max_val = 0;
+
+    for (int i = 0; i < N; i++)
+    {
+        double p_i, q2, mu2, sigma;
+
+        p_i = h[i]*scale;
+        mu1 *= q1;
+        q1 += p_i;
+        q2 = 1. - q1;
+
+        if (std::min(q1,q2) < FLT_EPSILON || std::max(q1,q2) > 1. - FLT_EPSILON)
+            continue;
+
+        mu1 = (mu1 + i*p_i)/q1;
+        mu2 = (mu - q1*mu1)/q2;
+        sigma = q1*q2*(mu1 - mu2)*(mu1 - mu2);
+        if (sigma > max_sigma)
+        {
+            max_sigma = sigma;
+            max_val = i;
+        }
+    }
+
+    *thresh = max_val;
+    if (dst_data == nullptr)
+        return CV_HAL_ERROR_OK;
+
+    return invoke(width, height, {threshold_range}, src_data, src_step, dst_data, dst_step, width, depth, 1, max_val, maxValue, thresholdType);
+}
+
+int adaptiveThreshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, double maxValue, int adaptiveMethod, int thresholdType, int blockSize, double C)
 {
     if (thresholdType != CV_HAL_THRESH_BINARY && thresholdType != CV_HAL_THRESH_BINARY_INV)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -456,27 +445,26 @@ inline int adaptiveThreshold(const uchar* src_data, size_t src_step, uchar* dst_
     switch (blockSize*100 + adaptiveMethod*10 + thresholdType)
     {
     case 300 + CV_HAL_ADAPTIVE_THRESH_MEAN_C*10 + CV_HAL_THRESH_BINARY:
-        return threshold::invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
     case 300 + CV_HAL_ADAPTIVE_THRESH_MEAN_C*10 + CV_HAL_THRESH_BINARY_INV:
-        return threshold::invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
     case 500 + CV_HAL_ADAPTIVE_THRESH_MEAN_C*10 + CV_HAL_THRESH_BINARY:
-        return threshold::invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
     case 500 + CV_HAL_ADAPTIVE_THRESH_MEAN_C*10 + CV_HAL_THRESH_BINARY_INV:
-        return threshold::invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
     case 300 + CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C*10 + CV_HAL_THRESH_BINARY:
-        return threshold::invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
     case 300 + CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C*10 + CV_HAL_THRESH_BINARY_INV:
-        return threshold::invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
     case 500 + CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C*10 + CV_HAL_THRESH_BINARY:
-        return threshold::invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
     case 500 + CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C*10 + CV_HAL_THRESH_BINARY_INV:
-        return threshold::invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::adaptiveThreshold
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} /// cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/hal_rvv_1p0/warp.hpp b/hal/riscv-rvv/src/imgproc/warp.cpp
similarity index 95%
rename from hal/riscv-rvv/hal_rvv_1p0/warp.hpp
rename to hal/riscv-rvv/src/imgproc/warp.cpp
index f207c7cb95..745f27c9ca 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/warp.hpp
+++ b/hal/riscv-rvv/src/imgproc/warp.cpp
@@ -4,20 +4,14 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_WARP_HPP_INCLUDED
-#define OPENCV_HAL_RVV_WARP_HPP_INCLUDED
+#include "rvv_hal.hpp"
+#include "common.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace imgproc {
 
-namespace cv { namespace cv_hal_rvv {
+#if CV_HAL_RVV_1P0_ENABLED
 
-namespace remap {
-#undef cv_hal_remap32f
-#define cv_hal_remap32f cv::cv_hal_rvv::remap::remap32f
-#undef cv_hal_remap32fc2
-#define cv_hal_remap32fc2 cv::cv_hal_rvv::remap::remap32fc2
-#undef cv_hal_remap16s
-#define cv_hal_remap16s cv::cv_hal_rvv::remap::remap16s
+namespace {
 
 class RemapInvoker : public ParallelLoopBody
 {
@@ -862,30 +856,6 @@ inline int remap32f(int src_type, const uchar *src_data, size_t src_step, int sr
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-inline int remap32fc2(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
-                      uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
-                      float* map, size_t map_step, int interpolation, int border_type, const double border_value[4])
-{
-    return remap32f(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, map, map_step, nullptr, 0, interpolation, border_type, border_value);
-}
-
-inline int remap16s(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
-                    uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
-                    short* mapx, size_t mapx_step, ushort* mapy, size_t mapy_step,
-                    int interpolation, int border_type, const double border_value[4])
-{
-    if (CV_MAKETYPE(src_type, 1) != src_type)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    return remap32f<true>(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, reinterpret_cast<float*>(mapx), mapx_step, reinterpret_cast<float*>(mapy), mapy_step, interpolation, border_type, border_value);
-}
-} // cv::cv_hal_rvv::remap
-
-namespace warp {
-#undef cv_hal_warpAffine
-#define cv_hal_warpAffine cv::cv_hal_rvv::warp::warpAffine
-#undef cv_hal_warpPerspective
-#define cv_hal_warpPerspective cv::cv_hal_rvv::warp::warpPerspective
-
 template<bool perspective>
 static inline int warpC1(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, const double* M, int interpolation, int borderType, const double* borderValue)
 {
@@ -1154,9 +1124,36 @@ static inline int warpC4(int start, int end, const uchar *src_data, size_t src_s
     return CV_HAL_ERROR_OK;
 }
 
+} // anonymous
+
+int remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+             uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+             float* mapx, size_t mapx_step, float* mapy, size_t mapy_step,
+             int interpolation, int border_type, const double border_value[4])
+{
+    return remap32f<false>(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
+}
+
+int remap32fc2(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+               uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+               float* map, size_t map_step, int interpolation, int border_type, const double border_value[4])
+{
+    return remap32f<false>(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, map, map_step, nullptr, 0, interpolation, border_type, border_value);
+}
+
+int remap16s(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+             uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+             short* mapx, size_t mapx_step, ushort* mapy, size_t mapy_step,
+             int interpolation, int border_type, const double border_value[4])
+{
+    if (CV_MAKETYPE(src_type, 1) != src_type)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    return remap32f<true>(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, reinterpret_cast<float*>(mapx), mapx_step, reinterpret_cast<float*>(mapy), mapy_step, interpolation, border_type, border_value);
+}
+
 // the algorithm is copied from 3rdparty/carotene/src/warp_affine.cpp,
 // in the function void CAROTENE_NS::warpAffineNearestNeighbor and void CAROTENE_NS::warpAffineLinear
-inline int warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4])
+int warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4])
 {
     if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -1168,11 +1165,11 @@ inline int warpAffine(int src_type, const uchar *src_data, size_t src_step, int
     switch (src_type)
     {
     case CV_8UC1:
-        return remap::invoke(dst_width, dst_height, {warpC1<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC1<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
     case CV_8UC3:
-        return remap::invoke(dst_width, dst_height, {warpC3<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC3<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
     case CV_8UC4:
-        return remap::invoke(dst_width, dst_height, {warpC4<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC4<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -1180,7 +1177,7 @@ inline int warpAffine(int src_type, const uchar *src_data, size_t src_step, int
 
 // the algorithm is copied from 3rdparty/carotene/src/warp_perspective.cpp,
 // in the function void CAROTENE_NS::warpPerspectiveNearestNeighbor and void CAROTENE_NS::warpPerspectiveLinear
-inline int warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4])
+int warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4])
 {
     if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -1192,17 +1189,16 @@ inline int warpPerspective(int src_type, const uchar *src_data, size_t src_step,
     switch (src_type)
     {
     case CV_8UC1:
-        return remap::invoke(dst_width, dst_height, {warpC1<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC1<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
     case CV_8UC3:
-        return remap::invoke(dst_width, dst_height, {warpC3<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC3<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
     case CV_8UC4:
-        return remap::invoke(dst_width, dst_height, {warpC4<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC4<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::warp
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/version/hal_rvv_071.hpp b/hal/riscv-rvv/version/hal_rvv_071.hpp
deleted file mode 100644
index db235d6139..0000000000
--- a/hal/riscv-rvv/version/hal_rvv_071.hpp
+++ /dev/null
@@ -1,109 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#ifndef OPENCV_HAL_RVV_071_HPP_INCLUDED
-#define OPENCV_HAL_RVV_071_HPP_INCLUDED
-
-#include <riscv_vector.h>
-
-#include <limits>
-
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_cvtBGRtoBGR
-#define cv_hal_cvtBGRtoBGR cv::cv_hal_rvv::cvtBGRtoBGR
-
-static const unsigned char index_array_32 [32]
-                        { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15, 18, 17, 16, 19, 22, 21, 20, 23, 26, 25, 24, 27, 30, 29, 28, 31  };
-
-static const unsigned char index_array_24 [24]
-                        { 2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17, 16, 15, 20, 19, 18, 23, 22, 21  };
-
-static void vBGRtoBGR(const unsigned char* src, unsigned char * dst, const unsigned char * index, int n, int scn, int dcn, int vsize_pixels, const int vsize)
-{
-    vuint8m2_t vec_index = vle8_v_u8m2(index, vsize);
-
-    int i = 0;
-
-    for ( ; i <= n-vsize; i += vsize_pixels, src += vsize, dst += vsize)
-    {
-        vuint8m2_t vec_src = vle8_v_u8m2(src, vsize);
-        vuint8m2_t vec_dst = vrgather_vv_u8m2(vec_src, vec_index, vsize);
-        vse8_v_u8m2(dst, vec_dst, vsize);
-    }
-
-    for ( ; i < n; i++, src += scn, dst += dcn )
-    {
-        unsigned char t0 = src[0], t1 = src[1], t2 = src[2];
-        dst[2] = t0;
-        dst[1] = t1;
-        dst[0] = t2;
-        if(dcn == 4)
-        {
-            unsigned char d = src[3];
-            dst[3] = d;
-        }
-    }
-}
-
-static void sBGRtoBGR(const unsigned char* src, unsigned char * dst, int n, int scn, int dcn, int bi)
-{
-    for (int i = 0; i < n; i++, src += scn, dst += dcn)
-    {
-        unsigned char t0 = src[0], t1 = src[1], t2 = src[2];
-        dst[bi  ] = t0;
-        dst[1]    = t1;
-        dst[bi^2] = t2;
-        if(dcn == 4)
-        {
-            unsigned char d = scn == 4 ? src[3] : std::numeric_limits<unsigned char>::max();
-            dst[3] = d;
-        }
-    }
-}
-
-static int cvtBGRtoBGR(const unsigned char * src_data, size_t src_step, unsigned char * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue)
-{
-    if (depth != CV_8U)
-    {
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    }
-
-    const int blueIdx = swapBlue ? 2 : 0;
-    if (scn == dcn)
-    {
-        if (!swapBlue)
-        {
-            return CV_HAL_ERROR_NOT_IMPLEMENTED;
-        }
-
-        const int vsize_pixels = 8;
-
-        if (scn == 4)
-        {
-            for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
-            {
-                vBGRtoBGR(src_data, dst_data, index_array_32, width, scn, dcn, vsize_pixels, 32);
-            }
-        }
-        else
-        {
-            for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
-            {
-                vBGRtoBGR(src_data, dst_data, index_array_24, width, scn, dcn, vsize_pixels, 24);
-            }
-        }
-    }
-    else
-    {
-        for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
-            sBGRtoBGR(src_data, dst_data, width, scn, dcn, blueIdx);
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-}}
-
-#endif