Merge pull request #27096 from amane-ame:moments_hal_rvv

Add RISC-V HAL implementation for cv::moments #27096 This patch implements `cv_hal_imageMoments` using native intrinsics, optimizing the performance of `cv::moments` for data types `CV_16U/CV_16S/CV_32F/CV_64F`. Tested on MUSE-PI (Spacemit X60) for both gcc 14.2 and clang 20.0. ``` $ ./opencv_test_imgproc --gtest_filter="*Moments*" $ ./opencv_perf_imgproc --gtest_filter="*Moments*" --perf_min_samples=1000 --perf_force_samples=1000 ``` ![image](https://github.com/user-attachments/assets/0efbae10-c022-4f15-a81c-682514cdb372) ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2025-12-06 12:19:50 +01:00 · 2025-03-20 15:50:06 +08:00 · 2025-03-20 15:50:06 +08:00 · 46fbe1895a
commit 46fbe1895a
parent 67ffb230f1
3 changed files with 224 additions and 7 deletions
--- a/3rdparty/hal_rvv/hal_rvv.hpp
+++ b/3rdparty/hal_rvv/hal_rvv.hpp
@ -44,6 +44,7 @@
 #include "hal_rvv_1p0/svd.hpp" // core
 #include "hal_rvv_1p0/sqrt.hpp" // core

+#include "hal_rvv_1p0/moments.hpp" // imgproc
 #include "hal_rvv_1p0/filter.hpp" // imgproc
 #include "hal_rvv_1p0/pyramids.hpp" // imgproc
 #include "hal_rvv_1p0/color.hpp" // imgproc
--- a/3rdparty/hal_rvv/hal_rvv_1p0/moments.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/moments.hpp
@ -0,0 +1,191 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#ifndef OPENCV_HAL_RVV_MOMENTS_HPP_INCLUDED
+#define OPENCV_HAL_RVV_MOMENTS_HPP_INCLUDED
+
+#include <riscv_vector.h>
+
+namespace cv { namespace cv_hal_rvv {
+
+namespace imageMoments {
+#undef cv_hal_imageMoments
+#define cv_hal_imageMoments cv::cv_hal_rvv::imageMoments::imageMoments
+
+class MomentsInvoker : public ParallelLoopBody
+{
+public:
+    template<typename... Args>
+    MomentsInvoker(std::function<int(int, int, Args...)> _func, Args&&... args)
+    {
+        func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward<Args>(args)...);
+    }
+
+    virtual void operator()(const Range& range) const override
+    {
+        func(range.start, range.end);
+    }
+
+private:
+    std::function<int(int, int)> func;
+};
+
+template<typename... Args>
+static inline int invoke(int width, int height, std::function<int(int, int, Args...)> func, Args&&... args)
+{
+    cv::parallel_for_(Range(1, height), MomentsInvoker(func, std::forward<Args>(args)...), static_cast<double>((width - 1) * height) / (1 << 10));
+    return func(0, 1, std::forward<Args>(args)...);
+}
+
+template<typename helper> struct rvv;
+template<> struct rvv<RVV_U32M2>
+{
+    static inline vuint8mf2_t vid(size_t a) { return __riscv_vid_v_u8mf2(a); }
+    static inline RVV_U32M2::VecType vcvt(vuint8mf2_t a, size_t b) { return __riscv_vzext_vf4(a, b); }
+};
+template<> struct rvv<RVV_U32M4>
+{
+    static inline vuint8m1_t vid(size_t a) { return __riscv_vid_v_u8m1(a); }
+    static inline RVV_U32M4::VecType vcvt(vuint8m1_t a, size_t b) { return __riscv_vzext_vf4(a, b); }
+};
+template<> struct rvv<RVV_I32M2>
+{
+    static inline vuint8mf2_t vid(size_t a) { return __riscv_vid_v_u8mf2(a); }
+    static inline RVV_I32M2::VecType vcvt(vuint8mf2_t a, size_t b) { return RVV_I32M2::reinterpret(__riscv_vzext_vf4(a, b)); }
+};
+template<> struct rvv<RVV_F64M4>
+{
+    static inline vuint8mf2_t vid(size_t a) { return __riscv_vid_v_u8mf2(a); }
+    static inline RVV_F64M4::VecType vcvt(vuint8mf2_t a, size_t b) { return __riscv_vfcvt_f(__riscv_vzext_vf8(a, b), b); }
+};
+
+constexpr int TILE_SIZE = 32;
+
+template<bool binary, typename T, typename helperT, typename helperWT, typename helperMT>
+static inline int imageMoments(int start, int end, const uchar* src_data, size_t src_step, int full_width, int full_height, double* m, std::mutex* mt)
+{
+    double mm[10] = {0};
+    for (int yy = start; yy < end; yy++)
+    {
+        const int y = yy * TILE_SIZE;
+        const int height = std::min(TILE_SIZE, full_height - y);
+        for (int x = 0; x < full_width; x += TILE_SIZE)
+        {
+            const int width = std::min(TILE_SIZE, full_width - x);
+            double mom[10] = {0};
+
+            for (int i = 0; i < height; i++)
+            {
+                auto id = rvv<helperWT>::vid(helperT::setvlmax());
+                auto v0 = helperWT::vmv(0, helperWT::setvlmax());
+                auto v1 = helperWT::vmv(0, helperWT::setvlmax());
+                auto v2 = helperWT::vmv(0, helperWT::setvlmax());
+                auto v3 = helperMT::vmv(0, helperMT::setvlmax());
+
+                int vl;
+                for (int j = 0; j < width; j += vl)
+                {
+                    vl = helperT::setvl(width - j);
+                    typename helperWT::VecType p;
+                    if (binary)
+                    {
+                        auto src = RVV_SameLen<T, helperT>::vload(reinterpret_cast<const T*>(src_data + (i + y) * src_step) + j + x, vl);
+                        p = __riscv_vmerge(helperWT::vmv(0, vl), helperWT::vmv(255, vl), RVV_SameLen<T, helperT>::vmne(src, 0, vl), vl);
+                    }
+                    else
+                    {
+                        p = helperWT::cast(helperT::vload(reinterpret_cast<const typename helperT::ElemType*>(src_data + (i + y) * src_step) + j + x, vl), vl);
+                    }
+                    auto xx = rvv<helperWT>::vcvt(id, vl);
+                    auto xp = helperWT::vmul(xx, p, vl);
+                    v0 = helperWT::vadd_tu(v0, v0, p, vl);
+                    v1 = helperWT::vadd_tu(v1, v1, xp, vl);
+                    auto xxp = helperWT::vmul(xx, xp, vl);
+                    v2 = helperWT::vadd_tu(v2, v2, xxp, vl);
+                    v3 = helperMT::vadd_tu(v3, v3, helperMT::vmul(helperMT::cast(xx, vl), helperMT::cast(xxp, vl), vl), vl);
+                    id = __riscv_vadd(id, vl, vl);
+                }
+
+                auto x0 = RVV_BaseType<helperWT>::vmv_x(helperWT::vredsum(v0, RVV_BaseType<helperWT>::vmv_s(0, RVV_BaseType<helperWT>::setvlmax()), helperWT::setvlmax()));
+                auto x1 = RVV_BaseType<helperWT>::vmv_x(helperWT::vredsum(v1, RVV_BaseType<helperWT>::vmv_s(0, RVV_BaseType<helperWT>::setvlmax()), helperWT::setvlmax()));
+                auto x2 = RVV_BaseType<helperWT>::vmv_x(helperWT::vredsum(v2, RVV_BaseType<helperWT>::vmv_s(0, RVV_BaseType<helperWT>::setvlmax()), helperWT::setvlmax()));
+                auto x3 = RVV_BaseType<helperMT>::vmv_x(helperMT::vredsum(v3, RVV_BaseType<helperMT>::vmv_s(0, RVV_BaseType<helperMT>::setvlmax()), helperMT::setvlmax()));
+                typename helperWT::ElemType py = i * x0, sy = i*i;
+
+                mom[9] += static_cast<typename helperMT::ElemType>(py) * sy;
+                mom[8] += static_cast<typename helperMT::ElemType>(x1) * sy;
+                mom[7] += static_cast<typename helperMT::ElemType>(x2) * i;
+                mom[6] += x3;
+                mom[5] += x0 * sy;
+                mom[4] += x1 * i;
+                mom[3] += x2;
+                mom[2] += py;
+                mom[1] += x1;
+                mom[0] += x0;
+            }
+
+            if (binary)
+            {
+                mom[0] /= 255, mom[1] /= 255, mom[2] /= 255, mom[3] /= 255, mom[4] /= 255;
+                mom[5] /= 255, mom[6] /= 255, mom[7] /= 255, mom[8] /= 255, mom[9] /= 255;
+            }
+            double xm = x * mom[0], ym = y * mom[0];
+            mm[0] += mom[0];
+            mm[1] += mom[1] + xm;
+            mm[2] += mom[2] + ym;
+            mm[3] += mom[3] + x * (mom[1] * 2 + xm);
+            mm[4] += mom[4] + x * (mom[2] + ym) + y * mom[1];
+            mm[5] += mom[5] + y * (mom[2] * 2 + ym);
+            mm[6] += mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+            mm[7] += mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+            mm[8] += mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+            mm[9] += mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+        }
+    }
+
+    std::lock_guard<std::mutex> lk(*mt);
+    for (int i = 0; i < 10; i++)
+        m[i] += mm[i];
+    return CV_HAL_ERROR_OK;
+}
+
+// the algorithm is copied from imgproc/src/moments.cpp,
+// in the function cv::Moments cv::moments
+inline int imageMoments(const uchar* src_data, size_t src_step, int src_type, int width, int height, bool binary, double m[10])
+{
+    if (src_type != CV_16UC1 && src_type != CV_16SC1 && src_type != CV_32FC1 && src_type != CV_64FC1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    std::fill(m, m + 10, 0);
+    const int cnt = (height + TILE_SIZE - 1) / TILE_SIZE;
+    std::mutex mt;
+    switch (static_cast<int>(binary)*100 + src_type)
+    {
+    case CV_16UC1:
+        return invoke(width, cnt, {imageMoments<false, ushort, RVV_U16M1, RVV_U32M2, RVV_U64M4>}, src_data, src_step, width, height, m, &mt);
+    case CV_16SC1:
+        return invoke(width, cnt, {imageMoments<false, short, RVV_I16M1, RVV_I32M2, RVV_I64M4>}, src_data, src_step, width, height, m, &mt);
+    case CV_32FC1:
+        return invoke(width, cnt, {imageMoments<false, float, RVV_F32M2, RVV_F64M4, RVV_F64M4>}, src_data, src_step, width, height, m, &mt);
+    case CV_64FC1:
+        return invoke(width, cnt, {imageMoments<false, double, RVV_F64M4, RVV_F64M4, RVV_F64M4>}, src_data, src_step, width, height, m, &mt);
+    case 100 + CV_16UC1:
+        return invoke(width, cnt, {imageMoments<true, ushort, RVV_U8M1, RVV_U32M4, RVV_U32M4>}, src_data, src_step, width, height, m, &mt);
+    case 100 + CV_16SC1:
+        return invoke(width, cnt, {imageMoments<true, short, RVV_U8M1, RVV_U32M4, RVV_U32M4>}, src_data, src_step, width, height, m, &mt);
+    case 100 + CV_32FC1:
+        return invoke(width, cnt, {imageMoments<true, float, RVV_U8M1, RVV_U32M4, RVV_U32M4>}, src_data, src_step, width, height, m, &mt);
+    case 100 + CV_64FC1:
+        return invoke(width, cnt, {imageMoments<true, double, RVV_U8M1, RVV_U32M4, RVV_U32M4>}, src_data, src_step, width, height, m, &mt);
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+} // cv::cv_hal_rvv::imageMoments
+
+}}
+
+#endif
--- a/3rdparty/hal_rvv/hal_rvv_1p0/types.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/types.hpp
@ -94,7 +94,7 @@ using RVV_F64M8 = struct RVV<double, LMUL_8>;
 // Only for dst type lmul >= 1
 template <typename Dst_T, typename RVV_T>
 using RVV_SameLen =
-    RVV<Dst_T, RVV_LMUL(RVV_T::lmul / sizeof(typename RVV_T::ElemType) * sizeof(Dst_T))>;
+    RVV<Dst_T, RVV_LMUL(RVV_T::lmul * sizeof(Dst_T) / sizeof(typename RVV_T::ElemType))>;

 template <size_t DstSize> struct RVV_ToIntHelper;
 template <size_t DstSize> struct RVV_ToUintHelper;
@ -117,7 +117,7 @@ using RVV_BaseType = RVV<typename RVV_T::ElemType, LMUL_1>;

 // -------------------------------Supported operations--------------------------------

-#define HAL_RVV_SIZE_RELATED(EEW, TYPE, LMUL, S_OR_F, X_OR_F, IS_U, IS_F)                            \
+#define HAL_RVV_SIZE_RELATED(EEW, TYPE, LMUL, S_OR_F, X_OR_F, IS_U, IS_F, IS_O)                      \
 static inline size_t setvlmax() { return __riscv_vsetvlmax_e##EEW##LMUL(); }                         \
 static inline size_t setvl(size_t vl) { return __riscv_vsetvl_e##EEW##LMUL(vl); }                    \
 static inline VecType vload(const ElemType* ptr, size_t vl) {                                        \
@ -153,7 +153,7 @@ static inline VecType vmv_s(ElemType a, size_t vl) {
 }                                                                                                    \
 HAL_RVV_SIZE_RELATED_CUSTOM(EEW, TYPE, LMUL)

-#define HAL_RVV_SIZE_UNRELATED(S_OR_F, X_OR_F, IS_U, IS_F)                                      \
+#define HAL_RVV_SIZE_UNRELATED(S_OR_F, X_OR_F, IS_U, IS_F, IS_O)                                \
 static inline ElemType vmv_x(VecType vs2) { return __riscv_v##IS_F##mv_##X_OR_F(vs2); }         \
                                                                                                \
 static inline BoolType vmlt(VecType vs2, VecType vs1, size_t vl) {                              \
@ -174,6 +174,12 @@ static inline BoolType vmgt(VecType vs2, ElemType vs1, size_t vl) {
 static inline BoolType vmge(VecType vs2, VecType vs1, size_t vl) {                              \
    return __riscv_vm##S_OR_F##ge##IS_U(vs2, vs1, vl);                                          \
 }                                                                                               \
+static inline BoolType vmeq(VecType vs2, ElemType vs1, size_t vl) {                             \
+    return __riscv_vm##S_OR_F##eq(vs2, vs1, vl);                                                \
+}                                                                                               \
+static inline BoolType vmne(VecType vs2, ElemType vs1, size_t vl) {                             \
+    return __riscv_vm##S_OR_F##ne(vs2, vs1, vl);                                                \
+}                                                                                               \
 static inline BoolType vmlt_mu(BoolType vm, BoolType vd, VecType vs2, VecType vs1, size_t vl) { \
    return __riscv_vm##S_OR_F##lt##IS_U##_mu(vm, vd, vs2, vs1, vl);                             \
 }                                                                                               \
@ -187,6 +193,22 @@ static inline BoolType vmge_mu(BoolType vm, BoolType vd, VecType vs2, VecType vs
    return __riscv_vm##S_OR_F##ge##IS_U##_mu(vm, vd, vs2, vs1, vl);                             \
 }                                                                                               \
                                                                                                \
+static inline VecType vadd(VecType vs2, VecType vs1, size_t vl) {                               \
+    return __riscv_v##IS_F##add(vs2, vs1, vl);                                                  \
+}                                                                                               \
+static inline VecType vsub(VecType vs2, VecType vs1, size_t vl) {                               \
+    return __riscv_v##IS_F##sub(vs2, vs1, vl);                                                  \
+}                                                                                               \
+static inline VecType vadd_tu(VecType vd, VecType vs2, VecType vs1, size_t vl) {                \
+    return __riscv_v##IS_F##add_tu(vd, vs2, vs1, vl);                                           \
+}                                                                                               \
+static inline VecType vsub_tu(VecType vd, VecType vs2, VecType vs1, size_t vl) {                \
+    return __riscv_v##IS_F##sub_tu(vd, vs2, vs1, vl);                                           \
+}                                                                                               \
+static inline VecType vmul(VecType vs2, VecType vs1, size_t vl) {                               \
+    return __riscv_v##IS_F##mul(vs2, vs1, vl);                                                  \
+}                                                                                               \
+                                                                                                \
 static inline VecType vmin(VecType vs2, VecType vs1, size_t vl) {                               \
    return __riscv_v##IS_F##min##IS_U(vs2, vs1, vl);                                            \
 }                                                                                               \
@ -211,9 +233,12 @@ static inline BaseType vredmin(VecType vs2, BaseType vs1, size_t vl) {
 }                                                                                               \
 static inline BaseType vredmax(VecType vs2, BaseType vs1, size_t vl) {                          \
    return __riscv_v##IS_F##redmax##IS_U(vs2, vs1, vl);                                         \
+}                                                                                               \
+static inline BaseType vredsum(VecType vs2, BaseType vs1, size_t vl) {                          \
+    return __riscv_v##IS_F##red##IS_O##sum(vs2, vs1, vl);                                       \
 }

-#define HAL_RVV_BOOL_TYPE(S_OR_F, X_OR_F, IS_U, IS_F) \
+#define HAL_RVV_BOOL_TYPE(S_OR_F, X_OR_F, IS_U, IS_F, IS_O) \
    decltype(__riscv_vm##S_OR_F##eq(std::declval<VecType>(), std::declval<VecType>(), 0))

 #define HAL_RVV_DEFINE_ONE(ELEM_TYPE, VEC_TYPE, LMUL_TYPE, \
@ -259,9 +284,9 @@ static inline BaseType vredmax(VecType vs2, BaseType vs1, size_t vl) {
    HAL_RVV_DEFINE_ONE(ELEM_TYPE, VEC_TYPE, LMUL_8, \
                       EEW, TYPE, m8, __VA_ARGS__)

-#define HAL_RVV_SIGNED_PARAM   s,x, ,
-#define HAL_RVV_UNSIGNED_PARAM s,x,u,
-#define HAL_RVV_FLOAT_PARAM    f,f, ,f
+#define HAL_RVV_SIGNED_PARAM   s,x, , ,
+#define HAL_RVV_UNSIGNED_PARAM s,x,u, ,
+#define HAL_RVV_FLOAT_PARAM    f,f, ,f,o

 // -------------------------------Define Unsigned Integer--------------------------------