Merge pull request #27096 from amane-ame:moments_hal_rvv

Add RISC-V HAL implementation for cv::moments #27096

This patch implements `cv_hal_imageMoments` using native intrinsics, optimizing the performance of `cv::moments` for data types `CV_16U/CV_16S/CV_32F/CV_64F`.

Tested on MUSE-PI (Spacemit X60) for both gcc 14.2 and clang 20.0.

```
$ ./opencv_test_imgproc --gtest_filter="*Moments*"
$ ./opencv_perf_imgproc --gtest_filter="*Moments*" --perf_min_samples=1000 --perf_force_samples=1000
```

![image](https://github.com/user-attachments/assets/0efbae10-c022-4f15-a81c-682514cdb372)

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
天音あめ 2025-03-20 15:50:06 +08:00 committed by GitHub
parent 67ffb230f1
commit 46fbe1895a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 224 additions and 7 deletions

View File

@ -44,6 +44,7 @@
#include "hal_rvv_1p0/svd.hpp" // core
#include "hal_rvv_1p0/sqrt.hpp" // core
#include "hal_rvv_1p0/moments.hpp" // imgproc
#include "hal_rvv_1p0/filter.hpp" // imgproc
#include "hal_rvv_1p0/pyramids.hpp" // imgproc
#include "hal_rvv_1p0/color.hpp" // imgproc

191
3rdparty/hal_rvv/hal_rvv_1p0/moments.hpp vendored Normal file
View File

@ -0,0 +1,191 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
#ifndef OPENCV_HAL_RVV_MOMENTS_HPP_INCLUDED
#define OPENCV_HAL_RVV_MOMENTS_HPP_INCLUDED
#include <riscv_vector.h>
namespace cv { namespace cv_hal_rvv {
namespace imageMoments {
#undef cv_hal_imageMoments
#define cv_hal_imageMoments cv::cv_hal_rvv::imageMoments::imageMoments
class MomentsInvoker : public ParallelLoopBody
{
public:
template<typename... Args>
MomentsInvoker(std::function<int(int, int, Args...)> _func, Args&&... args)
{
func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward<Args>(args)...);
}
virtual void operator()(const Range& range) const override
{
func(range.start, range.end);
}
private:
std::function<int(int, int)> func;
};
template<typename... Args>
static inline int invoke(int width, int height, std::function<int(int, int, Args...)> func, Args&&... args)
{
cv::parallel_for_(Range(1, height), MomentsInvoker(func, std::forward<Args>(args)...), static_cast<double>((width - 1) * height) / (1 << 10));
return func(0, 1, std::forward<Args>(args)...);
}
template<typename helper> struct rvv;
template<> struct rvv<RVV_U32M2>
{
static inline vuint8mf2_t vid(size_t a) { return __riscv_vid_v_u8mf2(a); }
static inline RVV_U32M2::VecType vcvt(vuint8mf2_t a, size_t b) { return __riscv_vzext_vf4(a, b); }
};
template<> struct rvv<RVV_U32M4>
{
static inline vuint8m1_t vid(size_t a) { return __riscv_vid_v_u8m1(a); }
static inline RVV_U32M4::VecType vcvt(vuint8m1_t a, size_t b) { return __riscv_vzext_vf4(a, b); }
};
template<> struct rvv<RVV_I32M2>
{
static inline vuint8mf2_t vid(size_t a) { return __riscv_vid_v_u8mf2(a); }
static inline RVV_I32M2::VecType vcvt(vuint8mf2_t a, size_t b) { return RVV_I32M2::reinterpret(__riscv_vzext_vf4(a, b)); }
};
template<> struct rvv<RVV_F64M4>
{
static inline vuint8mf2_t vid(size_t a) { return __riscv_vid_v_u8mf2(a); }
static inline RVV_F64M4::VecType vcvt(vuint8mf2_t a, size_t b) { return __riscv_vfcvt_f(__riscv_vzext_vf8(a, b), b); }
};
constexpr int TILE_SIZE = 32;
template<bool binary, typename T, typename helperT, typename helperWT, typename helperMT>
static inline int imageMoments(int start, int end, const uchar* src_data, size_t src_step, int full_width, int full_height, double* m, std::mutex* mt)
{
double mm[10] = {0};
for (int yy = start; yy < end; yy++)
{
const int y = yy * TILE_SIZE;
const int height = std::min(TILE_SIZE, full_height - y);
for (int x = 0; x < full_width; x += TILE_SIZE)
{
const int width = std::min(TILE_SIZE, full_width - x);
double mom[10] = {0};
for (int i = 0; i < height; i++)
{
auto id = rvv<helperWT>::vid(helperT::setvlmax());
auto v0 = helperWT::vmv(0, helperWT::setvlmax());
auto v1 = helperWT::vmv(0, helperWT::setvlmax());
auto v2 = helperWT::vmv(0, helperWT::setvlmax());
auto v3 = helperMT::vmv(0, helperMT::setvlmax());
int vl;
for (int j = 0; j < width; j += vl)
{
vl = helperT::setvl(width - j);
typename helperWT::VecType p;
if (binary)
{
auto src = RVV_SameLen<T, helperT>::vload(reinterpret_cast<const T*>(src_data + (i + y) * src_step) + j + x, vl);
p = __riscv_vmerge(helperWT::vmv(0, vl), helperWT::vmv(255, vl), RVV_SameLen<T, helperT>::vmne(src, 0, vl), vl);
}
else
{
p = helperWT::cast(helperT::vload(reinterpret_cast<const typename helperT::ElemType*>(src_data + (i + y) * src_step) + j + x, vl), vl);
}
auto xx = rvv<helperWT>::vcvt(id, vl);
auto xp = helperWT::vmul(xx, p, vl);
v0 = helperWT::vadd_tu(v0, v0, p, vl);
v1 = helperWT::vadd_tu(v1, v1, xp, vl);
auto xxp = helperWT::vmul(xx, xp, vl);
v2 = helperWT::vadd_tu(v2, v2, xxp, vl);
v3 = helperMT::vadd_tu(v3, v3, helperMT::vmul(helperMT::cast(xx, vl), helperMT::cast(xxp, vl), vl), vl);
id = __riscv_vadd(id, vl, vl);
}
auto x0 = RVV_BaseType<helperWT>::vmv_x(helperWT::vredsum(v0, RVV_BaseType<helperWT>::vmv_s(0, RVV_BaseType<helperWT>::setvlmax()), helperWT::setvlmax()));
auto x1 = RVV_BaseType<helperWT>::vmv_x(helperWT::vredsum(v1, RVV_BaseType<helperWT>::vmv_s(0, RVV_BaseType<helperWT>::setvlmax()), helperWT::setvlmax()));
auto x2 = RVV_BaseType<helperWT>::vmv_x(helperWT::vredsum(v2, RVV_BaseType<helperWT>::vmv_s(0, RVV_BaseType<helperWT>::setvlmax()), helperWT::setvlmax()));
auto x3 = RVV_BaseType<helperMT>::vmv_x(helperMT::vredsum(v3, RVV_BaseType<helperMT>::vmv_s(0, RVV_BaseType<helperMT>::setvlmax()), helperMT::setvlmax()));
typename helperWT::ElemType py = i * x0, sy = i*i;
mom[9] += static_cast<typename helperMT::ElemType>(py) * sy;
mom[8] += static_cast<typename helperMT::ElemType>(x1) * sy;
mom[7] += static_cast<typename helperMT::ElemType>(x2) * i;
mom[6] += x3;
mom[5] += x0 * sy;
mom[4] += x1 * i;
mom[3] += x2;
mom[2] += py;
mom[1] += x1;
mom[0] += x0;
}
if (binary)
{
mom[0] /= 255, mom[1] /= 255, mom[2] /= 255, mom[3] /= 255, mom[4] /= 255;
mom[5] /= 255, mom[6] /= 255, mom[7] /= 255, mom[8] /= 255, mom[9] /= 255;
}
double xm = x * mom[0], ym = y * mom[0];
mm[0] += mom[0];
mm[1] += mom[1] + xm;
mm[2] += mom[2] + ym;
mm[3] += mom[3] + x * (mom[1] * 2 + xm);
mm[4] += mom[4] + x * (mom[2] + ym) + y * mom[1];
mm[5] += mom[5] + y * (mom[2] * 2 + ym);
mm[6] += mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
mm[7] += mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
mm[8] += mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
mm[9] += mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
}
std::lock_guard<std::mutex> lk(*mt);
for (int i = 0; i < 10; i++)
m[i] += mm[i];
return CV_HAL_ERROR_OK;
}
// the algorithm is copied from imgproc/src/moments.cpp,
// in the function cv::Moments cv::moments
inline int imageMoments(const uchar* src_data, size_t src_step, int src_type, int width, int height, bool binary, double m[10])
{
if (src_type != CV_16UC1 && src_type != CV_16SC1 && src_type != CV_32FC1 && src_type != CV_64FC1)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
std::fill(m, m + 10, 0);
const int cnt = (height + TILE_SIZE - 1) / TILE_SIZE;
std::mutex mt;
switch (static_cast<int>(binary)*100 + src_type)
{
case CV_16UC1:
return invoke(width, cnt, {imageMoments<false, ushort, RVV_U16M1, RVV_U32M2, RVV_U64M4>}, src_data, src_step, width, height, m, &mt);
case CV_16SC1:
return invoke(width, cnt, {imageMoments<false, short, RVV_I16M1, RVV_I32M2, RVV_I64M4>}, src_data, src_step, width, height, m, &mt);
case CV_32FC1:
return invoke(width, cnt, {imageMoments<false, float, RVV_F32M2, RVV_F64M4, RVV_F64M4>}, src_data, src_step, width, height, m, &mt);
case CV_64FC1:
return invoke(width, cnt, {imageMoments<false, double, RVV_F64M4, RVV_F64M4, RVV_F64M4>}, src_data, src_step, width, height, m, &mt);
case 100 + CV_16UC1:
return invoke(width, cnt, {imageMoments<true, ushort, RVV_U8M1, RVV_U32M4, RVV_U32M4>}, src_data, src_step, width, height, m, &mt);
case 100 + CV_16SC1:
return invoke(width, cnt, {imageMoments<true, short, RVV_U8M1, RVV_U32M4, RVV_U32M4>}, src_data, src_step, width, height, m, &mt);
case 100 + CV_32FC1:
return invoke(width, cnt, {imageMoments<true, float, RVV_U8M1, RVV_U32M4, RVV_U32M4>}, src_data, src_step, width, height, m, &mt);
case 100 + CV_64FC1:
return invoke(width, cnt, {imageMoments<true, double, RVV_U8M1, RVV_U32M4, RVV_U32M4>}, src_data, src_step, width, height, m, &mt);
}
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
} // cv::cv_hal_rvv::imageMoments
}}
#endif

View File

@ -94,7 +94,7 @@ using RVV_F64M8 = struct RVV<double, LMUL_8>;
// Only for dst type lmul >= 1
template <typename Dst_T, typename RVV_T>
using RVV_SameLen =
RVV<Dst_T, RVV_LMUL(RVV_T::lmul / sizeof(typename RVV_T::ElemType) * sizeof(Dst_T))>;
RVV<Dst_T, RVV_LMUL(RVV_T::lmul * sizeof(Dst_T) / sizeof(typename RVV_T::ElemType))>;
template <size_t DstSize> struct RVV_ToIntHelper;
template <size_t DstSize> struct RVV_ToUintHelper;
@ -117,7 +117,7 @@ using RVV_BaseType = RVV<typename RVV_T::ElemType, LMUL_1>;
// -------------------------------Supported operations--------------------------------
#define HAL_RVV_SIZE_RELATED(EEW, TYPE, LMUL, S_OR_F, X_OR_F, IS_U, IS_F) \
#define HAL_RVV_SIZE_RELATED(EEW, TYPE, LMUL, S_OR_F, X_OR_F, IS_U, IS_F, IS_O) \
static inline size_t setvlmax() { return __riscv_vsetvlmax_e##EEW##LMUL(); } \
static inline size_t setvl(size_t vl) { return __riscv_vsetvl_e##EEW##LMUL(vl); } \
static inline VecType vload(const ElemType* ptr, size_t vl) { \
@ -153,7 +153,7 @@ static inline VecType vmv_s(ElemType a, size_t vl) {
} \
HAL_RVV_SIZE_RELATED_CUSTOM(EEW, TYPE, LMUL)
#define HAL_RVV_SIZE_UNRELATED(S_OR_F, X_OR_F, IS_U, IS_F) \
#define HAL_RVV_SIZE_UNRELATED(S_OR_F, X_OR_F, IS_U, IS_F, IS_O) \
static inline ElemType vmv_x(VecType vs2) { return __riscv_v##IS_F##mv_##X_OR_F(vs2); } \
\
static inline BoolType vmlt(VecType vs2, VecType vs1, size_t vl) { \
@ -174,6 +174,12 @@ static inline BoolType vmgt(VecType vs2, ElemType vs1, size_t vl) {
static inline BoolType vmge(VecType vs2, VecType vs1, size_t vl) { \
return __riscv_vm##S_OR_F##ge##IS_U(vs2, vs1, vl); \
} \
static inline BoolType vmeq(VecType vs2, ElemType vs1, size_t vl) { \
return __riscv_vm##S_OR_F##eq(vs2, vs1, vl); \
} \
static inline BoolType vmne(VecType vs2, ElemType vs1, size_t vl) { \
return __riscv_vm##S_OR_F##ne(vs2, vs1, vl); \
} \
static inline BoolType vmlt_mu(BoolType vm, BoolType vd, VecType vs2, VecType vs1, size_t vl) { \
return __riscv_vm##S_OR_F##lt##IS_U##_mu(vm, vd, vs2, vs1, vl); \
} \
@ -187,6 +193,22 @@ static inline BoolType vmge_mu(BoolType vm, BoolType vd, VecType vs2, VecType vs
return __riscv_vm##S_OR_F##ge##IS_U##_mu(vm, vd, vs2, vs1, vl); \
} \
\
static inline VecType vadd(VecType vs2, VecType vs1, size_t vl) { \
return __riscv_v##IS_F##add(vs2, vs1, vl); \
} \
static inline VecType vsub(VecType vs2, VecType vs1, size_t vl) { \
return __riscv_v##IS_F##sub(vs2, vs1, vl); \
} \
static inline VecType vadd_tu(VecType vd, VecType vs2, VecType vs1, size_t vl) { \
return __riscv_v##IS_F##add_tu(vd, vs2, vs1, vl); \
} \
static inline VecType vsub_tu(VecType vd, VecType vs2, VecType vs1, size_t vl) { \
return __riscv_v##IS_F##sub_tu(vd, vs2, vs1, vl); \
} \
static inline VecType vmul(VecType vs2, VecType vs1, size_t vl) { \
return __riscv_v##IS_F##mul(vs2, vs1, vl); \
} \
\
static inline VecType vmin(VecType vs2, VecType vs1, size_t vl) { \
return __riscv_v##IS_F##min##IS_U(vs2, vs1, vl); \
} \
@ -211,9 +233,12 @@ static inline BaseType vredmin(VecType vs2, BaseType vs1, size_t vl) {
} \
static inline BaseType vredmax(VecType vs2, BaseType vs1, size_t vl) { \
return __riscv_v##IS_F##redmax##IS_U(vs2, vs1, vl); \
} \
static inline BaseType vredsum(VecType vs2, BaseType vs1, size_t vl) { \
return __riscv_v##IS_F##red##IS_O##sum(vs2, vs1, vl); \
}
#define HAL_RVV_BOOL_TYPE(S_OR_F, X_OR_F, IS_U, IS_F) \
#define HAL_RVV_BOOL_TYPE(S_OR_F, X_OR_F, IS_U, IS_F, IS_O) \
decltype(__riscv_vm##S_OR_F##eq(std::declval<VecType>(), std::declval<VecType>(), 0))
#define HAL_RVV_DEFINE_ONE(ELEM_TYPE, VEC_TYPE, LMUL_TYPE, \
@ -259,9 +284,9 @@ static inline BaseType vredmax(VecType vs2, BaseType vs1, size_t vl) {
HAL_RVV_DEFINE_ONE(ELEM_TYPE, VEC_TYPE, LMUL_8, \
EEW, TYPE, m8, __VA_ARGS__)
#define HAL_RVV_SIGNED_PARAM s,x, ,
#define HAL_RVV_UNSIGNED_PARAM s,x,u,
#define HAL_RVV_FLOAT_PARAM f,f, ,f
#define HAL_RVV_SIGNED_PARAM s,x, , ,
#define HAL_RVV_UNSIGNED_PARAM s,x,u, ,
#define HAL_RVV_FLOAT_PARAM f,f, ,f,o
// -------------------------------Define Unsigned Integer--------------------------------