opencv/3rdparty/hal_rvv/hal_rvv_1p0/warp.hpp
天音あめ fa58c1205b
Merge pull request #27119 from amane-ame:warp_hal_rvv
Add RISC-V HAL implementation for cv::warp series #27119

This patch implements `cv_hal_remap`, `cv_hal_warpAffine` and `cv_hal_warpPerspective` using native intrinsics, optimizing the performance of `cv::remap/cv::warpAffine/cv::warpPerspective` for `CV_HAL_INTER_NEAREST/CV_HAL_INTER_LINEAR/CV_HAL_INTER_CUBIC/CV_HAL_INTER_LANCZOS4` modes.

Tested on MUSE-PI (Spacemit X60) for both gcc 14.2 and clang 20.0.

```
$ ./opencv_test_imgproc --gtest_filter="*Remap*:*Warp*"
$ ./opencv_perf_imgproc --gtest_filter="*Remap*:*remap*:*Warp*" --perf_min_samples=200 --perf_force_samples=200
```

View the full perf table here: [hal_rvv_warp.pdf](https://github.com/user-attachments/files/19403718/hal_rvv_warp.pdf)

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2025-03-25 11:57:47 +03:00

1209 lines
70 KiB
C++

// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
#ifndef OPENCV_HAL_RVV_WARP_HPP_INCLUDED
#define OPENCV_HAL_RVV_WARP_HPP_INCLUDED
#include <riscv_vector.h>
namespace cv { namespace cv_hal_rvv {
namespace remap {
#undef cv_hal_remap32f
#define cv_hal_remap32f cv::cv_hal_rvv::remap::remap32f
#undef cv_hal_remap32fc2
#define cv_hal_remap32fc2 cv::cv_hal_rvv::remap::remap32fc2
#undef cv_hal_remap16s
#define cv_hal_remap16s cv::cv_hal_rvv::remap::remap16s
class RemapInvoker : public ParallelLoopBody
{
public:
template<typename... Args>
RemapInvoker(std::function<int(int, int, Args...)> _func, Args&&... args)
{
func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward<Args>(args)...);
}
virtual void operator()(const Range& range) const override
{
func(range.start, range.end);
}
private:
std::function<int(int, int)> func;
};
template<typename... Args>
static inline int invoke(int width, int height, std::function<int(int, int, Args...)> func, Args&&... args)
{
cv::parallel_for_(Range(1, height), RemapInvoker(func, std::forward<Args>(args)...), static_cast<double>((width - 1) * height) / (1 << 15));
return func(0, 1, std::forward<Args>(args)...);
}
template<typename T> struct rvv;
// NN & LINEAR
template<> struct rvv<RVV_U8M2>
{
static inline vfloat32m8_t vcvt0(vuint8m2_t a, size_t b) { return __riscv_vfcvt_f(__riscv_vzext_vf4(a, b), b); }
static inline vuint8m2_t vcvt1(vfloat32m8_t a, size_t b) { return __riscv_vnclipu(__riscv_vfncvt_xu(a, b), 0, __RISCV_VXRM_RNU, b); }
static inline vuint8m2_t vloxei(const uchar* a, vuint32m8_t b, size_t c) { return __riscv_vloxei32_v_u8m2(a, b, c); }
};
template<> struct rvv<RVV_U16M4>
{
static inline vfloat32m8_t vcvt0(vuint16m4_t a, size_t b) { return __riscv_vfwcvt_f(a, b); }
static inline vuint16m4_t vcvt1(vfloat32m8_t a, size_t b) { return __riscv_vfncvt_xu(a, b); }
static inline vuint16m4_t vloxei(const ushort* a, vuint32m8_t b, size_t c) { return __riscv_vloxei32_v_u16m4(a, b, c); }
};
template<> struct rvv<RVV_I16M4>
{
static inline vfloat32m8_t vcvt0(vint16m4_t a, size_t b) { return __riscv_vfwcvt_f(a, b); }
static inline vint16m4_t vcvt1(vfloat32m8_t a, size_t b) { return __riscv_vfncvt_x(a, b); }
static inline vint16m4_t vloxei(const short* a, vuint32m8_t b, size_t c) { return __riscv_vloxei32_v_i16m4(a, b, c); }
};
template<> struct rvv<RVV_F32M8>
{
static inline vfloat32m8_t vcvt0(vfloat32m8_t a, size_t) { return a; }
static inline vfloat32m8_t vcvt1(vfloat32m8_t a, size_t) { return a; }
static inline vfloat32m8_t vloxei(const float* a, vuint32m8_t b, size_t c) { return __riscv_vloxei32_v_f32m8(a, b, c); }
};
// CUBIC
template<> struct rvv<RVV_U8MF4>
{
static inline vfloat32m1_t vcvt0(vuint8mf4_t a, size_t b) { return __riscv_vfcvt_f(__riscv_vzext_vf4(a, b), b); }
static inline vuint8mf4_t vcvt1(vfloat32m1_t a, size_t b) { return __riscv_vnclipu(__riscv_vfncvt_xu(a, b), 0, __RISCV_VXRM_RNU, b); }
static inline vuint8mf4_t vloxei(const uchar* a, vuint32m1_t b, size_t c) { return __riscv_vloxei32_v_u8mf4(a, b, c); }
};
template<> struct rvv<RVV_U16MF2>
{
static inline vfloat32m1_t vcvt0(vuint16mf2_t a, size_t b) { return __riscv_vfwcvt_f(a, b); }
static inline vuint16mf2_t vcvt1(vfloat32m1_t a, size_t b) { return __riscv_vfncvt_xu(a, b); }
static inline vuint16mf2_t vloxei(const ushort* a, vuint32m1_t b, size_t c) { return __riscv_vloxei32_v_u16mf2(a, b, c); }
};
template<> struct rvv<RVV_I16MF2>
{
static inline vfloat32m1_t vcvt0(vint16mf2_t a, size_t b) { return __riscv_vfwcvt_f(a, b); }
static inline vint16mf2_t vcvt1(vfloat32m1_t a, size_t b) { return __riscv_vfncvt_x(a, b); }
static inline vint16mf2_t vloxei(const short* a, vuint32m1_t b, size_t c) { return __riscv_vloxei32_v_i16mf2(a, b, c); }
};
template<> struct rvv<RVV_F32M1>
{
static inline vfloat32m1_t vcvt0(vfloat32m1_t a, size_t) { return a; }
static inline vfloat32m1_t vcvt1(vfloat32m1_t a, size_t) { return a; }
static inline vfloat32m1_t vloxei(const float* a, vuint32m1_t b, size_t c) { return __riscv_vloxei32_v_f32m1(a, b, c); }
};
// LANCZOS4
template<> struct rvv<RVV_U8MF2>
{
static inline vfloat32m2_t vcvt0(vuint8mf2_t a, size_t b) { return __riscv_vfcvt_f(__riscv_vzext_vf4(a, b), b); }
static inline vuint8mf2_t vcvt1(vfloat32m2_t a, size_t b) { return __riscv_vnclipu(__riscv_vfncvt_xu(a, b), 0, __RISCV_VXRM_RNU, b); }
static inline vuint8mf2_t vloxei(const uchar* a, vuint32m2_t b, size_t c) { return __riscv_vloxei32_v_u8mf2(a, b, c); }
};
template<> struct rvv<RVV_U16M1>
{
static inline vfloat32m2_t vcvt0(vuint16m1_t a, size_t b) { return __riscv_vfwcvt_f(a, b); }
static inline vuint16m1_t vcvt1(vfloat32m2_t a, size_t b) { return __riscv_vfncvt_xu(a, b); }
static inline vuint16m1_t vloxei(const ushort* a, vuint32m2_t b, size_t c) { return __riscv_vloxei32_v_u16m1(a, b, c); }
};
template<> struct rvv<RVV_I16M1>
{
static inline vfloat32m2_t vcvt0(vint16m1_t a, size_t b) { return __riscv_vfwcvt_f(a, b); }
static inline vint16m1_t vcvt1(vfloat32m2_t a, size_t b) { return __riscv_vfncvt_x(a, b); }
static inline vint16m1_t vloxei(const short* a, vuint32m2_t b, size_t c) { return __riscv_vloxei32_v_i16m1(a, b, c); }
};
template<> struct rvv<RVV_F32M2>
{
static inline vfloat32m2_t vcvt0(vfloat32m2_t a, size_t) { return a; }
static inline vfloat32m2_t vcvt1(vfloat32m2_t a, size_t) { return a; }
static inline vfloat32m2_t vloxei(const float* a, vuint32m2_t b, size_t c) { return __riscv_vloxei32_v_f32m2(a, b, c); }
};
template<typename helper>
static inline int remap32fC1(int start, int end, bool s16, const uchar *src_data, size_t src_step, int src_width, int src_height,
uchar *dst_data, size_t dst_step, int dst_width,
const float* mapx, size_t mapx_step, const float* mapy, size_t mapy_step,
int interpolation, int border_type, const double* border_value)
{
using T = typename helper::ElemType;
const int mode = interpolation & ~CV_HAL_WARP_RELATIVE_MAP;
for (int i = start; i < end; i++)
{
int vl;
for (int j = 0; j < dst_width; j += vl)
{
vl = helper::setvl(dst_width - j);
typename RVV_SameLen<float, helper>::VecType mx, my;
if (s16)
{
auto map = __riscv_vlseg2e16_v_i16m4x2(reinterpret_cast<const short*>(mapx) + i * mapx_step + j * 2, vl);
mx = __riscv_vfwcvt_f(__riscv_vget_v_i16m4x2_i16m4(map, 0), vl);
my = __riscv_vfwcvt_f(__riscv_vget_v_i16m4x2_i16m4(map, 1), vl);
}
else
{
if (mapy == nullptr)
{
mx = RVV_SameLen<float, helper>::vload_stride(mapx + i * mapx_step + j * 2 , sizeof(float) * 2, vl);
my = RVV_SameLen<float, helper>::vload_stride(mapx + i * mapx_step + j * 2 + 1, sizeof(float) * 2, vl);
}
else
{
mx = RVV_SameLen<float, helper>::vload(mapx + i * mapx_step + j, vl);
my = RVV_SameLen<float, helper>::vload(mapy + i * mapy_step + j, vl);
}
}
if (interpolation & CV_HAL_WARP_RELATIVE_MAP)
{
mx = __riscv_vfadd(mx, __riscv_vfcvt_f(__riscv_vadd(RVV_SameLen<uint, helper>::vid(vl), j, vl), vl), vl);
my = __riscv_vfadd(my, i, vl);
}
auto access = [&](typename RVV_SameLen<int, helper>::VecType ix, typename RVV_SameLen<int, helper>::VecType iy) {
auto ux = RVV_SameLen<uint, helper>::reinterpret(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width - 1, vl));
auto uy = RVV_SameLen<uint, helper>::reinterpret(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl));
auto src = rvv<helper>::vloxei(reinterpret_cast<const T*>(src_data), __riscv_vmadd(uy, src_step, __riscv_vmul(ux, sizeof(T), vl), vl), vl);
if (border_type == CV_HAL_BORDER_CONSTANT)
{
auto mask = __riscv_vmor(__riscv_vmsne(ix, RVV_SameLen<int, helper>::reinterpret(ux), vl), __riscv_vmsne(iy, RVV_SameLen<int, helper>::reinterpret(uy), vl), vl);
src = __riscv_vmerge(src, helper::vmv(border_value[0], vl), mask, vl);
}
return src;
};
if (mode == CV_HAL_INTER_NEAREST)
{
auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl);
helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, access(ix, iy), vl);
}
else if (mode == CV_HAL_INTER_LINEAR)
{
typename RVV_SameLen<int, helper>::VecType ix0, iy0;
if (s16)
{
ix0 = __riscv_vfcvt_x(mx, vl);
iy0 = __riscv_vfcvt_x(my, vl);
auto md = __riscv_vle16_v_u16m4(reinterpret_cast<const ushort*>(mapy) + i * mapy_step + j, vl);
mx = __riscv_vfdiv(__riscv_vfwcvt_f(__riscv_vand(md, 31, vl), vl), 32, vl);
my = __riscv_vfdiv(__riscv_vfwcvt_f(__riscv_vand(__riscv_vsrl(md, 5, vl), 31, vl), vl), 32, vl);
}
else
{
auto imx = __riscv_vfcvt_x(__riscv_vfmul(mx, 32, vl), vl);
auto imy = __riscv_vfcvt_x(__riscv_vfmul(my, 32, vl), vl);
ix0 = __riscv_vsra(imx, 5, vl);
iy0 = __riscv_vsra(imy, 5, vl);
mx = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imx, 31, vl), vl), 32, vl);
my = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imy, 31, vl), vl), 32, vl);
}
auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl);
auto v0 = rvv<helper>::vcvt0(access(ix0, iy0), vl);
auto v1 = rvv<helper>::vcvt0(access(ix1, iy0), vl);
auto v2 = rvv<helper>::vcvt0(access(ix0, iy1), vl);
auto v3 = rvv<helper>::vcvt0(access(ix1, iy1), vl);
v0 = __riscv_vfmacc(v0, mx, __riscv_vfsub(v1, v0, vl), vl);
v2 = __riscv_vfmacc(v2, mx, __riscv_vfsub(v3, v2, vl), vl);
v0 = __riscv_vfmacc(v0, my, __riscv_vfsub(v2, v0, vl), vl);
helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, rvv<helper>::vcvt1(v0, vl), vl);
}
else
{
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
}
}
return CV_HAL_ERROR_OK;
}
class RemapTable
{
private:
RemapTable()
{
// the algorithm is copied from imgproc/src/imgwarp.cpp,
// in the function static void interpolateLanczos4
constexpr double s45 = 0.70710678118654752440084436210485;
constexpr double cs[][2] = {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
for (int t = 0; t < 32; t++)
{
float x = t / 32.0f;
if (x < FLT_EPSILON)
{
for (int i = 0; i < 8; i++)
coeffs[t*8+i] = 0;
coeffs[t*8+3] = 1;
return;
}
float sum = 0;
double y0=-(x+3)*CV_PI*0.25, s0 = std::sin(y0), c0= std::cos(y0);
for (int i = 0; i < 8; i++)
{
double y = -(x+3-i)*CV_PI*0.25;
coeffs[t*8+i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
sum += coeffs[t*8+i];
}
sum = 1.f/sum;
for (int i = 0; i < 8; i++)
coeffs[t*8+i] *= sum;
}
}
public:
float coeffs[32 * 8];
static RemapTable& instance()
{
static RemapTable tab;
return tab;
}
};
template<typename helper>
static inline int remap32fCubic(int start, int end, bool s16, const uchar *src_data, size_t src_step, int src_width, int src_height,
uchar *dst_data, size_t dst_step, int dst_width,
const float* mapx, size_t mapx_step, const float* mapy, size_t mapy_step,
int interpolation, int border_type, const double* border_value)
{
using T = typename helper::ElemType;
for (int i = start; i < end; i++)
{
int vl;
for (int j = 0; j < dst_width; j += vl)
{
vl = helper::setvl(dst_width - j);
typename RVV_SameLen<float, helper>::VecType mx, my;
if (s16)
{
auto map = __riscv_vlseg2e16_v_i16mf2x2(reinterpret_cast<const short*>(mapx) + i * mapx_step + j * 2, vl);
mx = __riscv_vfwcvt_f(__riscv_vget_v_i16mf2x2_i16mf2(map, 0), vl);
my = __riscv_vfwcvt_f(__riscv_vget_v_i16mf2x2_i16mf2(map, 1), vl);
}
else
{
if (mapy == nullptr)
{
auto map = __riscv_vlseg2e32_v_f32m1x2(mapx + i * mapx_step + j * 2, vl);
mx = __riscv_vget_v_f32m1x2_f32m1(map, 0);
my = __riscv_vget_v_f32m1x2_f32m1(map, 1);
}
else
{
mx = RVV_SameLen<float, helper>::vload(mapx + i * mapx_step + j, vl);
my = RVV_SameLen<float, helper>::vload(mapy + i * mapy_step + j, vl);
}
}
if (interpolation & CV_HAL_WARP_RELATIVE_MAP)
{
mx = __riscv_vfadd(mx, __riscv_vfcvt_f(__riscv_vadd(RVV_SameLen<uint, helper>::vid(vl), j, vl), vl), vl);
my = __riscv_vfadd(my, i, vl);
}
auto access = [&](typename RVV_SameLen<int, helper>::VecType ix, typename RVV_SameLen<int, helper>::VecType iy) {
auto ux = RVV_SameLen<uint, helper>::reinterpret(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width - 1, vl));
auto uy = RVV_SameLen<uint, helper>::reinterpret(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl));
auto src = rvv<helper>::vloxei(reinterpret_cast<const T*>(src_data), __riscv_vmadd(uy, src_step, __riscv_vmul(ux, sizeof(T), vl), vl), vl);
if (border_type == CV_HAL_BORDER_CONSTANT)
{
auto mask = __riscv_vmor(__riscv_vmsne(ix, RVV_SameLen<int, helper>::reinterpret(ux), vl), __riscv_vmsne(iy, RVV_SameLen<int, helper>::reinterpret(uy), vl), vl);
src = __riscv_vmerge(src, helper::vmv(border_value[0], vl), mask, vl);
}
return src;
};
typename RVV_SameLen<int, helper>::VecType ix1, iy1;
if (s16)
{
ix1 = __riscv_vfcvt_x(mx, vl);
iy1 = __riscv_vfcvt_x(my, vl);
auto md = __riscv_vle16_v_u16mf2(reinterpret_cast<const ushort*>(mapy) + i * mapy_step + j, vl);
mx = __riscv_vfdiv(__riscv_vfwcvt_f(__riscv_vand(md, 31, vl), vl), 32, vl);
my = __riscv_vfdiv(__riscv_vfwcvt_f(__riscv_vand(__riscv_vsrl(md, 5, vl), 31, vl), vl), 32, vl);
}
else
{
auto imx = __riscv_vfcvt_x(__riscv_vfmul(mx, 32, vl), vl);
auto imy = __riscv_vfcvt_x(__riscv_vfmul(my, 32, vl), vl);
ix1 = __riscv_vsra(imx, 5, vl);
iy1 = __riscv_vsra(imy, 5, vl);
mx = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imx, 31, vl), vl), 32, vl);
my = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imy, 31, vl), vl), 32, vl);
}
auto ix0 = __riscv_vsub(ix1, 1, vl), iy0 = __riscv_vsub(iy1, 1, vl);
auto ix2 = __riscv_vadd(ix1, 1, vl), iy2 = __riscv_vadd(iy1, 1, vl);
auto ix3 = __riscv_vadd(ix1, 2, vl), iy3 = __riscv_vadd(iy1, 2, vl);
// the algorithm is copied from imgproc/src/imgwarp.cpp,
// in the function static void interpolateCubic
typename RVV_SameLen<float, helper>::VecType c0, c1, c2, c3;
auto intertab = [&](typename RVV_SameLen<float, helper>::VecType x) {
constexpr float A = -0.75f;
x = __riscv_vfadd(x, 1, vl);
c0 = __riscv_vfmadd(__riscv_vfmadd(__riscv_vfmadd(x, A, RVV_SameLen<float, helper>::vmv(-5 * A, vl), vl), x, RVV_SameLen<float, helper>::vmv(8 * A, vl), vl), x, RVV_SameLen<float, helper>::vmv(-4 * A, vl), vl);
x = __riscv_vfsub(x, 1, vl);
c1 = __riscv_vfmadd(__riscv_vfmul(__riscv_vfmadd(x, A + 2, RVV_SameLen<float, helper>::vmv(-(A + 3), vl), vl), x, vl), x, RVV_SameLen<float, helper>::vmv(1, vl), vl);
x = __riscv_vfrsub(x, 1, vl);
c2 = __riscv_vfmadd(__riscv_vfmul(__riscv_vfmadd(x, A + 2, RVV_SameLen<float, helper>::vmv(-(A + 3), vl), vl), x, vl), x, RVV_SameLen<float, helper>::vmv(1, vl), vl);
c3 = __riscv_vfsub(__riscv_vfsub(__riscv_vfrsub(c0, 1, vl), c1, vl), c2, vl);
};
intertab(mx);
auto v0 = rvv<helper>::vcvt0(access(ix0, iy0), vl);
auto v1 = rvv<helper>::vcvt0(access(ix1, iy0), vl);
auto v2 = rvv<helper>::vcvt0(access(ix2, iy0), vl);
auto v3 = rvv<helper>::vcvt0(access(ix3, iy0), vl);
auto k0 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl);
v0 = rvv<helper>::vcvt0(access(ix0, iy1), vl);
v1 = rvv<helper>::vcvt0(access(ix1, iy1), vl);
v2 = rvv<helper>::vcvt0(access(ix2, iy1), vl);
v3 = rvv<helper>::vcvt0(access(ix3, iy1), vl);
auto k1 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl);
v0 = rvv<helper>::vcvt0(access(ix0, iy2), vl);
v1 = rvv<helper>::vcvt0(access(ix1, iy2), vl);
v2 = rvv<helper>::vcvt0(access(ix2, iy2), vl);
v3 = rvv<helper>::vcvt0(access(ix3, iy2), vl);
auto k2 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl);
v0 = rvv<helper>::vcvt0(access(ix0, iy3), vl);
v1 = rvv<helper>::vcvt0(access(ix1, iy3), vl);
v2 = rvv<helper>::vcvt0(access(ix2, iy3), vl);
v3 = rvv<helper>::vcvt0(access(ix3, iy3), vl);
auto k3 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl);
intertab(my);
k0 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(k0, c0, vl), k1, c1, vl), k2, c2, vl), k3, c3, vl);
helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, rvv<helper>::vcvt1(k0, vl), vl);
}
}
return CV_HAL_ERROR_OK;
}
template<typename helper, bool s16>
static inline int remap32fLanczos4(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height,
uchar *dst_data, size_t dst_step, int dst_width,
const float* mapx, size_t mapx_step, const float* mapy, size_t mapy_step,
int interpolation, int border_type, const double* border_value)
{
using T = typename helper::ElemType;
for (int i = start; i < end; i++)
{
int vl;
for (int j = 0; j < dst_width; j += vl)
{
vl = helper::setvl(dst_width - j);
typename RVV_SameLen<float, helper>::VecType mx, my;
if (s16)
{
auto map = __riscv_vlseg2e16_v_i16m1x2(reinterpret_cast<const short*>(mapx) + i * mapx_step + j * 2, vl);
mx = __riscv_vfwcvt_f(__riscv_vget_v_i16m1x2_i16m1(map, 0), vl);
my = __riscv_vfwcvt_f(__riscv_vget_v_i16m1x2_i16m1(map, 1), vl);
}
else
{
if (mapy == nullptr)
{
auto map = __riscv_vlseg2e32_v_f32m2x2(mapx + i * mapx_step + j * 2, vl);
mx = __riscv_vget_v_f32m2x2_f32m2(map, 0);
my = __riscv_vget_v_f32m2x2_f32m2(map, 1);
}
else
{
mx = RVV_SameLen<float, helper>::vload(mapx + i * mapx_step + j, vl);
my = RVV_SameLen<float, helper>::vload(mapy + i * mapy_step + j, vl);
}
}
if (interpolation & CV_HAL_WARP_RELATIVE_MAP)
{
mx = __riscv_vfadd(mx, __riscv_vfcvt_f(__riscv_vadd(RVV_SameLen<uint, helper>::vid(vl), j, vl), vl), vl);
my = __riscv_vfadd(my, i, vl);
}
auto access = [&](typename RVV_SameLen<int, helper>::VecType ix, typename RVV_SameLen<int, helper>::VecType iy) {
auto ux = RVV_SameLen<uint, helper>::reinterpret(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width - 1, vl));
auto uy = RVV_SameLen<uint, helper>::reinterpret(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl));
auto src = rvv<helper>::vloxei(reinterpret_cast<const T*>(src_data), __riscv_vmadd(uy, src_step, __riscv_vmul(ux, sizeof(T), vl), vl), vl);
if (border_type == CV_HAL_BORDER_CONSTANT)
{
auto mask = __riscv_vmor(__riscv_vmsne(ix, RVV_SameLen<int, helper>::reinterpret(ux), vl), __riscv_vmsne(iy, RVV_SameLen<int, helper>::reinterpret(uy), vl), vl);
src = __riscv_vmerge(src, helper::vmv(border_value[0], vl), mask, vl);
}
return src;
};
typename RVV_SameLen<int, helper>::VecType ix3, iy3;
typename RVV_SameLen<ushort, helper>::VecType imx, imy;
if (s16)
{
ix3 = __riscv_vfcvt_x(mx, vl);
iy3 = __riscv_vfcvt_x(my, vl);
auto md = __riscv_vle16_v_u16m1(reinterpret_cast<const ushort*>(mapy) + i * mapy_step + j, vl);
imx = __riscv_vand(md, 31, vl);
imy = __riscv_vand(__riscv_vsrl(md, 5, vl), 31, vl);
}
else
{
auto dmx = __riscv_vfcvt_x(__riscv_vfmul(mx, 32, vl), vl);
auto dmy = __riscv_vfcvt_x(__riscv_vfmul(my, 32, vl), vl);
ix3 = __riscv_vsra(dmx, 5, vl);
iy3 = __riscv_vsra(dmy, 5, vl);
imx = __riscv_vncvt_x(__riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(dmx, 31, vl)), vl);
imy = __riscv_vncvt_x(__riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(dmy, 31, vl)), vl);
}
auto ix0 = __riscv_vsub(ix3, 3, vl), iy0 = __riscv_vsub(iy3, 3, vl);
auto ix1 = __riscv_vsub(ix3, 2, vl), iy1 = __riscv_vsub(iy3, 2, vl);
auto ix2 = __riscv_vsub(ix3, 1, vl), iy2 = __riscv_vsub(iy3, 1, vl);
auto ix4 = __riscv_vadd(ix3, 1, vl), iy4 = __riscv_vadd(iy3, 1, vl);
auto ix5 = __riscv_vadd(ix3, 2, vl), iy5 = __riscv_vadd(iy3, 2, vl);
auto ix6 = __riscv_vadd(ix3, 3, vl), iy6 = __riscv_vadd(iy3, 3, vl);
auto ix7 = __riscv_vadd(ix3, 4, vl), iy7 = __riscv_vadd(iy3, 4, vl);
typename RVV_SameLen<float, helper>::VecType c0, c1, c2, c3, c4, c5, c6, c7;
auto intertab = [&](typename RVV_SameLen<ushort, helper>::VecType x) {
x = __riscv_vmul(x, sizeof(float) * 8, vl);
auto val = __riscv_vloxseg4ei16_v_f32m2x4(RemapTable::instance().coeffs, x, vl);
c0 = __riscv_vget_v_f32m2x4_f32m2(val, 0);
c1 = __riscv_vget_v_f32m2x4_f32m2(val, 1);
c2 = __riscv_vget_v_f32m2x4_f32m2(val, 2);
c3 = __riscv_vget_v_f32m2x4_f32m2(val, 3);
val = __riscv_vloxseg4ei16_v_f32m2x4(RemapTable::instance().coeffs, __riscv_vadd(x, sizeof(float) * 4, vl), vl);
c4 = __riscv_vget_v_f32m2x4_f32m2(val, 0);
c5 = __riscv_vget_v_f32m2x4_f32m2(val, 1);
c6 = __riscv_vget_v_f32m2x4_f32m2(val, 2);
c7 = __riscv_vget_v_f32m2x4_f32m2(val, 3);
};
intertab(imx);
auto v0 = rvv<helper>::vcvt0(access(ix0, iy0), vl);
auto v1 = rvv<helper>::vcvt0(access(ix1, iy0), vl);
auto v2 = rvv<helper>::vcvt0(access(ix2, iy0), vl);
auto v3 = rvv<helper>::vcvt0(access(ix3, iy0), vl);
auto v4 = rvv<helper>::vcvt0(access(ix4, iy0), vl);
auto v5 = rvv<helper>::vcvt0(access(ix5, iy0), vl);
auto v6 = rvv<helper>::vcvt0(access(ix6, iy0), vl);
auto v7 = rvv<helper>::vcvt0(access(ix7, iy0), vl);
auto k0 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl);
v0 = rvv<helper>::vcvt0(access(ix0, iy1), vl);
v1 = rvv<helper>::vcvt0(access(ix1, iy1), vl);
v2 = rvv<helper>::vcvt0(access(ix2, iy1), vl);
v3 = rvv<helper>::vcvt0(access(ix3, iy1), vl);
v4 = rvv<helper>::vcvt0(access(ix4, iy1), vl);
v5 = rvv<helper>::vcvt0(access(ix5, iy1), vl);
v6 = rvv<helper>::vcvt0(access(ix6, iy1), vl);
v7 = rvv<helper>::vcvt0(access(ix7, iy1), vl);
auto k1 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl);
v0 = rvv<helper>::vcvt0(access(ix0, iy2), vl);
v1 = rvv<helper>::vcvt0(access(ix1, iy2), vl);
v2 = rvv<helper>::vcvt0(access(ix2, iy2), vl);
v3 = rvv<helper>::vcvt0(access(ix3, iy2), vl);
v4 = rvv<helper>::vcvt0(access(ix4, iy2), vl);
v5 = rvv<helper>::vcvt0(access(ix5, iy2), vl);
v6 = rvv<helper>::vcvt0(access(ix6, iy2), vl);
v7 = rvv<helper>::vcvt0(access(ix7, iy2), vl);
auto k2 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl);
v0 = rvv<helper>::vcvt0(access(ix0, iy3), vl);
v1 = rvv<helper>::vcvt0(access(ix1, iy3), vl);
v2 = rvv<helper>::vcvt0(access(ix2, iy3), vl);
v3 = rvv<helper>::vcvt0(access(ix3, iy3), vl);
v4 = rvv<helper>::vcvt0(access(ix4, iy3), vl);
v5 = rvv<helper>::vcvt0(access(ix5, iy3), vl);
v6 = rvv<helper>::vcvt0(access(ix6, iy3), vl);
v7 = rvv<helper>::vcvt0(access(ix7, iy3), vl);
auto k3 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl);
v0 = rvv<helper>::vcvt0(access(ix0, iy4), vl);
v1 = rvv<helper>::vcvt0(access(ix1, iy4), vl);
v2 = rvv<helper>::vcvt0(access(ix2, iy4), vl);
v3 = rvv<helper>::vcvt0(access(ix3, iy4), vl);
v4 = rvv<helper>::vcvt0(access(ix4, iy4), vl);
v5 = rvv<helper>::vcvt0(access(ix5, iy4), vl);
v6 = rvv<helper>::vcvt0(access(ix6, iy4), vl);
v7 = rvv<helper>::vcvt0(access(ix7, iy4), vl);
auto k4 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl);
v0 = rvv<helper>::vcvt0(access(ix0, iy5), vl);
v1 = rvv<helper>::vcvt0(access(ix1, iy5), vl);
v2 = rvv<helper>::vcvt0(access(ix2, iy5), vl);
v3 = rvv<helper>::vcvt0(access(ix3, iy5), vl);
v4 = rvv<helper>::vcvt0(access(ix4, iy5), vl);
v5 = rvv<helper>::vcvt0(access(ix5, iy5), vl);
v6 = rvv<helper>::vcvt0(access(ix6, iy5), vl);
v7 = rvv<helper>::vcvt0(access(ix7, iy5), vl);
auto k5 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl);
v0 = rvv<helper>::vcvt0(access(ix0, iy6), vl);
v1 = rvv<helper>::vcvt0(access(ix1, iy6), vl);
v2 = rvv<helper>::vcvt0(access(ix2, iy6), vl);
v3 = rvv<helper>::vcvt0(access(ix3, iy6), vl);
v4 = rvv<helper>::vcvt0(access(ix4, iy6), vl);
v5 = rvv<helper>::vcvt0(access(ix5, iy6), vl);
v6 = rvv<helper>::vcvt0(access(ix6, iy6), vl);
v7 = rvv<helper>::vcvt0(access(ix7, iy6), vl);
auto k6 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl);
v0 = rvv<helper>::vcvt0(access(ix0, iy7), vl);
v1 = rvv<helper>::vcvt0(access(ix1, iy7), vl);
v2 = rvv<helper>::vcvt0(access(ix2, iy7), vl);
v3 = rvv<helper>::vcvt0(access(ix3, iy7), vl);
v4 = rvv<helper>::vcvt0(access(ix4, iy7), vl);
v5 = rvv<helper>::vcvt0(access(ix5, iy7), vl);
v6 = rvv<helper>::vcvt0(access(ix6, iy7), vl);
v7 = rvv<helper>::vcvt0(access(ix7, iy7), vl);
auto k7 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl);
intertab(imy);
k0 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(k0, c0, vl), k1, c1, vl), k2, c2, vl), k3, c3, vl), k4, c4, vl), k5, c5, vl), k6, c6, vl), k7, c7, vl);
helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, rvv<helper>::vcvt1(k0, vl), vl);
}
}
return CV_HAL_ERROR_OK;
}
static inline int remap32fC3(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height,
uchar *dst_data, size_t dst_step, int dst_width,
const float* mapx, size_t mapx_step, const float* mapy, size_t mapy_step,
int interpolation, int border_type, const double* border_value)
{
for (int i = start; i < end; i++)
{
int vl;
for (int j = 0; j < dst_width; j += vl)
{
vl = __riscv_vsetvl_e8mf2(dst_width - j);
vfloat32m2_t mx, my;
if (mapy == nullptr)
{
auto map = __riscv_vlseg2e32_v_f32m2x2(mapx + i * mapx_step + j * 2, vl);
mx = __riscv_vget_v_f32m2x2_f32m2(map, 0);
my = __riscv_vget_v_f32m2x2_f32m2(map, 1);
}
else
{
mx = __riscv_vle32_v_f32m2(mapx + i * mapx_step + j, vl);
my = __riscv_vle32_v_f32m2(mapy + i * mapy_step + j, vl);
}
if (interpolation & CV_HAL_WARP_RELATIVE_MAP)
{
mx = __riscv_vfadd(mx, __riscv_vfcvt_f(__riscv_vadd(__riscv_vid_v_u32m2(vl), j, vl), vl), vl);
my = __riscv_vfadd(my, i, vl);
}
auto access = [&](vint32m2_t ix, vint32m2_t iy, vuint8mf2_t& src0, vuint8mf2_t& src1, vuint8mf2_t& src2) {
auto ux = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width - 1, vl));
auto uy = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl));
auto src = __riscv_vloxseg3ei32_v_u8mf2x3(src_data, __riscv_vmadd(uy, src_step, __riscv_vmul(ux, 3, vl), vl), vl);
src0 = __riscv_vget_v_u8mf2x3_u8mf2(src, 0);
src1 = __riscv_vget_v_u8mf2x3_u8mf2(src, 1);
src2 = __riscv_vget_v_u8mf2x3_u8mf2(src, 2);
if (border_type == CV_HAL_BORDER_CONSTANT)
{
auto mask = __riscv_vmor(__riscv_vmsne(ix, __riscv_vreinterpret_v_u32m2_i32m2(ux), vl), __riscv_vmsne(iy, __riscv_vreinterpret_v_u32m2_i32m2(uy), vl), vl);
src0 = __riscv_vmerge(src0, border_value[0], mask, vl);
src1 = __riscv_vmerge(src1, border_value[1], mask, vl);
src2 = __riscv_vmerge(src2, border_value[2], mask, vl);
}
};
if ((interpolation & ~CV_HAL_WARP_RELATIVE_MAP) == CV_HAL_INTER_NEAREST)
{
auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl);
vuint8mf2_t src0, src1, src2;
access(ix, iy, src0, src1, src2);
vuint8mf2x3_t dst{};
dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 0, src0);
dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 1, src1);
dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 2, src2);
__riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl);
}
else
{
auto imx = __riscv_vfcvt_x(__riscv_vfmul(mx, 32, vl), vl);
auto imy = __riscv_vfcvt_x(__riscv_vfmul(my, 32, vl), vl);
auto ix0 = __riscv_vsra(imx, 5, vl);
auto iy0 = __riscv_vsra(imy, 5, vl);
auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl);
mx = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imx, 31, vl), vl), 32, vl);
my = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imy, 31, vl), vl), 32, vl);
vfloat32m2_t v00, v10, v20;
vfloat32m2_t v01, v11, v21;
vfloat32m2_t v02, v12, v22;
vfloat32m2_t v03, v13, v23;
vuint8mf2_t src0, src1, src2;
access(ix0, iy0, src0, src1, src2);
v00 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl);
v10 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl);
v20 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl);
access(ix1, iy0, src0, src1, src2);
v01 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl);
v11 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl);
v21 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl);
access(ix0, iy1, src0, src1, src2);
v02 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl);
v12 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl);
v22 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl);
access(ix1, iy1, src0, src1, src2);
v03 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl);
v13 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl);
v23 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl);
v00 = __riscv_vfmacc(v00, mx, __riscv_vfsub(v01, v00, vl), vl);
v02 = __riscv_vfmacc(v02, mx, __riscv_vfsub(v03, v02, vl), vl);
v00 = __riscv_vfmacc(v00, my, __riscv_vfsub(v02, v00, vl), vl);
v10 = __riscv_vfmacc(v10, mx, __riscv_vfsub(v11, v10, vl), vl);
v12 = __riscv_vfmacc(v12, mx, __riscv_vfsub(v13, v12, vl), vl);
v10 = __riscv_vfmacc(v10, my, __riscv_vfsub(v12, v10, vl), vl);
v20 = __riscv_vfmacc(v20, mx, __riscv_vfsub(v21, v20, vl), vl);
v22 = __riscv_vfmacc(v22, mx, __riscv_vfsub(v23, v22, vl), vl);
v20 = __riscv_vfmacc(v20, my, __riscv_vfsub(v22, v20, vl), vl);
vuint8mf2x3_t dst{};
dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 0, __riscv_vnclipu(__riscv_vfncvt_xu(v00, vl), 0, __RISCV_VXRM_RNU, vl));
dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 1, __riscv_vnclipu(__riscv_vfncvt_xu(v10, vl), 0, __RISCV_VXRM_RNU, vl));
dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 2, __riscv_vnclipu(__riscv_vfncvt_xu(v20, vl), 0, __RISCV_VXRM_RNU, vl));
__riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl);
}
}
}
return CV_HAL_ERROR_OK;
}
static inline int remap32fC4(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height,
uchar *dst_data, size_t dst_step, int dst_width,
const float* mapx, size_t mapx_step, const float* mapy, size_t mapy_step,
int interpolation, int border_type, const double* border_value)
{
for (int i = start; i < end; i++)
{
int vl;
for (int j = 0; j < dst_width; j += vl)
{
vl = __riscv_vsetvl_e8mf2(dst_width - j);
vfloat32m2_t mx, my;
if (mapy == nullptr)
{
auto map = __riscv_vlseg2e32_v_f32m2x2(mapx + i * mapx_step + j * 2, vl);
mx = __riscv_vget_v_f32m2x2_f32m2(map, 0);
my = __riscv_vget_v_f32m2x2_f32m2(map, 1);
}
else
{
mx = __riscv_vle32_v_f32m2(mapx + i * mapx_step + j, vl);
my = __riscv_vle32_v_f32m2(mapy + i * mapy_step + j, vl);
}
if (interpolation & CV_HAL_WARP_RELATIVE_MAP)
{
mx = __riscv_vfadd(mx, __riscv_vfcvt_f(__riscv_vadd(__riscv_vid_v_u32m2(vl), j, vl), vl), vl);
my = __riscv_vfadd(my, i, vl);
}
auto access = [&](vint32m2_t ix, vint32m2_t iy, vuint8mf2_t& src0, vuint8mf2_t& src1, vuint8mf2_t& src2, vuint8mf2_t& src3) {
auto ux = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width - 1, vl));
auto uy = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl));
auto src = __riscv_vloxseg4ei32_v_u8mf2x4(src_data, __riscv_vmadd(uy, src_step, __riscv_vmul(ux, 4, vl), vl), vl);
src0 = __riscv_vget_v_u8mf2x4_u8mf2(src, 0);
src1 = __riscv_vget_v_u8mf2x4_u8mf2(src, 1);
src2 = __riscv_vget_v_u8mf2x4_u8mf2(src, 2);
src3 = __riscv_vget_v_u8mf2x4_u8mf2(src, 3);
if (border_type == CV_HAL_BORDER_CONSTANT)
{
auto mask = __riscv_vmor(__riscv_vmsne(ix, __riscv_vreinterpret_v_u32m2_i32m2(ux), vl), __riscv_vmsne(iy, __riscv_vreinterpret_v_u32m2_i32m2(uy), vl), vl);
src0 = __riscv_vmerge(src0, border_value[0], mask, vl);
src1 = __riscv_vmerge(src1, border_value[1], mask, vl);
src2 = __riscv_vmerge(src2, border_value[2], mask, vl);
src3 = __riscv_vmerge(src3, border_value[3], mask, vl);
}
};
if ((interpolation & ~CV_HAL_WARP_RELATIVE_MAP) == CV_HAL_INTER_NEAREST)
{
auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl);
vuint8mf2_t src0, src1, src2, src3;
access(ix, iy, src0, src1, src2, src3);
vuint8mf2x4_t dst{};
dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 0, src0);
dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 1, src1);
dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 2, src2);
dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 3, src3);
__riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
}
else
{
auto imx = __riscv_vfcvt_x(__riscv_vfmul(mx, 32, vl), vl);
auto imy = __riscv_vfcvt_x(__riscv_vfmul(my, 32, vl), vl);
auto ix0 = __riscv_vsra(imx, 5, vl);
auto iy0 = __riscv_vsra(imy, 5, vl);
auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl);
mx = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imx, 31, vl), vl), 32, vl);
my = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imy, 31, vl), vl), 32, vl);
vfloat32m2_t v00, v10, v20, v30;
vfloat32m2_t v01, v11, v21, v31;
vfloat32m2_t v02, v12, v22, v32;
vfloat32m2_t v03, v13, v23, v33;
vuint8mf2_t src0, src1, src2, src3;
access(ix0, iy0, src0, src1, src2, src3);
v00 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl);
v10 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl);
v20 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl);
v30 = __riscv_vfcvt_f(__riscv_vzext_vf4(src3, vl), vl);
access(ix1, iy0, src0, src1, src2, src3);
v01 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl);
v11 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl);
v21 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl);
v31 = __riscv_vfcvt_f(__riscv_vzext_vf4(src3, vl), vl);
access(ix0, iy1, src0, src1, src2, src3);
v02 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl);
v12 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl);
v22 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl);
v32 = __riscv_vfcvt_f(__riscv_vzext_vf4(src3, vl), vl);
access(ix1, iy1, src0, src1, src2, src3);
v03 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl);
v13 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl);
v23 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl);
v33 = __riscv_vfcvt_f(__riscv_vzext_vf4(src3, vl), vl);
v00 = __riscv_vfmacc(v00, mx, __riscv_vfsub(v01, v00, vl), vl);
v02 = __riscv_vfmacc(v02, mx, __riscv_vfsub(v03, v02, vl), vl);
v00 = __riscv_vfmacc(v00, my, __riscv_vfsub(v02, v00, vl), vl);
v10 = __riscv_vfmacc(v10, mx, __riscv_vfsub(v11, v10, vl), vl);
v12 = __riscv_vfmacc(v12, mx, __riscv_vfsub(v13, v12, vl), vl);
v10 = __riscv_vfmacc(v10, my, __riscv_vfsub(v12, v10, vl), vl);
v20 = __riscv_vfmacc(v20, mx, __riscv_vfsub(v21, v20, vl), vl);
v22 = __riscv_vfmacc(v22, mx, __riscv_vfsub(v23, v22, vl), vl);
v20 = __riscv_vfmacc(v20, my, __riscv_vfsub(v22, v20, vl), vl);
v30 = __riscv_vfmacc(v30, mx, __riscv_vfsub(v31, v30, vl), vl);
v32 = __riscv_vfmacc(v32, mx, __riscv_vfsub(v33, v32, vl), vl);
v30 = __riscv_vfmacc(v30, my, __riscv_vfsub(v32, v30, vl), vl);
vuint8mf2x4_t dst{};
dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 0, __riscv_vnclipu(__riscv_vfncvt_xu(v00, vl), 0, __RISCV_VXRM_RNU, vl));
dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 1, __riscv_vnclipu(__riscv_vfncvt_xu(v10, vl), 0, __RISCV_VXRM_RNU, vl));
dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 2, __riscv_vnclipu(__riscv_vfncvt_xu(v20, vl), 0, __RISCV_VXRM_RNU, vl));
dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 3, __riscv_vnclipu(__riscv_vfncvt_xu(v30, vl), 0, __RISCV_VXRM_RNU, vl));
__riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
}
}
}
return CV_HAL_ERROR_OK;
}
// the algorithm is copied from 3rdparty/carotene/src/remap.cpp,
// in the function void CAROTENE_NS::remapNearestNeighbor and void CAROTENE_NS::remapLinear
template<bool s16 = false>
inline int remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
float* mapx, size_t mapx_step, float* mapy, size_t mapy_step,
int interpolation, int border_type, const double border_value[4])
{
if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4 && src_type != CV_16UC1 && src_type != CV_16SC1 && src_type != CV_32FC1)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
if (border_type != CV_HAL_BORDER_CONSTANT && border_type != CV_HAL_BORDER_REPLICATE)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
const int mode = interpolation & ~CV_HAL_WARP_RELATIVE_MAP;
if (mode != CV_HAL_INTER_NEAREST && mode != CV_HAL_INTER_LINEAR && mode != CV_HAL_INTER_CUBIC && mode != CV_HAL_INTER_LANCZOS4)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
if ((mode == CV_HAL_INTER_CUBIC || mode == CV_HAL_INTER_LANCZOS4) && CV_MAKETYPE(src_type, 1) != src_type)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
mapx_step /= s16 ? sizeof(short) : sizeof(float);
mapy_step /= s16 ? sizeof(ushort) : sizeof(float);
switch (src_type)
{
case CV_8UC3:
return invoke(dst_width, dst_height, {remap32fC3}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
case CV_8UC4:
return invoke(dst_width, dst_height, {remap32fC4}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
}
switch (mode*100 + src_type)
{
case CV_HAL_INTER_NEAREST*100 + CV_8UC1:
case CV_HAL_INTER_LINEAR*100 + CV_8UC1:
return invoke(dst_width, dst_height, {remap32fC1<RVV_U8M2>}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
case CV_HAL_INTER_NEAREST*100 + CV_16UC1:
case CV_HAL_INTER_LINEAR*100 + CV_16UC1:
return invoke(dst_width, dst_height, {remap32fC1<RVV_U16M4>}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
case CV_HAL_INTER_NEAREST*100 + CV_16SC1:
case CV_HAL_INTER_LINEAR*100 + CV_16SC1:
return invoke(dst_width, dst_height, {remap32fC1<RVV_I16M4>}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
case CV_HAL_INTER_NEAREST*100 + CV_32FC1:
case CV_HAL_INTER_LINEAR*100 + CV_32FC1:
return invoke(dst_width, dst_height, {remap32fC1<RVV_F32M8>}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
case CV_HAL_INTER_CUBIC*100 + CV_8UC1:
return invoke(dst_width, dst_height, {remap32fCubic<RVV_U8MF4>}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
case CV_HAL_INTER_CUBIC*100 + CV_16UC1:
return invoke(dst_width, dst_height, {remap32fCubic<RVV_U16MF2>}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
case CV_HAL_INTER_CUBIC*100 + CV_16SC1:
return invoke(dst_width, dst_height, {remap32fCubic<RVV_I16MF2>}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
case CV_HAL_INTER_CUBIC*100 + CV_32FC1:
return invoke(dst_width, dst_height, {remap32fCubic<RVV_F32M1>}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
// Lanczos4 is disabled in clang since register allocation strategy is buggy in clang 20.0
// remove this #ifndef in the future if possible
#ifndef __clang__
case CV_HAL_INTER_LANCZOS4*100 + CV_8UC1:
return invoke(dst_width, dst_height, {remap32fLanczos4<RVV_U8MF2, s16>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
// disabled since UI is fast enough
// case CV_HAL_INTER_LANCZOS4*100 + CV_16UC1:
// return invoke(dst_width, dst_height, {remap32fLanczos4<RVV_U16M1, s16>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
case CV_HAL_INTER_LANCZOS4*100 + CV_16SC1:
return invoke(dst_width, dst_height, {remap32fLanczos4<RVV_I16M1, s16>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
case CV_HAL_INTER_LANCZOS4*100 + CV_32FC1:
return invoke(dst_width, dst_height, {remap32fLanczos4<RVV_F32M2, s16>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
#endif
}
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
inline int remap32fc2(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
float* map, size_t map_step, int interpolation, int border_type, const double border_value[4])
{
return remap32f(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, map, map_step, nullptr, 0, interpolation, border_type, border_value);
}
inline int remap16s(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
short* mapx, size_t mapx_step, ushort* mapy, size_t mapy_step,
int interpolation, int border_type, const double border_value[4])
{
if (CV_MAKETYPE(src_type, 1) != src_type)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
return remap32f<true>(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, reinterpret_cast<float*>(mapx), mapx_step, reinterpret_cast<float*>(mapy), mapy_step, interpolation, border_type, border_value);
}
} // cv::cv_hal_rvv::remap
namespace warp {
#undef cv_hal_warpAffine
#define cv_hal_warpAffine cv::cv_hal_rvv::warp::warpAffine
#undef cv_hal_warpPerspective
#define cv_hal_warpPerspective cv::cv_hal_rvv::warp::warpPerspective
template<bool perspective>
static inline int warpC1(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, const double* M, int interpolation, int borderType, const double* borderValue)
{
for (int i = start; i < end; i++)
{
int vl;
for (int j = 0; j < dst_width; j += vl)
{
vl = __riscv_vsetvl_e8m1(dst_width - j);
auto access = [&](vint32m4_t ix, vint32m4_t iy) {
auto ux = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width - 1, vl));
auto uy = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl));
auto src = __riscv_vloxei32_v_u8m1(src_data, __riscv_vmadd(uy, src_step, ux, vl), vl);
if (borderType == CV_HAL_BORDER_CONSTANT)
{
auto mask = __riscv_vmor(__riscv_vmsne(ix, __riscv_vreinterpret_v_u32m4_i32m4(ux), vl), __riscv_vmsne(iy, __riscv_vreinterpret_v_u32m4_i32m4(uy), vl), vl);
src = __riscv_vmerge(src, borderValue[0], mask, vl);
}
return src;
};
auto id = __riscv_vfcvt_f(__riscv_vadd(__riscv_vid_v_u32m4(vl), j, vl), vl);
auto mx = __riscv_vfmadd(id, M[0], __riscv_vfmadd(__riscv_vfmv_v_f_f32m4(i, vl), M[1], __riscv_vfmv_v_f_f32m4(M[2], vl), vl), vl);
auto my = __riscv_vfmadd(id, M[3], __riscv_vfmadd(__riscv_vfmv_v_f_f32m4(i, vl), M[4], __riscv_vfmv_v_f_f32m4(M[5], vl), vl), vl);
if (perspective)
{
auto md = __riscv_vfrdiv(__riscv_vfmadd(id, M[6], __riscv_vfmadd(__riscv_vfmv_v_f_f32m4(i, vl), M[7], __riscv_vfmv_v_f_f32m4(M[8], vl), vl), vl), 1, vl);
mx = __riscv_vfmul(mx, md, vl);
my = __riscv_vfmul(my, md, vl);
}
if (interpolation == CV_HAL_INTER_NEAREST)
{
auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl);
__riscv_vse8(dst_data + i * dst_step + j, access(ix, iy), vl);
}
else
{
auto ix = __riscv_vfcvt_x(__riscv_vfmadd(mx, 1 << 10, __riscv_vfmv_v_f_f32m4(1 << 4, vl), vl), vl);
auto iy = __riscv_vfcvt_x(__riscv_vfmadd(my, 1 << 10, __riscv_vfmv_v_f_f32m4(1 << 4, vl), vl), vl);
auto ix0 = __riscv_vsra(ix, 10, vl), iy0 = __riscv_vsra(iy, 10, vl);
auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl);
auto v0 = __riscv_vzext_vf4(access(ix0, iy0), vl);
auto v1 = __riscv_vzext_vf4(access(ix1, iy0), vl);
auto v2 = __riscv_vzext_vf4(access(ix0, iy1), vl);
auto v3 = __riscv_vzext_vf4(access(ix1, iy1), vl);
auto rx = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vand(__riscv_vsra(ix, 5, vl), (1 << 5) - 1, vl));
auto ry = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vand(__riscv_vsra(iy, 5, vl), (1 << 5) - 1, vl));
v0 = __riscv_vmacc(__riscv_vmul(v0, 1 << 5, vl), rx, __riscv_vsub(v1, v0, vl), vl);
v2 = __riscv_vmacc(__riscv_vmul(v2, 1 << 5, vl), rx, __riscv_vsub(v3, v2, vl), vl);
v0 = __riscv_vmacc(__riscv_vmul(v0, 1 << 5, vl), ry, __riscv_vsub(v2, v0, vl), vl);
__riscv_vse8(dst_data + i * dst_step + j, __riscv_vnclipu(__riscv_vnclipu(v0, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl), vl);
}
}
}
return CV_HAL_ERROR_OK;
}
template<bool perspective>
static inline int warpC3(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, const double* M, int interpolation, int borderType, const double* borderValue)
{
for (int i = start; i < end; i++)
{
int vl;
for (int j = 0; j < dst_width; j += vl)
{
vl = __riscv_vsetvl_e8mf2(dst_width - j);
auto access = [&](vint32m2_t ix, vint32m2_t iy, vuint8mf2_t& src0, vuint8mf2_t& src1, vuint8mf2_t& src2) {
auto ux = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width - 1, vl));
auto uy = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl));
auto src = __riscv_vloxseg3ei32_v_u8mf2x3(src_data, __riscv_vmadd(uy, src_step, __riscv_vmul(ux, 3, vl), vl), vl);
src0 = __riscv_vget_v_u8mf2x3_u8mf2(src, 0);
src1 = __riscv_vget_v_u8mf2x3_u8mf2(src, 1);
src2 = __riscv_vget_v_u8mf2x3_u8mf2(src, 2);
if (borderType == CV_HAL_BORDER_CONSTANT)
{
auto mask = __riscv_vmor(__riscv_vmsne(ix, __riscv_vreinterpret_v_u32m2_i32m2(ux), vl), __riscv_vmsne(iy, __riscv_vreinterpret_v_u32m2_i32m2(uy), vl), vl);
src0 = __riscv_vmerge(src0, borderValue[0], mask, vl);
src1 = __riscv_vmerge(src1, borderValue[1], mask, vl);
src2 = __riscv_vmerge(src2, borderValue[2], mask, vl);
}
};
auto id = __riscv_vfcvt_f(__riscv_vadd(__riscv_vid_v_u32m2(vl), j, vl), vl);
auto mx = __riscv_vfmadd(id, M[0], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[1], __riscv_vfmv_v_f_f32m2(M[2], vl), vl), vl);
auto my = __riscv_vfmadd(id, M[3], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[4], __riscv_vfmv_v_f_f32m2(M[5], vl), vl), vl);
if (perspective)
{
auto md = __riscv_vfrdiv(__riscv_vfmadd(id, M[6], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[7], __riscv_vfmv_v_f_f32m2(M[8], vl), vl), vl), 1, vl);
mx = __riscv_vfmul(mx, md, vl);
my = __riscv_vfmul(my, md, vl);
}
if (interpolation == CV_HAL_INTER_NEAREST)
{
auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl);
vuint8mf2_t src0, src1, src2;
access(ix, iy, src0, src1, src2);
vuint8mf2x3_t dst{};
dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 0, src0);
dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 1, src1);
dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 2, src2);
__riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl);
}
else
{
auto ix = __riscv_vfcvt_x(__riscv_vfmadd(mx, 1 << 10, __riscv_vfmv_v_f_f32m2(1 << 4, vl), vl), vl);
auto iy = __riscv_vfcvt_x(__riscv_vfmadd(my, 1 << 10, __riscv_vfmv_v_f_f32m2(1 << 4, vl), vl), vl);
auto ix0 = __riscv_vsra(ix, 10, vl), iy0 = __riscv_vsra(iy, 10, vl);
auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl);
vuint32m2_t v00, v10, v20;
vuint32m2_t v01, v11, v21;
vuint32m2_t v02, v12, v22;
vuint32m2_t v03, v13, v23;
vuint8mf2_t src0, src1, src2;
access(ix0, iy0, src0, src1, src2);
v00 = __riscv_vzext_vf4(src0, vl);
v10 = __riscv_vzext_vf4(src1, vl);
v20 = __riscv_vzext_vf4(src2, vl);
access(ix1, iy0, src0, src1, src2);
v01 = __riscv_vzext_vf4(src0, vl);
v11 = __riscv_vzext_vf4(src1, vl);
v21 = __riscv_vzext_vf4(src2, vl);
access(ix0, iy1, src0, src1, src2);
v02 = __riscv_vzext_vf4(src0, vl);
v12 = __riscv_vzext_vf4(src1, vl);
v22 = __riscv_vzext_vf4(src2, vl);
access(ix1, iy1, src0, src1, src2);
v03 = __riscv_vzext_vf4(src0, vl);
v13 = __riscv_vzext_vf4(src1, vl);
v23 = __riscv_vzext_vf4(src2, vl);
auto rx = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(__riscv_vsra(ix, 5, vl), (1 << 5) - 1, vl));
auto ry = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(__riscv_vsra(iy, 5, vl), (1 << 5) - 1, vl));
v00 = __riscv_vmacc(__riscv_vmul(v00, 1 << 5, vl), rx, __riscv_vsub(v01, v00, vl), vl);
v02 = __riscv_vmacc(__riscv_vmul(v02, 1 << 5, vl), rx, __riscv_vsub(v03, v02, vl), vl);
v00 = __riscv_vmacc(__riscv_vmul(v00, 1 << 5, vl), ry, __riscv_vsub(v02, v00, vl), vl);
v10 = __riscv_vmacc(__riscv_vmul(v10, 1 << 5, vl), rx, __riscv_vsub(v11, v10, vl), vl);
v12 = __riscv_vmacc(__riscv_vmul(v12, 1 << 5, vl), rx, __riscv_vsub(v13, v12, vl), vl);
v10 = __riscv_vmacc(__riscv_vmul(v10, 1 << 5, vl), ry, __riscv_vsub(v12, v10, vl), vl);
v20 = __riscv_vmacc(__riscv_vmul(v20, 1 << 5, vl), rx, __riscv_vsub(v21, v20, vl), vl);
v22 = __riscv_vmacc(__riscv_vmul(v22, 1 << 5, vl), rx, __riscv_vsub(v23, v22, vl), vl);
v20 = __riscv_vmacc(__riscv_vmul(v20, 1 << 5, vl), ry, __riscv_vsub(v22, v20, vl), vl);
vuint8mf2x3_t dst{};
dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 0, __riscv_vnclipu(__riscv_vnclipu(v00, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl));
dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 1, __riscv_vnclipu(__riscv_vnclipu(v10, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl));
dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 2, __riscv_vnclipu(__riscv_vnclipu(v20, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl));
__riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl);
}
}
}
return CV_HAL_ERROR_OK;
}
template<bool perspective>
static inline int warpC4(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, const double* M, int interpolation, int borderType, const double* borderValue)
{
for (int i = start; i < end; i++)
{
int vl;
for (int j = 0; j < dst_width; j += vl)
{
vl = __riscv_vsetvl_e8mf2(dst_width - j);
auto access = [&](vint32m2_t ix, vint32m2_t iy, vuint8mf2_t& src0, vuint8mf2_t& src1, vuint8mf2_t& src2, vuint8mf2_t& src3) {
auto ux = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width - 1, vl));
auto uy = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl));
auto src = __riscv_vloxseg4ei32_v_u8mf2x4(src_data, __riscv_vmadd(uy, src_step, __riscv_vmul(ux, 4, vl), vl), vl);
src0 = __riscv_vget_v_u8mf2x4_u8mf2(src, 0);
src1 = __riscv_vget_v_u8mf2x4_u8mf2(src, 1);
src2 = __riscv_vget_v_u8mf2x4_u8mf2(src, 2);
src3 = __riscv_vget_v_u8mf2x4_u8mf2(src, 3);
if (borderType == CV_HAL_BORDER_CONSTANT)
{
auto mask = __riscv_vmor(__riscv_vmsne(ix, __riscv_vreinterpret_v_u32m2_i32m2(ux), vl), __riscv_vmsne(iy, __riscv_vreinterpret_v_u32m2_i32m2(uy), vl), vl);
src0 = __riscv_vmerge(src0, borderValue[0], mask, vl);
src1 = __riscv_vmerge(src1, borderValue[1], mask, vl);
src2 = __riscv_vmerge(src2, borderValue[2], mask, vl);
src3 = __riscv_vmerge(src3, borderValue[3], mask, vl);
}
};
auto id = __riscv_vfcvt_f(__riscv_vadd(__riscv_vid_v_u32m2(vl), j, vl), vl);
auto mx = __riscv_vfmadd(id, M[0], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[1], __riscv_vfmv_v_f_f32m2(M[2], vl), vl), vl);
auto my = __riscv_vfmadd(id, M[3], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[4], __riscv_vfmv_v_f_f32m2(M[5], vl), vl), vl);
if (perspective)
{
auto md = __riscv_vfrdiv(__riscv_vfmadd(id, M[6], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[7], __riscv_vfmv_v_f_f32m2(M[8], vl), vl), vl), 1, vl);
mx = __riscv_vfmul(mx, md, vl);
my = __riscv_vfmul(my, md, vl);
}
if (interpolation == CV_HAL_INTER_NEAREST)
{
auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl);
vuint8mf2_t src0, src1, src2, src3;
access(ix, iy, src0, src1, src2, src3);
vuint8mf2x4_t dst{};
dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 0, src0);
dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 1, src1);
dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 2, src2);
dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 3, src3);
__riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
}
else
{
auto ix = __riscv_vfcvt_x(__riscv_vfmadd(mx, 1 << 10, __riscv_vfmv_v_f_f32m2(1 << 4, vl), vl), vl);
auto iy = __riscv_vfcvt_x(__riscv_vfmadd(my, 1 << 10, __riscv_vfmv_v_f_f32m2(1 << 4, vl), vl), vl);
auto ix0 = __riscv_vsra(ix, 10, vl), iy0 = __riscv_vsra(iy, 10, vl);
auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl);
vuint32m2_t v00, v10, v20, v30;
vuint32m2_t v01, v11, v21, v31;
vuint32m2_t v02, v12, v22, v32;
vuint32m2_t v03, v13, v23, v33;
vuint8mf2_t src0, src1, src2, src3;
access(ix0, iy0, src0, src1, src2, src3);
v00 = __riscv_vzext_vf4(src0, vl);
v10 = __riscv_vzext_vf4(src1, vl);
v20 = __riscv_vzext_vf4(src2, vl);
v30 = __riscv_vzext_vf4(src3, vl);
access(ix1, iy0, src0, src1, src2, src3);
v01 = __riscv_vzext_vf4(src0, vl);
v11 = __riscv_vzext_vf4(src1, vl);
v21 = __riscv_vzext_vf4(src2, vl);
v31 = __riscv_vzext_vf4(src3, vl);
access(ix0, iy1, src0, src1, src2, src3);
v02 = __riscv_vzext_vf4(src0, vl);
v12 = __riscv_vzext_vf4(src1, vl);
v22 = __riscv_vzext_vf4(src2, vl);
v32 = __riscv_vzext_vf4(src3, vl);
access(ix1, iy1, src0, src1, src2, src3);
v03 = __riscv_vzext_vf4(src0, vl);
v13 = __riscv_vzext_vf4(src1, vl);
v23 = __riscv_vzext_vf4(src2, vl);
v33 = __riscv_vzext_vf4(src3, vl);
auto rx = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(__riscv_vsra(ix, 5, vl), (1 << 5) - 1, vl));
auto ry = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(__riscv_vsra(iy, 5, vl), (1 << 5) - 1, vl));
v00 = __riscv_vmacc(__riscv_vmul(v00, 1 << 5, vl), rx, __riscv_vsub(v01, v00, vl), vl);
v02 = __riscv_vmacc(__riscv_vmul(v02, 1 << 5, vl), rx, __riscv_vsub(v03, v02, vl), vl);
v00 = __riscv_vmacc(__riscv_vmul(v00, 1 << 5, vl), ry, __riscv_vsub(v02, v00, vl), vl);
v10 = __riscv_vmacc(__riscv_vmul(v10, 1 << 5, vl), rx, __riscv_vsub(v11, v10, vl), vl);
v12 = __riscv_vmacc(__riscv_vmul(v12, 1 << 5, vl), rx, __riscv_vsub(v13, v12, vl), vl);
v10 = __riscv_vmacc(__riscv_vmul(v10, 1 << 5, vl), ry, __riscv_vsub(v12, v10, vl), vl);
v20 = __riscv_vmacc(__riscv_vmul(v20, 1 << 5, vl), rx, __riscv_vsub(v21, v20, vl), vl);
v22 = __riscv_vmacc(__riscv_vmul(v22, 1 << 5, vl), rx, __riscv_vsub(v23, v22, vl), vl);
v20 = __riscv_vmacc(__riscv_vmul(v20, 1 << 5, vl), ry, __riscv_vsub(v22, v20, vl), vl);
v30 = __riscv_vmacc(__riscv_vmul(v30, 1 << 5, vl), rx, __riscv_vsub(v31, v30, vl), vl);
v32 = __riscv_vmacc(__riscv_vmul(v32, 1 << 5, vl), rx, __riscv_vsub(v33, v32, vl), vl);
v30 = __riscv_vmacc(__riscv_vmul(v30, 1 << 5, vl), ry, __riscv_vsub(v32, v30, vl), vl);
vuint8mf2x4_t dst{};
dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 0, __riscv_vnclipu(__riscv_vnclipu(v00, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl));
dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 1, __riscv_vnclipu(__riscv_vnclipu(v10, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl));
dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 2, __riscv_vnclipu(__riscv_vnclipu(v20, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl));
dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 3, __riscv_vnclipu(__riscv_vnclipu(v30, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl));
__riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
}
}
}
return CV_HAL_ERROR_OK;
}
// the algorithm is copied from 3rdparty/carotene/src/warp_affine.cpp,
// in the function void CAROTENE_NS::warpAffineNearestNeighbor and void CAROTENE_NS::warpAffineLinear
inline int warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4])
{
if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
if (borderType != CV_HAL_BORDER_CONSTANT && borderType != CV_HAL_BORDER_REPLICATE)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
if (interpolation != CV_HAL_INTER_NEAREST && interpolation != CV_HAL_INTER_LINEAR)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
switch (src_type)
{
case CV_8UC1:
return remap::invoke(dst_width, dst_height, {warpC1<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
case CV_8UC3:
return remap::invoke(dst_width, dst_height, {warpC3<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
case CV_8UC4:
return remap::invoke(dst_width, dst_height, {warpC4<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
}
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
// the algorithm is copied from 3rdparty/carotene/src/warp_perspective.cpp,
// in the function void CAROTENE_NS::warpPerspectiveNearestNeighbor and void CAROTENE_NS::warpPerspectiveLinear
inline int warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4])
{
if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
if (borderType != CV_HAL_BORDER_CONSTANT && borderType != CV_HAL_BORDER_REPLICATE)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
if (interpolation != CV_HAL_INTER_NEAREST && interpolation != CV_HAL_INTER_LINEAR)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
switch (src_type)
{
case CV_8UC1:
return remap::invoke(dst_width, dst_height, {warpC1<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
case CV_8UC3:
return remap::invoke(dst_width, dst_height, {warpC3<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
case CV_8UC4:
return remap::invoke(dst_width, dst_height, {warpC4<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
}
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
} // cv::cv_hal_rvv::warp
}}
#endif