Merge pull request #27378 from spacemit-com:4.x

Add canny, scharr and sobel for riscv-rvv hal. #27378

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
damon-spacemit 2025-08-27 01:05:08 +08:00 committed by GitHub
parent ad560f69f4
commit ad22d482e6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 2460 additions and 0 deletions

View File

@ -241,6 +241,29 @@ int integral(int depth, int sdepth, int sqdepth,
//#undef cv_hal_integral
//#define cv_hal_integral cv::rvv_hal::imgproc::integral
/* ############ scharr ############ */
int scharr(const uint8_t *src_data, size_t src_step, uint8_t *dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, int dx, int dy, double scale, double delta, int border_type);
#undef cv_hal_scharr
#define cv_hal_scharr cv::rvv_hal::imgproc::scharr
/* ############ sobel ############ */
int sobel(const uint8_t *src_data, size_t src_step, uint8_t *dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, int dx, int dy, int ksize, double scale, double delta, int border_type);
#undef cv_hal_sobel
#define cv_hal_sobel cv::rvv_hal::imgproc::sobel
/* ############ canny ############ */
int canny(const uint8_t *src_data, size_t src_step,
uint8_t *dst_data, size_t dst_step,
int width, int height, int cn,
double low_thresh, double high_thresh,
int ksize, bool L2gradient);
#undef cv_hal_canny
#define cv_hal_canny cv::rvv_hal::imgproc::canny
#endif // CV_HAL_RVV_1P0_ENABLED
#if CV_HAL_RVV_071_ENABLED

View File

@ -0,0 +1,426 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
#include "rvv_hal.hpp"
#include "common.hpp"
namespace cv
{
namespace rvv_hal
{
namespace imgproc
{
#if CV_HAL_RVV_1P0_ENABLED
typedef struct
{
int *buffer;
int *prev;
int *curr;
int *next;
size_t size;
} LineBuffer;
LineBuffer *createLineBuffer(size_t width, size_t align);
void rotateLineBuffer(LineBuffer *buf);
void freeLineBuffer(LineBuffer *buf);
LineBuffer *createLineBuffer(size_t width, size_t align)
{
LineBuffer *buf = (LineBuffer *)malloc(sizeof(LineBuffer));
if (!buf)
return NULL;
size_t aligned_width = ((width + align - 1) & ~(align - 1));
buf->buffer = (int *)aligned_alloc(align, 3 * aligned_width * sizeof(int));
if (!buf->buffer)
{
free(buf);
return NULL;
}
buf->prev = buf->buffer;
buf->curr = buf->prev + aligned_width;
buf->next = buf->curr + aligned_width;
buf->size = aligned_width;
return buf;
}
void rotateLineBuffer(LineBuffer *buf)
{
int *temp = buf->prev;
buf->prev = buf->curr;
buf->curr = buf->next;
buf->next = temp;
}
void freeLineBuffer(LineBuffer *buf)
{
if (buf)
{
free(buf->buffer);
free(buf);
}
}
typedef struct
{
uint8_t **data;
int capacity;
int size;
} Stack;
static Stack *canny_createStack(int capacity);
static void canny_push(Stack *stack, uint8_t *item);
static uint8_t *canny_pop(Stack *stack);
static void freeStack(Stack *stack);
static Stack *canny_createStack(int capacity)
{
Stack *stack = (Stack *)malloc(sizeof(Stack));
stack->data = (uint8_t **)malloc(capacity * sizeof(uint8_t *));
stack->capacity = capacity;
stack->size = 0;
return stack;
}
static void canny_push(Stack *stack, uint8_t *item)
{
if (stack->size < stack->capacity)
{
stack->data[stack->size++] = item;
}
}
static uint8_t *canny_pop(Stack *stack)
{
if (stack->size > 0)
{
return stack->data[--stack->size];
}
return NULL;
}
static void freeStack(Stack *stack)
{
free(stack->data);
free(stack);
}
int sobel(const uint8_t *src_data, size_t src_step, uint8_t *dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, int dx, int dy, int ksize, double scale, double delta, int border_type);
int canny(const uint8_t *src_data, size_t src_step,
uint8_t *dst_data, size_t dst_step,
int width, int height, int cn,
double low_thresh, double high_thresh,
int ksize, bool L2gradient)
{
return CV_HAL_ERROR_NOT_IMPLEMENTED;
if (cn != 1 || (ksize != 3 && ksize != 5))
{
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
if (L2gradient)
{
low_thresh = 32767.0 < low_thresh ? 32767.0 : low_thresh;
high_thresh = 32767.0 < high_thresh ? 32767.0 : high_thresh;
if (low_thresh > 0)
low_thresh *= low_thresh;
if (high_thresh > 0)
high_thresh *= high_thresh;
}
int low = (int)low_thresh;
low_thresh = low - (low > low_thresh);
int high = (int)high_thresh;
high_thresh = high - (high > high_thresh);
int16_t *dx_data = (int16_t *)malloc(height * width * sizeof(int16_t));
int16_t *dy_data = (int16_t *)malloc(height * width * sizeof(int16_t));
if (!dx_data || !dy_data)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
const size_t grad_step = width * sizeof(int16_t);
int ret = sobel(src_data, src_step, (uint8_t *)dx_data, grad_step,
width, height, CV_8U, CV_16S, 1,
0, 0, 0, 0, 1, 0, ksize, 1.0, 0.0, BORDER_REPLICATE);
if (ret != CV_HAL_ERROR_OK)
{
free(dx_data);
free(dy_data);
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
ret = sobel(src_data, src_step, (uint8_t *)dy_data, grad_step,
width, height, CV_8U, CV_16S, 1,
0, 0, 0, 0, 0, 1, ksize, 1.0, 0.0, BORDER_REPLICATE);
if (ret != CV_HAL_ERROR_OK)
{
free(dx_data);
free(dy_data);
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
const int mapstep = width + 2;
const size_t mapsize = (height + 2) * mapstep;
uint8_t *edge_map = (uint8_t *)calloc(mapsize, sizeof(uint8_t));
if (!edge_map)
{
free(dx_data);
free(dy_data);
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
Stack *stack = canny_createStack(width * height);
Stack *borderPeaksLocal = canny_createStack(width * height);
if (!stack || !borderPeaksLocal)
{
free(dx_data);
free(dy_data);
free(edge_map);
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
memset(edge_map, 1, mapstep);
memset(edge_map + (height + 1) * mapstep, 1, mapstep);
for (int i = 1; i <= height; i++)
{
edge_map[i * mapstep] = 1;
edge_map[i * mapstep + width + 1] = 1;
}
uint8_t *map = edge_map + mapstep + 1;
const int simd_align = 16;
LineBuffer *lineBuf = createLineBuffer(width, simd_align);
if (!lineBuf)
{
free(dx_data);
free(dy_data);
free(edge_map);
freeStack(stack);
freeStack(borderPeaksLocal);
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
const int TG22 = 13573;
for (int i = 0; i <= height; i++)
{
if (i < height)
{
int16_t *dx = (int16_t *)(dx_data + i * width);
int16_t *dy = (int16_t *)(dy_data + i * width);
if (L2gradient)
{
size_t vl = 0;
for (int j = 0; j < width; j += vl)
{
vl = __riscv_vsetvl_e32m8(width - j);
vint32m8_t vdx = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dx + j, vl), vl);
vint32m8_t vdy = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dy + j, vl), vl);
vint32m8_t vres = __riscv_vmul_vv_i32m8(vdx, vdx, vl);
vres = __riscv_vmacc_vv_i32m8(vres, vdy, vdy, vl);
__riscv_vse32_v_i32m8(lineBuf->next + j, vres, vl);
}
}
else
{
size_t vl = 0;
for (int j = 0; j < width; j += vl)
{
vl = __riscv_vsetvl_e32m8(width - j);
vint32m8_t vdx = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dx + j, vl), vl);
vint32m8_t vdy = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dy + j, vl), vl);
vbool4_t mask = __riscv_vmslt_vx_i32m8_b4(vdx, 0, vl);
vdx = __riscv_vmul_vx_i32m8_m(mask, vdx, -1, vl);
mask = __riscv_vmslt_vx_i32m8_b4(vdy, 0, vl);
vdy = __riscv_vmul_vx_i32m8_m(mask, vdy, -1, vl);
vint32m8_t vres = __riscv_vadd_vv_i32m8(vdx, vdy, vl);
__riscv_vse32_v_i32m8(lineBuf->next + j, vres, vl);
}
}
}
else
{
memset(lineBuf->next, 0, width * sizeof(int));
}
uint8_t *pmap = map + i * mapstep;
int16_t *dx = (int16_t *)(dx_data + i * width);
int16_t *dy = (int16_t *)(dy_data + i * width);
size_t vl = 0;
for (int j = 0; j < width; j += vl)
{
vl = __riscv_vsetvl_e32m8(width - j);
vint32m8_t vm = __riscv_vle32_v_i32m8(lineBuf->curr + j, vl);
vint32m8_t vdx = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dx + j, vl), vl);
vint32m8_t vdy = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dy + j, vl), vl);
vbool4_t mask_dx = __riscv_vmslt_vx_i32m8_b4(vdx, 0, vl);
vbool4_t mask_dy = __riscv_vmslt_vx_i32m8_b4(vdy, 0, vl);
vint32m8_t vdx_abs = __riscv_vmul_vx_i32m8_m(mask_dx, vdx, -1, vl);
vint32m8_t vdy_abs = __riscv_vmul_vx_i32m8_m(mask_dy, vdy, -1, vl);
vdy_abs = __riscv_vsll_vx_i32m8(vdy_abs, 15, vl);
// Calculate threshold values
vint32m8_t vtg22x = __riscv_vmul_vx_i32m8(vdx_abs, TG22, vl);
vint32m8_t vtg67x = __riscv_vadd_vv_i32m8(vtg22x, __riscv_vsll_vx_i32m8(vdx_abs, 16, vl), vl);
// Create masks for different angle ranges
vbool4_t mask1 = __riscv_vmslt_vv_i32m8_b4(vdy_abs, vtg22x, vl);
vbool4_t mask2 = __riscv_vmsgt_vv_i32m8_b4(vdy_abs, vtg67x, vl);
vbool4_t mask3 = __riscv_vmnot_m_b4(__riscv_vmor_mm_b4(mask1, mask2, vl), vl);
// Load neighbor pixels for all conditions
vint32m8_t prev_curr = __riscv_vle32_v_i32m8(lineBuf->curr + (j > 0 ? j - 1 : j), vl);
vint32m8_t next_curr = __riscv_vle32_v_i32m8(lineBuf->curr + j + 1, vl);
vint32m8_t prev_line = __riscv_vle32_v_i32m8(lineBuf->prev + j, vl);
vint32m8_t next_line = __riscv_vle32_v_i32m8(lineBuf->next + j, vl);
// Condition 1: Horizontal/Vertical edges (compare left/right)
vbool4_t cond1_max = __riscv_vmand_mm_b4(
__riscv_vmsgt_vv_i32m8_b4(vm, prev_curr, vl),
__riscv_vmsge_vv_i32m8_b4(vm, next_curr, vl),
vl);
// Condition 2: Diagonal edges (compare top/bottom)
vbool4_t cond2_max = __riscv_vmand_mm_b4(
__riscv_vmsgt_vv_i32m8_b4(vm, prev_line, vl),
__riscv_vmsge_vv_i32m8_b4(vm, next_line, vl),
vl);
// Condition 3: Other diagonals (calculate s)
vint32m8_t vxor = __riscv_vxor_vv_i32m8(vdx, vdy, vl);
vbool4_t s_mask = __riscv_vmslt_vx_i32m8_b4(vxor, 0, vl);
vint32m8_t prev_s1 = __riscv_vle32_v_i32m8(lineBuf->prev + (j > 0 ? j - 1 : j), vl);
vint32m8_t prev_s2 = __riscv_vle32_v_i32m8(lineBuf->prev + j + 1, vl);
vint32m8_t next_s1 = __riscv_vle32_v_i32m8(lineBuf->next + j + 1, vl);
vint32m8_t next_s2 = __riscv_vle32_v_i32m8(lineBuf->next + (j > 0 ? j - 1 : j), vl);
vint32m8_t prev_sel = __riscv_vmerge_vvm_i32m8(prev_s1, prev_s2, s_mask, vl);
vint32m8_t next_sel = __riscv_vmerge_vvm_i32m8(next_s1, next_s2, s_mask, vl);
vbool4_t cond3_max = __riscv_vmand_mm_b4(
__riscv_vmsgt_vv_i32m8_b4(vm, prev_sel, vl),
__riscv_vmsgt_vv_i32m8_b4(vm, next_sel, vl),
vl);
// Combine results from all conditions
vbool4_t vmax = __riscv_vmor_mm_b4(
__riscv_vmand_mm_b4(mask1, cond1_max, vl),
__riscv_vmand_mm_b4(mask2, cond2_max, vl),
vl);
vmax = __riscv_vmor_mm_b4(
vmax,
__riscv_vmand_mm_b4(mask3, cond3_max, vl),
vl);
// Threshold checks
vbool4_t vlow = __riscv_vmsgt_vx_i32m8_b4(vm, low_thresh, vl);
vbool4_t vhigh = __riscv_vmsgt_vx_i32m8_b4(vm, high_thresh, vl);
vbool4_t valid_edges = __riscv_vmand_mm_b4(vmax, vlow, vl);
vbool4_t strong_edges = __riscv_vmand_mm_b4(valid_edges, vhigh, vl);
// Generate result map
vuint8m2_t vres = __riscv_vmv_v_x_u8m2(1, vl);
vres = __riscv_vmerge_vxm_u8m2(vres, 0, valid_edges, vl);
vres = __riscv_vmerge_vxm_u8m2(vres, 2, strong_edges, vl);
__riscv_vse8_v_u8m2(pmap + j, vres, vl);
// canny_Push strong edges to stack
int32_t vidx = __riscv_vfirst_m_b4(strong_edges, vl);
while (vidx >= 0)
{
canny_push(stack, pmap + j + vidx);
strong_edges = __riscv_vmand_mm_b4(strong_edges,
__riscv_vmclr_m_b4(vl), vl);
vidx = __riscv_vfirst_m_b4(strong_edges, vl);
}
}
rotateLineBuffer(lineBuf);
}
uint8_t *pmapLower = edge_map;
uint32_t pmapDiff = (uint32_t)((edge_map + mapsize) - pmapLower);
while (stack->size > 0)
{
uint8_t *m = canny_pop(stack);
if ((uint64_t)m < (uint64_t)pmapLower || (uint64_t)m >= (uint64_t)(pmapLower + pmapDiff))
continue;
const int offsets[8] = {
-mapstep - 1, -mapstep, -mapstep + 1,
-1, +1,
+mapstep - 1, +mapstep, +mapstep + 1};
for (int k = 0; k < 8; k++)
{
uint8_t *neighbor = m + offsets[k];
if ((uint64_t)neighbor < (uint64_t)edge_map ||
(uint64_t)neighbor >= (uint64_t)(edge_map + mapsize))
continue;
if (*neighbor == 0)
{
*neighbor = 2;
canny_push(stack, neighbor);
}
}
}
if (borderPeaksLocal->size > 0)
{
for (int i = 0; i < borderPeaksLocal->size; i++)
{
canny_push(stack, borderPeaksLocal->data[i]);
}
}
for (int i = 0; i < height; i++)
{
uint8_t *pdst = dst_data + i * dst_step;
uint8_t *pmap = map + i * mapstep + 1;
size_t vl = 0;
for (int j = 0; j < width; j += vl)
{
vl = __riscv_vsetvl_e8m8(width - j);
vuint8m8_t vres = __riscv_vle8_v_u8m8(pmap + j, vl);
vres = __riscv_vsrl_vx_u8m8(vres, 1, vl);
vres = __riscv_vsub_vv_u8m8(__riscv_vmv_v_x_u8m8(0, vl), vres, vl);
__riscv_vse8_v_u8m8(pdst + j, vres, vl);
}
}
freeLineBuffer(lineBuf);
freeStack(borderPeaksLocal);
freeStack(stack);
free(dx_data);
free(dy_data);
free(edge_map);
return CV_HAL_ERROR_OK;
}
}
}
#endif // CV_HAL_RVV_1P0_ENABLED
} // namespace cv::rvv_hal::imgproc

View File

@ -0,0 +1,883 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html .
// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
#include "rvv_hal.hpp"
#include "common.hpp"
namespace cv
{
namespace rvv_hal
{
namespace imgproc
{
#if CV_HAL_RVV_1P0_ENABLED
#define borderValue 0
static inline int scharr_dx0(int start, int end, const uint8_t *src_data, size_t src_step,
uint8_t *dst_data, size_t dst_step,
int width, int height,
int src_depth __attribute__((unused)), int dst_depth, int cn __attribute__((unused)),
int margin_left __attribute__((unused)), int margin_top __attribute__((unused)),
int margin_right __attribute__((unused)), int margin_bottom __attribute__((unused)),
double scale __attribute__((unused)), double delta __attribute__((unused)),
int border_type)
{
int dx0, dx1, dx2, dy0, dy1, dy2;
dy0 = -1;
dx0 = 3;
dy1 = 0;
dx1 = 10;
dy2 = 1;
dx2 = 3;
int16_t trow[width];
static thread_local std::vector<uint8_t> zero_row;
if (zero_row.size() < static_cast<size_t>(width))
{
zero_row.clear();
zero_row.resize(width, 0);
}
if (dst_depth == CV_8U)
{
for (int y = start; y < end; ++y)
{
const uint8_t *srow1 = src_data + y * src_step;
const uint8_t *srow0;
const uint8_t *srow2;
if (y != 0)
{
srow0 = src_data + (y - 1) * src_step;
}
else
{
if (border_type == BORDER_REPLICATE)
{
srow0 = srow1;
}
else
{
srow0 = zero_row.data();
}
}
if (y != height - 1)
{
srow2 = src_data + (y + 1) * src_step;
}
else
{
if (border_type == BORDER_REPLICATE)
{
srow2 = srow1;
}
else
{
srow2 = zero_row.data();
}
}
trow[0] = (dy0 * srow0[0] + dy1 * srow1[0] + dy2 * srow2[0]);
// Last pixel
trow[width - 1] = dy0 * srow0[width - 1] + dy1 * srow1[width - 1] + dy2 * srow2[width - 1];
size_t vl = 0;
// Vector processing
for (int x = 1; x < width - 1; x += vl)
{
vl = __riscv_vsetvl_e8m4(width - x - 1);
vint16m8_t vsrow0 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow0 + x, vl), vl));
vint16m8_t vsrow2 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow2 + x, vl), vl));
vint16m8_t vrowx = __riscv_vsub_vv_i16m8(vsrow2, vsrow0, vl);
__riscv_vse16_v_i16m8(trow + x, vrowx, vl);
}
int x = 0;
uint8_t *drow = (uint8_t *)(dst_data + y * dst_step);
int16_t temp = trow[0] * dx1 + trow[1] * dx2;
if (border_type == BORDER_REPLICATE)
{
temp += trow[0] * dx0;
}
drow[x] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0);
x = 1;
for (; x < width - 1; x += vl)
{
vl = __riscv_vsetvl_e16m8(width - x - 1);
vint16m8_t vleft = __riscv_vle16_v_i16m8(trow + x - 1, vl);
vint16m8_t vcenter = __riscv_vle16_v_i16m8(trow + x, vl);
vint16m8_t vright = __riscv_vle16_v_i16m8(trow + x + 1, vl);
vint16m8_t vres = __riscv_vmul_vx_i16m8(vcenter, dx1, vl);
vcenter = __riscv_vadd_vv_i16m8(vleft, vright, vl);
vres = __riscv_vmacc_vx_i16m8(vres, dx0, vcenter, vl);
vres = __riscv_vmax_vx_i16m8(vres, 0, vl);
vres = __riscv_vmin_vx_i16m8(vres, 255, vl);
__riscv_vse8_v_u8m4(drow + x, __riscv_vncvt_x_x_w_u8m4(__riscv_vreinterpret_v_i16m8_u16m8(vres), vl), vl);
}
x = width - 1;
temp = trow[width - 2] * dx0 + trow[width - 1] * dx1;
if (border_type == BORDER_REPLICATE)
{
temp += trow[width - 1] * dx2;
}
drow[x] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0);
}
}
else if (dst_depth == CV_16S)
{
for (int y = start; y < end; ++y)
{
const uint8_t *srow1 = src_data + y * src_step;
const uint8_t *srow0;
const uint8_t *srow2;
if (y != 0)
{
srow0 = src_data + (y - 1) * src_step;
}
else
{
if (border_type == BORDER_REPLICATE)
{
srow0 = srow1;
}
else
{
srow0 = zero_row.data();
}
}
if (y != height - 1)
{
srow2 = src_data + (y + 1) * src_step;
}
else
{
if (border_type == BORDER_REPLICATE)
{
srow2 = srow1;
}
else
{
srow2 = zero_row.data();
}
}
if (y == 0)
{
if (border_type == BORDER_REPLICATE)
{
srow0 = srow1;
}
else
{
srow0 = zero_row.data();
}
}
else if (y == height - 1)
{
if (border_type == BORDER_REPLICATE)
{
srow2 = srow1;
}
else
{
srow2 = zero_row.data();
}
}
size_t vl = 0;
// Vector processing
for (int x = 0; x < width; x += vl)
{
vl = __riscv_vsetvl_e8m4(width - x);
vint16m8_t vsrow0 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow0 + x, vl), vl));
vint16m8_t vsrow2 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow2 + x, vl), vl));
vint16m8_t vrowx = __riscv_vsub_vv_i16m8(vsrow2, vsrow0, vl);
__riscv_vse16_v_i16m8(trow + x, vrowx, vl);
}
int x = 0;
int16_t *drow = (int16_t *)(dst_data + y * dst_step);
drow[0] = trow[0] * dx1 + trow[1] * dx2;
if (border_type == BORDER_REPLICATE)
{
drow[x] += trow[0] * dx0;
}
x = 1;
for (; x < width - 1; x += vl)
{
vl = __riscv_vsetvl_e16m8(width - x - 1);
vint16m8_t vleft = __riscv_vle16_v_i16m8(trow + x - 1, vl);
vint16m8_t vcenter = __riscv_vle16_v_i16m8(trow + x, vl);
vint16m8_t vright = __riscv_vle16_v_i16m8(trow + x + 1, vl);
vint16m8_t vres = __riscv_vmul_vx_i16m8(vcenter, dx1, vl);
vcenter = __riscv_vadd_vv_i16m8(vleft, vright, vl);
vres = __riscv_vmacc_vx_i16m8(vres, dx0, vcenter, vl);
__riscv_vse16_v_i16m8(drow + x, vres, vl);
}
x = width - 1;
drow[x] = trow[width - 2] * dx0 + trow[width - 1] * dx1;
if (border_type == BORDER_REPLICATE)
{
drow[x] += trow[width - 1] * dx2;
}
}
}
else if (dst_depth == CV_32F)
{
for (int y = start; y < end; ++y)
{
const uint8_t *srow1 = src_data + y * src_step;
const uint8_t *srow0;
const uint8_t *srow2;
if (y != 0)
{
srow0 = src_data + (y - 1) * src_step;
}
else
{
if (border_type == BORDER_REPLICATE)
{
srow0 = srow1;
}
else
{
srow0 = zero_row.data();
}
}
if (y != height - 1)
{
srow2 = src_data + (y + 1) * src_step;
}
else
{
if (border_type == BORDER_REPLICATE)
{
srow2 = srow1;
}
else
{
srow2 = zero_row.data();
}
}
if (y == 0)
{
if (border_type == BORDER_REPLICATE)
{
srow0 = srow1;
}
else
{
srow0 = zero_row.data();
}
}
else if (y == height - 1)
{
if (border_type == BORDER_REPLICATE)
{
srow2 = srow1;
}
else
{
srow2 = zero_row.data();
}
}
size_t vl = 0;
// Vector processing
for (int x = 0; x < width; x += vl)
{
vl = __riscv_vsetvl_e8m4(width - x);
vint16m8_t vsrow0 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow0 + x, vl), vl));
vint16m8_t vsrow2 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow2 + x, vl), vl));
vint16m8_t vrowx = __riscv_vsub_vv_i16m8(vsrow2, vsrow0, vl);
__riscv_vse16_v_i16m8(trow + x, vrowx, vl);
}
int x = 0;
float *drow = (float *)(dst_data + y * dst_step);
drow[x] = trow[0] * dx1 + trow[1] * dx2;
if (border_type == BORDER_REPLICATE)
{
drow[x] += trow[0] * dx0;
}
x = 1;
for (; x < width - 1; x += vl)
{
vl = __riscv_vsetvl_e16m4(width - x - 1);
vint16m4_t vleft = __riscv_vle16_v_i16m4(trow + x - 1, vl);
vint16m4_t vcenter = __riscv_vle16_v_i16m4(trow + x, vl);
vint16m4_t vright = __riscv_vle16_v_i16m4(trow + x + 1, vl);
vint16m4_t vres = __riscv_vmul_vx_i16m4(vcenter, dx1, vl);
vcenter = __riscv_vadd_vv_i16m4(vleft, vright, vl);
vres = __riscv_vmacc_vx_i16m4(vres, dx0, vcenter, vl);
__riscv_vse32_v_f32m8(drow + x, __riscv_vfwcvt_f_x_v_f32m8(vres, vl), vl);
}
x = width - 1;
drow[x] = trow[width - 2] * dx0 + trow[width - 1] * dx1;
if (border_type == BORDER_REPLICATE)
{
drow[x] += trow[width - 1] * dx2;
}
}
}
return CV_HAL_ERROR_OK;
}
static inline int scharr_dx1(int start, int end, const uint8_t *src_data, size_t src_step,
uint8_t *dst_data, size_t dst_step,
int width, int height,
int src_depth __attribute__((unused)), int dst_depth, int cn __attribute__((unused)),
int margin_left __attribute__((unused)), int margin_top __attribute__((unused)),
int margin_right __attribute__((unused)), int margin_bottom __attribute__((unused)),
double scale __attribute__((unused)), double delta __attribute__((unused)),
int border_type)
{
int dx0, dx1, dx2, dy0, dy1, dy2;
dx0 = -1;
dy0 = 3;
dx1 = 0;
dy1 = 10;
dx2 = 1;
dy2 = 3;
int16_t trow[width];
static thread_local std::vector<uint8_t> zero_row;
if (zero_row.size() < static_cast<size_t>(width))
{
zero_row.clear();
zero_row.resize(width, 0);
}
if (dst_depth == CV_8U)
{
for (int y = start; y < end; ++y)
{
const uint8_t *srow1 = src_data + y * src_step;
const uint8_t *srow0;
const uint8_t *srow2;
if (y != 0)
{
srow0 = src_data + (y - 1) * src_step;
}
else
{
if (border_type == BORDER_REPLICATE)
{
srow0 = srow1;
}
else
{
srow0 = zero_row.data();
}
}
if (y != height - 1)
{
srow2 = src_data + (y + 1) * src_step;
}
else
{
if (border_type == BORDER_REPLICATE)
{
srow2 = srow1;
}
else
{
srow2 = zero_row.data();
}
}
if (y == 0)
{
if (border_type == BORDER_REPLICATE)
{
srow0 = srow1;
}
else
{
srow0 = zero_row.data();
}
}
else if (y == height - 1)
{
if (border_type == BORDER_REPLICATE)
{
srow2 = srow1;
}
else
{
srow2 = zero_row.data();
}
}
trow[0] = (dy0 * srow0[0] + dy1 * srow1[0] + dy2 * srow2[0]);
// Last pixel
trow[width - 1] = dy0 * srow0[width - 1] + dy1 * srow1[width - 1] + dy2 * srow2[width - 1];
size_t vl = 0;
// Vector processing
for (int x = 1; x < width - 1; x += vl)
{
vl = __riscv_vsetvl_e8m4(width - x - 1);
vuint8m4_t vsrow0 = __riscv_vle8_v_u8m4(srow0 + x, vl);
vuint8m4_t vsrow1 = __riscv_vle8_v_u8m4(srow1 + x, vl);
vuint8m4_t vsrow2 = __riscv_vle8_v_u8m4(srow2 + x, vl);
vuint16m8_t vrowx = __riscv_vwmulu_vx_u16m8(vsrow1, dy1, vl);
vuint16m8_t vtemp = __riscv_vwaddu_vv_u16m8(vsrow0, vsrow2, vl);
vrowx = __riscv_vmacc_vx_u16m8(vrowx, dy0, vtemp, vl);
__riscv_vse16_v_i16m8(trow + x, __riscv_vreinterpret_v_u16m8_i16m8(vrowx), vl);
}
int x = 0;
uint8_t *drow = (uint8_t *)(dst_data + y * dst_step);
int16_t temp = trow[0] * dx1 + trow[1] * dx2;
if (border_type == BORDER_REPLICATE)
{
temp += trow[0] * dx0;
}
drow[x] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0);
x = 1;
for (; x < width - 1; x += vl)
{
vl = __riscv_vsetvl_e16m8(width - x - 1);
vint16m8_t vleft = __riscv_vle16_v_i16m8(trow + x - 1, vl);
vint16m8_t vright = __riscv_vle16_v_i16m8(trow + x + 1, vl);
vint16m8_t vres = __riscv_vsub_vv_i16m8(vright, vleft, vl);
vres = __riscv_vmax_vx_i16m8(vres, 0, vl);
vres = __riscv_vmin_vx_i16m8(vres, 255, vl);
__riscv_vse8_v_u8m4(drow + x, __riscv_vncvt_x_x_w_u8m4(__riscv_vreinterpret_v_i16m8_u16m8(vres), vl), vl);
}
x = width - 1;
temp = trow[width - 2] * dx0 + trow[width - 1] * dx1;
if (border_type == BORDER_REPLICATE)
{
temp += trow[width - 1] * dx2;
}
drow[x] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0);
}
}
else if (dst_depth == CV_16S)
{
for (int y = start; y < end; ++y)
{
const uint8_t *srow1 = src_data + y * src_step;
const uint8_t *srow0;
const uint8_t *srow2;
if (y != 0)
{
srow0 = src_data + (y - 1) * src_step;
}
else
{
if (border_type == BORDER_REPLICATE)
{
srow0 = srow1;
}
else
{
srow0 = zero_row.data();
}
}
if (y != height - 1)
{
srow2 = src_data + (y + 1) * src_step;
}
else
{
if (border_type == BORDER_REPLICATE)
{
srow2 = srow1;
}
else
{
srow2 = zero_row.data();
}
}
if (y == 0)
{
if (border_type == BORDER_REPLICATE)
{
srow0 = srow1;
}
else
{
srow0 = zero_row.data();
}
}
else if (y == height - 1)
{
if (border_type == BORDER_REPLICATE)
{
srow2 = srow1;
}
else
{
srow2 = zero_row.data();
}
}
size_t vl = 0;
// Vector processing
for (int x = 0; x < width; x += vl)
{
vl = __riscv_vsetvl_e8m4(width - x);
vuint8m4_t vsrow0 = __riscv_vle8_v_u8m4(srow0 + x, vl);
vuint8m4_t vsrow1 = __riscv_vle8_v_u8m4(srow1 + x, vl);
vuint8m4_t vsrow2 = __riscv_vle8_v_u8m4(srow2 + x, vl);
vuint16m8_t vrowx = __riscv_vwmulu_vx_u16m8(vsrow1, dy1, vl);
vuint16m8_t vtemp = __riscv_vwaddu_vv_u16m8(vsrow0, vsrow2, vl);
vrowx = __riscv_vmacc_vx_u16m8(vrowx, dy0, vtemp, vl);
__riscv_vse16_v_i16m8(trow + x, __riscv_vreinterpret_v_u16m8_i16m8(vrowx), vl);
}
int x = 0;
int16_t *drow = (int16_t *)(dst_data + y * dst_step);
drow[x] = trow[0] * dx1 + trow[1] * dx2;
if (border_type == BORDER_REPLICATE)
{
drow[x] += trow[0] * dx0;
}
x = 1;
for (; x < width - 1; x += vl)
{
vl = __riscv_vsetvl_e16m8(width - x - 1);
vint16m8_t vleft = __riscv_vle16_v_i16m8(trow + x - 1, vl);
vint16m8_t vright = __riscv_vle16_v_i16m8(trow + x + 1, vl);
vint16m8_t vres = __riscv_vsub_vv_i16m8(vright, vleft, vl);
__riscv_vse16_v_i16m8(drow + x, vres, vl);
}
x = width - 1;
drow[x] = trow[width - 2] * dx0 + trow[width - 1] * dx1;
if (border_type == BORDER_REPLICATE)
{
drow[x] += trow[width - 1] * dx2;
}
}
}
else if (dst_depth == CV_32F)
{
for (int y = start; y < end; ++y)
{
const uint8_t *srow1 = src_data + y * src_step;
const uint8_t *srow0;
const uint8_t *srow2;
if (y != 0)
{
srow0 = src_data + (y - 1) * src_step;
}
else
{
if (border_type == BORDER_REPLICATE)
{
srow0 = srow1;
}
else
{
srow0 = zero_row.data();
}
}
if (y != height - 1)
{
srow2 = src_data + (y + 1) * src_step;
}
else
{
if (border_type == BORDER_REPLICATE)
{
srow2 = srow1;
}
else
{
srow2 = zero_row.data();
}
}
if (y == 0)
{
if (border_type == BORDER_REPLICATE)
{
srow0 = srow1;
}
else
{
srow0 = zero_row.data();
}
}
else if (y == height - 1)
{
if (border_type == BORDER_REPLICATE)
{
srow2 = srow1;
}
else
{
srow2 = zero_row.data();
}
}
size_t vl = 0;
// Vector processing
for (int x = 0; x < width; x += vl)
{
vl = __riscv_vsetvl_e8m4(width - x);
vuint8m4_t vsrow0 = __riscv_vle8_v_u8m4(srow0 + x, vl);
vuint8m4_t vsrow1 = __riscv_vle8_v_u8m4(srow1 + x, vl);
vuint8m4_t vsrow2 = __riscv_vle8_v_u8m4(srow2 + x, vl);
vuint16m8_t vrowx = __riscv_vwmulu_vx_u16m8(vsrow1, dy1, vl);
vuint16m8_t vtemp = __riscv_vwaddu_vv_u16m8(vsrow0, vsrow2, vl);
vrowx = __riscv_vmacc_vx_u16m8(vrowx, dy0, vtemp, vl);
__riscv_vse16_v_i16m8(trow + x, __riscv_vreinterpret_v_u16m8_i16m8(vrowx), vl);
}
int x = 0;
float *drow = (float *)(dst_data + y * dst_step);
drow[x] = trow[0] * dx1 + trow[1] * dx2;
if (border_type == BORDER_REPLICATE)
{
drow[x] += trow[0] * dx0;
}
x = 1;
for (; x < width - 1; x += vl)
{
vl = __riscv_vsetvl_e16m4(width - x - 1);
vint16m4_t vleft = __riscv_vle16_v_i16m4(trow + x - 1, vl);
vint16m4_t vright = __riscv_vle16_v_i16m4(trow + x + 1, vl);
vint16m4_t vres = __riscv_vsub_vv_i16m4(vright, vleft, vl);
__riscv_vse32_v_f32m8(drow + x, __riscv_vfwcvt_f_x_v_f32m8(vres, vl), vl);
}
x = width - 1;
drow[x] = trow[width - 2] * dx0 + trow[width - 1] * dx1;
if (border_type == BORDER_REPLICATE)
{
drow[x] += trow[width - 1] * dx2;
}
}
}
return CV_HAL_ERROR_OK;
}
class ScharrInvoker : public ParallelLoopBody
{
public:
explicit ScharrInvoker(std::function<int(int, int)> _func)
: func(std::move(_func)) {}
void operator()(const Range &range) const override
{
func(range.start, range.end);
}
private:
std::function<int(int, int)> func;
};
template <typename... Args>
inline int invoke(int height,
int (*f)(int, int, Args...),
Args... args)
{
using namespace std::placeholders;
auto bound = std::bind(f, _1, _2, std::forward<Args>(args)...);
cv::parallel_for_(Range(1, height), ScharrInvoker(bound), cv::getNumThreads());
return f(0, 1, std::forward<Args>(args)...);
}
int scharr(const uint8_t *src_data, size_t src_step,
uint8_t *dst_data, size_t dst_step,
int width, int height,
int src_depth, int dst_depth, int cn,
int margin_left, int margin_top,
int margin_right, int margin_bottom,
int dx, int dy __attribute__((unused)),
double scale, double delta,
int border_type)
{
if (src_depth != CV_8U ||
(dst_depth != CV_8U && dst_depth != CV_16S && dst_depth != CV_32F) ||
cn != 1 || width < 3)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
if (scale != 1 || delta != 0)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
if (border_type != BORDER_REPLICATE && border_type != BORDER_CONSTANT)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
if (margin_left != 0 || margin_top != 0 || margin_right != 0 || margin_bottom != 0)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
// This is copied from opencv/modules/imgproc/src/deriv.cpp
// if (!(dx >= 0 && dy >= 0 && dx + dy == 1))
// {
// return CV_HAL_ERROR_NOT_IMPLEMENTED;
// }
int size = 1;
if (dst_depth == CV_16S)
{
size = 2;
}
else if (dst_depth == CV_32F)
{
size = 4;
}
std::vector<uint8_t> temp_buffer;
uint8_t *actual_dst_data = dst_data;
if (src_data == dst_data)
{
temp_buffer.resize(height * dst_step);
actual_dst_data = temp_buffer.data();
}
int res = invoke(height,
dx == 1 ? &scharr_dx1 : &scharr_dx0,
src_data, src_step,
actual_dst_data, dst_step,
width, height,
src_depth, dst_depth, cn,
margin_left, margin_top,
margin_right, margin_bottom,
scale, delta,
border_type);
if (src_data == dst_data && res == CV_HAL_ERROR_OK)
{
for (int y = 0; y < height; ++y)
{
memcpy(dst_data + y * dst_step,
actual_dst_data + y * dst_step,
width * size);
}
}
return res;
}
#endif // CV_HAL_RVV_1P0_ENABLED
}
}
} // cv::rvv_hal::imgproc

File diff suppressed because it is too large Load Diff