From ad22d482e6d3bafa8a059042c2acda68bb2f35fc Mon Sep 17 00:00:00 2001 From: damon-spacemit Date: Wed, 27 Aug 2025 01:05:08 +0800 Subject: [PATCH] Merge pull request #27378 from spacemit-com:4.x Add canny, scharr and sobel for riscv-rvv hal. #27378 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- hal/riscv-rvv/include/imgproc.hpp | 23 + hal/riscv-rvv/src/imgproc/canny.cpp | 426 ++++++++++ hal/riscv-rvv/src/imgproc/scharr.cpp | 883 ++++++++++++++++++++ hal/riscv-rvv/src/imgproc/sobel.cpp | 1128 ++++++++++++++++++++++++++ 4 files changed, 2460 insertions(+) create mode 100644 hal/riscv-rvv/src/imgproc/canny.cpp create mode 100644 hal/riscv-rvv/src/imgproc/scharr.cpp create mode 100644 hal/riscv-rvv/src/imgproc/sobel.cpp diff --git a/hal/riscv-rvv/include/imgproc.hpp b/hal/riscv-rvv/include/imgproc.hpp index 398e952802..ab95028e2a 100644 --- a/hal/riscv-rvv/include/imgproc.hpp +++ b/hal/riscv-rvv/include/imgproc.hpp @@ -241,6 +241,29 @@ int integral(int depth, int sdepth, int sqdepth, //#undef cv_hal_integral //#define cv_hal_integral cv::rvv_hal::imgproc::integral +/* ############ scharr ############ */ +int scharr(const uint8_t *src_data, size_t src_step, uint8_t *dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, int dx, int dy, double scale, double delta, int border_type); + +#undef cv_hal_scharr +#define cv_hal_scharr cv::rvv_hal::imgproc::scharr + +/* ############ sobel ############ */ + +int sobel(const uint8_t *src_data, size_t src_step, uint8_t *dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, int dx, int dy, int ksize, double scale, double delta, int border_type); + +#undef cv_hal_sobel +#define cv_hal_sobel cv::rvv_hal::imgproc::sobel + +/* ############ canny ############ */ +int canny(const uint8_t *src_data, size_t src_step, + uint8_t *dst_data, size_t dst_step, + int width, int height, int cn, + double low_thresh, double high_thresh, + int ksize, bool L2gradient); + +#undef cv_hal_canny +#define cv_hal_canny cv::rvv_hal::imgproc::canny + #endif // CV_HAL_RVV_1P0_ENABLED #if CV_HAL_RVV_071_ENABLED diff --git a/hal/riscv-rvv/src/imgproc/canny.cpp b/hal/riscv-rvv/src/imgproc/canny.cpp new file mode 100644 index 0000000000..f6af73c7a7 --- /dev/null +++ b/hal/riscv-rvv/src/imgproc/canny.cpp @@ -0,0 +1,426 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2025, SpaceMIT Inc., all rights reserved. + +#include "rvv_hal.hpp" +#include "common.hpp" + +namespace cv +{ + namespace rvv_hal + { + namespace imgproc + { +#if CV_HAL_RVV_1P0_ENABLED + + typedef struct + { + int *buffer; + int *prev; + int *curr; + int *next; + size_t size; + } LineBuffer; + + LineBuffer *createLineBuffer(size_t width, size_t align); + + void rotateLineBuffer(LineBuffer *buf); + + void freeLineBuffer(LineBuffer *buf); + + LineBuffer *createLineBuffer(size_t width, size_t align) + { + LineBuffer *buf = (LineBuffer *)malloc(sizeof(LineBuffer)); + if (!buf) + return NULL; + + size_t aligned_width = ((width + align - 1) & ~(align - 1)); + buf->buffer = (int *)aligned_alloc(align, 3 * aligned_width * sizeof(int)); + if (!buf->buffer) + { + free(buf); + return NULL; + } + + buf->prev = buf->buffer; + buf->curr = buf->prev + aligned_width; + buf->next = buf->curr + aligned_width; + buf->size = aligned_width; + + return buf; + } + + void rotateLineBuffer(LineBuffer *buf) + { + int *temp = buf->prev; + buf->prev = buf->curr; + buf->curr = buf->next; + buf->next = temp; + } + + void freeLineBuffer(LineBuffer *buf) + { + if (buf) + { + free(buf->buffer); + free(buf); + } + } + + typedef struct + { + uint8_t **data; + int capacity; + int size; + } Stack; + + static Stack *canny_createStack(int capacity); + static void canny_push(Stack *stack, uint8_t *item); + static uint8_t *canny_pop(Stack *stack); + static void freeStack(Stack *stack); + + static Stack *canny_createStack(int capacity) + { + Stack *stack = (Stack *)malloc(sizeof(Stack)); + stack->data = (uint8_t **)malloc(capacity * sizeof(uint8_t *)); + stack->capacity = capacity; + stack->size = 0; + return stack; + } + + static void canny_push(Stack *stack, uint8_t *item) + { + if (stack->size < stack->capacity) + { + stack->data[stack->size++] = item; + } + } + + static uint8_t *canny_pop(Stack *stack) + { + if (stack->size > 0) + { + return stack->data[--stack->size]; + } + return NULL; + } + + static void freeStack(Stack *stack) + { + free(stack->data); + free(stack); + } + + int sobel(const uint8_t *src_data, size_t src_step, uint8_t *dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, int dx, int dy, int ksize, double scale, double delta, int border_type); + + int canny(const uint8_t *src_data, size_t src_step, + uint8_t *dst_data, size_t dst_step, + int width, int height, int cn, + double low_thresh, double high_thresh, + int ksize, bool L2gradient) + { + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (cn != 1 || (ksize != 3 && ksize != 5)) + { + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + if (L2gradient) + { + low_thresh = 32767.0 < low_thresh ? 32767.0 : low_thresh; + high_thresh = 32767.0 < high_thresh ? 32767.0 : high_thresh; + + if (low_thresh > 0) + low_thresh *= low_thresh; + if (high_thresh > 0) + high_thresh *= high_thresh; + } + + int low = (int)low_thresh; + low_thresh = low - (low > low_thresh); + int high = (int)high_thresh; + high_thresh = high - (high > high_thresh); + + int16_t *dx_data = (int16_t *)malloc(height * width * sizeof(int16_t)); + int16_t *dy_data = (int16_t *)malloc(height * width * sizeof(int16_t)); + if (!dx_data || !dy_data) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + const size_t grad_step = width * sizeof(int16_t); + + int ret = sobel(src_data, src_step, (uint8_t *)dx_data, grad_step, + width, height, CV_8U, CV_16S, 1, + 0, 0, 0, 0, 1, 0, ksize, 1.0, 0.0, BORDER_REPLICATE); + if (ret != CV_HAL_ERROR_OK) + { + free(dx_data); + free(dy_data); + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + ret = sobel(src_data, src_step, (uint8_t *)dy_data, grad_step, + width, height, CV_8U, CV_16S, 1, + 0, 0, 0, 0, 0, 1, ksize, 1.0, 0.0, BORDER_REPLICATE); + if (ret != CV_HAL_ERROR_OK) + { + free(dx_data); + free(dy_data); + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + const int mapstep = width + 2; + const size_t mapsize = (height + 2) * mapstep; + uint8_t *edge_map = (uint8_t *)calloc(mapsize, sizeof(uint8_t)); + if (!edge_map) + { + free(dx_data); + free(dy_data); + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + Stack *stack = canny_createStack(width * height); + Stack *borderPeaksLocal = canny_createStack(width * height); + if (!stack || !borderPeaksLocal) + { + free(dx_data); + free(dy_data); + free(edge_map); + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + memset(edge_map, 1, mapstep); + memset(edge_map + (height + 1) * mapstep, 1, mapstep); + + for (int i = 1; i <= height; i++) + { + edge_map[i * mapstep] = 1; + edge_map[i * mapstep + width + 1] = 1; + } + + uint8_t *map = edge_map + mapstep + 1; + + const int simd_align = 16; + LineBuffer *lineBuf = createLineBuffer(width, simd_align); + if (!lineBuf) + { + free(dx_data); + free(dy_data); + free(edge_map); + freeStack(stack); + freeStack(borderPeaksLocal); + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + const int TG22 = 13573; + for (int i = 0; i <= height; i++) + { + if (i < height) + { + int16_t *dx = (int16_t *)(dx_data + i * width); + int16_t *dy = (int16_t *)(dy_data + i * width); + + if (L2gradient) + { + size_t vl = 0; + for (int j = 0; j < width; j += vl) + { + vl = __riscv_vsetvl_e32m8(width - j); + vint32m8_t vdx = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dx + j, vl), vl); + vint32m8_t vdy = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dy + j, vl), vl); + vint32m8_t vres = __riscv_vmul_vv_i32m8(vdx, vdx, vl); + vres = __riscv_vmacc_vv_i32m8(vres, vdy, vdy, vl); + __riscv_vse32_v_i32m8(lineBuf->next + j, vres, vl); + } + } + else + { + size_t vl = 0; + for (int j = 0; j < width; j += vl) + { + vl = __riscv_vsetvl_e32m8(width - j); + vint32m8_t vdx = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dx + j, vl), vl); + vint32m8_t vdy = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dy + j, vl), vl); + vbool4_t mask = __riscv_vmslt_vx_i32m8_b4(vdx, 0, vl); + vdx = __riscv_vmul_vx_i32m8_m(mask, vdx, -1, vl); + mask = __riscv_vmslt_vx_i32m8_b4(vdy, 0, vl); + vdy = __riscv_vmul_vx_i32m8_m(mask, vdy, -1, vl); + vint32m8_t vres = __riscv_vadd_vv_i32m8(vdx, vdy, vl); + __riscv_vse32_v_i32m8(lineBuf->next + j, vres, vl); + } + } + } + else + { + memset(lineBuf->next, 0, width * sizeof(int)); + } + + uint8_t *pmap = map + i * mapstep; + + int16_t *dx = (int16_t *)(dx_data + i * width); + int16_t *dy = (int16_t *)(dy_data + i * width); + + size_t vl = 0; + for (int j = 0; j < width; j += vl) + { + vl = __riscv_vsetvl_e32m8(width - j); + + vint32m8_t vm = __riscv_vle32_v_i32m8(lineBuf->curr + j, vl); + + vint32m8_t vdx = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dx + j, vl), vl); + vint32m8_t vdy = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dy + j, vl), vl); + + vbool4_t mask_dx = __riscv_vmslt_vx_i32m8_b4(vdx, 0, vl); + vbool4_t mask_dy = __riscv_vmslt_vx_i32m8_b4(vdy, 0, vl); + vint32m8_t vdx_abs = __riscv_vmul_vx_i32m8_m(mask_dx, vdx, -1, vl); + vint32m8_t vdy_abs = __riscv_vmul_vx_i32m8_m(mask_dy, vdy, -1, vl); + + vdy_abs = __riscv_vsll_vx_i32m8(vdy_abs, 15, vl); + + // Calculate threshold values + vint32m8_t vtg22x = __riscv_vmul_vx_i32m8(vdx_abs, TG22, vl); + vint32m8_t vtg67x = __riscv_vadd_vv_i32m8(vtg22x, __riscv_vsll_vx_i32m8(vdx_abs, 16, vl), vl); + + // Create masks for different angle ranges + vbool4_t mask1 = __riscv_vmslt_vv_i32m8_b4(vdy_abs, vtg22x, vl); + vbool4_t mask2 = __riscv_vmsgt_vv_i32m8_b4(vdy_abs, vtg67x, vl); + vbool4_t mask3 = __riscv_vmnot_m_b4(__riscv_vmor_mm_b4(mask1, mask2, vl), vl); + + // Load neighbor pixels for all conditions + vint32m8_t prev_curr = __riscv_vle32_v_i32m8(lineBuf->curr + (j > 0 ? j - 1 : j), vl); + vint32m8_t next_curr = __riscv_vle32_v_i32m8(lineBuf->curr + j + 1, vl); + vint32m8_t prev_line = __riscv_vle32_v_i32m8(lineBuf->prev + j, vl); + vint32m8_t next_line = __riscv_vle32_v_i32m8(lineBuf->next + j, vl); + + // Condition 1: Horizontal/Vertical edges (compare left/right) + vbool4_t cond1_max = __riscv_vmand_mm_b4( + __riscv_vmsgt_vv_i32m8_b4(vm, prev_curr, vl), + __riscv_vmsge_vv_i32m8_b4(vm, next_curr, vl), + vl); + + // Condition 2: Diagonal edges (compare top/bottom) + vbool4_t cond2_max = __riscv_vmand_mm_b4( + __riscv_vmsgt_vv_i32m8_b4(vm, prev_line, vl), + __riscv_vmsge_vv_i32m8_b4(vm, next_line, vl), + vl); + + // Condition 3: Other diagonals (calculate s) + vint32m8_t vxor = __riscv_vxor_vv_i32m8(vdx, vdy, vl); + vbool4_t s_mask = __riscv_vmslt_vx_i32m8_b4(vxor, 0, vl); + + vint32m8_t prev_s1 = __riscv_vle32_v_i32m8(lineBuf->prev + (j > 0 ? j - 1 : j), vl); + vint32m8_t prev_s2 = __riscv_vle32_v_i32m8(lineBuf->prev + j + 1, vl); + vint32m8_t next_s1 = __riscv_vle32_v_i32m8(lineBuf->next + j + 1, vl); + vint32m8_t next_s2 = __riscv_vle32_v_i32m8(lineBuf->next + (j > 0 ? j - 1 : j), vl); + + vint32m8_t prev_sel = __riscv_vmerge_vvm_i32m8(prev_s1, prev_s2, s_mask, vl); + vint32m8_t next_sel = __riscv_vmerge_vvm_i32m8(next_s1, next_s2, s_mask, vl); + + vbool4_t cond3_max = __riscv_vmand_mm_b4( + __riscv_vmsgt_vv_i32m8_b4(vm, prev_sel, vl), + __riscv_vmsgt_vv_i32m8_b4(vm, next_sel, vl), + vl); + + // Combine results from all conditions + vbool4_t vmax = __riscv_vmor_mm_b4( + __riscv_vmand_mm_b4(mask1, cond1_max, vl), + __riscv_vmand_mm_b4(mask2, cond2_max, vl), + vl); + vmax = __riscv_vmor_mm_b4( + vmax, + __riscv_vmand_mm_b4(mask3, cond3_max, vl), + vl); + + // Threshold checks + vbool4_t vlow = __riscv_vmsgt_vx_i32m8_b4(vm, low_thresh, vl); + vbool4_t vhigh = __riscv_vmsgt_vx_i32m8_b4(vm, high_thresh, vl); + vbool4_t valid_edges = __riscv_vmand_mm_b4(vmax, vlow, vl); + vbool4_t strong_edges = __riscv_vmand_mm_b4(valid_edges, vhigh, vl); + + // Generate result map + vuint8m2_t vres = __riscv_vmv_v_x_u8m2(1, vl); + vres = __riscv_vmerge_vxm_u8m2(vres, 0, valid_edges, vl); + vres = __riscv_vmerge_vxm_u8m2(vres, 2, strong_edges, vl); + __riscv_vse8_v_u8m2(pmap + j, vres, vl); + + // canny_Push strong edges to stack + int32_t vidx = __riscv_vfirst_m_b4(strong_edges, vl); + while (vidx >= 0) + { + canny_push(stack, pmap + j + vidx); + strong_edges = __riscv_vmand_mm_b4(strong_edges, + __riscv_vmclr_m_b4(vl), vl); + vidx = __riscv_vfirst_m_b4(strong_edges, vl); + } + } + + rotateLineBuffer(lineBuf); + } + + uint8_t *pmapLower = edge_map; + uint32_t pmapDiff = (uint32_t)((edge_map + mapsize) - pmapLower); + + while (stack->size > 0) + { + uint8_t *m = canny_pop(stack); + + if ((uint64_t)m < (uint64_t)pmapLower || (uint64_t)m >= (uint64_t)(pmapLower + pmapDiff)) + continue; + + const int offsets[8] = { + -mapstep - 1, -mapstep, -mapstep + 1, + -1, +1, + +mapstep - 1, +mapstep, +mapstep + 1}; + + for (int k = 0; k < 8; k++) + { + uint8_t *neighbor = m + offsets[k]; + if ((uint64_t)neighbor < (uint64_t)edge_map || + (uint64_t)neighbor >= (uint64_t)(edge_map + mapsize)) + continue; + + if (*neighbor == 0) + { + *neighbor = 2; + canny_push(stack, neighbor); + } + } + } + + if (borderPeaksLocal->size > 0) + { + for (int i = 0; i < borderPeaksLocal->size; i++) + { + canny_push(stack, borderPeaksLocal->data[i]); + } + } + + for (int i = 0; i < height; i++) + { + uint8_t *pdst = dst_data + i * dst_step; + uint8_t *pmap = map + i * mapstep + 1; + size_t vl = 0; + for (int j = 0; j < width; j += vl) + { + vl = __riscv_vsetvl_e8m8(width - j); + vuint8m8_t vres = __riscv_vle8_v_u8m8(pmap + j, vl); + vres = __riscv_vsrl_vx_u8m8(vres, 1, vl); + vres = __riscv_vsub_vv_u8m8(__riscv_vmv_v_x_u8m8(0, vl), vres, vl); + __riscv_vse8_v_u8m8(pdst + j, vres, vl); + } + } + + freeLineBuffer(lineBuf); + freeStack(borderPeaksLocal); + freeStack(stack); + free(dx_data); + free(dy_data); + free(edge_map); + + return CV_HAL_ERROR_OK; + } + } + } +#endif // CV_HAL_RVV_1P0_ENABLED +} // namespace cv::rvv_hal::imgproc diff --git a/hal/riscv-rvv/src/imgproc/scharr.cpp b/hal/riscv-rvv/src/imgproc/scharr.cpp new file mode 100644 index 0000000000..695c1e3f34 --- /dev/null +++ b/hal/riscv-rvv/src/imgproc/scharr.cpp @@ -0,0 +1,883 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html . + +// Copyright (C) 2025, SpaceMIT Inc., all rights reserved. + +#include "rvv_hal.hpp" +#include "common.hpp" + +namespace cv +{ + namespace rvv_hal + { + namespace imgproc + { + +#if CV_HAL_RVV_1P0_ENABLED + +#define borderValue 0 + + static inline int scharr_dx0(int start, int end, const uint8_t *src_data, size_t src_step, + uint8_t *dst_data, size_t dst_step, + int width, int height, + int src_depth __attribute__((unused)), int dst_depth, int cn __attribute__((unused)), + int margin_left __attribute__((unused)), int margin_top __attribute__((unused)), + int margin_right __attribute__((unused)), int margin_bottom __attribute__((unused)), + double scale __attribute__((unused)), double delta __attribute__((unused)), + int border_type) + { + int dx0, dx1, dx2, dy0, dy1, dy2; + + dy0 = -1; + dx0 = 3; + + dy1 = 0; + dx1 = 10; + + dy2 = 1; + dx2 = 3; + + int16_t trow[width]; + + static thread_local std::vector zero_row; + if (zero_row.size() < static_cast(width)) + { + zero_row.clear(); + zero_row.resize(width, 0); + } + + if (dst_depth == CV_8U) + { + for (int y = start; y < end; ++y) + { + const uint8_t *srow1 = src_data + y * src_step; + const uint8_t *srow0; + const uint8_t *srow2; + + if (y != 0) + { + srow0 = src_data + (y - 1) * src_step; + } + else + { + if (border_type == BORDER_REPLICATE) + { + srow0 = srow1; + } + else + { + srow0 = zero_row.data(); + } + } + + if (y != height - 1) + { + srow2 = src_data + (y + 1) * src_step; + } + else + { + if (border_type == BORDER_REPLICATE) + { + srow2 = srow1; + } + else + { + srow2 = zero_row.data(); + } + } + + trow[0] = (dy0 * srow0[0] + dy1 * srow1[0] + dy2 * srow2[0]); + + // Last pixel + + trow[width - 1] = dy0 * srow0[width - 1] + dy1 * srow1[width - 1] + dy2 * srow2[width - 1]; + + size_t vl = 0; + + // Vector processing + + for (int x = 1; x < width - 1; x += vl) + { + vl = __riscv_vsetvl_e8m4(width - x - 1); + + vint16m8_t vsrow0 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow0 + x, vl), vl)); + vint16m8_t vsrow2 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow2 + x, vl), vl)); + + vint16m8_t vrowx = __riscv_vsub_vv_i16m8(vsrow2, vsrow0, vl); + + __riscv_vse16_v_i16m8(trow + x, vrowx, vl); + } + + int x = 0; + + uint8_t *drow = (uint8_t *)(dst_data + y * dst_step); + + int16_t temp = trow[0] * dx1 + trow[1] * dx2; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[0] * dx0; + } + + drow[x] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0); + + x = 1; + for (; x < width - 1; x += vl) + { + vl = __riscv_vsetvl_e16m8(width - x - 1); + vint16m8_t vleft = __riscv_vle16_v_i16m8(trow + x - 1, vl); + vint16m8_t vcenter = __riscv_vle16_v_i16m8(trow + x, vl); + vint16m8_t vright = __riscv_vle16_v_i16m8(trow + x + 1, vl); + + vint16m8_t vres = __riscv_vmul_vx_i16m8(vcenter, dx1, vl); + vcenter = __riscv_vadd_vv_i16m8(vleft, vright, vl); + vres = __riscv_vmacc_vx_i16m8(vres, dx0, vcenter, vl); + + vres = __riscv_vmax_vx_i16m8(vres, 0, vl); + vres = __riscv_vmin_vx_i16m8(vres, 255, vl); + + __riscv_vse8_v_u8m4(drow + x, __riscv_vncvt_x_x_w_u8m4(__riscv_vreinterpret_v_i16m8_u16m8(vres), vl), vl); + } + + x = width - 1; + + temp = trow[width - 2] * dx0 + trow[width - 1] * dx1; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[width - 1] * dx2; + } + + drow[x] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0); + } + } + + else if (dst_depth == CV_16S) + { + + for (int y = start; y < end; ++y) + { + const uint8_t *srow1 = src_data + y * src_step; + const uint8_t *srow0; + const uint8_t *srow2; + + if (y != 0) + { + srow0 = src_data + (y - 1) * src_step; + } + else + { + if (border_type == BORDER_REPLICATE) + { + srow0 = srow1; + } + else + { + srow0 = zero_row.data(); + } + } + + if (y != height - 1) + { + srow2 = src_data + (y + 1) * src_step; + } + else + { + if (border_type == BORDER_REPLICATE) + { + srow2 = srow1; + } + else + { + srow2 = zero_row.data(); + } + } + + if (y == 0) + { + if (border_type == BORDER_REPLICATE) + { + srow0 = srow1; + } + else + { + srow0 = zero_row.data(); + } + } + else if (y == height - 1) + { + if (border_type == BORDER_REPLICATE) + { + srow2 = srow1; + } + else + { + srow2 = zero_row.data(); + } + } + + size_t vl = 0; + + // Vector processing + + for (int x = 0; x < width; x += vl) + { + vl = __riscv_vsetvl_e8m4(width - x); + + vint16m8_t vsrow0 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow0 + x, vl), vl)); + vint16m8_t vsrow2 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow2 + x, vl), vl)); + + vint16m8_t vrowx = __riscv_vsub_vv_i16m8(vsrow2, vsrow0, vl); + + __riscv_vse16_v_i16m8(trow + x, vrowx, vl); + } + + int x = 0; + + int16_t *drow = (int16_t *)(dst_data + y * dst_step); + + drow[0] = trow[0] * dx1 + trow[1] * dx2; + + if (border_type == BORDER_REPLICATE) + { + drow[x] += trow[0] * dx0; + } + + x = 1; + for (; x < width - 1; x += vl) + { + vl = __riscv_vsetvl_e16m8(width - x - 1); + vint16m8_t vleft = __riscv_vle16_v_i16m8(trow + x - 1, vl); + vint16m8_t vcenter = __riscv_vle16_v_i16m8(trow + x, vl); + vint16m8_t vright = __riscv_vle16_v_i16m8(trow + x + 1, vl); + + vint16m8_t vres = __riscv_vmul_vx_i16m8(vcenter, dx1, vl); + vcenter = __riscv_vadd_vv_i16m8(vleft, vright, vl); + vres = __riscv_vmacc_vx_i16m8(vres, dx0, vcenter, vl); + + __riscv_vse16_v_i16m8(drow + x, vres, vl); + } + + x = width - 1; + + drow[x] = trow[width - 2] * dx0 + trow[width - 1] * dx1; + + if (border_type == BORDER_REPLICATE) + { + drow[x] += trow[width - 1] * dx2; + } + } + } + + else if (dst_depth == CV_32F) + { + for (int y = start; y < end; ++y) + { + const uint8_t *srow1 = src_data + y * src_step; + const uint8_t *srow0; + const uint8_t *srow2; + + if (y != 0) + { + srow0 = src_data + (y - 1) * src_step; + } + else + { + if (border_type == BORDER_REPLICATE) + { + srow0 = srow1; + } + else + { + srow0 = zero_row.data(); + } + } + + if (y != height - 1) + { + srow2 = src_data + (y + 1) * src_step; + } + else + { + if (border_type == BORDER_REPLICATE) + { + srow2 = srow1; + } + else + { + srow2 = zero_row.data(); + } + } + + if (y == 0) + { + if (border_type == BORDER_REPLICATE) + { + srow0 = srow1; + } + else + { + srow0 = zero_row.data(); + } + } + else if (y == height - 1) + { + if (border_type == BORDER_REPLICATE) + { + srow2 = srow1; + } + else + { + srow2 = zero_row.data(); + } + } + + size_t vl = 0; + + // Vector processing + for (int x = 0; x < width; x += vl) + { + vl = __riscv_vsetvl_e8m4(width - x); + + vint16m8_t vsrow0 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow0 + x, vl), vl)); + vint16m8_t vsrow2 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow2 + x, vl), vl)); + + vint16m8_t vrowx = __riscv_vsub_vv_i16m8(vsrow2, vsrow0, vl); + + __riscv_vse16_v_i16m8(trow + x, vrowx, vl); + } + + int x = 0; + + float *drow = (float *)(dst_data + y * dst_step); + + drow[x] = trow[0] * dx1 + trow[1] * dx2; + + if (border_type == BORDER_REPLICATE) + { + drow[x] += trow[0] * dx0; + } + + x = 1; + + for (; x < width - 1; x += vl) + { + vl = __riscv_vsetvl_e16m4(width - x - 1); + vint16m4_t vleft = __riscv_vle16_v_i16m4(trow + x - 1, vl); + vint16m4_t vcenter = __riscv_vle16_v_i16m4(trow + x, vl); + vint16m4_t vright = __riscv_vle16_v_i16m4(trow + x + 1, vl); + + vint16m4_t vres = __riscv_vmul_vx_i16m4(vcenter, dx1, vl); + vcenter = __riscv_vadd_vv_i16m4(vleft, vright, vl); + vres = __riscv_vmacc_vx_i16m4(vres, dx0, vcenter, vl); + + __riscv_vse32_v_f32m8(drow + x, __riscv_vfwcvt_f_x_v_f32m8(vres, vl), vl); + } + + x = width - 1; + + drow[x] = trow[width - 2] * dx0 + trow[width - 1] * dx1; + + if (border_type == BORDER_REPLICATE) + { + drow[x] += trow[width - 1] * dx2; + } + } + } + return CV_HAL_ERROR_OK; + } + + static inline int scharr_dx1(int start, int end, const uint8_t *src_data, size_t src_step, + uint8_t *dst_data, size_t dst_step, + int width, int height, + int src_depth __attribute__((unused)), int dst_depth, int cn __attribute__((unused)), + int margin_left __attribute__((unused)), int margin_top __attribute__((unused)), + int margin_right __attribute__((unused)), int margin_bottom __attribute__((unused)), + double scale __attribute__((unused)), double delta __attribute__((unused)), + int border_type) + { + int dx0, dx1, dx2, dy0, dy1, dy2; + dx0 = -1; + dy0 = 3; + + dx1 = 0; + dy1 = 10; + + dx2 = 1; + dy2 = 3; + + int16_t trow[width]; + + static thread_local std::vector zero_row; + if (zero_row.size() < static_cast(width)) + { + zero_row.clear(); + zero_row.resize(width, 0); + } + + if (dst_depth == CV_8U) + { + for (int y = start; y < end; ++y) + { + const uint8_t *srow1 = src_data + y * src_step; + const uint8_t *srow0; + const uint8_t *srow2; + + if (y != 0) + { + srow0 = src_data + (y - 1) * src_step; + } + else + { + if (border_type == BORDER_REPLICATE) + { + srow0 = srow1; + } + else + { + srow0 = zero_row.data(); + } + } + + if (y != height - 1) + { + srow2 = src_data + (y + 1) * src_step; + } + else + { + if (border_type == BORDER_REPLICATE) + { + srow2 = srow1; + } + else + { + srow2 = zero_row.data(); + } + } + + if (y == 0) + { + if (border_type == BORDER_REPLICATE) + { + srow0 = srow1; + } + else + { + srow0 = zero_row.data(); + } + } + else if (y == height - 1) + { + if (border_type == BORDER_REPLICATE) + { + srow2 = srow1; + } + else + { + srow2 = zero_row.data(); + } + } + + trow[0] = (dy0 * srow0[0] + dy1 * srow1[0] + dy2 * srow2[0]); + + // Last pixel + + trow[width - 1] = dy0 * srow0[width - 1] + dy1 * srow1[width - 1] + dy2 * srow2[width - 1]; + + size_t vl = 0; + + // Vector processing + + for (int x = 1; x < width - 1; x += vl) + { + vl = __riscv_vsetvl_e8m4(width - x - 1); + + vuint8m4_t vsrow0 = __riscv_vle8_v_u8m4(srow0 + x, vl); + vuint8m4_t vsrow1 = __riscv_vle8_v_u8m4(srow1 + x, vl); + vuint8m4_t vsrow2 = __riscv_vle8_v_u8m4(srow2 + x, vl); + + vuint16m8_t vrowx = __riscv_vwmulu_vx_u16m8(vsrow1, dy1, vl); + vuint16m8_t vtemp = __riscv_vwaddu_vv_u16m8(vsrow0, vsrow2, vl); + vrowx = __riscv_vmacc_vx_u16m8(vrowx, dy0, vtemp, vl); + + __riscv_vse16_v_i16m8(trow + x, __riscv_vreinterpret_v_u16m8_i16m8(vrowx), vl); + } + + int x = 0; + + uint8_t *drow = (uint8_t *)(dst_data + y * dst_step); + + int16_t temp = trow[0] * dx1 + trow[1] * dx2; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[0] * dx0; + } + + drow[x] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0); + + x = 1; + + for (; x < width - 1; x += vl) + { + vl = __riscv_vsetvl_e16m8(width - x - 1); + vint16m8_t vleft = __riscv_vle16_v_i16m8(trow + x - 1, vl); + + vint16m8_t vright = __riscv_vle16_v_i16m8(trow + x + 1, vl); + vint16m8_t vres = __riscv_vsub_vv_i16m8(vright, vleft, vl); + + vres = __riscv_vmax_vx_i16m8(vres, 0, vl); + vres = __riscv_vmin_vx_i16m8(vres, 255, vl); + + __riscv_vse8_v_u8m4(drow + x, __riscv_vncvt_x_x_w_u8m4(__riscv_vreinterpret_v_i16m8_u16m8(vres), vl), vl); + } + + x = width - 1; + + temp = trow[width - 2] * dx0 + trow[width - 1] * dx1; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[width - 1] * dx2; + } + + drow[x] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0); + } + } + + else if (dst_depth == CV_16S) + { + for (int y = start; y < end; ++y) + { + const uint8_t *srow1 = src_data + y * src_step; + const uint8_t *srow0; + const uint8_t *srow2; + + if (y != 0) + { + srow0 = src_data + (y - 1) * src_step; + } + else + { + if (border_type == BORDER_REPLICATE) + { + srow0 = srow1; + } + else + { + srow0 = zero_row.data(); + } + } + + if (y != height - 1) + { + srow2 = src_data + (y + 1) * src_step; + } + else + { + if (border_type == BORDER_REPLICATE) + { + srow2 = srow1; + } + else + { + srow2 = zero_row.data(); + } + } + + if (y == 0) + { + if (border_type == BORDER_REPLICATE) + { + srow0 = srow1; + } + else + { + srow0 = zero_row.data(); + } + } + else if (y == height - 1) + { + if (border_type == BORDER_REPLICATE) + { + srow2 = srow1; + } + else + { + srow2 = zero_row.data(); + } + } + + size_t vl = 0; + + // Vector processing + + for (int x = 0; x < width; x += vl) + { + vl = __riscv_vsetvl_e8m4(width - x); + + vuint8m4_t vsrow0 = __riscv_vle8_v_u8m4(srow0 + x, vl); + vuint8m4_t vsrow1 = __riscv_vle8_v_u8m4(srow1 + x, vl); + vuint8m4_t vsrow2 = __riscv_vle8_v_u8m4(srow2 + x, vl); + + vuint16m8_t vrowx = __riscv_vwmulu_vx_u16m8(vsrow1, dy1, vl); + vuint16m8_t vtemp = __riscv_vwaddu_vv_u16m8(vsrow0, vsrow2, vl); + vrowx = __riscv_vmacc_vx_u16m8(vrowx, dy0, vtemp, vl); + + __riscv_vse16_v_i16m8(trow + x, __riscv_vreinterpret_v_u16m8_i16m8(vrowx), vl); + } + + int x = 0; + + int16_t *drow = (int16_t *)(dst_data + y * dst_step); + + drow[x] = trow[0] * dx1 + trow[1] * dx2; + + if (border_type == BORDER_REPLICATE) + { + drow[x] += trow[0] * dx0; + } + + x = 1; + for (; x < width - 1; x += vl) + { + vl = __riscv_vsetvl_e16m8(width - x - 1); + vint16m8_t vleft = __riscv_vle16_v_i16m8(trow + x - 1, vl); + + vint16m8_t vright = __riscv_vle16_v_i16m8(trow + x + 1, vl); + vint16m8_t vres = __riscv_vsub_vv_i16m8(vright, vleft, vl); + + __riscv_vse16_v_i16m8(drow + x, vres, vl); + } + + x = width - 1; + + drow[x] = trow[width - 2] * dx0 + trow[width - 1] * dx1; + + if (border_type == BORDER_REPLICATE) + { + drow[x] += trow[width - 1] * dx2; + } + } + } + + else if (dst_depth == CV_32F) + { + for (int y = start; y < end; ++y) + { + const uint8_t *srow1 = src_data + y * src_step; + const uint8_t *srow0; + const uint8_t *srow2; + + if (y != 0) + { + srow0 = src_data + (y - 1) * src_step; + } + else + { + if (border_type == BORDER_REPLICATE) + { + srow0 = srow1; + } + else + { + srow0 = zero_row.data(); + } + } + + if (y != height - 1) + { + srow2 = src_data + (y + 1) * src_step; + } + else + { + if (border_type == BORDER_REPLICATE) + { + srow2 = srow1; + } + else + { + srow2 = zero_row.data(); + } + } + + if (y == 0) + { + if (border_type == BORDER_REPLICATE) + { + srow0 = srow1; + } + else + { + srow0 = zero_row.data(); + } + } + else if (y == height - 1) + { + if (border_type == BORDER_REPLICATE) + { + srow2 = srow1; + } + else + { + srow2 = zero_row.data(); + } + } + + size_t vl = 0; + + // Vector processing + for (int x = 0; x < width; x += vl) + { + vl = __riscv_vsetvl_e8m4(width - x); + + vuint8m4_t vsrow0 = __riscv_vle8_v_u8m4(srow0 + x, vl); + vuint8m4_t vsrow1 = __riscv_vle8_v_u8m4(srow1 + x, vl); + vuint8m4_t vsrow2 = __riscv_vle8_v_u8m4(srow2 + x, vl); + + vuint16m8_t vrowx = __riscv_vwmulu_vx_u16m8(vsrow1, dy1, vl); + vuint16m8_t vtemp = __riscv_vwaddu_vv_u16m8(vsrow0, vsrow2, vl); + vrowx = __riscv_vmacc_vx_u16m8(vrowx, dy0, vtemp, vl); + + __riscv_vse16_v_i16m8(trow + x, __riscv_vreinterpret_v_u16m8_i16m8(vrowx), vl); + } + + int x = 0; + + float *drow = (float *)(dst_data + y * dst_step); + + drow[x] = trow[0] * dx1 + trow[1] * dx2; + + if (border_type == BORDER_REPLICATE) + { + drow[x] += trow[0] * dx0; + } + + x = 1; + + for (; x < width - 1; x += vl) + { + vl = __riscv_vsetvl_e16m4(width - x - 1); + vint16m4_t vleft = __riscv_vle16_v_i16m4(trow + x - 1, vl); + + vint16m4_t vright = __riscv_vle16_v_i16m4(trow + x + 1, vl); + vint16m4_t vres = __riscv_vsub_vv_i16m4(vright, vleft, vl); + + __riscv_vse32_v_f32m8(drow + x, __riscv_vfwcvt_f_x_v_f32m8(vres, vl), vl); + } + + x = width - 1; + + drow[x] = trow[width - 2] * dx0 + trow[width - 1] * dx1; + + if (border_type == BORDER_REPLICATE) + { + drow[x] += trow[width - 1] * dx2; + } + } + } + + return CV_HAL_ERROR_OK; + } + + class ScharrInvoker : public ParallelLoopBody + { + public: + explicit ScharrInvoker(std::function _func) + : func(std::move(_func)) {} + + void operator()(const Range &range) const override + { + func(range.start, range.end); + } + + private: + std::function func; + }; + + template + inline int invoke(int height, + int (*f)(int, int, Args...), + Args... args) + { + using namespace std::placeholders; + auto bound = std::bind(f, _1, _2, std::forward(args)...); + cv::parallel_for_(Range(1, height), ScharrInvoker(bound), cv::getNumThreads()); + return f(0, 1, std::forward(args)...); + } + + int scharr(const uint8_t *src_data, size_t src_step, + uint8_t *dst_data, size_t dst_step, + int width, int height, + int src_depth, int dst_depth, int cn, + int margin_left, int margin_top, + int margin_right, int margin_bottom, + int dx, int dy __attribute__((unused)), + double scale, double delta, + int border_type) + { + if (src_depth != CV_8U || + (dst_depth != CV_8U && dst_depth != CV_16S && dst_depth != CV_32F) || + cn != 1 || width < 3) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + if (scale != 1 || delta != 0) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + if (border_type != BORDER_REPLICATE && border_type != BORDER_CONSTANT) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + if (margin_left != 0 || margin_top != 0 || margin_right != 0 || margin_bottom != 0) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + // This is copied from opencv/modules/imgproc/src/deriv.cpp + // if (!(dx >= 0 && dy >= 0 && dx + dy == 1)) + // { + // return CV_HAL_ERROR_NOT_IMPLEMENTED; + // } + + int size = 1; + if (dst_depth == CV_16S) + { + size = 2; + } + else if (dst_depth == CV_32F) + { + size = 4; + } + + std::vector temp_buffer; + uint8_t *actual_dst_data = dst_data; + if (src_data == dst_data) + { + temp_buffer.resize(height * dst_step); + actual_dst_data = temp_buffer.data(); + } + + int res = invoke(height, + dx == 1 ? &scharr_dx1 : &scharr_dx0, + src_data, src_step, + actual_dst_data, dst_step, + width, height, + src_depth, dst_depth, cn, + margin_left, margin_top, + margin_right, margin_bottom, + scale, delta, + border_type); + if (src_data == dst_data && res == CV_HAL_ERROR_OK) + { + for (int y = 0; y < height; ++y) + { + memcpy(dst_data + y * dst_step, + actual_dst_data + y * dst_step, + width * size); + } + } + + return res; + } +#endif // CV_HAL_RVV_1P0_ENABLED + } + } +} // cv::rvv_hal::imgproc \ No newline at end of file diff --git a/hal/riscv-rvv/src/imgproc/sobel.cpp b/hal/riscv-rvv/src/imgproc/sobel.cpp new file mode 100644 index 0000000000..7abfe4b5ea --- /dev/null +++ b/hal/riscv-rvv/src/imgproc/sobel.cpp @@ -0,0 +1,1128 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2025, SpaceMIT Inc., all rights reserved. + +#include "rvv_hal.hpp" +#include "common.hpp" + +namespace cv +{ + namespace rvv_hal + { + namespace imgproc + { + +#if CV_HAL_RVV_1P0_ENABLED + +#define borderValue 0 + int sobel_3X3(int start, int end, const uint8_t *src_data, size_t src_step, uint8_t *dst_data, size_t dst_step, int width, int height, int src_depth __attribute__((unused)), int dst_depth, int cn __attribute__((unused)), int margin_left __attribute__((unused)), int margin_top __attribute__((unused)), int margin_right __attribute__((unused)), int margin_bottom __attribute__((unused)), int dx, int dy, double scale __attribute__((unused)), double delta __attribute__((unused)), int border_type); + + int sobel_3X3(int start, int end, const uint8_t *src_data, size_t src_step, uint8_t *dst_data, size_t dst_step, int width, int height, int src_depth __attribute__((unused)), int dst_depth, int cn __attribute__((unused)), int margin_left __attribute__((unused)), int margin_top __attribute__((unused)), int margin_right __attribute__((unused)), int margin_bottom __attribute__((unused)), int dx, int dy, double scale __attribute__((unused)), double delta __attribute__((unused)), int border_type) + { + if (border_type != BORDER_REPLICATE && border_type != BORDER_CONSTANT) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + int dx0, dx1, dx2, dy1; + + if (dx == 0) + { + dx0 = 1; + dx1 = 2; + dx2 = 1; + } + else if (dx == 1) + { + dx0 = -1; + dx1 = 0; + dx2 = 1; + } + else if (dx == 2) + { + dx0 = 1; + dx1 = -2; + dx2 = 1; + } + else + { + dx0 = 1; + dx1 = 1; + dx2 = 1; + + } + + if (dy == 0) + { + dy1 = 2; + } + else if (dy == 1) + { + dy1 = 0; + } + else if (dy == 2) + { + dy1 = -2; + } + else + { + dy1 = 0; + } + + // int16_t trow[width]; + + std::vector trow_buf(width); + int16_t *trow = trow_buf.data(); + + static thread_local std::vector zero_row; + if (zero_row.size() < static_cast(width)) + { + zero_row.clear(); + zero_row.resize(width, 0); + } + + for (int y = start; y < end; ++y) + { + const uint8_t *srow1 = src_data + y * src_step; + const uint8_t *srow0; + const uint8_t *srow2; + + if (y != 0) + { + srow0 = src_data + (y - 1) * src_step; + } + else + { + if (border_type == BORDER_REPLICATE) + { + srow0 = srow1; + } + else + { + srow0 = zero_row.data(); + } + } + + if (y != height - 1) + { + srow2 = src_data + (y + 1) * src_step; + } + else + { + if (border_type == BORDER_REPLICATE) + { + srow2 = srow1; + } + else + { + srow2 = zero_row.data(); + } + } + + size_t vl = 0; + + // Vector processing + + if (dy == 1) + { + for (int x = 0; x < width; x += vl) + { + vl = __riscv_vsetvl_e8m4(width - x); + + vint16m8_t vsrow0 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow0 + x, vl), vl)); + vint16m8_t vsrow2 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow2 + x, vl), vl)); + + vint16m8_t vrowx = __riscv_vsub_vv_i16m8(vsrow2, vsrow0, vl); + + __riscv_vse16_v_i16m8(trow + x, vrowx, vl); + } + } + else + { + for (int x = 0; x < width; x += vl) + { + vl = __riscv_vsetvl_e8m4(width - x); + + vint16m8_t vsrow0 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow0 + x, vl), vl)); + vint16m8_t vsrow1 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow1 + x, vl), vl)); + vint16m8_t vsrow2 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow2 + x, vl), vl)); + + vint16m8_t vrowx = __riscv_vadd_vv_i16m8(vsrow0, vsrow2, vl); + vrowx = __riscv_vmacc_vx_i16m8(vrowx, dy1, vsrow1, vl); + + __riscv_vse16_v_i16m8(trow + x, vrowx, vl); + } + } + + if (dst_depth == CV_8U) + { + int x = 0; + + uint8_t *drow = (uint8_t *)(dst_data + y * dst_step); + + int16_t temp = trow[0] * dx1 + trow[1] * dx2; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[0] * dx0; + } + + drow[x] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0); + + x = 1; + + if (dx == 1) + { + for (; x < width - 1; x += vl) + { + vl = __riscv_vsetvl_e16m8(width - x - 1); + vint16m8_t vleft = __riscv_vle16_v_i16m8(trow + x - 1, vl); + vint16m8_t vright = __riscv_vle16_v_i16m8(trow + x + 1, vl); + + vint16m8_t vres = __riscv_vsub_vv_i16m8(vright, vleft, vl); + + vres = __riscv_vmax_vx_i16m8(vres, 0, vl); + vres = __riscv_vmin_vx_i16m8(vres, 255, vl); + + __riscv_vse8_v_u8m4(drow + x, __riscv_vncvt_x_x_w_u8m4(__riscv_vreinterpret_v_i16m8_u16m8(vres), vl), vl); + } + } + else + { + for (; x < width - 1; x += vl) + { + vl = __riscv_vsetvl_e16m8(width - x - 1); + vint16m8_t vleft = __riscv_vle16_v_i16m8(trow + x - 1, vl); + vint16m8_t vcenter = __riscv_vle16_v_i16m8(trow + x, vl); + vint16m8_t vright = __riscv_vle16_v_i16m8(trow + x + 1, vl); + + vint16m8_t vres = __riscv_vadd_vv_i16m8(vleft, vright, vl); + vres = __riscv_vmacc_vx_i16m8(vres, dx1, vcenter, vl); + + vres = __riscv_vmax_vx_i16m8(vres, 0, vl); + vres = __riscv_vmin_vx_i16m8(vres, 255, vl); + + __riscv_vse8_v_u8m4(drow + x, __riscv_vncvt_x_x_w_u8m4(__riscv_vreinterpret_v_i16m8_u16m8(vres), vl), vl); + } + } + + x = width - 1; + + temp = trow[width - 2] * dx0 + trow[width - 1] * dx1; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[width - 1] * dx2; + } + + drow[x] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0); + } + + else if (dst_depth == CV_16S) + { + int x = 0; + + int16_t *drow = (int16_t *)(dst_data + y * dst_step); + + drow[0] = trow[0] * dx1 + trow[1] * dx2; + + if (border_type == BORDER_REPLICATE) + { + drow[x] += trow[0] * dx0; + } + + x = 1; + + if (dx == 1) + { + for (; x < width - 1; x += vl) + { + vl = __riscv_vsetvl_e16m8(width - x - 1); + vint16m8_t vleft = __riscv_vle16_v_i16m8(trow + x - 1, vl); + vint16m8_t vright = __riscv_vle16_v_i16m8(trow + x + 1, vl); + + vint16m8_t vres = __riscv_vsub_vv_i16m8(vright, vleft, vl); + + __riscv_vse16_v_i16m8(drow + x, vres, vl); + } + } + else + { + for (; x < width - 1; x += vl) + { + vl = __riscv_vsetvl_e16m8(width - x - 1); + vint16m8_t vleft = __riscv_vle16_v_i16m8(trow + x - 1, vl); + vint16m8_t vcenter = __riscv_vle16_v_i16m8(trow + x, vl); + vint16m8_t vright = __riscv_vle16_v_i16m8(trow + x + 1, vl); + + vint16m8_t vres = __riscv_vadd_vv_i16m8(vleft, vright, vl); + vres = __riscv_vmacc_vx_i16m8(vres, dx1, vcenter, vl); + + __riscv_vse16_v_i16m8(drow + x, vres, vl); + } + } + + x = width - 1; + + drow[x] = trow[width - 2] * dx0 + trow[width - 1] * dx1; + + if (border_type == BORDER_REPLICATE) + { + drow[x] += trow[width - 1] * dx2; + } + } + + else if (dst_depth == CV_32F) + { + int x = 0; + + float *drow = (float *)(dst_data + y * dst_step); + + int temp = trow[0] * dx1 + trow[1] * dx2; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[0] * dx0; + } + + drow[0] = (float)temp; + + x = 1; + + if (dx == 1) + { + for (; x < width - 1; x += vl) + { + vl = __riscv_vsetvl_e16m4(width - x - 1); + vint16m4_t vleft = __riscv_vle16_v_i16m4(trow + x - 1, vl); + vint16m4_t vright = __riscv_vle16_v_i16m4(trow + x + 1, vl); + + vint16m4_t vres = __riscv_vsub_vv_i16m4(vright, vleft, vl); + + __riscv_vse32_v_f32m8(drow + x, __riscv_vfwcvt_f_x_v_f32m8(vres, vl), vl); + } + } + else + { + for (; x < width - 1; x += vl) + { + vl = __riscv_vsetvl_e16m4(width - x - 1); + vint16m4_t vleft = __riscv_vle16_v_i16m4(trow + x - 1, vl); + vint16m4_t vcenter = __riscv_vle16_v_i16m4(trow + x, vl); + vint16m4_t vright = __riscv_vle16_v_i16m4(trow + x + 1, vl); + + vint16m4_t vres = __riscv_vadd_vv_i16m4(vleft, vright, vl); + vres = __riscv_vmacc_vx_i16m4(vres, dx1, vcenter, vl); + + __riscv_vse32_v_f32m8(drow + x, __riscv_vfwcvt_f_x_v_f32m8(vres, vl), vl); + } + } + + x = width - 1; + + drow[x] = trow[width - 2] * dx0 + trow[width - 1] * dx1; + + if (border_type == BORDER_REPLICATE) + { + drow[x] += trow[width - 1] * dx2; + } + } + } + + return CV_HAL_ERROR_OK; + } + int sobel_5X5(int start, int end, const uint8_t *src_data, size_t src_step, uint8_t *dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, int dx, int dy, double scale, double delta, int border_type); + int sobel_5X5(int start, int end, const uint8_t *src_data, size_t src_step, uint8_t *dst_data, size_t dst_step, int width, int height, int src_depth __attribute__((unused)), int dst_depth, int cn __attribute__((unused)), int margin_left __attribute__((unused)), int margin_top __attribute__((unused)), int margin_right __attribute__((unused)), int margin_bottom __attribute__((unused)), int dx, int dy, double scale __attribute__((unused)), double delta __attribute__((unused)), int border_type) + { + if (border_type != BORDER_REPLICATE && border_type != BORDER_CONSTANT && border_type != BORDER_REFLECT && border_type != BORDER_REFLECT_101) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + int dx0, dx1, dx2, dx3, dx4, dy1, dy2, dy3; + + if (dx == 0) + { + dx0 = 1; + dx1 = 4; + dx2 = 6; + dx3 = 4; + dx4 = 1; + } + else if (dx == 1) + { + dx0 = -1; + dx1 = -2; + dx2 = 0; + dx3 = 2; + dx4 = 1; + } + else if (dx == 2) + { + dx0 = 1; + dx1 = 0; + dx2 = -2; + dx3 = 0; + dx4 = 1; + } + else + { + dx0 = 1; + dx1 = 1; + dx2 = 1; + dx3 = 1; + dx4 = 1; + } + + if (dy == 0) + { + dy1 = 4; + dy2 = 6; + dy3 = 4; + } + else if (dy == 1) + { + dy1 = -2; + dy2 = 0; + dy3 = 2; + } + else if (dy == 2) + { + dy1 = 0; + dy2 = -2; + dy3 = 0; + } + else + { + dy1 = 0; + dy2 = 0; + dy3 = 0; + } + + // int16_t trow[width]; + + std::vector trow_buf(width); + int16_t *trow = trow_buf.data(); + + static thread_local std::vector zero_row; + if (zero_row.size() < static_cast(width)) + { + zero_row.clear(); + zero_row.resize(width, 0); + } + + for (int y = start; y < end; ++y) + { + const uint8_t *srowm2, *srowm1, *srow0, *srowp1, *srowp2; + + if (border_type == BORDER_REPLICATE) + { + srowm2 = src_data + (y > 2 ? y - 2 : 0) * src_step; + srowm1 = src_data + (y > 1 ? y - 1 : 0) * src_step; + srow0 = src_data + y * src_step; + srowp1 = src_data + (y + 1 < height - 1 ? y + 1 : height - 1) * src_step; + srowp2 = src_data + (y + 2 < height - 1 ? y + 2 : height - 1) * src_step; + } + else if (border_type == BORDER_CONSTANT) + { + srowm2 = y >= 2 ? (src_data + (y - 2) * src_step) : zero_row.data(); + srowm1 = y >= 1 ? (src_data + (y - 1) * src_step) : zero_row.data(); + srow0 = src_data + y * src_step; + srowp1 = y + 1 <= height - 1 ? (src_data + (y + 1) * src_step) : zero_row.data(); + srowp2 = y + 2 <= height - 1 ? (src_data + (y + 2) * src_step) : zero_row.data(); + } + else if (border_type == BORDER_REFLECT) + { + if (y >= 2) + { + srowm2 = src_data + (y - 2) * src_step; + } + else if (y == 1) + { + srowm2 = src_data + (y - 1) * src_step; + } + else // y == 0 + { + srowm2 = src_data + (y + 1) * src_step; + } + + if (y >= 1) + { + srowm1 = src_data + (y - 1) * src_step; + } + else // y == 0 + { + srowm1 = src_data + y * src_step; + } + + srow0 = src_data + y * src_step; + + if (y + 1 <= height - 1) // height - 2 + { + srowp1 = src_data + (y + 1) * src_step; + } + else if (y + 1 == height) // height - 1 + { + srowp1 = src_data + y * src_step; + } + else + { + srowp1 = src_data; + } + + if (y + 2 <= height - 1) // height - 3 + { + srowp2 = src_data + (y + 2) * src_step; + } + else if (y + 2 == height) // height - 2 + { + srowp2 = src_data + (y + 1) * src_step; + } + else // y == height - 1 + { + srowp2 = src_data + (y - 1) * src_step; + } + } + else if (border_type == BORDER_REFLECT_101) + { + if (y >= 2) + { + srowm2 = src_data + (y - 2) * src_step; + } + else if (y == 1) + { + srowm2 = src_data + y * src_step; + } + else // y == 0 + { + srowm2 = src_data + (y + 2) * src_step; + } + + if (y >= 1) + { + srowm1 = src_data + (y - 1) * src_step; + } + else // y == 0 + { + srowm1 = src_data + (y + 1) * src_step; + } + + srow0 = src_data + y * src_step; + + if (y + 1 <= height - 1) // height - 2 + { + srowp1 = src_data + (y + 1) * src_step; + } + else if (y + 1 == height) // height - 1 + { + srowp1 = src_data + (y - 1) * src_step; + } + else + { + srowp1 = src_data; + } + + if (y + 2 <= height - 1) // height - 3 + { + srowp2 = src_data + (y + 2) * src_step; + } + else if (y + 2 == height) // height - 2 + { + srowp2 = src_data + y * src_step; + } + else // y == height - 1 + { + srowp2 = src_data + (y - 2) * src_step; + } + } + else + { + srowm1 = src_data; + } + + size_t vl = 0; + + // Vector processing + + if (dy == 0) + { + for (int x = 0; x < width; x += vl) + { + vl = __riscv_vsetvl_e8m4(width - x); + + vuint8m4_t vsrow0 = __riscv_vle8_v_u8m4(srowm2 + x, vl); + vuint8m4_t vsrow1 = __riscv_vle8_v_u8m4(srowm1 + x, vl); + vuint8m4_t vsrow2 = __riscv_vle8_v_u8m4(srow0 + x, vl); + vuint8m4_t vsrow3 = __riscv_vle8_v_u8m4(srowp1 + x, vl); + vuint8m4_t vsrow4 = __riscv_vle8_v_u8m4(srowp2 + x, vl); + + vuint16m8_t vrowx = __riscv_vwaddu_vv_u16m8(vsrow4, vsrow0, vl); + vrowx = __riscv_vwmaccu_vx_u16m8(vrowx, dy1, vsrow1, vl); + vrowx = __riscv_vwmaccu_vx_u16m8(vrowx, dy2, vsrow2, vl); + vrowx = __riscv_vwmaccu_vx_u16m8(vrowx, dy3, vsrow3, vl); + + __riscv_vse16_v_i16m8(trow + x, __riscv_vreinterpret_v_u16m8_i16m8(vrowx), vl); + } + } + else if (dy == 1) + { + for (int x = 0; x < width; x += vl) + { + vl = __riscv_vsetvl_e8m4(width - x); + + vint16m8_t vsrow0 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srowm2 + x, vl), vl)); + vint16m8_t vsrow1 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srowm1 + x, vl), vl)); + vint16m8_t vsrow3 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srowp1 + x, vl), vl)); + vint16m8_t vsrow4 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srowp2 + x, vl), vl)); + + vint16m8_t vrowx = __riscv_vsub_vv_i16m8(vsrow4, vsrow0, vl); + vrowx = __riscv_vmacc_vx_i16m8(vrowx, dy1, vsrow1, vl); + vrowx = __riscv_vmacc_vx_i16m8(vrowx, dy3, vsrow3, vl); + + __riscv_vse16_v_i16m8(trow + x, vrowx, vl); + } + } + else + { + for (int x = 0; x < width; x += vl) + { + vl = __riscv_vsetvl_e8m4(width - x); + + vint16m8_t vsrow0 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srowm2 + x, vl), vl)); + vint16m8_t vsrow2 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow0 + x, vl), vl)); + vint16m8_t vsrow4 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srowp2 + x, vl), vl)); + + vint16m8_t vrowx = __riscv_vadd_vv_i16m8(vsrow4, vsrow0, vl); + vrowx = __riscv_vmacc_vx_i16m8(vrowx, dy2, vsrow2, vl); + + __riscv_vse16_v_i16m8(trow + x, vrowx, vl); + } + } + + if (dst_depth == CV_8U) + { + return CV_HAL_ERROR_NOT_IMPLEMENTED; + uint8_t *drow = (uint8_t *)(dst_data + y * dst_step); + + int16_t temp = trow[0] * dx2 + trow[1] * dx3 + trow[2] * dx4; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[0] * dx0 + trow[0] * dx1; + } + else if (border_type == BORDER_REFLECT) + { + temp += trow[1] * dx0 + trow[0] * dx1; + } + else if (border_type == BORDER_REFLECT_101) + { + temp += trow[2] * dx0 + trow[1] * dx1; + } + + drow[0] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0); + + temp = trow[0] * dx1 + trow[1] * dx2 + trow[2] * dx3 + trow[3] * dx4; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[0] * dx0; + } + else if (border_type == BORDER_REFLECT) + { + temp += trow[0] * dx0; + } + else if (border_type == BORDER_REFLECT_101) + { + temp += trow[1] * dx0; + } + + drow[1] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0); + + if (dx == 0) + { + for (int x = 2; x < width - 2; x += vl) + { + vl = __riscv_vsetvl_e16m8(width - x - 2); + vint16m8_t vleft2 = __riscv_vle16_v_i16m8(trow + x - 2, vl); + vint16m8_t vleft1 = __riscv_vle16_v_i16m8(trow + x - 1, vl); + vint16m8_t vcenter = __riscv_vle16_v_i16m8(trow + x, vl); + vint16m8_t vright1 = __riscv_vle16_v_i16m8(trow + x + 1, vl); + vint16m8_t vright2 = __riscv_vle16_v_i16m8(trow + x + 2, vl); + + vint16m8_t vres = __riscv_vadd_vv_i16m8(vright2, vleft2, vl); + vres = __riscv_vmacc_vx_i16m8(vres, dx1, vleft1, vl); + vres = __riscv_vmacc_vx_i16m8(vres, dx2, vcenter, vl); + vres = __riscv_vmacc_vx_i16m8(vres, dx3, vright1, vl); + + vres = __riscv_vmax_vx_i16m8(vres, 0, vl); + vres = __riscv_vmin_vx_i16m8(vres, 255, vl); + + __riscv_vse8_v_u8m4(drow + x, __riscv_vncvt_x_x_w_u8m4(__riscv_vreinterpret_v_i16m8_u16m8(vres), vl), vl); + } + } + else if (dx == 1) + { + for (int x = 2; x < width - 2; x += vl) + { + vl = __riscv_vsetvl_e16m8(width - x - 2); + vint16m8_t vleft2 = __riscv_vle16_v_i16m8(trow + x - 2, vl); + vint16m8_t vleft1 = __riscv_vle16_v_i16m8(trow + x - 1, vl); + vint16m8_t vright1 = __riscv_vle16_v_i16m8(trow + x + 1, vl); + vint16m8_t vright2 = __riscv_vle16_v_i16m8(trow + x + 2, vl); + + vint16m8_t vres = __riscv_vsub_vv_i16m8(vright2, vleft2, vl); + vres = __riscv_vmacc_vx_i16m8(vres, dx1, vleft1, vl); + vres = __riscv_vmacc_vx_i16m8(vres, dx3, vright1, vl); + + vres = __riscv_vmax_vx_i16m8(vres, 0, vl); + vres = __riscv_vmin_vx_i16m8(vres, 255, vl); + + __riscv_vse8_v_u8m4(drow + x, __riscv_vncvt_x_x_w_u8m4(__riscv_vreinterpret_v_i16m8_u16m8(vres), vl), vl); + } + } + else + { + for (int x = 2; x < width - 2; x += vl) + { + vl = __riscv_vsetvl_e16m8(width - x - 2); + vint16m8_t vleft2 = __riscv_vle16_v_i16m8(trow + x - 2, vl); + vint16m8_t vcenter = __riscv_vle16_v_i16m8(trow + x, vl); + vint16m8_t vright2 = __riscv_vle16_v_i16m8(trow + x + 2, vl); + + vint16m8_t vres = __riscv_vadd_vv_i16m8(vright2, vleft2, vl); + vres = __riscv_vmacc_vx_i16m8(vres, dx2, vcenter, vl); + + vres = __riscv_vmax_vx_i16m8(vres, 0, vl); + vres = __riscv_vmin_vx_i16m8(vres, 255, vl); + + __riscv_vse8_v_u8m4(drow + x, __riscv_vncvt_x_x_w_u8m4(__riscv_vreinterpret_v_i16m8_u16m8(vres), vl), vl); + } + } + + size_t x = width - 2; + + temp = trow[x - 2] * dx0 + trow[x - 1] * dx1 + trow[x] * dx2 + trow[x + 1] * dx3; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[x + 1] * dx4; + } + else if (border_type == BORDER_REFLECT) + { + temp += trow[width - 1] * dx4; + } + else if (border_type == BORDER_REFLECT_101) + { + temp += trow[width - 2] * dx4; + } + + drow[x] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0); + + x = width - 1; + + temp = trow[width - 3] * dx0 + trow[width - 2] * dx1 + trow[width - 1] * dx2; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[width - 1] * dx3 + trow[width - 1] * dx4; + } + else if (border_type == BORDER_REFLECT) + { + temp += trow[width - 1] * dx3 + trow[width - 2] * dx4; + } + else if (border_type == BORDER_REFLECT_101) + { + temp += trow[width - 2] * dx3 + trow[width - 3] * dx4; + } + + drow[x] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0); + } + + else if (dst_depth == CV_16S) + { + size_t x = 0; + + int16_t *drow = (int16_t *)(dst_data + y * dst_step); + + int16_t temp = trow[0] * dx2 + trow[1] * dx3 + trow[2] * dx4; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[0] * dx0 + trow[0] * dx1; + } + else if (border_type == BORDER_REFLECT) + { + temp += trow[1] * dx0 + trow[0] * dx1; + } + else if (border_type == BORDER_REFLECT_101) + { + temp += trow[2] * dx0 + trow[1] * dx1; + } + + drow[0] = temp; + + temp = trow[0] * dx1 + trow[1] * dx2 + trow[2] * dx3 + trow[3] * dx4; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[0] * dx0; + } + else if (border_type == BORDER_REFLECT) + { + temp += trow[0] * dx0; + } + else if (border_type == BORDER_REFLECT_101) + { + temp += trow[1] * dx0; + } + + drow[1] = temp; + + x = 1; + + if (dx == 0) + { + for (int xx = 2; xx < width - 2; xx += vl) + { + vl = __riscv_vsetvl_e16m8(width - xx - 2); + vint16m8_t vleft2 = __riscv_vle16_v_i16m8(trow + xx - 2, vl); + vint16m8_t vleft1 = __riscv_vle16_v_i16m8(trow + xx - 1, vl); + vint16m8_t vcenter = __riscv_vle16_v_i16m8(trow + xx, vl); + vint16m8_t vright1 = __riscv_vle16_v_i16m8(trow + xx + 1, vl); + vint16m8_t vright2 = __riscv_vle16_v_i16m8(trow + xx + 2, vl); + + vint16m8_t vres = __riscv_vadd_vv_i16m8(vright2, vleft2, vl); + vres = __riscv_vmacc_vx_i16m8(vres, dx1, vleft1, vl); + vres = __riscv_vmacc_vx_i16m8(vres, dx2, vcenter, vl); + vres = __riscv_vmacc_vx_i16m8(vres, dx3, vright1, vl); + + __riscv_vse16_v_i16m8(drow + xx, vres, vl); + } + } + else if (dx == 1) + { + for (int xx = 2; xx < width - 2; xx += vl) + { + vl = __riscv_vsetvl_e16m8(width - xx - 2); + vint16m8_t vleft2 = __riscv_vle16_v_i16m8(trow + xx - 2, vl); + vint16m8_t vleft1 = __riscv_vle16_v_i16m8(trow + xx - 1, vl); + vint16m8_t vright1 = __riscv_vle16_v_i16m8(trow + xx + 1, vl); + vint16m8_t vright2 = __riscv_vle16_v_i16m8(trow + xx + 2, vl); + + vint16m8_t vres = __riscv_vsub_vv_i16m8(vright2, vleft2, vl); + vres = __riscv_vmacc_vx_i16m8(vres, dx1, vleft1, vl); + vres = __riscv_vmacc_vx_i16m8(vres, dx3, vright1, vl); + + __riscv_vse16_v_i16m8(drow + xx, vres, vl); + } + } + else + { + for (int xx = 2; xx < width - 2; xx += vl) + { + vl = __riscv_vsetvl_e16m8(width - xx - 2); + vint16m8_t vleft2 = __riscv_vle16_v_i16m8(trow + xx - 2, vl); + vint16m8_t vcenter = __riscv_vle16_v_i16m8(trow + xx, vl); + vint16m8_t vright2 = __riscv_vle16_v_i16m8(trow + xx + 2, vl); + + vint16m8_t vres = __riscv_vadd_vv_i16m8(vright2, vleft2, vl); + vres = __riscv_vmacc_vx_i16m8(vres, dx2, vcenter, vl); + + __riscv_vse16_v_i16m8(drow + xx, vres, vl); + } + } + x = width - 2; + + temp = trow[x - 2] * dx0 + trow[x - 1] * dx1 + trow[x] * dx2 + trow[x + 1] * dx3; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[width - 1] * dx4; + } + else if (border_type == BORDER_REFLECT) + { + temp += trow[width - 1] * dx4; + } + else if (border_type == BORDER_REFLECT_101) + { + temp += trow[width - 2] * dx4; + } + + drow[x] = temp; + + x = width - 1; + + temp = trow[width - 3] * dx0 + trow[width - 2] * dx1 + trow[width - 1] * dx2; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[width - 1] * dx3 + trow[width - 1] * dx4; + } + else if (border_type == BORDER_REFLECT) + { + temp += trow[width - 1] * dx3 + trow[width - 2] * dx4; + } + else if (border_type == BORDER_REFLECT_101) + { + temp += trow[width - 2] * dx3 + trow[width - 3] * dx4; + } + + drow[x] = temp; + } + + else if (dst_depth == CV_32F) + { + size_t x = 0; + + float *drow = (float *)(dst_data + y * dst_step); + + int16_t temp = trow[0] * dx2 + trow[1] * dx3 + trow[2] * dx4; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[0] * dx0 + trow[0] * dx1; + } + else if (border_type == BORDER_REFLECT) + { + temp += trow[1] * dx0 + trow[0] * dx1; + } + else if (border_type == BORDER_REFLECT_101) + { + temp += trow[2] * dx0 + trow[1] * dx1; + } + + drow[0] = (float)(temp); + + temp = trow[0] * dx1 + trow[1] * dx2 + trow[2] * dx3 + trow[3] * dx4; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[0] * dx0; + } + else if (border_type == BORDER_REFLECT) + { + temp += trow[0] * dx0; + } + else if (border_type == BORDER_REFLECT_101) + { + temp += trow[1] * dx0; + } + + drow[1] = (float)temp; + + x = 1; + + if (dx == 0) + { + for (int xx = 2; xx < width - 2; xx += vl) + { + vl = __riscv_vsetvl_e16m4(width - xx - 2); + vint16m4_t vleft2 = __riscv_vle16_v_i16m4(trow + xx - 2, vl); + vint16m4_t vleft1 = __riscv_vle16_v_i16m4(trow + xx - 1, vl); + vint16m4_t vcenter = __riscv_vle16_v_i16m4(trow + xx, vl); + vint16m4_t vright1 = __riscv_vle16_v_i16m4(trow + xx + 1, vl); + vint16m4_t vright2 = __riscv_vle16_v_i16m4(trow + xx + 2, vl); + + vint16m4_t vres = __riscv_vadd_vv_i16m4(vright2, vleft2, vl); + vres = __riscv_vmacc_vx_i16m4(vres, dx1, vleft1, vl); + vres = __riscv_vmacc_vx_i16m4(vres, dx2, vcenter, vl); + vres = __riscv_vmacc_vx_i16m4(vres, dx3, vright1, vl); + + __riscv_vse32_v_f32m8(drow + xx, __riscv_vfwcvt_f_x_v_f32m8(vres, vl), vl); + } + } + else if (dx == 1) + { + for (int xx = 2; xx < width - 2; xx += vl) + { + vl = __riscv_vsetvl_e16m4(width - xx - 2); + vint16m4_t vleft2 = __riscv_vle16_v_i16m4(trow + xx - 2, vl); + vint16m4_t vleft1 = __riscv_vle16_v_i16m4(trow + xx - 1, vl); + vint16m4_t vright1 = __riscv_vle16_v_i16m4(trow + xx + 1, vl); + vint16m4_t vright2 = __riscv_vle16_v_i16m4(trow + xx + 2, vl); + + vint16m4_t vres = __riscv_vsub_vv_i16m4(vright2, vleft2, vl); + vres = __riscv_vmacc_vx_i16m4(vres, dx1, vleft1, vl); + vres = __riscv_vmacc_vx_i16m4(vres, dx3, vright1, vl); + + __riscv_vse32_v_f32m8(drow + xx, __riscv_vfwcvt_f_x_v_f32m8(vres, vl), vl); + } + } + else + { + for (int xx = 2; xx < width - 2; xx += vl) + { + vl = __riscv_vsetvl_e16m4(width - xx - 2); + vint16m4_t vleft2 = __riscv_vle16_v_i16m4(trow + xx - 2, vl); + vint16m4_t vcenter = __riscv_vle16_v_i16m4(trow + xx, vl); + vint16m4_t vright2 = __riscv_vle16_v_i16m4(trow + xx + 2, vl); + + vint16m4_t vres = __riscv_vadd_vv_i16m4(vright2, vleft2, vl); + vres = __riscv_vmacc_vx_i16m4(vres, dx2, vcenter, vl); + + __riscv_vse32_v_f32m8(drow + xx, __riscv_vfwcvt_f_x_v_f32m8(vres, vl), vl); + } + } + + x = width - 2; + + temp = trow[width - 4] * dx0 + trow[width - 3] * dx1 + trow[width - 2] * dx2 + trow[width - 1] * dx3; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[width - 1] * dx4; + } + else if (border_type == BORDER_REFLECT) + { + temp += trow[width - 1] * dx4; + } + else if (border_type == BORDER_REFLECT_101) + { + temp += trow[width - 2] * dx4; + } + + drow[x] = (float)temp; + + x = width - 1; + + temp = trow[width - 3] * dx0 + trow[width - 2] * dx1 + trow[width - 1] * dx2; + + if (border_type == BORDER_REPLICATE) + { + temp += trow[width - 1] * dx3 + trow[width - 1] * dx4; + } + else if (border_type == BORDER_REFLECT) + { + temp += trow[width - 1] * dx3 + trow[width - 2] * dx4; + } + else if (border_type == BORDER_REFLECT_101) + { + temp += trow[width - 2] * dx3 + trow[width - 3] * dx4; + } + + drow[x] = (float)temp; + } + } + + return CV_HAL_ERROR_OK; + } + + class sobelInvoker : public ParallelLoopBody + { + public: + explicit sobelInvoker(std::function _func) + : func(std::move(_func)) {} + + void operator()(const Range &range) const override + { + func(range.start, range.end); + } + + private: + std::function func; + }; + + template + inline int invoke(int height, + int (*f)(int, int, Args...), + Args... args) + { + using namespace std::placeholders; + auto bound = std::bind(f, _1, _2, std::forward(args)...); + cv::parallel_for_(Range(1, height), sobelInvoker(bound), cv::getNumThreads()); + return f(0, 1, std::forward(args)...); + } + + int sobel(const uint8_t *src_data, size_t src_step, + uint8_t *dst_data, size_t dst_step, + int width, int height, + int src_depth, int dst_depth, int cn, + int margin_left, int margin_top, + int margin_right, int margin_bottom, + int dx, int dy, int ksize, + double scale, double delta, + int border_type) + { + if (src_depth != CV_8U || + (dst_depth != CV_8U && dst_depth != CV_16S && dst_depth != CV_32F) || + cn != 1 || width < 3) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + if (scale != 1 || delta != 0) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + if (margin_left != 0 || margin_top != 0 || margin_right != 0 || margin_bottom != 0) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + if (height < ksize || width < ksize) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + int size = 1; + if (dst_depth == CV_16S) + { + size = 2; + } + else if (dst_depth == CV_32F) + { + size = 4; + } + + std::vector temp_buffer; + uint8_t *actual_dst_data = dst_data; + if (src_data == dst_data) + { + temp_buffer.resize(height * dst_step); + actual_dst_data = temp_buffer.data(); + } + + int res = 0; + + if (ksize == 3) + { + res = invoke(height, + &sobel_3X3, + src_data, src_step, + actual_dst_data, dst_step, + width, height, + src_depth, dst_depth, cn, + margin_left, margin_top, + margin_right, margin_bottom, + dx, dy, + scale, delta, + border_type); + } + else if (ksize == 5) + { + res = invoke(height, + &sobel_5X5, + src_data, src_step, + actual_dst_data, dst_step, + width, height, + src_depth, dst_depth, cn, + margin_left, margin_top, + margin_right, margin_bottom, + dx, dy, + scale, delta, + border_type); + } + else + { + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + if (src_data == dst_data && res == CV_HAL_ERROR_OK) + { + for (int y = 0; y < height; ++y) + { + memcpy(dst_data + y * dst_step, + actual_dst_data + y * dst_step, + width * size); + } + } + + return res; + } + +#endif // CV_HAL_RVV_1P0_ENABLED + + } + } +} // cv::rvv_hal::imgproc