Merge pull request #27378 from spacemit-com:4.x

Add canny, scharr and sobel for riscv-rvv hal. #27378 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2025-12-06 00:19:46 +01:00 · 2025-08-27 01:05:08 +08:00 · 2025-08-27 01:05:08 +08:00 · ad22d482e6
commit ad22d482e6
parent ad560f69f4
4 changed files with 2460 additions and 0 deletions
--- a/hal/riscv-rvv/include/imgproc.hpp
+++ b/hal/riscv-rvv/include/imgproc.hpp
@ -241,6 +241,29 @@ int integral(int depth, int sdepth, int sqdepth,
 //#undef cv_hal_integral
 //#define cv_hal_integral cv::rvv_hal::imgproc::integral

+/* ############ scharr ############ */
+int scharr(const uint8_t *src_data, size_t src_step, uint8_t *dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, int dx, int dy, double scale, double delta, int border_type);
+
+#undef cv_hal_scharr
+#define cv_hal_scharr cv::rvv_hal::imgproc::scharr
+
+/* ############ sobel ############ */
+
+int sobel(const uint8_t *src_data, size_t src_step, uint8_t *dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, int dx, int dy, int ksize, double scale, double delta, int border_type);
+
+#undef cv_hal_sobel
+#define cv_hal_sobel cv::rvv_hal::imgproc::sobel
+
+/* ############ canny ############ */
+int canny(const uint8_t *src_data, size_t src_step,
+         uint8_t *dst_data, size_t dst_step,
+         int width, int height, int cn,
+         double low_thresh, double high_thresh,
+         int ksize, bool L2gradient);
+
+#undef cv_hal_canny
+#define cv_hal_canny cv::rvv_hal::imgproc::canny
+
 #endif // CV_HAL_RVV_1P0_ENABLED

 #if CV_HAL_RVV_071_ENABLED
--- a/hal/riscv-rvv/src/imgproc/canny.cpp
+++ b/hal/riscv-rvv/src/imgproc/canny.cpp
@ -0,0 +1,426 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv
+{
+    namespace rvv_hal
+    {
+        namespace imgproc
+        {
+#if CV_HAL_RVV_1P0_ENABLED
+
+            typedef struct
+            {
+                int *buffer;
+                int *prev;
+                int *curr;
+                int *next;
+                size_t size;
+            } LineBuffer;
+
+            LineBuffer *createLineBuffer(size_t width, size_t align);
+
+            void rotateLineBuffer(LineBuffer *buf);
+
+            void freeLineBuffer(LineBuffer *buf);
+
+            LineBuffer *createLineBuffer(size_t width, size_t align)
+            {
+                LineBuffer *buf = (LineBuffer *)malloc(sizeof(LineBuffer));
+                if (!buf)
+                    return NULL;
+
+                size_t aligned_width = ((width + align - 1) & ~(align - 1));
+                buf->buffer = (int *)aligned_alloc(align, 3 * aligned_width * sizeof(int));
+                if (!buf->buffer)
+                {
+                    free(buf);
+                    return NULL;
+                }
+
+                buf->prev = buf->buffer;
+                buf->curr = buf->prev + aligned_width;
+                buf->next = buf->curr + aligned_width;
+                buf->size = aligned_width;
+
+                return buf;
+            }
+
+            void rotateLineBuffer(LineBuffer *buf)
+            {
+                int *temp = buf->prev;
+                buf->prev = buf->curr;
+                buf->curr = buf->next;
+                buf->next = temp;
+            }
+
+            void freeLineBuffer(LineBuffer *buf)
+            {
+                if (buf)
+                {
+                    free(buf->buffer);
+                    free(buf);
+                }
+            }
+
+            typedef struct
+            {
+                uint8_t **data;
+                int capacity;
+                int size;
+            } Stack;
+
+            static Stack *canny_createStack(int capacity);
+            static void canny_push(Stack *stack, uint8_t *item);
+            static uint8_t *canny_pop(Stack *stack);
+            static void freeStack(Stack *stack);
+
+            static Stack *canny_createStack(int capacity)
+            {
+                Stack *stack = (Stack *)malloc(sizeof(Stack));
+                stack->data = (uint8_t **)malloc(capacity * sizeof(uint8_t *));
+                stack->capacity = capacity;
+                stack->size = 0;
+                return stack;
+            }
+
+            static void canny_push(Stack *stack, uint8_t *item)
+            {
+                if (stack->size < stack->capacity)
+                {
+                    stack->data[stack->size++] = item;
+                }
+            }
+
+            static uint8_t *canny_pop(Stack *stack)
+            {
+                if (stack->size > 0)
+                {
+                    return stack->data[--stack->size];
+                }
+                return NULL;
+            }
+
+            static void freeStack(Stack *stack)
+            {
+                free(stack->data);
+                free(stack);
+            }
+
+            int sobel(const uint8_t *src_data, size_t src_step, uint8_t *dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, int dx, int dy, int ksize, double scale, double delta, int border_type);
+
+            int canny(const uint8_t *src_data, size_t src_step,
+                      uint8_t *dst_data, size_t dst_step,
+                      int width, int height, int cn,
+                      double low_thresh, double high_thresh,
+                      int ksize, bool L2gradient)
+            {
+                return CV_HAL_ERROR_NOT_IMPLEMENTED;
+                if (cn != 1 || (ksize != 3 && ksize != 5))
+                {
+                    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+                }
+
+                if (L2gradient)
+                {
+                    low_thresh = 32767.0 < low_thresh ? 32767.0 : low_thresh;
+                    high_thresh = 32767.0 < high_thresh ? 32767.0 : high_thresh;
+
+                    if (low_thresh > 0)
+                        low_thresh *= low_thresh;
+                    if (high_thresh > 0)
+                        high_thresh *= high_thresh;
+                }
+
+                int low = (int)low_thresh;
+                low_thresh = low - (low > low_thresh);
+                int high = (int)high_thresh;
+                high_thresh = high - (high > high_thresh);
+
+                int16_t *dx_data = (int16_t *)malloc(height * width * sizeof(int16_t));
+                int16_t *dy_data = (int16_t *)malloc(height * width * sizeof(int16_t));
+                if (!dx_data || !dy_data)
+                    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+                const size_t grad_step = width * sizeof(int16_t);
+
+                int ret = sobel(src_data, src_step, (uint8_t *)dx_data, grad_step,
+                                width, height, CV_8U, CV_16S, 1,
+                                0, 0, 0, 0, 1, 0, ksize, 1.0, 0.0, BORDER_REPLICATE);
+                if (ret != CV_HAL_ERROR_OK)
+                {
+                    free(dx_data);
+                    free(dy_data);
+                    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+                }
+
+                ret = sobel(src_data, src_step, (uint8_t *)dy_data, grad_step,
+                            width, height, CV_8U, CV_16S, 1,
+                            0, 0, 0, 0, 0, 1, ksize, 1.0, 0.0, BORDER_REPLICATE);
+                if (ret != CV_HAL_ERROR_OK)
+                {
+                    free(dx_data);
+                    free(dy_data);
+                    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+                }
+
+                const int mapstep = width + 2;
+                const size_t mapsize = (height + 2) * mapstep;
+                uint8_t *edge_map = (uint8_t *)calloc(mapsize, sizeof(uint8_t));
+                if (!edge_map)
+                {
+                    free(dx_data);
+                    free(dy_data);
+                    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+                }
+
+                Stack *stack = canny_createStack(width * height);
+                Stack *borderPeaksLocal = canny_createStack(width * height);
+                if (!stack || !borderPeaksLocal)
+                {
+                    free(dx_data);
+                    free(dy_data);
+                    free(edge_map);
+                    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+                }
+
+                memset(edge_map, 1, mapstep);
+                memset(edge_map + (height + 1) * mapstep, 1, mapstep);
+
+                for (int i = 1; i <= height; i++)
+                {
+                    edge_map[i * mapstep] = 1;
+                    edge_map[i * mapstep + width + 1] = 1;
+                }
+
+                uint8_t *map = edge_map + mapstep + 1;
+
+                const int simd_align = 16;
+                LineBuffer *lineBuf = createLineBuffer(width, simd_align);
+                if (!lineBuf)
+                {
+                    free(dx_data);
+                    free(dy_data);
+                    free(edge_map);
+                    freeStack(stack);
+                    freeStack(borderPeaksLocal);
+                    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+                }
+
+                const int TG22 = 13573;
+                for (int i = 0; i <= height; i++)
+                {
+                    if (i < height)
+                    {
+                        int16_t *dx = (int16_t *)(dx_data + i * width);
+                        int16_t *dy = (int16_t *)(dy_data + i * width);
+
+                        if (L2gradient)
+                        {
+                            size_t vl = 0;
+                            for (int j = 0; j < width; j += vl)
+                            {
+                                vl = __riscv_vsetvl_e32m8(width - j);
+                                vint32m8_t vdx = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dx + j, vl), vl);
+                                vint32m8_t vdy = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dy + j, vl), vl);
+                                vint32m8_t vres = __riscv_vmul_vv_i32m8(vdx, vdx, vl);
+                                vres = __riscv_vmacc_vv_i32m8(vres, vdy, vdy, vl);
+                                __riscv_vse32_v_i32m8(lineBuf->next + j, vres, vl);
+                            }
+                        }
+                        else
+                        {
+                            size_t vl = 0;
+                            for (int j = 0; j < width; j += vl)
+                            {
+                                vl = __riscv_vsetvl_e32m8(width - j);
+                                vint32m8_t vdx = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dx + j, vl), vl);
+                                vint32m8_t vdy = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dy + j, vl), vl);
+                                vbool4_t mask = __riscv_vmslt_vx_i32m8_b4(vdx, 0, vl);
+                                vdx = __riscv_vmul_vx_i32m8_m(mask, vdx, -1, vl);
+                                mask = __riscv_vmslt_vx_i32m8_b4(vdy, 0, vl);
+                                vdy = __riscv_vmul_vx_i32m8_m(mask, vdy, -1, vl);
+                                vint32m8_t vres = __riscv_vadd_vv_i32m8(vdx, vdy, vl);
+                                __riscv_vse32_v_i32m8(lineBuf->next + j, vres, vl);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        memset(lineBuf->next, 0, width * sizeof(int));
+                    }
+
+                    uint8_t *pmap = map + i * mapstep;
+
+                    int16_t *dx = (int16_t *)(dx_data + i * width);
+                    int16_t *dy = (int16_t *)(dy_data + i * width);
+
+                    size_t vl = 0;
+                    for (int j = 0; j < width; j += vl)
+                    {
+                        vl = __riscv_vsetvl_e32m8(width - j);
+
+                        vint32m8_t vm = __riscv_vle32_v_i32m8(lineBuf->curr + j, vl);
+
+                        vint32m8_t vdx = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dx + j, vl), vl);
+                        vint32m8_t vdy = __riscv_vwcvt_x_x_v_i32m8(__riscv_vle16_v_i16m4(dy + j, vl), vl);
+
+                        vbool4_t mask_dx = __riscv_vmslt_vx_i32m8_b4(vdx, 0, vl);
+                        vbool4_t mask_dy = __riscv_vmslt_vx_i32m8_b4(vdy, 0, vl);
+                        vint32m8_t vdx_abs = __riscv_vmul_vx_i32m8_m(mask_dx, vdx, -1, vl);
+                        vint32m8_t vdy_abs = __riscv_vmul_vx_i32m8_m(mask_dy, vdy, -1, vl);
+
+                        vdy_abs = __riscv_vsll_vx_i32m8(vdy_abs, 15, vl);
+
+                        // Calculate threshold values
+                        vint32m8_t vtg22x = __riscv_vmul_vx_i32m8(vdx_abs, TG22, vl);
+                        vint32m8_t vtg67x = __riscv_vadd_vv_i32m8(vtg22x, __riscv_vsll_vx_i32m8(vdx_abs, 16, vl), vl);
+
+                        // Create masks for different angle ranges
+                        vbool4_t mask1 = __riscv_vmslt_vv_i32m8_b4(vdy_abs, vtg22x, vl);
+                        vbool4_t mask2 = __riscv_vmsgt_vv_i32m8_b4(vdy_abs, vtg67x, vl);
+                        vbool4_t mask3 = __riscv_vmnot_m_b4(__riscv_vmor_mm_b4(mask1, mask2, vl), vl);
+
+                        // Load neighbor pixels for all conditions
+                        vint32m8_t prev_curr = __riscv_vle32_v_i32m8(lineBuf->curr + (j > 0 ? j - 1 : j), vl);
+                        vint32m8_t next_curr = __riscv_vle32_v_i32m8(lineBuf->curr + j + 1, vl);
+                        vint32m8_t prev_line = __riscv_vle32_v_i32m8(lineBuf->prev + j, vl);
+                        vint32m8_t next_line = __riscv_vle32_v_i32m8(lineBuf->next + j, vl);
+
+                        // Condition 1: Horizontal/Vertical edges (compare left/right)
+                        vbool4_t cond1_max = __riscv_vmand_mm_b4(
+                            __riscv_vmsgt_vv_i32m8_b4(vm, prev_curr, vl),
+                            __riscv_vmsge_vv_i32m8_b4(vm, next_curr, vl),
+                            vl);
+
+                        // Condition 2: Diagonal edges (compare top/bottom)
+                        vbool4_t cond2_max = __riscv_vmand_mm_b4(
+                            __riscv_vmsgt_vv_i32m8_b4(vm, prev_line, vl),
+                            __riscv_vmsge_vv_i32m8_b4(vm, next_line, vl),
+                            vl);
+
+                        // Condition 3: Other diagonals (calculate s)
+                        vint32m8_t vxor = __riscv_vxor_vv_i32m8(vdx, vdy, vl);
+                        vbool4_t s_mask = __riscv_vmslt_vx_i32m8_b4(vxor, 0, vl);
+
+                        vint32m8_t prev_s1 = __riscv_vle32_v_i32m8(lineBuf->prev + (j > 0 ? j - 1 : j), vl);
+                        vint32m8_t prev_s2 = __riscv_vle32_v_i32m8(lineBuf->prev + j + 1, vl);
+                        vint32m8_t next_s1 = __riscv_vle32_v_i32m8(lineBuf->next + j + 1, vl);
+                        vint32m8_t next_s2 = __riscv_vle32_v_i32m8(lineBuf->next + (j > 0 ? j - 1 : j), vl);
+
+                        vint32m8_t prev_sel = __riscv_vmerge_vvm_i32m8(prev_s1, prev_s2, s_mask, vl);
+                        vint32m8_t next_sel = __riscv_vmerge_vvm_i32m8(next_s1, next_s2, s_mask, vl);
+
+                        vbool4_t cond3_max = __riscv_vmand_mm_b4(
+                            __riscv_vmsgt_vv_i32m8_b4(vm, prev_sel, vl),
+                            __riscv_vmsgt_vv_i32m8_b4(vm, next_sel, vl),
+                            vl);
+
+                        // Combine results from all conditions
+                        vbool4_t vmax = __riscv_vmor_mm_b4(
+                            __riscv_vmand_mm_b4(mask1, cond1_max, vl),
+                            __riscv_vmand_mm_b4(mask2, cond2_max, vl),
+                            vl);
+                        vmax = __riscv_vmor_mm_b4(
+                            vmax,
+                            __riscv_vmand_mm_b4(mask3, cond3_max, vl),
+                            vl);
+
+                        // Threshold checks
+                        vbool4_t vlow = __riscv_vmsgt_vx_i32m8_b4(vm, low_thresh, vl);
+                        vbool4_t vhigh = __riscv_vmsgt_vx_i32m8_b4(vm, high_thresh, vl);
+                        vbool4_t valid_edges = __riscv_vmand_mm_b4(vmax, vlow, vl);
+                        vbool4_t strong_edges = __riscv_vmand_mm_b4(valid_edges, vhigh, vl);
+
+                        // Generate result map
+                        vuint8m2_t vres = __riscv_vmv_v_x_u8m2(1, vl);
+                        vres = __riscv_vmerge_vxm_u8m2(vres, 0, valid_edges, vl);
+                        vres = __riscv_vmerge_vxm_u8m2(vres, 2, strong_edges, vl);
+                        __riscv_vse8_v_u8m2(pmap + j, vres, vl);
+
+                        // canny_Push strong edges to stack
+                        int32_t vidx = __riscv_vfirst_m_b4(strong_edges, vl);
+                        while (vidx >= 0)
+                        {
+                            canny_push(stack, pmap + j + vidx);
+                            strong_edges = __riscv_vmand_mm_b4(strong_edges,
+                                                               __riscv_vmclr_m_b4(vl), vl);
+                            vidx = __riscv_vfirst_m_b4(strong_edges, vl);
+                        }
+                    }
+
+                    rotateLineBuffer(lineBuf);
+                }
+
+                uint8_t *pmapLower = edge_map;
+                uint32_t pmapDiff = (uint32_t)((edge_map + mapsize) - pmapLower);
+
+                while (stack->size > 0)
+                {
+                    uint8_t *m = canny_pop(stack);
+
+                    if ((uint64_t)m < (uint64_t)pmapLower || (uint64_t)m >= (uint64_t)(pmapLower + pmapDiff))
+                        continue;
+
+                    const int offsets[8] = {
+                        -mapstep - 1, -mapstep, -mapstep + 1,
+                        -1, +1,
+                        +mapstep - 1, +mapstep, +mapstep + 1};
+
+                    for (int k = 0; k < 8; k++)
+                    {
+                        uint8_t *neighbor = m + offsets[k];
+                        if ((uint64_t)neighbor < (uint64_t)edge_map ||
+                            (uint64_t)neighbor >= (uint64_t)(edge_map + mapsize))
+                            continue;
+
+                        if (*neighbor == 0)
+                        {
+                            *neighbor = 2;
+                            canny_push(stack, neighbor);
+                        }
+                    }
+                }
+
+                if (borderPeaksLocal->size > 0)
+                {
+                    for (int i = 0; i < borderPeaksLocal->size; i++)
+                    {
+                        canny_push(stack, borderPeaksLocal->data[i]);
+                    }
+                }
+
+                for (int i = 0; i < height; i++)
+                {
+                    uint8_t *pdst = dst_data + i * dst_step;
+                    uint8_t *pmap = map + i * mapstep + 1;
+                    size_t vl = 0;
+                    for (int j = 0; j < width; j += vl)
+                    {
+                        vl = __riscv_vsetvl_e8m8(width - j);
+                        vuint8m8_t vres = __riscv_vle8_v_u8m8(pmap + j, vl);
+                        vres = __riscv_vsrl_vx_u8m8(vres, 1, vl);
+                        vres = __riscv_vsub_vv_u8m8(__riscv_vmv_v_x_u8m8(0, vl), vres, vl);
+                        __riscv_vse8_v_u8m8(pdst + j, vres, vl);
+                    }
+                }
+
+                freeLineBuffer(lineBuf);
+                freeStack(borderPeaksLocal);
+                freeStack(stack);
+                free(dx_data);
+                free(dy_data);
+                free(edge_map);
+
+                return CV_HAL_ERROR_OK;
+            }
+        }
+    }
+#endif // CV_HAL_RVV_1P0_ENABLED
+} // namespace cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/src/imgproc/scharr.cpp
+++ b/hal/riscv-rvv/src/imgproc/scharr.cpp
@ -0,0 +1,883 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html  .
+
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv
+{
+    namespace rvv_hal
+    {
+        namespace imgproc
+        {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+#define borderValue 0
+
+            static inline int scharr_dx0(int start, int end, const uint8_t *src_data, size_t src_step,
+                                         uint8_t *dst_data, size_t dst_step,
+                                         int width, int height,
+                                         int src_depth  __attribute__((unused)), int dst_depth, int cn __attribute__((unused)),
+                                         int margin_left __attribute__((unused)), int margin_top __attribute__((unused)),
+                                         int margin_right __attribute__((unused)), int margin_bottom __attribute__((unused)),
+                                         double scale __attribute__((unused)), double delta __attribute__((unused)),
+                                         int border_type)
+            {
+                int dx0, dx1, dx2, dy0, dy1, dy2;
+
+                dy0 = -1;
+                dx0 = 3;
+
+                dy1 = 0;
+                dx1 = 10;
+
+                dy2 = 1;
+                dx2 = 3;
+
+                int16_t trow[width];
+
+                static thread_local std::vector<uint8_t> zero_row;
+                if (zero_row.size() < static_cast<size_t>(width))
+                {
+                    zero_row.clear();
+                    zero_row.resize(width, 0);
+                }
+
+                if (dst_depth == CV_8U)
+                {
+                    for (int y = start; y < end; ++y)
+                    {
+                        const uint8_t *srow1 = src_data + y * src_step;
+                        const uint8_t *srow0;
+                        const uint8_t *srow2;
+
+                        if (y != 0)
+                        {
+                            srow0 = src_data + (y - 1) * src_step;
+                        }
+                        else
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow0 = srow1;
+                            }
+                            else
+                            {
+                                srow0 = zero_row.data();
+                            }
+                        }
+
+                        if (y != height - 1)
+                        {
+                            srow2 = src_data + (y + 1) * src_step;
+                        }
+                        else
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow2 = srow1;
+                            }
+                            else
+                            {
+                                srow2 = zero_row.data();
+                            }
+                        }
+
+                        trow[0] = (dy0 * srow0[0] + dy1 * srow1[0] + dy2 * srow2[0]);
+
+                        // Last pixel
+
+                        trow[width - 1] = dy0 * srow0[width - 1] + dy1 * srow1[width - 1] + dy2 * srow2[width - 1];
+
+                        size_t vl = 0;
+
+                        // Vector processing
+
+                        for (int x = 1; x < width - 1; x += vl)
+                        {
+                            vl = __riscv_vsetvl_e8m4(width - x - 1);
+
+                            vint16m8_t vsrow0 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow0 + x, vl), vl));
+                            vint16m8_t vsrow2 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow2 + x, vl), vl));
+
+                            vint16m8_t vrowx = __riscv_vsub_vv_i16m8(vsrow2, vsrow0, vl);
+
+                            __riscv_vse16_v_i16m8(trow + x, vrowx, vl);
+                        }
+
+                        int x = 0;
+
+                        uint8_t *drow = (uint8_t *)(dst_data + y * dst_step);
+
+                        int16_t temp = trow[0] * dx1 + trow[1] * dx2;
+
+                        if (border_type == BORDER_REPLICATE)
+                        {
+                            temp += trow[0] * dx0;
+                        }
+
+                        drow[x] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0);
+
+                        x = 1;
+                        for (; x < width - 1; x += vl)
+                        {
+                            vl = __riscv_vsetvl_e16m8(width - x - 1);
+                            vint16m8_t vleft = __riscv_vle16_v_i16m8(trow + x - 1, vl);
+                            vint16m8_t vcenter = __riscv_vle16_v_i16m8(trow + x, vl);
+                            vint16m8_t vright = __riscv_vle16_v_i16m8(trow + x + 1, vl);
+
+                            vint16m8_t vres = __riscv_vmul_vx_i16m8(vcenter, dx1, vl);
+                            vcenter = __riscv_vadd_vv_i16m8(vleft, vright, vl);
+                            vres = __riscv_vmacc_vx_i16m8(vres, dx0, vcenter, vl);
+
+                            vres = __riscv_vmax_vx_i16m8(vres, 0, vl);
+                            vres = __riscv_vmin_vx_i16m8(vres, 255, vl);
+
+                            __riscv_vse8_v_u8m4(drow + x, __riscv_vncvt_x_x_w_u8m4(__riscv_vreinterpret_v_i16m8_u16m8(vres), vl), vl);
+                        }
+
+                        x = width - 1;
+
+                        temp = trow[width - 2] * dx0 + trow[width - 1] * dx1;
+
+                        if (border_type == BORDER_REPLICATE)
+                        {
+                            temp += trow[width - 1] * dx2;
+                        }
+
+                        drow[x] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0);
+                    }
+                }
+
+                else if (dst_depth == CV_16S)
+                {
+
+                    for (int y = start; y < end; ++y)
+                    {
+                        const uint8_t *srow1 = src_data + y * src_step;
+                        const uint8_t *srow0;
+                        const uint8_t *srow2;
+
+                        if (y != 0)
+                        {
+                            srow0 = src_data + (y - 1) * src_step;
+                        }
+                        else
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow0 = srow1;
+                            }
+                            else
+                            {
+                                srow0 = zero_row.data();
+                            }
+                        }
+
+                        if (y != height - 1)
+                        {
+                            srow2 = src_data + (y + 1) * src_step;
+                        }
+                        else
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow2 = srow1;
+                            }
+                            else
+                            {
+                                srow2 = zero_row.data();
+                            }
+                        }
+
+                        if (y == 0)
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow0 = srow1;
+                            }
+                            else
+                            {
+                                srow0 = zero_row.data();
+                            }
+                        }
+                        else if (y == height - 1)
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow2 = srow1;
+                            }
+                            else
+                            {
+                                srow2 = zero_row.data();
+                            }
+                        }
+
+                        size_t vl = 0;
+
+                        // Vector processing
+
+                        for (int x = 0; x < width; x += vl)
+                        {
+                            vl = __riscv_vsetvl_e8m4(width - x);
+
+                            vint16m8_t vsrow0 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow0 + x, vl), vl));
+                            vint16m8_t vsrow2 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow2 + x, vl), vl));
+
+                            vint16m8_t vrowx = __riscv_vsub_vv_i16m8(vsrow2, vsrow0, vl);
+
+                            __riscv_vse16_v_i16m8(trow + x, vrowx, vl);
+                        }
+
+                        int x = 0;
+
+                        int16_t *drow = (int16_t *)(dst_data + y * dst_step);
+
+                        drow[0] = trow[0] * dx1 + trow[1] * dx2;
+
+                        if (border_type == BORDER_REPLICATE)
+                        {
+                            drow[x] += trow[0] * dx0;
+                        }
+
+                        x = 1;
+                        for (; x < width - 1; x += vl)
+                        {
+                            vl = __riscv_vsetvl_e16m8(width - x - 1);
+                            vint16m8_t vleft = __riscv_vle16_v_i16m8(trow + x - 1, vl);
+                            vint16m8_t vcenter = __riscv_vle16_v_i16m8(trow + x, vl);
+                            vint16m8_t vright = __riscv_vle16_v_i16m8(trow + x + 1, vl);
+
+                            vint16m8_t vres = __riscv_vmul_vx_i16m8(vcenter, dx1, vl);
+                            vcenter = __riscv_vadd_vv_i16m8(vleft, vright, vl);
+                            vres = __riscv_vmacc_vx_i16m8(vres, dx0, vcenter, vl);
+
+                            __riscv_vse16_v_i16m8(drow + x, vres, vl);
+                        }
+
+                        x = width - 1;
+
+                        drow[x] = trow[width - 2] * dx0 + trow[width - 1] * dx1;
+
+                        if (border_type == BORDER_REPLICATE)
+                        {
+                            drow[x] += trow[width - 1] * dx2;
+                        }
+                    }
+                }
+
+                else if (dst_depth == CV_32F)
+                {
+                    for (int y = start; y < end; ++y)
+                    {
+                        const uint8_t *srow1 = src_data + y * src_step;
+                        const uint8_t *srow0;
+                        const uint8_t *srow2;
+
+                        if (y != 0)
+                        {
+                            srow0 = src_data + (y - 1) * src_step;
+                        }
+                        else
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow0 = srow1;
+                            }
+                            else
+                            {
+                                srow0 = zero_row.data();
+                            }
+                        }
+
+                        if (y != height - 1)
+                        {
+                            srow2 = src_data + (y + 1) * src_step;
+                        }
+                        else
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow2 = srow1;
+                            }
+                            else
+                            {
+                                srow2 = zero_row.data();
+                            }
+                        }
+
+                        if (y == 0)
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow0 = srow1;
+                            }
+                            else
+                            {
+                                srow0 = zero_row.data();
+                            }
+                        }
+                        else if (y == height - 1)
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow2 = srow1;
+                            }
+                            else
+                            {
+                                srow2 = zero_row.data();
+                            }
+                        }
+
+                        size_t vl = 0;
+
+                        // Vector processing
+                        for (int x = 0; x < width; x += vl)
+                        {
+                            vl = __riscv_vsetvl_e8m4(width - x);
+
+                            vint16m8_t vsrow0 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow0 + x, vl), vl));
+                            vint16m8_t vsrow2 = __riscv_vreinterpret_v_u16m8_i16m8(__riscv_vwcvtu_x_x_v_u16m8(__riscv_vle8_v_u8m4(srow2 + x, vl), vl));
+
+                            vint16m8_t vrowx = __riscv_vsub_vv_i16m8(vsrow2, vsrow0, vl);
+
+                            __riscv_vse16_v_i16m8(trow + x, vrowx, vl);
+                        }
+
+                        int x = 0;
+
+                        float *drow = (float *)(dst_data + y * dst_step);
+
+                        drow[x] = trow[0] * dx1 + trow[1] * dx2;
+
+                        if (border_type == BORDER_REPLICATE)
+                        {
+                            drow[x] += trow[0] * dx0;
+                        }
+
+                        x = 1;
+
+                        for (; x < width - 1; x += vl)
+                        {
+                            vl = __riscv_vsetvl_e16m4(width - x - 1);
+                            vint16m4_t vleft = __riscv_vle16_v_i16m4(trow + x - 1, vl);
+                            vint16m4_t vcenter = __riscv_vle16_v_i16m4(trow + x, vl);
+                            vint16m4_t vright = __riscv_vle16_v_i16m4(trow + x + 1, vl);
+
+                            vint16m4_t vres = __riscv_vmul_vx_i16m4(vcenter, dx1, vl);
+                            vcenter = __riscv_vadd_vv_i16m4(vleft, vright, vl);
+                            vres = __riscv_vmacc_vx_i16m4(vres, dx0, vcenter, vl);
+
+                            __riscv_vse32_v_f32m8(drow + x, __riscv_vfwcvt_f_x_v_f32m8(vres, vl), vl);
+                        }
+
+                        x = width - 1;
+
+                        drow[x] = trow[width - 2] * dx0 + trow[width - 1] * dx1;
+
+                        if (border_type == BORDER_REPLICATE)
+                        {
+                            drow[x] += trow[width - 1] * dx2;
+                        }
+                    }
+                }
+                return CV_HAL_ERROR_OK;
+            }
+
+            static inline int scharr_dx1(int start, int end, const uint8_t *src_data, size_t src_step,
+                                         uint8_t *dst_data, size_t dst_step,
+                                         int width, int height,
+                                         int src_depth __attribute__((unused)), int dst_depth, int cn __attribute__((unused)),
+                                         int margin_left __attribute__((unused)), int margin_top __attribute__((unused)),
+                                         int margin_right __attribute__((unused)), int margin_bottom __attribute__((unused)),
+                                         double scale __attribute__((unused)), double delta __attribute__((unused)),
+                                         int border_type)
+            {
+                int dx0, dx1, dx2, dy0, dy1, dy2;
+                dx0 = -1;
+                dy0 = 3;
+
+                dx1 = 0;
+                dy1 = 10;
+
+                dx2 = 1;
+                dy2 = 3;
+
+                int16_t trow[width];
+
+                static thread_local std::vector<uint8_t> zero_row;
+                if (zero_row.size() < static_cast<size_t>(width))
+                {
+                    zero_row.clear();
+                    zero_row.resize(width, 0);
+                }
+
+                if (dst_depth == CV_8U)
+                {
+                    for (int y = start; y < end; ++y)
+                    {
+                        const uint8_t *srow1 = src_data + y * src_step;
+                        const uint8_t *srow0;
+                        const uint8_t *srow2;
+
+                        if (y != 0)
+                        {
+                            srow0 = src_data + (y - 1) * src_step;
+                        }
+                        else
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow0 = srow1;
+                            }
+                            else
+                            {
+                                srow0 = zero_row.data();
+                            }
+                        }
+
+                        if (y != height - 1)
+                        {
+                            srow2 = src_data + (y + 1) * src_step;
+                        }
+                        else
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow2 = srow1;
+                            }
+                            else
+                            {
+                                srow2 = zero_row.data();
+                            }
+                        }
+
+                        if (y == 0)
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow0 = srow1;
+                            }
+                            else
+                            {
+                                srow0 = zero_row.data();
+                            }
+                        }
+                        else if (y == height - 1)
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow2 = srow1;
+                            }
+                            else
+                            {
+                                srow2 = zero_row.data();
+                            }
+                        }
+
+                        trow[0] = (dy0 * srow0[0] + dy1 * srow1[0] + dy2 * srow2[0]);
+
+                        // Last pixel
+
+                        trow[width - 1] = dy0 * srow0[width - 1] + dy1 * srow1[width - 1] + dy2 * srow2[width - 1];
+
+                        size_t vl = 0;
+
+                        // Vector processing
+
+                        for (int x = 1; x < width - 1; x += vl)
+                        {
+                            vl = __riscv_vsetvl_e8m4(width - x - 1);
+
+                            vuint8m4_t vsrow0 = __riscv_vle8_v_u8m4(srow0 + x, vl);
+                            vuint8m4_t vsrow1 = __riscv_vle8_v_u8m4(srow1 + x, vl);
+                            vuint8m4_t vsrow2 = __riscv_vle8_v_u8m4(srow2 + x, vl);
+
+                            vuint16m8_t vrowx = __riscv_vwmulu_vx_u16m8(vsrow1, dy1, vl);
+                            vuint16m8_t vtemp = __riscv_vwaddu_vv_u16m8(vsrow0, vsrow2, vl);
+                            vrowx = __riscv_vmacc_vx_u16m8(vrowx, dy0, vtemp, vl);
+
+                            __riscv_vse16_v_i16m8(trow + x, __riscv_vreinterpret_v_u16m8_i16m8(vrowx), vl);
+                        }
+
+                        int x = 0;
+
+                        uint8_t *drow = (uint8_t *)(dst_data + y * dst_step);
+
+                        int16_t temp = trow[0] * dx1 + trow[1] * dx2;
+
+                        if (border_type == BORDER_REPLICATE)
+                        {
+                            temp += trow[0] * dx0;
+                        }
+
+                        drow[x] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0);
+
+                        x = 1;
+
+                        for (; x < width - 1; x += vl)
+                        {
+                            vl = __riscv_vsetvl_e16m8(width - x - 1);
+                            vint16m8_t vleft = __riscv_vle16_v_i16m8(trow + x - 1, vl);
+
+                            vint16m8_t vright = __riscv_vle16_v_i16m8(trow + x + 1, vl);
+                            vint16m8_t vres = __riscv_vsub_vv_i16m8(vright, vleft, vl);
+
+                            vres = __riscv_vmax_vx_i16m8(vres, 0, vl);
+                            vres = __riscv_vmin_vx_i16m8(vres, 255, vl);
+
+                            __riscv_vse8_v_u8m4(drow + x, __riscv_vncvt_x_x_w_u8m4(__riscv_vreinterpret_v_i16m8_u16m8(vres), vl), vl);
+                        }
+
+                        x = width - 1;
+
+                        temp = trow[width - 2] * dx0 + trow[width - 1] * dx1;
+
+                        if (border_type == BORDER_REPLICATE)
+                        {
+                            temp += trow[width - 1] * dx2;
+                        }
+
+                        drow[x] = (uint8_t)(temp > 0 ? (temp < 255 ? temp : 255) : 0);
+                    }
+                }
+
+                else if (dst_depth == CV_16S)
+                {
+                    for (int y = start; y < end; ++y)
+                    {
+                        const uint8_t *srow1 = src_data + y * src_step;
+                        const uint8_t *srow0;
+                        const uint8_t *srow2;
+
+                        if (y != 0)
+                        {
+                            srow0 = src_data + (y - 1) * src_step;
+                        }
+                        else
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow0 = srow1;
+                            }
+                            else
+                            {
+                                srow0 = zero_row.data();
+                            }
+                        }
+
+                        if (y != height - 1)
+                        {
+                            srow2 = src_data + (y + 1) * src_step;
+                        }
+                        else
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow2 = srow1;
+                            }
+                            else
+                            {
+                                srow2 = zero_row.data();
+                            }
+                        }
+
+                        if (y == 0)
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow0 = srow1;
+                            }
+                            else
+                            {
+                                srow0 = zero_row.data();
+                            }
+                        }
+                        else if (y == height - 1)
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow2 = srow1;
+                            }
+                            else
+                            {
+                                srow2 = zero_row.data();
+                            }
+                        }
+
+                        size_t vl = 0;
+
+                        // Vector processing
+
+                        for (int x = 0; x < width; x += vl)
+                        {
+                            vl = __riscv_vsetvl_e8m4(width - x);
+
+                            vuint8m4_t vsrow0 = __riscv_vle8_v_u8m4(srow0 + x, vl);
+                            vuint8m4_t vsrow1 = __riscv_vle8_v_u8m4(srow1 + x, vl);
+                            vuint8m4_t vsrow2 = __riscv_vle8_v_u8m4(srow2 + x, vl);
+
+                            vuint16m8_t vrowx = __riscv_vwmulu_vx_u16m8(vsrow1, dy1, vl);
+                            vuint16m8_t vtemp = __riscv_vwaddu_vv_u16m8(vsrow0, vsrow2, vl);
+                            vrowx = __riscv_vmacc_vx_u16m8(vrowx, dy0, vtemp, vl);
+
+                            __riscv_vse16_v_i16m8(trow + x, __riscv_vreinterpret_v_u16m8_i16m8(vrowx), vl);
+                        }
+
+                        int x = 0;
+
+                        int16_t *drow = (int16_t *)(dst_data + y * dst_step);
+
+                        drow[x] = trow[0] * dx1 + trow[1] * dx2;
+
+                        if (border_type == BORDER_REPLICATE)
+                        {
+                            drow[x] += trow[0] * dx0;
+                        }
+
+                        x = 1;
+                        for (; x < width - 1; x += vl)
+                        {
+                            vl = __riscv_vsetvl_e16m8(width - x - 1);
+                            vint16m8_t vleft = __riscv_vle16_v_i16m8(trow + x - 1, vl);
+
+                            vint16m8_t vright = __riscv_vle16_v_i16m8(trow + x + 1, vl);
+                            vint16m8_t vres = __riscv_vsub_vv_i16m8(vright, vleft, vl);
+
+                            __riscv_vse16_v_i16m8(drow + x, vres, vl);
+                        }
+
+                        x = width - 1;
+
+                        drow[x] = trow[width - 2] * dx0 + trow[width - 1] * dx1;
+
+                        if (border_type == BORDER_REPLICATE)
+                        {
+                            drow[x] += trow[width - 1] * dx2;
+                        }
+                    }
+                }
+
+                else if (dst_depth == CV_32F)
+                {
+                    for (int y = start; y < end; ++y)
+                    {
+                        const uint8_t *srow1 = src_data + y * src_step;
+                        const uint8_t *srow0;
+                        const uint8_t *srow2;
+
+                        if (y != 0)
+                        {
+                            srow0 = src_data + (y - 1) * src_step;
+                        }
+                        else
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow0 = srow1;
+                            }
+                            else
+                            {
+                                srow0 = zero_row.data();
+                            }
+                        }
+
+                        if (y != height - 1)
+                        {
+                            srow2 = src_data + (y + 1) * src_step;
+                        }
+                        else
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow2 = srow1;
+                            }
+                            else
+                            {
+                                srow2 = zero_row.data();
+                            }
+                        }
+
+                        if (y == 0)
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow0 = srow1;
+                            }
+                            else
+                            {
+                                srow0 = zero_row.data();
+                            }
+                        }
+                        else if (y == height - 1)
+                        {
+                            if (border_type == BORDER_REPLICATE)
+                            {
+                                srow2 = srow1;
+                            }
+                            else
+                            {
+                                srow2 = zero_row.data();
+                            }
+                        }
+
+                        size_t vl = 0;
+
+                        // Vector processing
+                        for (int x = 0; x < width; x += vl)
+                        {
+                            vl = __riscv_vsetvl_e8m4(width - x);
+
+                            vuint8m4_t vsrow0 = __riscv_vle8_v_u8m4(srow0 + x, vl);
+                            vuint8m4_t vsrow1 = __riscv_vle8_v_u8m4(srow1 + x, vl);
+                            vuint8m4_t vsrow2 = __riscv_vle8_v_u8m4(srow2 + x, vl);
+
+                            vuint16m8_t vrowx = __riscv_vwmulu_vx_u16m8(vsrow1, dy1, vl);
+                            vuint16m8_t vtemp = __riscv_vwaddu_vv_u16m8(vsrow0, vsrow2, vl);
+                            vrowx = __riscv_vmacc_vx_u16m8(vrowx, dy0, vtemp, vl);
+
+                            __riscv_vse16_v_i16m8(trow + x, __riscv_vreinterpret_v_u16m8_i16m8(vrowx), vl);
+                        }
+
+                        int x = 0;
+
+                        float *drow = (float *)(dst_data + y * dst_step);
+
+                        drow[x] = trow[0] * dx1 + trow[1] * dx2;
+
+                        if (border_type == BORDER_REPLICATE)
+                        {
+                            drow[x] += trow[0] * dx0;
+                        }
+
+                        x = 1;
+
+                        for (; x < width - 1; x += vl)
+                        {
+                            vl = __riscv_vsetvl_e16m4(width - x - 1);
+                            vint16m4_t vleft = __riscv_vle16_v_i16m4(trow + x - 1, vl);
+
+                            vint16m4_t vright = __riscv_vle16_v_i16m4(trow + x + 1, vl);
+                            vint16m4_t vres = __riscv_vsub_vv_i16m4(vright, vleft, vl);
+
+                            __riscv_vse32_v_f32m8(drow + x, __riscv_vfwcvt_f_x_v_f32m8(vres, vl), vl);
+                        }
+
+                        x = width - 1;
+
+                        drow[x] = trow[width - 2] * dx0 + trow[width - 1] * dx1;
+
+                        if (border_type == BORDER_REPLICATE)
+                        {
+                            drow[x] += trow[width - 1] * dx2;
+                        }
+                    }
+                }
+
+                return CV_HAL_ERROR_OK;
+            }
+
+            class ScharrInvoker : public ParallelLoopBody
+            {
+            public:
+                explicit ScharrInvoker(std::function<int(int, int)> _func)
+                    : func(std::move(_func)) {}
+
+                void operator()(const Range &range) const override
+                {
+                    func(range.start, range.end);
+                }
+
+            private:
+                std::function<int(int, int)> func;
+            };
+
+            template <typename... Args>
+            inline int invoke(int height,
+                              int (*f)(int, int, Args...),
+                              Args... args)
+            {
+                using namespace std::placeholders;
+                auto bound = std::bind(f, _1, _2, std::forward<Args>(args)...);
+                cv::parallel_for_(Range(1, height), ScharrInvoker(bound), cv::getNumThreads());
+                return f(0, 1, std::forward<Args>(args)...);
+            }
+
+            int scharr(const uint8_t *src_data, size_t src_step,
+                       uint8_t *dst_data, size_t dst_step,
+                       int width, int height,
+                       int src_depth, int dst_depth, int cn,
+                       int margin_left, int margin_top,
+                       int margin_right, int margin_bottom,
+                       int dx, int dy __attribute__((unused)),
+                       double scale, double delta,
+                       int border_type)
+            {
+                if (src_depth != CV_8U ||
+                    (dst_depth != CV_8U && dst_depth != CV_16S && dst_depth != CV_32F) ||
+                    cn != 1 || width < 3)
+                    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+                if (scale != 1 || delta != 0)
+                    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+                if (border_type != BORDER_REPLICATE && border_type != BORDER_CONSTANT)
+                    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+                if (margin_left != 0 || margin_top != 0 || margin_right != 0 || margin_bottom != 0)
+                    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+                // This is copied from opencv/modules/imgproc/src/deriv.cpp
+                // if (!(dx >= 0 && dy >= 0 && dx + dy == 1))
+                // {
+                //     return CV_HAL_ERROR_NOT_IMPLEMENTED;
+                // }
+
+                int size = 1;
+                if (dst_depth == CV_16S)
+                {
+                    size = 2;
+                }
+                else if (dst_depth == CV_32F)
+                {
+                    size = 4;
+                }
+
+                std::vector<uint8_t> temp_buffer;
+                uint8_t *actual_dst_data = dst_data;
+                if (src_data == dst_data)
+                {
+                    temp_buffer.resize(height * dst_step);
+                    actual_dst_data = temp_buffer.data();
+                }
+
+                int res = invoke(height,
+                                 dx == 1 ? &scharr_dx1 : &scharr_dx0,
+                                 src_data, src_step,
+                                 actual_dst_data, dst_step,
+                                 width, height,
+                                 src_depth, dst_depth, cn,
+                                 margin_left, margin_top,
+                                 margin_right, margin_bottom,
+                                 scale, delta,
+                                 border_type);
+                if (src_data == dst_data && res == CV_HAL_ERROR_OK)
+                {
+                    for (int y = 0; y < height; ++y)
+                    {
+                        memcpy(dst_data + y * dst_step,
+                               actual_dst_data + y * dst_step,
+                               width * size);
+                    }
+                }
+
+                return res;
+            }
+#endif // CV_HAL_RVV_1P0_ENABLED
+        }
+    }
+} // cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/src/imgproc/sobel.cpp
+++ b/hal/riscv-rvv/src/imgproc/sobel.cpp