Merge pull request #27391 from Haosonn:pr-rvv-hal-fast

hal/riscv-rvv: implement FAST keypoint detection #27391 An implementation of FAST keypoint detection with NMS/noNMS version. A new perf test is written, and the perf test is evaluated in two platforms: K1/K230. Accelaration is achieved when threshold is high, however, weird stat shows that the acceleration doesn't work when threshold is low (the number of keypoint candidates is high). K1: ``` # GCC Name of Test scalar rvv rvv vs scalar (x-factor) detect::Fast_Params::(20, 2, false, "cv/cameracalibration/chess9.png") 22.113 23.721 0.93 detect::Fast_Params::(20, 2, false, "cv/inpaint/orig.png") 4.605 7.168 0.64 detect::Fast_Params::(20, 2, true, "cv/cameracalibration/chess9.png") 26.228 24.689 1.06 detect::Fast_Params::(20, 2, true, "cv/inpaint/orig.png") 7.134 7.561 0.94 detect::Fast_Params::(30, 2, false, "cv/cameracalibration/chess9.png") 19.488 21.407 0.91 detect::Fast_Params::(30, 2, false, "cv/inpaint/orig.png") 3.481 5.404 0.64 detect::Fast_Params::(30, 2, true, "cv/cameracalibration/chess9.png") 22.309 22.145 1.01 detect::Fast_Params::(30, 2, true, "cv/inpaint/orig.png") 4.826 5.654 0.85 detect::Fast_Params::(100, 2, false, "cv/cameracalibration/chess9.png") 14.108 8.205 1.72 detect::Fast_Params::(100, 2, false, "cv/inpaint/orig.png") 2.520 1.072 2.35 detect::Fast_Params::(100, 2, true, "cv/cameracalibration/chess9.png") 14.133 8.410 1.68 detect::Fast_Params::(100, 2, true, "cv/inpaint/orig.png") 2.556 1.097 2.33 # Clang Name of Test scalar rvv rvv vs scalar (x-factor) detect::Fast_Params::(20, 2, false, "cv/cameracalibration/chess9.png") 25.130 23.695 1.06 detect::Fast_Params::(20, 2, false, "cv/inpaint/orig.png") 4.987 7.168 0.70 detect::Fast_Params::(20, 2, true, "cv/cameracalibration/chess9.png") 28.035 24.467 1.15 detect::Fast_Params::(20, 2, true, "cv/inpaint/orig.png") 6.760 7.503 0.90 detect::Fast_Params::(30, 2, false, "cv/cameracalibration/chess9.png") 22.954 21.373 1.07 detect::Fast_Params::(30, 2, false, "cv/inpaint/orig.png") 3.838 5.330 0.72 detect::Fast_Params::(30, 2, true, "cv/cameracalibration/chess9.png") 24.523 21.998 1.11 detect::Fast_Params::(30, 2, true, "cv/inpaint/orig.png") 4.795 5.543 0.87 detect::Fast_Params::(100, 2, false, "cv/cameracalibration/chess9.png") 16.799 8.102 2.07 detect::Fast_Params::(100, 2, false, "cv/inpaint/orig.png") 2.874 1.024 2.81 detect::Fast_Params::(100, 2, true, "cv/cameracalibration/chess9.png") 16.950 8.073 2.10 detect::Fast_Params::(100, 2, true, "cv/inpaint/orig.png") 2.899 1.027 2.82 ``` K230 ``` # GCC Name of Test scalar rvv rvv vs scalar (x-factor) detect::Fast_Params::(20, 2, false, "cv/cameracalibration/chess9.png") 21.082 32.090 0.66 detect::Fast_Params::(20, 2, false, "cv/inpaint/orig.png") 4.837 9.157 0.53 detect::Fast_Params::(20, 2, true, "cv/cameracalibration/chess9.png") 25.479 33.576 0.76 detect::Fast_Params::(20, 2, true, "cv/inpaint/orig.png") 7.549 9.716 0.78 detect::Fast_Params::(30, 2, false, "cv/cameracalibration/chess9.png") 18.463 30.087 0.61 detect::Fast_Params::(30, 2, false, "cv/inpaint/orig.png") 3.716 6.544 0.57 detect::Fast_Params::(30, 2, true, "cv/cameracalibration/chess9.png") 21.548 31.374 0.69 detect::Fast_Params::(30, 2, true, "cv/inpaint/orig.png") 5.107 6.928 0.74 detect::Fast_Params::(100, 2, false, "cv/cameracalibration/chess9.png") 13.763 8.712 1.58 detect::Fast_Params::(100, 2, false, "cv/inpaint/orig.png") 2.578 1.284 2.01 detect::Fast_Params::(100, 2, true, "cv/cameracalibration/chess9.png") 13.804 8.831 1.56 detect::Fast_Params::(100, 2, true, "cv/inpaint/orig.png") 2.615 1.289 2.03 # Clang Name of Test scalar rvv rvv vs scalar (x-factor) detect::Fast_Params::(20, 2, false, "cv/cameracalibration/chess9.png") 23.424 35.072 0.67 detect::Fast_Params::(20, 2, false, "cv/inpaint/orig.png") 5.284 10.107 0.52 detect::Fast_Params::(20, 2, true, "cv/cameracalibration/chess9.png") 26.487 35.978 0.74 detect::Fast_Params::(20, 2, true, "cv/inpaint/orig.png") 7.146 10.612 0.67 detect::Fast_Params::(30, 2, false, "cv/cameracalibration/chess9.png") 21.155 32.858 0.64 detect::Fast_Params::(30, 2, false, "cv/inpaint/orig.png") 4.101 7.153 0.57 detect::Fast_Params::(30, 2, true, "cv/cameracalibration/chess9.png") 23.321 33.505 0.70 detect::Fast_Params::(30, 2, true, "cv/inpaint/orig.png") 5.106 7.415 0.69 detect::Fast_Params::(100, 2, false, "cv/cameracalibration/chess9.png") 15.597 8.792 1.77 detect::Fast_Params::(100, 2, false, "cv/inpaint/orig.png") 2.922 1.228 2.38 detect::Fast_Params::(100, 2, true, "cv/cameracalibration/chess9.png") 15.626 8.817 1.77 detect::Fast_Params::(100, 2, true, "cv/inpaint/orig.png") 2.963 1.240 2.39 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2025-12-06 12:19:50 +01:00 · 2025-08-11 18:03:34 +08:00 · 2025-08-11 18:03:34 +08:00 · 75d9ac3964
commit 75d9ac3964
parent a783a1e2d8
8 changed files with 383 additions and 5 deletions
--- a/hal/riscv-rvv/CMakeLists.txt
+++ b/hal/riscv-rvv/CMakeLists.txt
@ -17,7 +17,8 @@ endif()
 target_include_directories(${HAL_LIB_NAME} PRIVATE
  ${CMAKE_CURRENT_SOURCE_DIR}
  ${CMAKE_SOURCE_DIR}/modules/core/include
-  ${CMAKE_SOURCE_DIR}/modules/imgproc/include) #   ${CMAKE_SOURCE_DIR}/modules/features2d/include
+  ${CMAKE_SOURCE_DIR}/modules/imgproc/include
+  ${CMAKE_SOURCE_DIR}/modules/features2d/include)

 set(RVV_HAL_FOUND TRUE CACHE INTERNAL "")
 set(RVV_HAL_VERSION "0.0.1" CACHE INTERNAL "")
--- a/hal/riscv-rvv/include/features2d.hpp
+++ b/hal/riscv-rvv/include/features2d.hpp
@ -0,0 +1,26 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_RVV_HAL_FEATURES2D_HPP
+#define OPENCV_RVV_HAL_FEATURES2D_HPP
+
+struct cvhalFilter2D;
+
+namespace cv { namespace rvv_hal { namespace features2d {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+int FAST(const uchar* src_data, size_t src_step, int width, int height,
+          void** keypoints_data, size_t* keypoints_count,
+          int threshold, bool nonmax_suppression, int detector_type, void* (*realloc_func)(void*, size_t));
+
+#undef cv_hal_FASTv2
+#define cv_hal_FASTv2 cv::rvv_hal::features2d::FAST
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+
+}}} // cv::rvv_hal::features2d
+
+#endif // OPENCV_RVV_HAL_IMGPROC_HPP
--- a/hal/riscv-rvv/rvv_hal.hpp
+++ b/hal/riscv-rvv/rvv_hal.hpp
@ -27,5 +27,6 @@
 #include "include/types.hpp"
 #include "include/core.hpp"
 #include "include/imgproc.hpp"
+#include "include/features2d.hpp"

 #endif // OPENCV_HAL_RVV_HPP_INCLUDED
--- a/hal/riscv-rvv/src/features2d/common.hpp
+++ b/hal/riscv-rvv/src/features2d/common.hpp
@ -0,0 +1,23 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+// Third party copyrights are property of their respective owners.
+
+#ifndef OPENCV_HAL_RVV_FEATURES2D_COMMON_HPP_INCLUDED
+#define OPENCV_HAL_RVV_FEATURES2D_COMMON_HPP_INCLUDED
+
+#include <riscv_vector.h>
+#include "opencv2/features2d/hal/interface.h"
+
+namespace cv { namespace rvv_hal { namespace features2d { namespace common {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}}} // cv::rvv_hal::core::common
+
+#endif // OPENCV_HAL_RVV_CORE_COMMON_HPP_INCLUDED
--- a/hal/riscv-rvv/src/features2d/fast.cpp
+++ b/hal/riscv-rvv/src/features2d/fast.cpp
@ -0,0 +1,256 @@
+#include "rvv_hal.hpp"
+#include "common.hpp"
+#include <cfloat>
+
+namespace cv { namespace rvv_hal { namespace features2d {
+
+static inline uint8_t cornerScore(const uint8_t* ptr, const int* pixel)
+{
+    constexpr int K = 8, N = 16 + K + 1;
+    int v = ptr[0];
+    int16_t d[32] = {0};
+    for (int k = 0; k < N; k++)
+        d[k] = (int16_t)(v - ptr[pixel[k]]);
+    auto vlenb = __riscv_vlenb();
+    switch (vlenb) {
+        #define CV_RVV_HAL_FAST_CORNERSOCRE16_CASE(lmul) \
+            size_t vl = __riscv_vsetvl_e16m##lmul(N); \
+            vint16m##lmul##_t vd = __riscv_vle16_v_i16m##lmul(d, vl); \
+            vint16m##lmul##_t q0 = __riscv_vmv_v_x_i16m##lmul((int16_t)(-1000), vl); \
+            vint16m##lmul##_t q1 = __riscv_vmv_v_x_i16m##lmul((int16_t)(1000), vl); \
+            vint16m##lmul##_t vds = vd, ak0 = vd, bk0 = vd; \
+            for (int i = 0; i < 8; i++) { \
+                vds = __riscv_vslide1down(vds, 0, vl); \
+                ak0 = __riscv_vmin(ak0, vds, vl); \
+                bk0 = __riscv_vmax(bk0, vds, vl); \
+            } \
+            q0 = __riscv_vmax(q0, __riscv_vmin(ak0, vd, vl), vl); \
+            q1 = __riscv_vmin(q1, __riscv_vmax(bk0, vd, vl), vl); \
+            vds = __riscv_vslide1down(vds, 0, vl); \
+            q0 = __riscv_vmax(q0, __riscv_vmin(ak0, vds, vl), vl); \
+            q1 = __riscv_vmin(q1, __riscv_vmax(bk0, vds, vl), vl); \
+            q0 = __riscv_vmax(q0, __riscv_vrsub(q1, 0, vl), vl); \
+            return (uint8_t)(__riscv_vmv_x(__riscv_vredmax(q0, __riscv_vmv_s_x_i16m1(0, vl), vl)) - 1);
+        case 16: { // 128-bit
+            CV_RVV_HAL_FAST_CORNERSOCRE16_CASE(4)
+        } break;
+        case 32: { // 256-bit
+            CV_RVV_HAL_FAST_CORNERSOCRE16_CASE(2)
+        } break;
+        default: { // >=512-bit
+            CV_RVV_HAL_FAST_CORNERSOCRE16_CASE(1)
+        }
+    }
+}
+
+
+inline int fast_16(const uchar* src_data, size_t src_step,
+                   int width, int height,
+                   std::vector<cvhalKeyPoint> &keypoints,
+                   int threshold, bool nonmax_suppression)
+{
+
+    constexpr int patternSize = 16;
+    constexpr int K = patternSize/2, N = patternSize + K + 1;
+    constexpr int quarterPatternSize = patternSize/4;
+
+    int i, j, k;
+    int pixel[N] = {0};
+    pixel[0] = 0 + (int)src_step * 3;
+    pixel[1] = 1 + (int)src_step * 3;
+    pixel[2] = 2 + (int)src_step * 2;
+    pixel[3] = 3 + (int)src_step * 1;
+    pixel[4] = 3 + (int)src_step * 0;
+    pixel[5] = 3 + (int)src_step * -1;
+    pixel[6] = 2 + (int)src_step * -2;
+    pixel[7] = 1 + (int)src_step * -3;
+    pixel[8] = 0 + (int)src_step * -3;
+    pixel[9] = -1 + (int)src_step * -3;
+    pixel[10] = -2 + (int)src_step * -2;
+    pixel[11] = -3 + (int)src_step * -1;
+    pixel[12] = -3 + (int)src_step * 0;
+    pixel[13] = -3 + (int)src_step * 1;
+    pixel[14] = -2 + (int)src_step * 2;
+    pixel[15] = -1 + (int)src_step * 3;
+    for (k = 16; k < N; k++)
+    {
+        pixel[k] = pixel[k - 16];
+    }
+
+    std::vector<uchar> _buf((width+16)*3*(sizeof(ptrdiff_t) + sizeof(uchar)) + 128);
+    uchar* buf[3];
+    buf[0] = &_buf[0]; buf[1] = buf[0] + width; buf[2] = buf[1] + width;
+    ptrdiff_t* cpbuf[3];
+    cpbuf[0] = (ptrdiff_t*)alignPtr(buf[2] + width, sizeof(ptrdiff_t)) + 1;
+    cpbuf[1] = cpbuf[0] + width + 1;
+    cpbuf[2] = cpbuf[1] + width + 1;
+    memset(buf[0], 0, width*3);
+
+    int vlmax = __riscv_vsetvlmax_e8m4();
+    vuint8m4_t v_c_delta = __riscv_vmv_v_x_u8m4(0x80, vlmax);
+    vuint8m4_t v_c_threshold = __riscv_vmv_v_x_u8m4((char) threshold, vlmax);
+    vuint8m4_t v_c_k = __riscv_vmv_v_x_u8m4((uint8_t)K, vlmax);
+    vuint8m4_t v_c_zero = __riscv_vmv_v_x_u8m4(0, vlmax);
+
+    for( i = 3; i < height - 2; i++)
+    {
+
+        const uchar* ptr = src_data + i * src_step + 3;
+        uchar* curr = buf[(i - 3)%3];
+        ptrdiff_t* cornerpos = cpbuf[(i - 3)%3];
+        memset(curr, 0, width);
+        ptrdiff_t ncorners = 0;
+
+        if( i < height - 3 )
+        {
+            j = 3;
+            {
+                int margin = width - 3;
+                int vl = __riscv_vsetvl_e8m4(margin - j);
+                for (; j < margin; j += vl, ptr += vl)
+                {
+                    vl = __riscv_vsetvl_e8m4(margin - j);
+                    vuint8m4_t v_pixels = __riscv_vle8_v_u8m4(ptr, vl);
+                    // pixels add threshold
+                    vuint8m4_t v_pat = __riscv_vsaddu(v_pixels, v_c_threshold, vl);
+                    // pixels sub threshold
+                    vuint8m4_t v_pst = __riscv_vssubu(v_pixels, v_c_threshold, vl);
+                    vint8m4_t v0 = __riscv_vreinterpret_i8m4(__riscv_vxor(v_pat, v_c_delta, vl));
+                    vint8m4_t v1 = __riscv_vreinterpret_i8m4(__riscv_vxor(v_pst, v_c_delta, vl));
+
+                    v_pixels = __riscv_vle8_v_u8m4(ptr + pixel[0], vl);
+                    vint8m4_t x0 = __riscv_vreinterpret_i8m4(__riscv_vsub(v_pixels, v_c_delta, vl));
+                    v_pixels = __riscv_vle8_v_u8m4(ptr + pixel[quarterPatternSize], vl);
+                    vint8m4_t x1 = __riscv_vreinterpret_i8m4(__riscv_vsub(v_pixels, v_c_delta, vl));
+                    v_pixels = __riscv_vle8_v_u8m4(ptr + pixel[2*quarterPatternSize], vl);
+                    vint8m4_t x2 = __riscv_vreinterpret_i8m4(__riscv_vsub(v_pixels, v_c_delta, vl));
+                    v_pixels = __riscv_vle8_v_u8m4(ptr + pixel[3*quarterPatternSize], vl);
+                    vint8m4_t x3 = __riscv_vreinterpret_i8m4(__riscv_vsub(v_pixels, v_c_delta, vl));
+
+                    vbool2_t m0, m1;
+                    m0 = __riscv_vmand(__riscv_vmslt(v0, x0, vl), __riscv_vmslt(v0, x1, vl), vl);
+                    m1 = __riscv_vmand(__riscv_vmslt(x0, v1, vl), __riscv_vmslt(x1, v1, vl), vl);
+                    m0 = __riscv_vmor(m0, __riscv_vmand(__riscv_vmslt(v0, x1, vl), __riscv_vmslt(v0, x2, vl), vl), vl);
+                    m1 = __riscv_vmor(m1, __riscv_vmand(__riscv_vmslt(x1, v1, vl), __riscv_vmslt(x2, v1, vl), vl), vl);
+                    m0 = __riscv_vmor(m0, __riscv_vmand(__riscv_vmslt(v0, x2, vl), __riscv_vmslt(v0, x3, vl), vl), vl);
+                    m1 = __riscv_vmor(m1, __riscv_vmand(__riscv_vmslt(x2, v1, vl), __riscv_vmslt(x3, v1, vl), vl), vl);
+                    m0 = __riscv_vmor(m0, __riscv_vmand(__riscv_vmslt(v0, x3, vl), __riscv_vmslt(v0, x0, vl), vl), vl);
+                    m1 = __riscv_vmor(m1, __riscv_vmand(__riscv_vmslt(x3, v1, vl), __riscv_vmslt(x0, v1, vl), vl), vl);
+                    m0 = __riscv_vmor(m0, m1, vl);
+
+                    unsigned long mask_cnt = __riscv_vcpop(m0, vl);
+                    if(!mask_cnt)
+                        continue;
+
+                    // TODO: Test if skipping to the first possible key point pixel if faster
+                    // Memory access maybe expensive since the data is not aligned
+                    // long first_set = __riscv_vfirst(m0, vl);
+                    // if( first_set == -1 ) {
+                    //     j -= first_set;
+                    //     ptr -= first_set;
+                    // }
+
+                    vuint8m4_t c0 = __riscv_vmv_v_x_u8m4(0, vl);
+                    vuint8m4_t c1 = __riscv_vmv_v_x_u8m4(0, vl);
+                    vuint8m4_t max0 = __riscv_vmv_v_x_u8m4(0, vl);
+                    vuint8m4_t max1 = __riscv_vmv_v_x_u8m4(0, vl);
+
+                    for( k = 0; k < N; k++ )
+                    {
+                        vint8m4_t x = __riscv_vreinterpret_i8m4(__riscv_vxor(__riscv_vle8_v_u8m4(ptr + pixel[k], vl), v_c_delta, vl));
+
+                        m0 = __riscv_vmslt(v0, x, vl);
+                        m1 = __riscv_vmslt(x, v1, vl);
+
+                        c0 = __riscv_vadd_mu(m0, c0, c0, (uint8_t)1, vl);
+                        c1 = __riscv_vadd_mu(m1, c1, c1, (uint8_t)1, vl);
+                        c0 = __riscv_vmerge(v_c_zero, c0, m0, vl);
+                        c1 = __riscv_vmerge(v_c_zero, c1, m1, vl);
+
+                        max0 = __riscv_vmaxu(max0, c0, vl);
+                        max1 = __riscv_vmaxu(max1, c1, vl);
+                    }
+
+                    vbool2_t v_comparek = __riscv_vmsltu(v_c_k, __riscv_vmaxu(max0, max1, vl), vl);
+                    uint8_t m[64];
+                    __riscv_vse8(m, __riscv_vreinterpret_u8m1(v_comparek), vl);
+
+                    for( k = 0; k < vl; k++ )
+                    {
+                        if( (m[k / 8] >> (k % 8)) & 1 )
+                        {
+                            cornerpos[ncorners++] = j + k;
+                            if(nonmax_suppression) {
+                                curr[j + k] = (uchar)cornerScore(ptr + k, pixel);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        cornerpos[-1] = ncorners;
+
+        if( i == 3 )            continue;
+
+        const uchar* prev = buf[(i - 4 + 3)%3];
+        const uchar* pprev = buf[(i - 5 + 3)%3];
+        cornerpos = cpbuf[(i - 4 + 3)%3]; // cornerpos[-1] is used to store a value
+        ncorners = cornerpos[-1];
+        for( k = 0; k < ncorners; k++ )
+        {
+            j = cornerpos[k];
+            int score = prev[j];
+            if(!nonmax_suppression ||
+               (score > prev[j+1] && score > prev[j-1] &&
+                score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
+                score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
+            {
+                cvhalKeyPoint kp;
+                kp.x = (float)j;
+                kp.y = (float)(i-1);
+                kp.size = 7.f;
+                kp.angle = -1.f;
+                kp.response = (float)score;
+                kp.octave = 0; // Not used in FAST
+                kp.class_id = -1; // Not used in FAST
+                keypoints.push_back(kp);
+            }
+        }
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+int FAST(const uchar* src_data, size_t src_step,
+         int width, int height, void** keypoints_data,
+         size_t* keypoints_count, int threshold,
+         bool nonmax_suppression, int detector_type, void* (*realloc_func)(void*, size_t))
+{
+    int res = CV_HAL_ERROR_UNKNOWN;
+    switch(detector_type) {
+        case CV_HAL_TYPE_5_8:
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+        case CV_HAL_TYPE_7_12:
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+        case CV_HAL_TYPE_9_16: {
+            std::vector<cvhalKeyPoint> keypoints;
+            res = fast_16(src_data, src_step, width, height, keypoints, threshold, nonmax_suppression);
+            if (res == CV_HAL_ERROR_OK) {
+                if (keypoints.size() > *keypoints_count) {
+                    *keypoints_count = keypoints.size();
+                    uchar *tmp = (uchar*)realloc_func(*keypoints_data, sizeof(cvhalKeyPoint)*(*keypoints_count));
+                    memcpy(tmp, (uchar*)keypoints.data(), sizeof(cvhalKeyPoint)*(*keypoints_count));
+                    *keypoints_data = tmp;
+                } else {
+                    *keypoints_count = keypoints.size();
+                    memcpy(*keypoints_data, (uchar*)keypoints.data(), sizeof(cvhalKeyPoint)*(*keypoints_count));
+                }
+            }
+            return res;
+        }
+        default:
+            return res;
+    }
+}
+
+}}} // namespace cv::rvv_hal::features2d
--- a/modules/features2d/perf/perf_fast.cpp
+++ b/modules/features2d/perf/perf_fast.cpp
@ -0,0 +1,43 @@
+#include "perf_precomp.hpp"
+#include "perf_feature2d.hpp"
+
+namespace opencv_test
+{
+using namespace perf;
+
+typedef tuple<int, int, bool, string> Fast_Params_t;
+typedef perf::TestBaseWithParam<Fast_Params_t> Fast_Params;
+
+PERF_TEST_P(Fast_Params, detect,
+    testing::Combine(
+        testing::Values(20,30,100),                   // threshold
+        testing::Values(
+            // (int)FastFeatureDetector::TYPE_5_8,
+            // (int)FastFeatureDetector::TYPE_7_12,
+            (int)FastFeatureDetector::TYPE_9_16       // detector_type
+        ),
+        testing::Bool(),                              // nonmaxSuppression
+        testing::Values("cv/inpaint/orig.png",
+                        "cv/cameracalibration/chess9.png")
+    ))
+{
+    int threshold_p = get<0>(GetParam());
+    int type_p = get<1>(GetParam());
+    bool nonmaxSuppression_p = get<2>(GetParam());
+    string filename = getDataPath(get<3>(GetParam()));
+
+    Mat img = imread(filename, IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty()) << "Failed to load image: " << filename;
+
+    vector<KeyPoint> keypoints;
+
+    declare.in(img);
+    TEST_CYCLE()
+    {
+        FAST(img, keypoints, threshold_p, nonmaxSuppression_p, (FastFeatureDetector::DetectorType)type_p);
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace opencv_test
--- a/modules/features2d/src/fast.cpp
+++ b/modules/features2d/src/fast.cpp
@ -435,11 +435,23 @@ void FAST(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool
    cv::Mat img = _img.getMat();
    CALL_HAL(fast_dense, hal_FAST, img, keypoints, threshold, nonmax_suppression, type);

-    size_t keypoints_count = 10000;
+    size_t keypoints_count = 1;
+    keypoints.clear();
+    KeyPoint* kps = (KeyPoint*)malloc(sizeof(KeyPoint) * keypoints_count);
+    int hal_ret = cv_hal_FASTv2(img.data, img.step, img.cols, img.rows, (void**)&kps,
+                                &keypoints_count, threshold, nonmax_suppression, type, realloc);
+    if (hal_ret == CV_HAL_ERROR_OK) {
+        keypoints.assign(kps, kps + keypoints_count);
+        free(kps);
+        return;
+    } else {
+        free(kps);
+        keypoints_count = 10000;
        keypoints.clear();
        keypoints.resize(keypoints_count);
        CALL_HAL(fast, cv_hal_FAST, img.data, img.step, img.cols, img.rows,
                (uchar*)(keypoints.data()), &keypoints_count, threshold, nonmax_suppression, type);
+    }

    switch(type) {
    case FastFeatureDetector::TYPE_5_8:
--- a/modules/features2d/src/hal_replacement.hpp
+++ b/modules/features2d/src/hal_replacement.hpp
@ -104,8 +104,24 @@ inline int hal_ni_FAST_NMS(const uchar* src_data, size_t src_step, uchar* dst_da
 */
 inline int hal_ni_FAST(const uchar* src_data, size_t src_step, int width, int height, uchar* keypoints_data, size_t* keypoints_count, int threshold, bool nonmax_suppression, int /*cv::FastFeatureDetector::DetectorType*/ type) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }

+/**
+   @brief Detects corners using the FAST algorithm.
+   @param src_data Source image data
+   @param src_step Source image step
+   @param width Source image width
+   @param height Source image height
+   @param keypoints_data Pointer to keypoints
+   @param keypoints_count Count of keypoints
+   @param threshold Threshold for keypoint
+   @param nonmax_suppression Indicates if make nonmaxima suppression or not.
+   @param type FAST type
+   @param realloc_func function for reallocation
+*/
+inline int hal_ni_FASTv2(const uchar* src_data, size_t src_step, int width, int height, void** keypoints_data, size_t* keypoints_count, int threshold, bool nonmax_suppression, int /*cv::FastFeatureDetector::DetectorType*/ type, void* (*realloc_func)(void*, size_t)) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
 //! @cond IGNORED
 #define cv_hal_FAST hal_ni_FAST
+#define cv_hal_FASTv2 hal_ni_FASTv2
 //! @endcond

 //! @}