Merge pull request #27060 from YooLc:hal-rvv-integral

[hal_rvv] Add cv::integral implementation and more types of input for test #27060

This patch introduces an RVV-optimized implementation of `cv::integral()` in hal_rvv, along with performance and accuracy tests for all valid input/output type combinations specified in `modules/imgproc/src/hal_replacement.hpp`:
2a8d4b8e43/modules/imgproc/src/hal_replacement.hpp (L960-L974)

The vectorized prefix sum algorithm follows the approach described in [Prefix Sum with SIMD - Algorithmica](https://en.algorithmica.org/hpc/algorithms/prefix/).

I intentionally omitted support for the following cases by returning `CV_HAL_ERROR_NOT_IMPLEMENTED`, as they are harder to implement or show limited performance gains:
1. **Tilted Sum**: The data access pattern for tilted sums requires multi-row operations, making effective vectorization difficult.
2. **3-channel images (`cn == 3`)**: Current implementation requires `VLEN/SEW` (a.k.a. number of elements in a vector register) to be a multiple of channel count, which 3-channel formats typically cannot satisfy.
    - Support for 1, 2 and 4 channel images is implemented
4. **Small images (`!(width >> 8 || height >> 8)`)**: The scalar implementation demonstrates better performance for images with limited dimensions. 
    - This is the same as `3rdparty/ndsrvp/src/integral.cpp` 09c71aed14/3rdparty/ndsrvp/src/integral.cpp (L24-L26)

Test configuration:

- Platform: SpacemiT Muse Pi (K1 @ 1.60 Ghz)
- Toolchain: GCC 14.2.0
- `integral_sqsum_full` test is disabled by default, so `--gtest_also_run_disabled_tests` is needed

Test results:

```plaintext
Geometric mean (ms)

                                     Name of Test                                       imgproc-gcc-scalar imgproc-gcc-hal  imgproc-gcc-hal  
                                                                                                                                   vs        
                                                                                                                           imgproc-gcc-scalar
                                                                                                                               (x-factor)      
integral::Size_MatType_OutMatDepth::(640x480, 8UC1, CV_32F)                                   1.973             1.415             1.39       
integral::Size_MatType_OutMatDepth::(640x480, 8UC1, CV_32S)                                   1.343             1.351             0.99       
integral::Size_MatType_OutMatDepth::(640x480, 8UC1, CV_64F)                                   2.021             2.756             0.73       
integral::Size_MatType_OutMatDepth::(640x480, 8UC2, CV_32F)                                   4.695             2.874             1.63       
integral::Size_MatType_OutMatDepth::(640x480, 8UC2, CV_32S)                                   4.028             2.801             1.44       
integral::Size_MatType_OutMatDepth::(640x480, 8UC2, CV_64F)                                   5.965             4.926             1.21       
integral::Size_MatType_OutMatDepth::(640x480, 8UC4, CV_32F)                                   9.970             4.440             2.25       
integral::Size_MatType_OutMatDepth::(640x480, 8UC4, CV_32S)                                   7.934             4.244             1.87       
integral::Size_MatType_OutMatDepth::(640x480, 8UC4, CV_64F)                                   14.696            8.431             1.74       
integral::Size_MatType_OutMatDepth::(1280x720, 8UC1, CV_32F)                                  5.949             4.108             1.45       
integral::Size_MatType_OutMatDepth::(1280x720, 8UC1, CV_32S)                                  4.064             4.080             1.00       
integral::Size_MatType_OutMatDepth::(1280x720, 8UC1, CV_64F)                                  6.137             7.975             0.77       
integral::Size_MatType_OutMatDepth::(1280x720, 8UC2, CV_32F)                                  13.896            8.721             1.59       
integral::Size_MatType_OutMatDepth::(1280x720, 8UC2, CV_32S)                                  10.948            8.513             1.29       
integral::Size_MatType_OutMatDepth::(1280x720, 8UC2, CV_64F)                                  18.046           15.234             1.18       
integral::Size_MatType_OutMatDepth::(1280x720, 8UC4, CV_32F)                                  35.105           13.778             2.55       
integral::Size_MatType_OutMatDepth::(1280x720, 8UC4, CV_32S)                                  27.135           13.417             2.02       
integral::Size_MatType_OutMatDepth::(1280x720, 8UC4, CV_64F)                                  43.477           25.616             1.70       
integral::Size_MatType_OutMatDepth::(1920x1080, 8UC1, CV_32F)                                 13.386            9.281             1.44       
integral::Size_MatType_OutMatDepth::(1920x1080, 8UC1, CV_32S)                                 9.159             9.194             1.00       
integral::Size_MatType_OutMatDepth::(1920x1080, 8UC1, CV_64F)                                 13.776           17.836             0.77       
integral::Size_MatType_OutMatDepth::(1920x1080, 8UC2, CV_32F)                                 31.943           19.435             1.64       
integral::Size_MatType_OutMatDepth::(1920x1080, 8UC2, CV_32S)                                 24.747           18.946             1.31       
integral::Size_MatType_OutMatDepth::(1920x1080, 8UC2, CV_64F)                                 35.925           33.943             1.06       
integral::Size_MatType_OutMatDepth::(1920x1080, 8UC4, CV_32F)                                 66.493           29.692             2.24       
integral::Size_MatType_OutMatDepth::(1920x1080, 8UC4, CV_32S)                                 54.737           28.250             1.94       
integral::Size_MatType_OutMatDepth::(1920x1080, 8UC4, CV_64F)                                 91.880           57.495             1.60            
integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC1, CV_32F)                             4.384             4.016             1.09       
integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC1, CV_32S)                             3.676             3.960             0.93       
integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC1, CV_64F)                             5.620             5.224             1.08       
integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC2, CV_32F)                             9.971             7.696             1.30       
integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC2, CV_32S)                             8.934             7.632             1.17       
integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC2, CV_64F)                             9.927             9.759             1.02       
integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC4, CV_32F)                             21.556           12.288             1.75       
integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC4, CV_32S)                             21.261           12.089             1.76       
integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC4, CV_64F)                             23.989           16.278             1.47       
integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC1, CV_32F)                            15.232           11.752             1.30       
integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC1, CV_32S)                            12.976           11.721             1.11       
integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC1, CV_64F)                            16.450           15.627             1.05       
integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC2, CV_32F)                            25.932           23.243             1.12       
integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC2, CV_32S)                            24.750           23.019             1.08       
integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC2, CV_64F)                            28.228           29.605             0.95       
integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC4, CV_32F)                            61.665           37.477             1.65       
integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC4, CV_32S)                            61.536           37.126             1.66       
integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC4, CV_64F)                            73.989           48.994             1.51       
integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC1, CV_32F)                           49.640           26.529             1.87       
integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC1, CV_32S)                           35.869           26.417             1.36       
integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC1, CV_64F)                           34.378           35.056             0.98       
integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC2, CV_32F)                           82.138           52.661             1.56       
integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC2, CV_32S)                           54.644           52.089             1.05       
integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC2, CV_64F)                           75.073           66.670             1.13       
integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC4, CV_32F)                          143.283           83.943             1.71       
integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC4, CV_32S)                          156.851           82.378             1.90       
integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC4, CV_64F)                          521.594           111.375            4.68            
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC1, DEPTH_32F_32F))          3.529             2.787             1.27       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC1, DEPTH_32F_64F))          4.396             3.998             1.10       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC1, DEPTH_32S_32F))          3.229             2.774             1.16       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC1, DEPTH_32S_32S))          2.945             2.780             1.06       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC1, DEPTH_32S_64F))          3.857             3.995             0.97       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC1, DEPTH_64F_64F))          5.872             5.228             1.12       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (16UC1, DEPTH_64F_64F))         6.075             5.277             1.15       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (16SC1, DEPTH_64F_64F))         5.680             5.296             1.07       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC1, DEPTH_32F_32F))         3.355             2.896             1.16       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC1, DEPTH_32F_64F))         4.183             4.000             1.05       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC1, DEPTH_64F_64F))         6.237             5.143             1.21       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (64FC1, DEPTH_64F_64F))         4.753             4.783             0.99       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC2, DEPTH_32F_32F))          8.021             5.793             1.38       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC2, DEPTH_32F_64F))          9.963             7.704             1.29       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC2, DEPTH_32S_32F))          7.864             5.720             1.37       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC2, DEPTH_32S_32S))          7.141             5.699             1.25       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC2, DEPTH_32S_64F))          9.228             7.646             1.21       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC2, DEPTH_64F_64F))          9.940             9.759             1.02       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (16UC2, DEPTH_64F_64F))         10.606            9.716             1.09       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (16SC2, DEPTH_64F_64F))         9.933             9.751             1.02       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC2, DEPTH_32F_32F))         7.986             5.962             1.34       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC2, DEPTH_32F_64F))         9.243             7.598             1.22       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC2, DEPTH_64F_64F))         10.573            9.425             1.12       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (64FC2, DEPTH_64F_64F))         11.029            8.977             1.23       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC4, DEPTH_32F_32F))          17.236            8.881             1.94       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC4, DEPTH_32F_64F))          20.905           12.322             1.70       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC4, DEPTH_32S_32F))          16.011            8.666             1.85       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC4, DEPTH_32S_32S))          15.932            8.507             1.87       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC4, DEPTH_32S_64F))          20.713           12.115             1.71       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC4, DEPTH_64F_64F))          23.953           16.284             1.47       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (16UC4, DEPTH_64F_64F))         25.127           16.341             1.54       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (16SC4, DEPTH_64F_64F))         24.950           16.441             1.52       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC4, DEPTH_32F_32F))         17.261            8.906             1.94       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC4, DEPTH_32F_64F))         21.944           12.073             1.82       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC4, DEPTH_64F_64F))         25.921           15.539             1.67       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (64FC4, DEPTH_64F_64F))         27.938           14.824             1.88       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC1, DEPTH_32F_32F))         11.156            8.260             1.35       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC1, DEPTH_32F_64F))         14.777           11.869             1.24       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC1, DEPTH_32S_32F))         9.693             8.221             1.18       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC1, DEPTH_32S_32S))         9.023             8.256             1.09       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC1, DEPTH_32S_64F))         13.276           11.821             1.12       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC1, DEPTH_64F_64F))         15.406           15.618             0.99       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (16UC1, DEPTH_64F_64F))        16.799           15.749             1.07       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (16SC1, DEPTH_64F_64F))        15.054           15.806             0.95       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC1, DEPTH_32F_32F))        10.055            7.999             1.26       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC1, DEPTH_32F_64F))        13.506           11.253             1.20       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC1, DEPTH_64F_64F))        14.952           15.021             1.00       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (64FC1, DEPTH_64F_64F))        13.761           14.002             0.98       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC2, DEPTH_32F_32F))         22.677           17.330             1.31       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC2, DEPTH_32F_64F))         26.283           23.237             1.13       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC2, DEPTH_32S_32F))         20.126           17.118             1.18       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC2, DEPTH_32S_32S))         19.337           17.041             1.13       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC2, DEPTH_32S_64F))         24.973           23.004             1.09       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC2, DEPTH_64F_64F))         29.959           29.585             1.01       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (16UC2, DEPTH_64F_64F))        33.598           29.599             1.14       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (16SC2, DEPTH_64F_64F))        46.213           29.741             1.55       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC2, DEPTH_32F_32F))        33.077           17.556             1.88       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC2, DEPTH_32F_64F))        33.960           22.991             1.48       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC2, DEPTH_64F_64F))        41.792           28.803             1.45       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (64FC2, DEPTH_64F_64F))        34.660           28.532             1.21       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC4, DEPTH_32F_32F))         52.989           27.659             1.92       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC4, DEPTH_32F_64F))         62.418           37.515             1.66       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC4, DEPTH_32S_32F))         50.902           27.310             1.86       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC4, DEPTH_32S_32S))         47.301           27.019             1.75       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC4, DEPTH_32S_64F))         61.982           37.140             1.67       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC4, DEPTH_64F_64F))         79.403           49.041             1.62       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (16UC4, DEPTH_64F_64F))        86.550           49.180             1.76       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (16SC4, DEPTH_64F_64F))        85.715           49.468             1.73       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC4, DEPTH_32F_32F))        63.932           28.019             2.28       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC4, DEPTH_32F_64F))        68.180           36.858             1.85       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC4, DEPTH_64F_64F))        83.063           46.483             1.79       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (64FC4, DEPTH_64F_64F))        91.990           44.545             2.07       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC1, DEPTH_32F_32F))        25.503           18.609             1.37       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC1, DEPTH_32F_64F))        29.544           26.635             1.11       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC1, DEPTH_32S_32F))        22.581           18.514             1.22       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC1, DEPTH_32S_32S))        20.860           18.547             1.12       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC1, DEPTH_32S_64F))        26.046           26.373             0.99       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC1, DEPTH_64F_64F))        34.831           34.997             1.00       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (16UC1, DEPTH_64F_64F))       36.428           35.214             1.03       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (16SC1, DEPTH_64F_64F))       32.435           35.314             0.92       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC1, DEPTH_32F_32F))       22.548           18.845             1.20       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC1, DEPTH_32F_64F))       28.589           25.790             1.11       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC1, DEPTH_64F_64F))       32.625           33.791             0.97       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (64FC1, DEPTH_64F_64F))       30.158           31.889             0.95       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC2, DEPTH_32F_32F))        53.374           38.938             1.37       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC2, DEPTH_32F_64F))        73.892           52.747             1.40       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC2, DEPTH_32S_32F))        47.392           38.572             1.23       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC2, DEPTH_32S_32S))        45.638           38.225             1.19       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC2, DEPTH_32S_64F))        69.966           52.156             1.34       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC2, DEPTH_64F_64F))        68.560           66.963             1.02       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (16UC2, DEPTH_64F_64F))       71.487           65.420             1.09       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (16SC2, DEPTH_64F_64F))       68.127           65.718             1.04       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC2, DEPTH_32F_32F))       72.967           39.987             1.82       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC2, DEPTH_32F_64F))       63.933           51.408             1.24       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC2, DEPTH_64F_64F))       73.334           63.354             1.16       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (64FC2, DEPTH_64F_64F))       80.983           60.778             1.33       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC4, DEPTH_32F_32F))       116.981           59.908             1.95       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC4, DEPTH_32F_64F))       155.085           83.974             1.85       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC4, DEPTH_32S_32F))       109.567           58.525             1.87       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC4, DEPTH_32S_32S))       105.457           57.124             1.85       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC4, DEPTH_32S_64F))       157.325           82.485             1.91       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC4, DEPTH_64F_64F))       265.776           111.577            2.38       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (16UC4, DEPTH_64F_64F))      585.218           110.583            5.29       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (16SC4, DEPTH_64F_64F))      585.418           111.302            5.26       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC4, DEPTH_32F_32F))      126.456           60.415             2.09       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC4, DEPTH_32F_64F))      169.278           81.460             2.08       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC4, DEPTH_64F_64F))      281.256           104.732            2.69       
integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (64FC4, DEPTH_64F_64F))      620.885           99.953             6.21       
```

The vectorized implementation shows progressively better acceleration for larger image sizes and higher channel counts, achieving up to 6.21× speedup for 64FC4 (1920×1080) inputs with `DEPTH_64F_64F` configuration.

This is my first time proposing patch for the OpenCV Project 🥹, if there's anything that can be improved, please tell me.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
YooLc 2025-04-21 14:50:13 +08:00 committed by GitHub
parent 11e46cda86
commit f20facc60a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 510 additions and 20 deletions

View File

@ -57,6 +57,7 @@
#include "hal_rvv_1p0/thresh.hpp" // imgproc
#include "hal_rvv_1p0/histogram.hpp" // imgproc
#include "hal_rvv_1p0/resize.hpp" // imgproc
#include "hal_rvv_1p0/integral.hpp" // imgproc
#endif
#endif

View File

@ -0,0 +1,173 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
#ifndef OPENCV_HAL_RVV_INTEGRAL_HPP_INCLUDED
#define OPENCV_HAL_RVV_INTEGRAL_HPP_INCLUDED
#include <riscv_vector.h>
#include "types.hpp"
namespace cv { namespace cv_hal_rvv {
#undef cv_hal_integral
#define cv_hal_integral cv::cv_hal_rvv::integral
template <typename vec_t>
inline typename vec_t::VecType repeat_last_n(typename vec_t::VecType vs, int n, size_t vl) {
auto v_last = vec_t::vslidedown(vs, vl - n, vl);
if (n == 1) return vec_t::vmv(vec_t::vmv_x(v_last), vl);
for (size_t offset = n; offset < vl; offset <<= 1) {
v_last = vec_t::vslideup(v_last, v_last, offset, vl);
}
return v_last;
}
template <typename data_vec_t, typename acc_vec_t, bool sqsum = false>
inline int integral_inner(const uchar* src_data, size_t src_step,
uchar* sum_data, size_t sum_step,
int width, int height, int cn) {
using data_t = typename data_vec_t::ElemType;
using acc_t = typename acc_vec_t::ElemType;
for (int y = 0; y < height; y++) {
const data_t* src = reinterpret_cast<const data_t*>(src_data + src_step * y);
acc_t* prev = reinterpret_cast<acc_t*>(sum_data + sum_step * y);
acc_t* curr = reinterpret_cast<acc_t*>(sum_data + sum_step * (y + 1));
memset(curr, 0, cn * sizeof(acc_t));
size_t vl = acc_vec_t::setvlmax();
auto sum = acc_vec_t::vmv(0, vl);
for (size_t x = 0; x < static_cast<size_t>(width); x += vl) {
vl = acc_vec_t::setvl(width - x);
__builtin_prefetch(&src[x + vl], 0);
__builtin_prefetch(&prev[x + cn], 0);
auto v_src = data_vec_t::vload(&src[x], vl);
auto acc = acc_vec_t::cast(v_src, vl);
if (sqsum) { // Squared Sum
acc = acc_vec_t::vmul(acc, acc, vl);
}
auto v_zero = acc_vec_t::vmv(0, vl);
for (size_t offset = cn; offset < vl; offset <<= 1) {
auto v_shift = acc_vec_t::vslideup(v_zero, acc, offset, vl);
acc = acc_vec_t::vadd(acc, v_shift, vl);
}
auto last_n = repeat_last_n<acc_vec_t>(acc, cn, vl);
auto v_prev = acc_vec_t::vload(&prev[x + cn], vl);
acc = acc_vec_t::vadd(acc, v_prev, vl);
acc = acc_vec_t::vadd(acc, sum, vl);
sum = acc_vec_t::vadd(sum, last_n, vl);
acc_vec_t::vstore(&curr[x + cn], acc, vl);
}
}
return CV_HAL_ERROR_OK;
}
template <typename data_vec_t, typename acc_vec_t, typename sq_acc_vec_t>
inline int integral(const uchar* src_data, size_t src_step, uchar* sum_data, size_t sum_step, uchar* sqsum_data, size_t sqsum_step, int width, int height, int cn) {
memset(sum_data, 0, (sum_step) * sizeof(uchar));
int result = CV_HAL_ERROR_NOT_IMPLEMENTED;
if (sqsum_data == nullptr) {
result = integral_inner<data_vec_t, acc_vec_t, false>(src_data, src_step, sum_data, sum_step, width, height, cn);
} else {
result = integral_inner<data_vec_t, acc_vec_t, false>(src_data, src_step, sum_data, sum_step, width, height, cn);
memset(sqsum_data, 0, (sqsum_step) * sizeof(uchar));
if (result != CV_HAL_ERROR_OK) return result;
result = integral_inner<data_vec_t, sq_acc_vec_t, true>(src_data, src_step, sqsum_data, sqsum_step, width, height, cn);
}
return result;
}
/**
@brief Calculate integral image
@param depth Depth of source image
@param sdepth Depth of sum image
@param sqdepth Depth of square sum image
@param src_data Source image data
@param src_step Source image step
@param sum_data Sum image data
@param sum_step Sum image step
@param sqsum_data Square sum image data
@param sqsum_step Square sum image step
@param tilted_data Tilted sum image data
@param tilted_step Tilted sum image step
@param width Source image width
@param height Source image height
@param cn Number of channels
@note Following combinations of image depths are used:
Source | Sum | Square sum
-------|-----|-----------
CV_8U | CV_32S | CV_64F
CV_8U | CV_32S | CV_32F
CV_8U | CV_32S | CV_32S
CV_8U | CV_32F | CV_64F
CV_8U | CV_32F | CV_32F
CV_8U | CV_64F | CV_64F
CV_16U | CV_64F | CV_64F
CV_16S | CV_64F | CV_64F
CV_32F | CV_32F | CV_64F
CV_32F | CV_32F | CV_32F
CV_32F | CV_64F | CV_64F
CV_64F | CV_64F | CV_64F
*/
inline int integral(int depth, int sdepth, int sqdepth,
const uchar* src_data, size_t src_step,
uchar* sum_data, size_t sum_step,
uchar* sqsum_data, size_t sqsum_step,
uchar* tilted_data, [[maybe_unused]] size_t tilted_step,
int width, int height, int cn) {
// tilted sum and cn == 3 cases are not supported
if (tilted_data || cn == 3) {
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
// Skip images that are too small
if (!(width >> 8 || height >> 8)) {
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
int result = CV_HAL_ERROR_NOT_IMPLEMENTED;
width *= cn;
if( depth == CV_8U && sdepth == CV_32S && sqdepth == CV_64F )
result = integral<RVV<uint8_t, LMUL_1>, RVV<int32_t, LMUL_4>, RVV<double, LMUL_8>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
else if( depth == CV_8U && sdepth == CV_32S && sqdepth == CV_32F )
result = integral<RVV<uint8_t, LMUL_1>, RVV<int32_t, LMUL_4>, RVV<float, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
else if( depth == CV_8U && sdepth == CV_32S && sqdepth == CV_32S )
result = integral<RVV<uint8_t, LMUL_1>, RVV<int32_t, LMUL_4>, RVV<int32_t, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
else if( depth == CV_8U && sdepth == CV_32F && sqdepth == CV_64F )
result = integral<RVV<uint8_t, LMUL_1>, RVV<float, LMUL_4>, RVV<double, LMUL_8>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
else if( depth == CV_8U && sdepth == CV_32F && sqdepth == CV_32F )
result = integral<RVV<uint8_t, LMUL_1>, RVV<float, LMUL_4>, RVV<float, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
else if( depth == CV_8U && sdepth == CV_64F && sqdepth == CV_64F )
result = integral<RVV<uint8_t, LMUL_1>, RVV<double, LMUL_8>, RVV<double, LMUL_8>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
else if( depth == CV_16U && sdepth == CV_64F && sqdepth == CV_64F )
result = integral<RVV<uint16_t, LMUL_1>, RVV<double, LMUL_4>, RVV<double, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
else if( depth == CV_16S && sdepth == CV_64F && sqdepth == CV_64F )
result = integral<RVV<int16_t, LMUL_1>, RVV<double, LMUL_4>, RVV<double, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
else if( depth == CV_32F && sdepth == CV_32F && sqdepth == CV_64F )
result = integral<RVV<float, LMUL_2>, RVV<float, LMUL_2>, RVV<double, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
else if( depth == CV_32F && sdepth == CV_32F && sqdepth == CV_32F )
result = integral<RVV<float, LMUL_4>, RVV<float, LMUL_4>, RVV<float, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
else if( depth == CV_32F && sdepth == CV_64F && sqdepth == CV_64F )
result = integral<RVV<float, LMUL_2>, RVV<double, LMUL_4>, RVV<double, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
else if( depth == CV_64F && sdepth == CV_64F && sqdepth == CV_64F ) {
result = integral<RVV<double, LMUL_4>, RVV<double, LMUL_4>, RVV<double, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
}
return result;
}
}}
#endif

View File

@ -153,6 +153,12 @@ static inline VecType vmv(ElemType a, size_t vl) {
static inline VecType vmv_s(ElemType a, size_t vl) { \
return __riscv_v##IS_F##mv_s_##X_OR_F##_##TYPE##LMUL(a, vl); \
} \
static inline VecType vslideup(VecType vs2, VecType vs1, size_t n, size_t vl) { \
return __riscv_vslideup_vx_##TYPE##LMUL(vs2, vs1, n, vl); \
} \
static inline VecType vslidedown(VecType vs, size_t n, size_t vl) { \
return __riscv_vslidedown_vx_##TYPE##LMUL(vs, n, vl); \
} \
HAL_RVV_SIZE_RELATED_CUSTOM(EEW, TYPE, LMUL)
#define HAL_RVV_SIZE_UNRELATED(S_OR_F, X_OR_F, IS_U, IS_F, IS_O) \
@ -380,7 +386,7 @@ template <> struct RVV_ToFloatHelper<8> {using type = double;};
template <> \
inline ONE::VecType ONE::cast(TWO::VecType v, size_t vl) { return __riscv_vncvt_x(v, vl); } \
template <> \
inline TWO::VecType TWO::cast(ONE::VecType v, size_t vl) { return __riscv_vwcvt_x(v, vl); }
inline TWO::VecType TWO::cast(ONE::VecType v, size_t vl) { return __riscv_vsext_vf2(v, vl); }
HAL_RVV_CVT(RVV_I8M4, RVV_I16M8)
HAL_RVV_CVT(RVV_I8M2, RVV_I16M4)
@ -406,7 +412,7 @@ HAL_RVV_CVT(RVV_I32MF2, RVV_I64M1)
template <> \
inline ONE::VecType ONE::cast(TWO::VecType v, size_t vl) { return __riscv_vncvt_x(v, vl); } \
template <> \
inline TWO::VecType TWO::cast(ONE::VecType v, size_t vl) { return __riscv_vwcvtu_x(v, vl); }
inline TWO::VecType TWO::cast(ONE::VecType v, size_t vl) { return __riscv_vzext_vf2(v, vl); }
HAL_RVV_CVT(RVV_U8M4, RVV_U16M8)
HAL_RVV_CVT(RVV_U8M2, RVV_U16M4)
@ -592,6 +598,277 @@ HAL_RVV_CVT( uint8_t, int8_t, u8, i8, LMUL_f8, mf8)
#undef HAL_RVV_CVT
#define HAL_RVV_CVT(A, B, A_TYPE, B_TYPE, LMUL_TYPE, LMUL) \
template <> \
inline RVV<A, LMUL_TYPE>::VecType RVV<A, LMUL_TYPE>::cast(RVV<B, LMUL_TYPE>::VecType v, [[maybe_unused]] size_t vl) { \
return __riscv_vreinterpret_##A_TYPE##LMUL(v); \
} \
template <> \
inline RVV<B, LMUL_TYPE>::VecType RVV<B, LMUL_TYPE>::cast(RVV<A, LMUL_TYPE>::VecType v, [[maybe_unused]] size_t vl) { \
return __riscv_vreinterpret_##B_TYPE##LMUL(v); \
}
#define HAL_RVV_CVT2(A, B, A_TYPE, B_TYPE) \
HAL_RVV_CVT(A, B, A_TYPE, B_TYPE, LMUL_1, m1) \
HAL_RVV_CVT(A, B, A_TYPE, B_TYPE, LMUL_2, m2) \
HAL_RVV_CVT(A, B, A_TYPE, B_TYPE, LMUL_4, m4) \
HAL_RVV_CVT(A, B, A_TYPE, B_TYPE, LMUL_8, m8)
HAL_RVV_CVT2( uint8_t, int8_t, u8, i8)
HAL_RVV_CVT2(uint16_t, int16_t, u16, i16)
HAL_RVV_CVT2(uint32_t, int32_t, u32, i32)
HAL_RVV_CVT2(uint64_t, int64_t, u64, i64)
#undef HAL_RVV_CVT2
#undef HAL_RVV_CVT
#define HAL_RVV_CVT(FROM, INTERMEDIATE, TO) \
template <> \
inline TO::VecType TO::cast(FROM::VecType v, size_t vl) { \
return TO::cast(INTERMEDIATE::cast(v, vl), vl); \
} \
template <> \
inline FROM::VecType FROM::cast(TO::VecType v, size_t vl) { \
return FROM::cast(INTERMEDIATE::cast(v, vl), vl); \
}
// Integer and Float conversions
HAL_RVV_CVT(RVV_I8M1, RVV_I32M4, RVV_F32M4)
HAL_RVV_CVT(RVV_I8M2, RVV_I32M8, RVV_F32M8)
HAL_RVV_CVT(RVV_I8M1, RVV_I64M8, RVV_F64M8)
HAL_RVV_CVT(RVV_I16M1, RVV_I32M2, RVV_F32M2)
HAL_RVV_CVT(RVV_I16M2, RVV_I32M4, RVV_F32M4)
HAL_RVV_CVT(RVV_I16M4, RVV_I32M8, RVV_F32M8)
HAL_RVV_CVT(RVV_I16M1, RVV_I64M4, RVV_F64M4)
HAL_RVV_CVT(RVV_I16M2, RVV_I64M8, RVV_F64M8)
HAL_RVV_CVT(RVV_I32M1, RVV_I64M2, RVV_F64M2)
HAL_RVV_CVT(RVV_I32M2, RVV_I64M4, RVV_F64M4)
HAL_RVV_CVT(RVV_I32M4, RVV_I64M8, RVV_F64M8)
HAL_RVV_CVT(RVV_U8M1, RVV_U32M4, RVV_F32M4)
HAL_RVV_CVT(RVV_U8M2, RVV_U32M8, RVV_F32M8)
HAL_RVV_CVT(RVV_U8M1, RVV_U64M8, RVV_F64M8)
HAL_RVV_CVT(RVV_U16M1, RVV_U32M2, RVV_F32M2)
HAL_RVV_CVT(RVV_U16M2, RVV_U32M4, RVV_F32M4)
HAL_RVV_CVT(RVV_U16M4, RVV_U32M8, RVV_F32M8)
HAL_RVV_CVT(RVV_U16M1, RVV_U64M4, RVV_F64M4)
HAL_RVV_CVT(RVV_U16M2, RVV_U64M8, RVV_F64M8)
HAL_RVV_CVT(RVV_U32M1, RVV_U64M2, RVV_F64M2)
HAL_RVV_CVT(RVV_U32M2, RVV_U64M4, RVV_F64M4)
HAL_RVV_CVT(RVV_U32M4, RVV_U64M8, RVV_F64M8)
// Signed and Unsigned conversions
HAL_RVV_CVT(RVV_U8M1, RVV_U16M2, RVV_I16M2)
HAL_RVV_CVT(RVV_U8M2, RVV_U16M4, RVV_I16M4)
HAL_RVV_CVT(RVV_U8M4, RVV_U16M8, RVV_I16M8)
HAL_RVV_CVT(RVV_U8M1, RVV_U32M4, RVV_I32M4)
HAL_RVV_CVT(RVV_U8M2, RVV_U32M8, RVV_I32M8)
HAL_RVV_CVT(RVV_U8M1, RVV_U64M8, RVV_I64M8)
#undef HAL_RVV_CVT
// ---------------------------- Define Register Group Operations -------------------------------
#if defined(__clang__) && __clang_major__ <= 17
#define HAL_RVV_GROUP(ONE, TWO, TYPE, ONE_LMUL, TWO_LMUL) \
template <size_t idx> \
inline ONE::VecType vget(TWO::VecType v) { \
return __riscv_vget_v_##TYPE##TWO_LMUL##_##TYPE##ONE_LMUL(v, idx); \
} \
template <size_t idx> \
inline void vset(TWO::VecType v, ONE::VecType val) { \
__riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##TWO_LMUL(v, idx, val); \
} \
inline TWO::VecType vcreate(ONE::VecType v0, ONE::VecType v1) { \
TWO::VecType v{}; \
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##TWO_LMUL(v, 0, v0); \
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##TWO_LMUL(v, 1, v1); \
return v; \
}
#else
#define HAL_RVV_GROUP(ONE, TWO, TYPE, ONE_LMUL, TWO_LMUL) \
template <size_t idx> \
inline ONE::VecType vget(TWO::VecType v) { \
return __riscv_vget_v_##TYPE##TWO_LMUL##_##TYPE##ONE_LMUL(v, idx); \
} \
template <size_t idx> \
inline void vset(TWO::VecType v, ONE::VecType val) { \
__riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##TWO_LMUL(v, idx, val); \
} \
inline TWO::VecType vcreate(ONE::VecType v0, ONE::VecType v1) { \
return __riscv_vcreate_v_##TYPE##ONE_LMUL##_##TYPE##TWO_LMUL(v0, v1); \
}
#endif
HAL_RVV_GROUP(RVV_I8M1, RVV_I8M2, i8, m1, m2)
HAL_RVV_GROUP(RVV_I8M2, RVV_I8M4, i8, m2, m4)
HAL_RVV_GROUP(RVV_I8M4, RVV_I8M8, i8, m4, m8)
HAL_RVV_GROUP(RVV_I16M1, RVV_I16M2, i16, m1, m2)
HAL_RVV_GROUP(RVV_I16M2, RVV_I16M4, i16, m2, m4)
HAL_RVV_GROUP(RVV_I16M4, RVV_I16M8, i16, m4, m8)
HAL_RVV_GROUP(RVV_I32M1, RVV_I32M2, i32, m1, m2)
HAL_RVV_GROUP(RVV_I32M2, RVV_I32M4, i32, m2, m4)
HAL_RVV_GROUP(RVV_I32M4, RVV_I32M8, i32, m4, m8)
HAL_RVV_GROUP(RVV_I64M1, RVV_I64M2, i64, m1, m2)
HAL_RVV_GROUP(RVV_I64M2, RVV_I64M4, i64, m2, m4)
HAL_RVV_GROUP(RVV_I64M4, RVV_I64M8, i64, m4, m8)
HAL_RVV_GROUP(RVV_U8M1, RVV_U8M2, u8, m1, m2)
HAL_RVV_GROUP(RVV_U8M2, RVV_U8M4, u8, m2, m4)
HAL_RVV_GROUP(RVV_U8M4, RVV_U8M8, u8, m4, m8)
HAL_RVV_GROUP(RVV_U16M1, RVV_U16M2, u16, m1, m2)
HAL_RVV_GROUP(RVV_U16M2, RVV_U16M4, u16, m2, m4)
HAL_RVV_GROUP(RVV_U16M4, RVV_U16M8, u16, m4, m8)
HAL_RVV_GROUP(RVV_U32M1, RVV_U32M2, u32, m1, m2)
HAL_RVV_GROUP(RVV_U32M2, RVV_U32M4, u32, m2, m4)
HAL_RVV_GROUP(RVV_U32M4, RVV_U32M8, u32, m4, m8)
HAL_RVV_GROUP(RVV_U64M1, RVV_U64M2, u64, m1, m2)
HAL_RVV_GROUP(RVV_U64M2, RVV_U64M4, u64, m2, m4)
HAL_RVV_GROUP(RVV_U64M4, RVV_U64M8, u64, m4, m8)
HAL_RVV_GROUP(RVV_F32M1, RVV_F32M2, f32, m1, m2)
HAL_RVV_GROUP(RVV_F32M2, RVV_F32M4, f32, m2, m4)
HAL_RVV_GROUP(RVV_F32M4, RVV_F32M8, f32, m4, m8)
HAL_RVV_GROUP(RVV_F64M1, RVV_F64M2, f64, m1, m2)
HAL_RVV_GROUP(RVV_F64M2, RVV_F64M4, f64, m2, m4)
HAL_RVV_GROUP(RVV_F64M4, RVV_F64M8, f64, m4, m8)
#undef HAL_RVV_GROUP
#if defined(__clang__) && __clang_major__ <= 17
#define HAL_RVV_GROUP(ONE, FOUR, TYPE, ONE_LMUL, FOUR_LMUL) \
template <size_t idx> \
inline ONE::VecType vget(FOUR::VecType v) { \
return __riscv_vget_v_##TYPE##FOUR_LMUL##_##TYPE##ONE_LMUL(v, idx); \
} \
template <size_t idx> \
inline void vset(FOUR::VecType v, ONE::VecType val) { \
__riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##FOUR_LMUL(v, idx, val); \
} \
inline FOUR::VecType vcreate(ONE::VecType v0, ONE::VecType v1, ONE::VecType v2, ONE::VecType v3) { \
FOUR::VecType v{}; \
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##FOUR_LMUL(v, 0, v0); \
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##FOUR_LMUL(v, 1, v1); \
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##FOUR_LMUL(v, 2, v2); \
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##FOUR_LMUL(v, 3, v3); \
return v; \
}
#else
#define HAL_RVV_GROUP(ONE, FOUR, TYPE, ONE_LMUL, FOUR_LMUL) \
template <size_t idx> \
inline ONE::VecType vget(FOUR::VecType v) { \
return __riscv_vget_v_##TYPE##FOUR_LMUL##_##TYPE##ONE_LMUL(v, idx); \
} \
template <size_t idx> \
inline void vset(FOUR::VecType v, ONE::VecType val) { \
__riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##FOUR_LMUL(v, idx, val); \
} \
inline FOUR::VecType vcreate(ONE::VecType v0, ONE::VecType v1, ONE::VecType v2, ONE::VecType v3) { \
return __riscv_vcreate_v_##TYPE##ONE_LMUL##_##TYPE##FOUR_LMUL(v0, v1, v2, v3); \
}
#endif
HAL_RVV_GROUP(RVV_I8M1, RVV_I8M4, i8, m1, m4)
HAL_RVV_GROUP(RVV_I8M2, RVV_I8M8, i8, m2, m8)
HAL_RVV_GROUP(RVV_U8M1, RVV_U8M4, u8, m1, m4)
HAL_RVV_GROUP(RVV_U8M2, RVV_U8M8, u8, m2, m8)
HAL_RVV_GROUP(RVV_I16M1, RVV_I16M4, i16, m1, m4)
HAL_RVV_GROUP(RVV_I16M2, RVV_I16M8, i16, m2, m8)
HAL_RVV_GROUP(RVV_U16M1, RVV_U16M4, u16, m1, m4)
HAL_RVV_GROUP(RVV_U16M2, RVV_U16M8, u16, m2, m8)
HAL_RVV_GROUP(RVV_I32M1, RVV_I32M4, i32, m1, m4)
HAL_RVV_GROUP(RVV_I32M2, RVV_I32M8, i32, m2, m8)
HAL_RVV_GROUP(RVV_U32M1, RVV_U32M4, u32, m1, m4)
HAL_RVV_GROUP(RVV_U32M2, RVV_U32M8, u32, m2, m8)
HAL_RVV_GROUP(RVV_I64M1, RVV_I64M4, i64, m1, m4)
HAL_RVV_GROUP(RVV_I64M2, RVV_I64M8, i64, m2, m8)
HAL_RVV_GROUP(RVV_U64M1, RVV_U64M4, u64, m1, m4)
HAL_RVV_GROUP(RVV_U64M2, RVV_U64M8, u64, m2, m8)
HAL_RVV_GROUP(RVV_F32M1, RVV_F32M4, f32, m1, m4)
HAL_RVV_GROUP(RVV_F32M2, RVV_F32M8, f32, m2, m8)
HAL_RVV_GROUP(RVV_F64M1, RVV_F64M4, f64, m1, m4)
HAL_RVV_GROUP(RVV_F64M2, RVV_F64M8, f64, m2, m8)
#undef HAL_RVV_GROUP
#if defined(__clang__) && __clang_major__ <= 17
#define HAL_RVV_GROUP(ONE, EIGHT, TYPE, ONE_LMUL, EIGHT_LMUL) \
template <size_t idx> \
inline ONE::VecType vget(EIGHT::VecType v) { \
return __riscv_vget_v_##TYPE##EIGHT_LMUL##_##TYPE##ONE_LMUL(v, idx); \
} \
template <size_t idx> \
inline void vset(EIGHT::VecType v, ONE::VecType val) { \
__riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, idx, val); \
} \
inline EIGHT::VecType vcreate(ONE::VecType v0, ONE::VecType v1, ONE::VecType v2, ONE::VecType v3, \
ONE::VecType v4, ONE::VecType v5, ONE::VecType v6, ONE::VecType v7) { \
EIGHT::VecType v{}; \
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, 0, v0); \
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, 1, v1); \
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, 2, v2); \
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, 3, v3); \
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, 4, v4); \
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, 5, v5); \
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, 6, v6); \
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, 7, v7); \
return v; \
}
#else
#define HAL_RVV_GROUP(ONE, EIGHT, TYPE, ONE_LMUL, EIGHT_LMUL) \
template <size_t idx> \
inline ONE::VecType vget(EIGHT::VecType v) { \
return __riscv_vget_v_##TYPE##EIGHT_LMUL##_##TYPE##ONE_LMUL(v, idx); \
} \
template <size_t idx> \
inline void vset(EIGHT::VecType v, ONE::VecType val) { \
__riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, idx, val); \
} \
inline EIGHT::VecType vcreate(ONE::VecType v0, ONE::VecType v1, ONE::VecType v2, ONE::VecType v3, \
ONE::VecType v4, ONE::VecType v5, ONE::VecType v6, ONE::VecType v7) { \
return __riscv_vcreate_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v0, v1, v2, v3, v4, v5, v6, v7); \
}
#endif
HAL_RVV_GROUP(RVV_I8M1, RVV_I8M8, i8, m1, m8)
HAL_RVV_GROUP(RVV_U8M1, RVV_U8M8, u8, m1, m8)
HAL_RVV_GROUP(RVV_I16M1, RVV_I16M8, i16, m1, m8)
HAL_RVV_GROUP(RVV_U16M1, RVV_U16M8, u16, m1, m8)
HAL_RVV_GROUP(RVV_I32M1, RVV_I32M8, i32, m1, m8)
HAL_RVV_GROUP(RVV_U32M1, RVV_U32M8, u32, m1, m8)
HAL_RVV_GROUP(RVV_I64M1, RVV_I64M8, i64, m1, m8)
HAL_RVV_GROUP(RVV_U64M1, RVV_U64M8, u64, m1, m8)
HAL_RVV_GROUP(RVV_F32M1, RVV_F32M8, f32, m1, m8)
HAL_RVV_GROUP(RVV_F64M1, RVV_F64M8, f64, m1, m8)
#undef HAL_RVV_GROUP
}} // namespace cv::cv_hal_rvv
#endif //OPENCV_HAL_RVV_TYPES_HPP_INCLUDED

View File

@ -20,7 +20,7 @@ static int extraOutputDepths[6][2] = {{CV_32S, CV_32S}, {CV_32S, CV_32F}, {CV_32
typedef tuple<Size, MatType, MatDepth> Size_MatType_OutMatDepth_t;
typedef perf::TestBaseWithParam<Size_MatType_OutMatDepth_t> Size_MatType_OutMatDepth;
typedef tuple<Size, MatType, IntegralOutputDepths> Size_MatType_OutMatDepthArray_t;
typedef tuple<Size, std::tuple<MatType, IntegralOutputDepths>> Size_MatType_OutMatDepthArray_t;
typedef perf::TestBaseWithParam<Size_MatType_OutMatDepthArray_t> Size_MatType_OutMatDepthArray;
PERF_TEST_P(Size_MatType_OutMatDepth, integral,
@ -83,19 +83,42 @@ PERF_TEST_P(Size_MatType_OutMatDepth, integral_sqsum,
SANITY_CHECK(sqsum, 1e-6);
}
static std::vector<std::tuple<MatType, IntegralOutputDepths>> GetFullSqsumDepthPairs() {
static int extraDepths[12][2] = {
{CV_8U, DEPTH_32S_64F},
{CV_8U, DEPTH_32S_32F},
{CV_8U, DEPTH_32S_32S},
{CV_8U, DEPTH_32F_64F},
{CV_8U, DEPTH_32F_32F},
{CV_8U, DEPTH_64F_64F},
{CV_16U, DEPTH_64F_64F},
{CV_16S, DEPTH_64F_64F},
{CV_32F, DEPTH_32F_64F},
{CV_32F, DEPTH_32F_32F},
{CV_32F, DEPTH_64F_64F},
{CV_64F, DEPTH_64F_64F}
};
std::vector<std::tuple<MatType, IntegralOutputDepths>> valid_pairs;
for (size_t i = 0; i < 12; i++) {
for (int cn = 1; cn <= 4; cn++) {
valid_pairs.emplace_back(CV_MAKETYPE(extraDepths[i][0], cn), extraDepths[i][1]);
}
}
return valid_pairs;
}
PERF_TEST_P(Size_MatType_OutMatDepthArray, DISABLED_integral_sqsum_full,
testing::Combine(
testing::Values(TYPICAL_MAT_SIZES),
testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4),
testing::Values(DEPTH_32S_32S, DEPTH_32S_32F, DEPTH_32S_64F, DEPTH_32F_32F, DEPTH_32F_64F, DEPTH_64F_64F)
testing::ValuesIn(GetFullSqsumDepthPairs())
)
)
{
Size sz = get<0>(GetParam());
int matType = get<1>(GetParam());
int *outputDepths = (int *)extraOutputDepths[get<2>(GetParam())];
int sdepth = outputDepths[0];
int sqdepth = outputDepths[1];
auto depths = get<1>(GetParam());
int matType = get<0>(depths);
int sdepth = extraOutputDepths[get<1>(depths)][0];
int sqdepth = extraOutputDepths[get<1>(depths)][1];
Mat src(sz, matType);
Mat sum(sz, sdepth);

View File

@ -486,7 +486,8 @@ cvIntegral( const CvArr* image, CvArr* sumImage,
ptilted = &tilted;
}
cv::integral( src, sum, psqsum ? cv::_OutputArray(*psqsum) : cv::_OutputArray(),
ptilted ? cv::_OutputArray(*ptilted) : cv::_OutputArray(), sum.depth() );
ptilted ? cv::_OutputArray(*ptilted) : cv::_OutputArray(), sum.depth(),
psqsum ? psqsum->depth() : -1 );
CV_Assert( sum.data == sum0.data && sqsum.data == sqsum0.data && tilted.data == tilted0.data );
}

View File

@ -1684,19 +1684,34 @@ void CV_IntegralTest::get_test_array_types_and_sizes( int test_case_idx,
vector<vector<Size> >& sizes, vector<vector<int> >& types )
{
RNG& rng = ts->get_rng();
int depth = cvtest::randInt(rng) % 2, sum_depth;
int cn = cvtest::randInt(rng) % 4 + 1;
cvtest::ArrayTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
Size sum_size;
depth = depth == 0 ? CV_8U : CV_32F;
int b = (cvtest::randInt(rng) & 1) != 0;
sum_depth = depth == CV_8U && b ? CV_32S : b ? CV_32F : CV_64F;
const int depths[12][3] = {
{CV_8U, CV_32S, CV_64F},
{CV_8U, CV_32S, CV_32F},
{CV_8U, CV_32S, CV_32S},
{CV_8U, CV_32F, CV_64F},
{CV_8U, CV_32F, CV_32F},
{CV_8U, CV_64F, CV_64F},
{CV_16U, CV_64F, CV_64F},
{CV_16S, CV_64F, CV_64F},
{CV_32F, CV_32F, CV_64F},
{CV_32F, CV_32F, CV_32F},
{CV_32F, CV_64F, CV_64F},
{CV_64F, CV_64F, CV_64F},
};
int random_choice = cvtest::randInt(rng) % 12;
int depth = depths[random_choice][0];
int sum_depth = depths[random_choice][1];
int sqsum_depth = depths[random_choice][2];
types[INPUT][0] = CV_MAKETYPE(depth, cn);
types[OUTPUT][0] = types[REF_OUTPUT][0] =
types[OUTPUT][2] = types[REF_OUTPUT][2] = CV_MAKETYPE(sum_depth, cn);
types[OUTPUT][1] = types[REF_OUTPUT][1] = CV_MAKETYPE(CV_64F, cn);
types[OUTPUT][1] = types[REF_OUTPUT][1] = CV_MAKETYPE(sqsum_depth, cn);
sum_size.width = sizes[INPUT][0].width + 1;
sum_size.height = sizes[INPUT][0].height + 1;
@ -1738,7 +1753,7 @@ void CV_IntegralTest::run_func()
static void test_integral( const Mat& img, Mat* sum, Mat* sqsum, Mat* tilted )
{
CV_Assert( img.depth() == CV_32F );
CV_Assert( img.depth() == CV_64F );
sum->create(img.rows+1, img.cols+1, CV_64F);
if( sqsum )
@ -1746,7 +1761,7 @@ static void test_integral( const Mat& img, Mat* sum, Mat* sqsum, Mat* tilted )
if( tilted )
tilted->create(img.rows+1, img.cols+1, CV_64F);
const float* data = img.ptr<float>();
const double* data = img.ptr<double>();
double* sdata = sum->ptr<double>();
double* sqdata = sqsum ? sqsum->ptr<double>() : 0;
double* tdata = tilted ? tilted->ptr<double>() : 0;
@ -1788,7 +1803,7 @@ static void test_integral( const Mat& img, Mat* sum, Mat* sqsum, Mat* tilted )
else
{
ts += tdata[x-tstep-1];
if( data > img.ptr<float>() )
if( data > img.ptr<double>() )
{
ts += data[x-step-1];
if( x < size.width )
@ -1824,7 +1839,7 @@ void CV_IntegralTest::prepare_to_validation( int /*test_case_idx*/ )
{
if( cn > 1 )
cvtest::extract(src, plane, i);
plane.convertTo(srcf, CV_32F);
plane.convertTo(srcf, CV_64F);
test_integral( srcf, &psum, sqsum0 ? &psqsum : 0, tsum0 ? &ptsum : 0 );
psum.convertTo(psum2, sum0->depth());