mirror of
https://github.com/zebrajr/opencv.git
synced 2025-12-06 00:19:46 +01:00
Merge pull request #27060 from YooLc:hal-rvv-integral
[hal_rvv] Add cv::integral implementation and more types of input for test #27060 This patch introduces an RVV-optimized implementation of `cv::integral()` in hal_rvv, along with performance and accuracy tests for all valid input/output type combinations specified in `modules/imgproc/src/hal_replacement.hpp`:2a8d4b8e43/modules/imgproc/src/hal_replacement.hpp (L960-L974)The vectorized prefix sum algorithm follows the approach described in [Prefix Sum with SIMD - Algorithmica](https://en.algorithmica.org/hpc/algorithms/prefix/). I intentionally omitted support for the following cases by returning `CV_HAL_ERROR_NOT_IMPLEMENTED`, as they are harder to implement or show limited performance gains: 1. **Tilted Sum**: The data access pattern for tilted sums requires multi-row operations, making effective vectorization difficult. 2. **3-channel images (`cn == 3`)**: Current implementation requires `VLEN/SEW` (a.k.a. number of elements in a vector register) to be a multiple of channel count, which 3-channel formats typically cannot satisfy. - Support for 1, 2 and 4 channel images is implemented 4. **Small images (`!(width >> 8 || height >> 8)`)**: The scalar implementation demonstrates better performance for images with limited dimensions. - This is the same as `3rdparty/ndsrvp/src/integral.cpp`09c71aed14/3rdparty/ndsrvp/src/integral.cpp (L24-L26)Test configuration: - Platform: SpacemiT Muse Pi (K1 @ 1.60 Ghz) - Toolchain: GCC 14.2.0 - `integral_sqsum_full` test is disabled by default, so `--gtest_also_run_disabled_tests` is needed Test results: ```plaintext Geometric mean (ms) Name of Test imgproc-gcc-scalar imgproc-gcc-hal imgproc-gcc-hal vs imgproc-gcc-scalar (x-factor) integral::Size_MatType_OutMatDepth::(640x480, 8UC1, CV_32F) 1.973 1.415 1.39 integral::Size_MatType_OutMatDepth::(640x480, 8UC1, CV_32S) 1.343 1.351 0.99 integral::Size_MatType_OutMatDepth::(640x480, 8UC1, CV_64F) 2.021 2.756 0.73 integral::Size_MatType_OutMatDepth::(640x480, 8UC2, CV_32F) 4.695 2.874 1.63 integral::Size_MatType_OutMatDepth::(640x480, 8UC2, CV_32S) 4.028 2.801 1.44 integral::Size_MatType_OutMatDepth::(640x480, 8UC2, CV_64F) 5.965 4.926 1.21 integral::Size_MatType_OutMatDepth::(640x480, 8UC4, CV_32F) 9.970 4.440 2.25 integral::Size_MatType_OutMatDepth::(640x480, 8UC4, CV_32S) 7.934 4.244 1.87 integral::Size_MatType_OutMatDepth::(640x480, 8UC4, CV_64F) 14.696 8.431 1.74 integral::Size_MatType_OutMatDepth::(1280x720, 8UC1, CV_32F) 5.949 4.108 1.45 integral::Size_MatType_OutMatDepth::(1280x720, 8UC1, CV_32S) 4.064 4.080 1.00 integral::Size_MatType_OutMatDepth::(1280x720, 8UC1, CV_64F) 6.137 7.975 0.77 integral::Size_MatType_OutMatDepth::(1280x720, 8UC2, CV_32F) 13.896 8.721 1.59 integral::Size_MatType_OutMatDepth::(1280x720, 8UC2, CV_32S) 10.948 8.513 1.29 integral::Size_MatType_OutMatDepth::(1280x720, 8UC2, CV_64F) 18.046 15.234 1.18 integral::Size_MatType_OutMatDepth::(1280x720, 8UC4, CV_32F) 35.105 13.778 2.55 integral::Size_MatType_OutMatDepth::(1280x720, 8UC4, CV_32S) 27.135 13.417 2.02 integral::Size_MatType_OutMatDepth::(1280x720, 8UC4, CV_64F) 43.477 25.616 1.70 integral::Size_MatType_OutMatDepth::(1920x1080, 8UC1, CV_32F) 13.386 9.281 1.44 integral::Size_MatType_OutMatDepth::(1920x1080, 8UC1, CV_32S) 9.159 9.194 1.00 integral::Size_MatType_OutMatDepth::(1920x1080, 8UC1, CV_64F) 13.776 17.836 0.77 integral::Size_MatType_OutMatDepth::(1920x1080, 8UC2, CV_32F) 31.943 19.435 1.64 integral::Size_MatType_OutMatDepth::(1920x1080, 8UC2, CV_32S) 24.747 18.946 1.31 integral::Size_MatType_OutMatDepth::(1920x1080, 8UC2, CV_64F) 35.925 33.943 1.06 integral::Size_MatType_OutMatDepth::(1920x1080, 8UC4, CV_32F) 66.493 29.692 2.24 integral::Size_MatType_OutMatDepth::(1920x1080, 8UC4, CV_32S) 54.737 28.250 1.94 integral::Size_MatType_OutMatDepth::(1920x1080, 8UC4, CV_64F) 91.880 57.495 1.60 integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC1, CV_32F) 4.384 4.016 1.09 integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC1, CV_32S) 3.676 3.960 0.93 integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC1, CV_64F) 5.620 5.224 1.08 integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC2, CV_32F) 9.971 7.696 1.30 integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC2, CV_32S) 8.934 7.632 1.17 integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC2, CV_64F) 9.927 9.759 1.02 integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC4, CV_32F) 21.556 12.288 1.75 integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC4, CV_32S) 21.261 12.089 1.76 integral_sqsum::Size_MatType_OutMatDepth::(640x480, 8UC4, CV_64F) 23.989 16.278 1.47 integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC1, CV_32F) 15.232 11.752 1.30 integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC1, CV_32S) 12.976 11.721 1.11 integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC1, CV_64F) 16.450 15.627 1.05 integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC2, CV_32F) 25.932 23.243 1.12 integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC2, CV_32S) 24.750 23.019 1.08 integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC2, CV_64F) 28.228 29.605 0.95 integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC4, CV_32F) 61.665 37.477 1.65 integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC4, CV_32S) 61.536 37.126 1.66 integral_sqsum::Size_MatType_OutMatDepth::(1280x720, 8UC4, CV_64F) 73.989 48.994 1.51 integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC1, CV_32F) 49.640 26.529 1.87 integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC1, CV_32S) 35.869 26.417 1.36 integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC1, CV_64F) 34.378 35.056 0.98 integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC2, CV_32F) 82.138 52.661 1.56 integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC2, CV_32S) 54.644 52.089 1.05 integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC2, CV_64F) 75.073 66.670 1.13 integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC4, CV_32F) 143.283 83.943 1.71 integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC4, CV_32S) 156.851 82.378 1.90 integral_sqsum::Size_MatType_OutMatDepth::(1920x1080, 8UC4, CV_64F) 521.594 111.375 4.68 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC1, DEPTH_32F_32F)) 3.529 2.787 1.27 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC1, DEPTH_32F_64F)) 4.396 3.998 1.10 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC1, DEPTH_32S_32F)) 3.229 2.774 1.16 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC1, DEPTH_32S_32S)) 2.945 2.780 1.06 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC1, DEPTH_32S_64F)) 3.857 3.995 0.97 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC1, DEPTH_64F_64F)) 5.872 5.228 1.12 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (16UC1, DEPTH_64F_64F)) 6.075 5.277 1.15 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (16SC1, DEPTH_64F_64F)) 5.680 5.296 1.07 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC1, DEPTH_32F_32F)) 3.355 2.896 1.16 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC1, DEPTH_32F_64F)) 4.183 4.000 1.05 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC1, DEPTH_64F_64F)) 6.237 5.143 1.21 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (64FC1, DEPTH_64F_64F)) 4.753 4.783 0.99 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC2, DEPTH_32F_32F)) 8.021 5.793 1.38 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC2, DEPTH_32F_64F)) 9.963 7.704 1.29 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC2, DEPTH_32S_32F)) 7.864 5.720 1.37 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC2, DEPTH_32S_32S)) 7.141 5.699 1.25 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC2, DEPTH_32S_64F)) 9.228 7.646 1.21 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC2, DEPTH_64F_64F)) 9.940 9.759 1.02 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (16UC2, DEPTH_64F_64F)) 10.606 9.716 1.09 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (16SC2, DEPTH_64F_64F)) 9.933 9.751 1.02 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC2, DEPTH_32F_32F)) 7.986 5.962 1.34 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC2, DEPTH_32F_64F)) 9.243 7.598 1.22 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC2, DEPTH_64F_64F)) 10.573 9.425 1.12 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (64FC2, DEPTH_64F_64F)) 11.029 8.977 1.23 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC4, DEPTH_32F_32F)) 17.236 8.881 1.94 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC4, DEPTH_32F_64F)) 20.905 12.322 1.70 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC4, DEPTH_32S_32F)) 16.011 8.666 1.85 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC4, DEPTH_32S_32S)) 15.932 8.507 1.87 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC4, DEPTH_32S_64F)) 20.713 12.115 1.71 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (8UC4, DEPTH_64F_64F)) 23.953 16.284 1.47 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (16UC4, DEPTH_64F_64F)) 25.127 16.341 1.54 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (16SC4, DEPTH_64F_64F)) 24.950 16.441 1.52 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC4, DEPTH_32F_32F)) 17.261 8.906 1.94 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC4, DEPTH_32F_64F)) 21.944 12.073 1.82 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (32FC4, DEPTH_64F_64F)) 25.921 15.539 1.67 integral_sqsum_full::Size_MatType_OutMatDepthArray::(640x480, (64FC4, DEPTH_64F_64F)) 27.938 14.824 1.88 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC1, DEPTH_32F_32F)) 11.156 8.260 1.35 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC1, DEPTH_32F_64F)) 14.777 11.869 1.24 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC1, DEPTH_32S_32F)) 9.693 8.221 1.18 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC1, DEPTH_32S_32S)) 9.023 8.256 1.09 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC1, DEPTH_32S_64F)) 13.276 11.821 1.12 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC1, DEPTH_64F_64F)) 15.406 15.618 0.99 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (16UC1, DEPTH_64F_64F)) 16.799 15.749 1.07 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (16SC1, DEPTH_64F_64F)) 15.054 15.806 0.95 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC1, DEPTH_32F_32F)) 10.055 7.999 1.26 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC1, DEPTH_32F_64F)) 13.506 11.253 1.20 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC1, DEPTH_64F_64F)) 14.952 15.021 1.00 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (64FC1, DEPTH_64F_64F)) 13.761 14.002 0.98 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC2, DEPTH_32F_32F)) 22.677 17.330 1.31 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC2, DEPTH_32F_64F)) 26.283 23.237 1.13 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC2, DEPTH_32S_32F)) 20.126 17.118 1.18 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC2, DEPTH_32S_32S)) 19.337 17.041 1.13 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC2, DEPTH_32S_64F)) 24.973 23.004 1.09 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC2, DEPTH_64F_64F)) 29.959 29.585 1.01 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (16UC2, DEPTH_64F_64F)) 33.598 29.599 1.14 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (16SC2, DEPTH_64F_64F)) 46.213 29.741 1.55 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC2, DEPTH_32F_32F)) 33.077 17.556 1.88 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC2, DEPTH_32F_64F)) 33.960 22.991 1.48 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC2, DEPTH_64F_64F)) 41.792 28.803 1.45 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (64FC2, DEPTH_64F_64F)) 34.660 28.532 1.21 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC4, DEPTH_32F_32F)) 52.989 27.659 1.92 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC4, DEPTH_32F_64F)) 62.418 37.515 1.66 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC4, DEPTH_32S_32F)) 50.902 27.310 1.86 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC4, DEPTH_32S_32S)) 47.301 27.019 1.75 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC4, DEPTH_32S_64F)) 61.982 37.140 1.67 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (8UC4, DEPTH_64F_64F)) 79.403 49.041 1.62 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (16UC4, DEPTH_64F_64F)) 86.550 49.180 1.76 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (16SC4, DEPTH_64F_64F)) 85.715 49.468 1.73 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC4, DEPTH_32F_32F)) 63.932 28.019 2.28 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC4, DEPTH_32F_64F)) 68.180 36.858 1.85 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (32FC4, DEPTH_64F_64F)) 83.063 46.483 1.79 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1280x720, (64FC4, DEPTH_64F_64F)) 91.990 44.545 2.07 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC1, DEPTH_32F_32F)) 25.503 18.609 1.37 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC1, DEPTH_32F_64F)) 29.544 26.635 1.11 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC1, DEPTH_32S_32F)) 22.581 18.514 1.22 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC1, DEPTH_32S_32S)) 20.860 18.547 1.12 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC1, DEPTH_32S_64F)) 26.046 26.373 0.99 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC1, DEPTH_64F_64F)) 34.831 34.997 1.00 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (16UC1, DEPTH_64F_64F)) 36.428 35.214 1.03 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (16SC1, DEPTH_64F_64F)) 32.435 35.314 0.92 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC1, DEPTH_32F_32F)) 22.548 18.845 1.20 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC1, DEPTH_32F_64F)) 28.589 25.790 1.11 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC1, DEPTH_64F_64F)) 32.625 33.791 0.97 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (64FC1, DEPTH_64F_64F)) 30.158 31.889 0.95 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC2, DEPTH_32F_32F)) 53.374 38.938 1.37 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC2, DEPTH_32F_64F)) 73.892 52.747 1.40 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC2, DEPTH_32S_32F)) 47.392 38.572 1.23 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC2, DEPTH_32S_32S)) 45.638 38.225 1.19 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC2, DEPTH_32S_64F)) 69.966 52.156 1.34 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC2, DEPTH_64F_64F)) 68.560 66.963 1.02 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (16UC2, DEPTH_64F_64F)) 71.487 65.420 1.09 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (16SC2, DEPTH_64F_64F)) 68.127 65.718 1.04 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC2, DEPTH_32F_32F)) 72.967 39.987 1.82 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC2, DEPTH_32F_64F)) 63.933 51.408 1.24 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC2, DEPTH_64F_64F)) 73.334 63.354 1.16 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (64FC2, DEPTH_64F_64F)) 80.983 60.778 1.33 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC4, DEPTH_32F_32F)) 116.981 59.908 1.95 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC4, DEPTH_32F_64F)) 155.085 83.974 1.85 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC4, DEPTH_32S_32F)) 109.567 58.525 1.87 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC4, DEPTH_32S_32S)) 105.457 57.124 1.85 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC4, DEPTH_32S_64F)) 157.325 82.485 1.91 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (8UC4, DEPTH_64F_64F)) 265.776 111.577 2.38 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (16UC4, DEPTH_64F_64F)) 585.218 110.583 5.29 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (16SC4, DEPTH_64F_64F)) 585.418 111.302 5.26 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC4, DEPTH_32F_32F)) 126.456 60.415 2.09 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC4, DEPTH_32F_64F)) 169.278 81.460 2.08 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (32FC4, DEPTH_64F_64F)) 281.256 104.732 2.69 integral_sqsum_full::Size_MatType_OutMatDepthArray::(1920x1080, (64FC4, DEPTH_64F_64F)) 620.885 99.953 6.21 ``` The vectorized implementation shows progressively better acceleration for larger image sizes and higher channel counts, achieving up to 6.21× speedup for 64FC4 (1920×1080) inputs with `DEPTH_64F_64F` configuration. This is my first time proposing patch for the OpenCV Project 🥹, if there's anything that can be improved, please tell me. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
11e46cda86
commit
f20facc60a
1
3rdparty/hal_rvv/hal_rvv.hpp
vendored
1
3rdparty/hal_rvv/hal_rvv.hpp
vendored
|
|
@ -57,6 +57,7 @@
|
|||
#include "hal_rvv_1p0/thresh.hpp" // imgproc
|
||||
#include "hal_rvv_1p0/histogram.hpp" // imgproc
|
||||
#include "hal_rvv_1p0/resize.hpp" // imgproc
|
||||
#include "hal_rvv_1p0/integral.hpp" // imgproc
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
|||
173
3rdparty/hal_rvv/hal_rvv_1p0/integral.hpp
vendored
Normal file
173
3rdparty/hal_rvv/hal_rvv_1p0/integral.hpp
vendored
Normal file
|
|
@ -0,0 +1,173 @@
|
|||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
|
||||
|
||||
#ifndef OPENCV_HAL_RVV_INTEGRAL_HPP_INCLUDED
|
||||
#define OPENCV_HAL_RVV_INTEGRAL_HPP_INCLUDED
|
||||
|
||||
#include <riscv_vector.h>
|
||||
#include "types.hpp"
|
||||
|
||||
namespace cv { namespace cv_hal_rvv {
|
||||
|
||||
#undef cv_hal_integral
|
||||
#define cv_hal_integral cv::cv_hal_rvv::integral
|
||||
|
||||
template <typename vec_t>
|
||||
inline typename vec_t::VecType repeat_last_n(typename vec_t::VecType vs, int n, size_t vl) {
|
||||
auto v_last = vec_t::vslidedown(vs, vl - n, vl);
|
||||
if (n == 1) return vec_t::vmv(vec_t::vmv_x(v_last), vl);
|
||||
for (size_t offset = n; offset < vl; offset <<= 1) {
|
||||
v_last = vec_t::vslideup(v_last, v_last, offset, vl);
|
||||
}
|
||||
return v_last;
|
||||
}
|
||||
|
||||
template <typename data_vec_t, typename acc_vec_t, bool sqsum = false>
|
||||
inline int integral_inner(const uchar* src_data, size_t src_step,
|
||||
uchar* sum_data, size_t sum_step,
|
||||
int width, int height, int cn) {
|
||||
using data_t = typename data_vec_t::ElemType;
|
||||
using acc_t = typename acc_vec_t::ElemType;
|
||||
|
||||
for (int y = 0; y < height; y++) {
|
||||
const data_t* src = reinterpret_cast<const data_t*>(src_data + src_step * y);
|
||||
acc_t* prev = reinterpret_cast<acc_t*>(sum_data + sum_step * y);
|
||||
acc_t* curr = reinterpret_cast<acc_t*>(sum_data + sum_step * (y + 1));
|
||||
memset(curr, 0, cn * sizeof(acc_t));
|
||||
|
||||
size_t vl = acc_vec_t::setvlmax();
|
||||
auto sum = acc_vec_t::vmv(0, vl);
|
||||
for (size_t x = 0; x < static_cast<size_t>(width); x += vl) {
|
||||
vl = acc_vec_t::setvl(width - x);
|
||||
__builtin_prefetch(&src[x + vl], 0);
|
||||
__builtin_prefetch(&prev[x + cn], 0);
|
||||
|
||||
auto v_src = data_vec_t::vload(&src[x], vl);
|
||||
auto acc = acc_vec_t::cast(v_src, vl);
|
||||
|
||||
if (sqsum) { // Squared Sum
|
||||
acc = acc_vec_t::vmul(acc, acc, vl);
|
||||
}
|
||||
|
||||
auto v_zero = acc_vec_t::vmv(0, vl);
|
||||
for (size_t offset = cn; offset < vl; offset <<= 1) {
|
||||
auto v_shift = acc_vec_t::vslideup(v_zero, acc, offset, vl);
|
||||
acc = acc_vec_t::vadd(acc, v_shift, vl);
|
||||
}
|
||||
auto last_n = repeat_last_n<acc_vec_t>(acc, cn, vl);
|
||||
|
||||
auto v_prev = acc_vec_t::vload(&prev[x + cn], vl);
|
||||
acc = acc_vec_t::vadd(acc, v_prev, vl);
|
||||
acc = acc_vec_t::vadd(acc, sum, vl);
|
||||
sum = acc_vec_t::vadd(sum, last_n, vl);
|
||||
|
||||
acc_vec_t::vstore(&curr[x + cn], acc, vl);
|
||||
}
|
||||
}
|
||||
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
template <typename data_vec_t, typename acc_vec_t, typename sq_acc_vec_t>
|
||||
inline int integral(const uchar* src_data, size_t src_step, uchar* sum_data, size_t sum_step, uchar* sqsum_data, size_t sqsum_step, int width, int height, int cn) {
|
||||
memset(sum_data, 0, (sum_step) * sizeof(uchar));
|
||||
|
||||
int result = CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
if (sqsum_data == nullptr) {
|
||||
result = integral_inner<data_vec_t, acc_vec_t, false>(src_data, src_step, sum_data, sum_step, width, height, cn);
|
||||
} else {
|
||||
result = integral_inner<data_vec_t, acc_vec_t, false>(src_data, src_step, sum_data, sum_step, width, height, cn);
|
||||
memset(sqsum_data, 0, (sqsum_step) * sizeof(uchar));
|
||||
if (result != CV_HAL_ERROR_OK) return result;
|
||||
result = integral_inner<data_vec_t, sq_acc_vec_t, true>(src_data, src_step, sqsum_data, sqsum_step, width, height, cn);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
@brief Calculate integral image
|
||||
@param depth Depth of source image
|
||||
@param sdepth Depth of sum image
|
||||
@param sqdepth Depth of square sum image
|
||||
@param src_data Source image data
|
||||
@param src_step Source image step
|
||||
@param sum_data Sum image data
|
||||
@param sum_step Sum image step
|
||||
@param sqsum_data Square sum image data
|
||||
@param sqsum_step Square sum image step
|
||||
@param tilted_data Tilted sum image data
|
||||
@param tilted_step Tilted sum image step
|
||||
@param width Source image width
|
||||
@param height Source image height
|
||||
@param cn Number of channels
|
||||
@note Following combinations of image depths are used:
|
||||
Source | Sum | Square sum
|
||||
-------|-----|-----------
|
||||
CV_8U | CV_32S | CV_64F
|
||||
CV_8U | CV_32S | CV_32F
|
||||
CV_8U | CV_32S | CV_32S
|
||||
CV_8U | CV_32F | CV_64F
|
||||
CV_8U | CV_32F | CV_32F
|
||||
CV_8U | CV_64F | CV_64F
|
||||
CV_16U | CV_64F | CV_64F
|
||||
CV_16S | CV_64F | CV_64F
|
||||
CV_32F | CV_32F | CV_64F
|
||||
CV_32F | CV_32F | CV_32F
|
||||
CV_32F | CV_64F | CV_64F
|
||||
CV_64F | CV_64F | CV_64F
|
||||
*/
|
||||
inline int integral(int depth, int sdepth, int sqdepth,
|
||||
const uchar* src_data, size_t src_step,
|
||||
uchar* sum_data, size_t sum_step,
|
||||
uchar* sqsum_data, size_t sqsum_step,
|
||||
uchar* tilted_data, [[maybe_unused]] size_t tilted_step,
|
||||
int width, int height, int cn) {
|
||||
// tilted sum and cn == 3 cases are not supported
|
||||
if (tilted_data || cn == 3) {
|
||||
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
// Skip images that are too small
|
||||
if (!(width >> 8 || height >> 8)) {
|
||||
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
int result = CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
|
||||
width *= cn;
|
||||
|
||||
if( depth == CV_8U && sdepth == CV_32S && sqdepth == CV_64F )
|
||||
result = integral<RVV<uint8_t, LMUL_1>, RVV<int32_t, LMUL_4>, RVV<double, LMUL_8>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
|
||||
else if( depth == CV_8U && sdepth == CV_32S && sqdepth == CV_32F )
|
||||
result = integral<RVV<uint8_t, LMUL_1>, RVV<int32_t, LMUL_4>, RVV<float, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
|
||||
else if( depth == CV_8U && sdepth == CV_32S && sqdepth == CV_32S )
|
||||
result = integral<RVV<uint8_t, LMUL_1>, RVV<int32_t, LMUL_4>, RVV<int32_t, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
|
||||
else if( depth == CV_8U && sdepth == CV_32F && sqdepth == CV_64F )
|
||||
result = integral<RVV<uint8_t, LMUL_1>, RVV<float, LMUL_4>, RVV<double, LMUL_8>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
|
||||
else if( depth == CV_8U && sdepth == CV_32F && sqdepth == CV_32F )
|
||||
result = integral<RVV<uint8_t, LMUL_1>, RVV<float, LMUL_4>, RVV<float, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
|
||||
else if( depth == CV_8U && sdepth == CV_64F && sqdepth == CV_64F )
|
||||
result = integral<RVV<uint8_t, LMUL_1>, RVV<double, LMUL_8>, RVV<double, LMUL_8>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
|
||||
else if( depth == CV_16U && sdepth == CV_64F && sqdepth == CV_64F )
|
||||
result = integral<RVV<uint16_t, LMUL_1>, RVV<double, LMUL_4>, RVV<double, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
|
||||
else if( depth == CV_16S && sdepth == CV_64F && sqdepth == CV_64F )
|
||||
result = integral<RVV<int16_t, LMUL_1>, RVV<double, LMUL_4>, RVV<double, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
|
||||
else if( depth == CV_32F && sdepth == CV_32F && sqdepth == CV_64F )
|
||||
result = integral<RVV<float, LMUL_2>, RVV<float, LMUL_2>, RVV<double, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
|
||||
else if( depth == CV_32F && sdepth == CV_32F && sqdepth == CV_32F )
|
||||
result = integral<RVV<float, LMUL_4>, RVV<float, LMUL_4>, RVV<float, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
|
||||
else if( depth == CV_32F && sdepth == CV_64F && sqdepth == CV_64F )
|
||||
result = integral<RVV<float, LMUL_2>, RVV<double, LMUL_4>, RVV<double, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
|
||||
else if( depth == CV_64F && sdepth == CV_64F && sqdepth == CV_64F ) {
|
||||
result = integral<RVV<double, LMUL_4>, RVV<double, LMUL_4>, RVV<double, LMUL_4>>(src_data, src_step, sum_data, sum_step, sqsum_data, sqsum_step, width, height, cn);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
}}
|
||||
|
||||
#endif
|
||||
281
3rdparty/hal_rvv/hal_rvv_1p0/types.hpp
vendored
281
3rdparty/hal_rvv/hal_rvv_1p0/types.hpp
vendored
|
|
@ -153,6 +153,12 @@ static inline VecType vmv(ElemType a, size_t vl) {
|
|||
static inline VecType vmv_s(ElemType a, size_t vl) { \
|
||||
return __riscv_v##IS_F##mv_s_##X_OR_F##_##TYPE##LMUL(a, vl); \
|
||||
} \
|
||||
static inline VecType vslideup(VecType vs2, VecType vs1, size_t n, size_t vl) { \
|
||||
return __riscv_vslideup_vx_##TYPE##LMUL(vs2, vs1, n, vl); \
|
||||
} \
|
||||
static inline VecType vslidedown(VecType vs, size_t n, size_t vl) { \
|
||||
return __riscv_vslidedown_vx_##TYPE##LMUL(vs, n, vl); \
|
||||
} \
|
||||
HAL_RVV_SIZE_RELATED_CUSTOM(EEW, TYPE, LMUL)
|
||||
|
||||
#define HAL_RVV_SIZE_UNRELATED(S_OR_F, X_OR_F, IS_U, IS_F, IS_O) \
|
||||
|
|
@ -380,7 +386,7 @@ template <> struct RVV_ToFloatHelper<8> {using type = double;};
|
|||
template <> \
|
||||
inline ONE::VecType ONE::cast(TWO::VecType v, size_t vl) { return __riscv_vncvt_x(v, vl); } \
|
||||
template <> \
|
||||
inline TWO::VecType TWO::cast(ONE::VecType v, size_t vl) { return __riscv_vwcvt_x(v, vl); }
|
||||
inline TWO::VecType TWO::cast(ONE::VecType v, size_t vl) { return __riscv_vsext_vf2(v, vl); }
|
||||
|
||||
HAL_RVV_CVT(RVV_I8M4, RVV_I16M8)
|
||||
HAL_RVV_CVT(RVV_I8M2, RVV_I16M4)
|
||||
|
|
@ -406,7 +412,7 @@ HAL_RVV_CVT(RVV_I32MF2, RVV_I64M1)
|
|||
template <> \
|
||||
inline ONE::VecType ONE::cast(TWO::VecType v, size_t vl) { return __riscv_vncvt_x(v, vl); } \
|
||||
template <> \
|
||||
inline TWO::VecType TWO::cast(ONE::VecType v, size_t vl) { return __riscv_vwcvtu_x(v, vl); }
|
||||
inline TWO::VecType TWO::cast(ONE::VecType v, size_t vl) { return __riscv_vzext_vf2(v, vl); }
|
||||
|
||||
HAL_RVV_CVT(RVV_U8M4, RVV_U16M8)
|
||||
HAL_RVV_CVT(RVV_U8M2, RVV_U16M4)
|
||||
|
|
@ -592,6 +598,277 @@ HAL_RVV_CVT( uint8_t, int8_t, u8, i8, LMUL_f8, mf8)
|
|||
|
||||
#undef HAL_RVV_CVT
|
||||
|
||||
#define HAL_RVV_CVT(A, B, A_TYPE, B_TYPE, LMUL_TYPE, LMUL) \
|
||||
template <> \
|
||||
inline RVV<A, LMUL_TYPE>::VecType RVV<A, LMUL_TYPE>::cast(RVV<B, LMUL_TYPE>::VecType v, [[maybe_unused]] size_t vl) { \
|
||||
return __riscv_vreinterpret_##A_TYPE##LMUL(v); \
|
||||
} \
|
||||
template <> \
|
||||
inline RVV<B, LMUL_TYPE>::VecType RVV<B, LMUL_TYPE>::cast(RVV<A, LMUL_TYPE>::VecType v, [[maybe_unused]] size_t vl) { \
|
||||
return __riscv_vreinterpret_##B_TYPE##LMUL(v); \
|
||||
}
|
||||
|
||||
#define HAL_RVV_CVT2(A, B, A_TYPE, B_TYPE) \
|
||||
HAL_RVV_CVT(A, B, A_TYPE, B_TYPE, LMUL_1, m1) \
|
||||
HAL_RVV_CVT(A, B, A_TYPE, B_TYPE, LMUL_2, m2) \
|
||||
HAL_RVV_CVT(A, B, A_TYPE, B_TYPE, LMUL_4, m4) \
|
||||
HAL_RVV_CVT(A, B, A_TYPE, B_TYPE, LMUL_8, m8)
|
||||
|
||||
HAL_RVV_CVT2( uint8_t, int8_t, u8, i8)
|
||||
HAL_RVV_CVT2(uint16_t, int16_t, u16, i16)
|
||||
HAL_RVV_CVT2(uint32_t, int32_t, u32, i32)
|
||||
HAL_RVV_CVT2(uint64_t, int64_t, u64, i64)
|
||||
|
||||
#undef HAL_RVV_CVT2
|
||||
#undef HAL_RVV_CVT
|
||||
|
||||
#define HAL_RVV_CVT(FROM, INTERMEDIATE, TO) \
|
||||
template <> \
|
||||
inline TO::VecType TO::cast(FROM::VecType v, size_t vl) { \
|
||||
return TO::cast(INTERMEDIATE::cast(v, vl), vl); \
|
||||
} \
|
||||
template <> \
|
||||
inline FROM::VecType FROM::cast(TO::VecType v, size_t vl) { \
|
||||
return FROM::cast(INTERMEDIATE::cast(v, vl), vl); \
|
||||
}
|
||||
|
||||
// Integer and Float conversions
|
||||
HAL_RVV_CVT(RVV_I8M1, RVV_I32M4, RVV_F32M4)
|
||||
HAL_RVV_CVT(RVV_I8M2, RVV_I32M8, RVV_F32M8)
|
||||
HAL_RVV_CVT(RVV_I8M1, RVV_I64M8, RVV_F64M8)
|
||||
|
||||
HAL_RVV_CVT(RVV_I16M1, RVV_I32M2, RVV_F32M2)
|
||||
HAL_RVV_CVT(RVV_I16M2, RVV_I32M4, RVV_F32M4)
|
||||
HAL_RVV_CVT(RVV_I16M4, RVV_I32M8, RVV_F32M8)
|
||||
HAL_RVV_CVT(RVV_I16M1, RVV_I64M4, RVV_F64M4)
|
||||
HAL_RVV_CVT(RVV_I16M2, RVV_I64M8, RVV_F64M8)
|
||||
|
||||
HAL_RVV_CVT(RVV_I32M1, RVV_I64M2, RVV_F64M2)
|
||||
HAL_RVV_CVT(RVV_I32M2, RVV_I64M4, RVV_F64M4)
|
||||
HAL_RVV_CVT(RVV_I32M4, RVV_I64M8, RVV_F64M8)
|
||||
|
||||
HAL_RVV_CVT(RVV_U8M1, RVV_U32M4, RVV_F32M4)
|
||||
HAL_RVV_CVT(RVV_U8M2, RVV_U32M8, RVV_F32M8)
|
||||
HAL_RVV_CVT(RVV_U8M1, RVV_U64M8, RVV_F64M8)
|
||||
|
||||
HAL_RVV_CVT(RVV_U16M1, RVV_U32M2, RVV_F32M2)
|
||||
HAL_RVV_CVT(RVV_U16M2, RVV_U32M4, RVV_F32M4)
|
||||
HAL_RVV_CVT(RVV_U16M4, RVV_U32M8, RVV_F32M8)
|
||||
HAL_RVV_CVT(RVV_U16M1, RVV_U64M4, RVV_F64M4)
|
||||
HAL_RVV_CVT(RVV_U16M2, RVV_U64M8, RVV_F64M8)
|
||||
|
||||
HAL_RVV_CVT(RVV_U32M1, RVV_U64M2, RVV_F64M2)
|
||||
HAL_RVV_CVT(RVV_U32M2, RVV_U64M4, RVV_F64M4)
|
||||
HAL_RVV_CVT(RVV_U32M4, RVV_U64M8, RVV_F64M8)
|
||||
|
||||
// Signed and Unsigned conversions
|
||||
HAL_RVV_CVT(RVV_U8M1, RVV_U16M2, RVV_I16M2)
|
||||
HAL_RVV_CVT(RVV_U8M2, RVV_U16M4, RVV_I16M4)
|
||||
HAL_RVV_CVT(RVV_U8M4, RVV_U16M8, RVV_I16M8)
|
||||
|
||||
HAL_RVV_CVT(RVV_U8M1, RVV_U32M4, RVV_I32M4)
|
||||
HAL_RVV_CVT(RVV_U8M2, RVV_U32M8, RVV_I32M8)
|
||||
|
||||
HAL_RVV_CVT(RVV_U8M1, RVV_U64M8, RVV_I64M8)
|
||||
|
||||
#undef HAL_RVV_CVT
|
||||
|
||||
// ---------------------------- Define Register Group Operations -------------------------------
|
||||
|
||||
#if defined(__clang__) && __clang_major__ <= 17
|
||||
#define HAL_RVV_GROUP(ONE, TWO, TYPE, ONE_LMUL, TWO_LMUL) \
|
||||
template <size_t idx> \
|
||||
inline ONE::VecType vget(TWO::VecType v) { \
|
||||
return __riscv_vget_v_##TYPE##TWO_LMUL##_##TYPE##ONE_LMUL(v, idx); \
|
||||
} \
|
||||
template <size_t idx> \
|
||||
inline void vset(TWO::VecType v, ONE::VecType val) { \
|
||||
__riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##TWO_LMUL(v, idx, val); \
|
||||
} \
|
||||
inline TWO::VecType vcreate(ONE::VecType v0, ONE::VecType v1) { \
|
||||
TWO::VecType v{}; \
|
||||
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##TWO_LMUL(v, 0, v0); \
|
||||
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##TWO_LMUL(v, 1, v1); \
|
||||
return v; \
|
||||
}
|
||||
#else
|
||||
#define HAL_RVV_GROUP(ONE, TWO, TYPE, ONE_LMUL, TWO_LMUL) \
|
||||
template <size_t idx> \
|
||||
inline ONE::VecType vget(TWO::VecType v) { \
|
||||
return __riscv_vget_v_##TYPE##TWO_LMUL##_##TYPE##ONE_LMUL(v, idx); \
|
||||
} \
|
||||
template <size_t idx> \
|
||||
inline void vset(TWO::VecType v, ONE::VecType val) { \
|
||||
__riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##TWO_LMUL(v, idx, val); \
|
||||
} \
|
||||
inline TWO::VecType vcreate(ONE::VecType v0, ONE::VecType v1) { \
|
||||
return __riscv_vcreate_v_##TYPE##ONE_LMUL##_##TYPE##TWO_LMUL(v0, v1); \
|
||||
}
|
||||
#endif
|
||||
|
||||
HAL_RVV_GROUP(RVV_I8M1, RVV_I8M2, i8, m1, m2)
|
||||
HAL_RVV_GROUP(RVV_I8M2, RVV_I8M4, i8, m2, m4)
|
||||
HAL_RVV_GROUP(RVV_I8M4, RVV_I8M8, i8, m4, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_I16M1, RVV_I16M2, i16, m1, m2)
|
||||
HAL_RVV_GROUP(RVV_I16M2, RVV_I16M4, i16, m2, m4)
|
||||
HAL_RVV_GROUP(RVV_I16M4, RVV_I16M8, i16, m4, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_I32M1, RVV_I32M2, i32, m1, m2)
|
||||
HAL_RVV_GROUP(RVV_I32M2, RVV_I32M4, i32, m2, m4)
|
||||
HAL_RVV_GROUP(RVV_I32M4, RVV_I32M8, i32, m4, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_I64M1, RVV_I64M2, i64, m1, m2)
|
||||
HAL_RVV_GROUP(RVV_I64M2, RVV_I64M4, i64, m2, m4)
|
||||
HAL_RVV_GROUP(RVV_I64M4, RVV_I64M8, i64, m4, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_U8M1, RVV_U8M2, u8, m1, m2)
|
||||
HAL_RVV_GROUP(RVV_U8M2, RVV_U8M4, u8, m2, m4)
|
||||
HAL_RVV_GROUP(RVV_U8M4, RVV_U8M8, u8, m4, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_U16M1, RVV_U16M2, u16, m1, m2)
|
||||
HAL_RVV_GROUP(RVV_U16M2, RVV_U16M4, u16, m2, m4)
|
||||
HAL_RVV_GROUP(RVV_U16M4, RVV_U16M8, u16, m4, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_U32M1, RVV_U32M2, u32, m1, m2)
|
||||
HAL_RVV_GROUP(RVV_U32M2, RVV_U32M4, u32, m2, m4)
|
||||
HAL_RVV_GROUP(RVV_U32M4, RVV_U32M8, u32, m4, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_U64M1, RVV_U64M2, u64, m1, m2)
|
||||
HAL_RVV_GROUP(RVV_U64M2, RVV_U64M4, u64, m2, m4)
|
||||
HAL_RVV_GROUP(RVV_U64M4, RVV_U64M8, u64, m4, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_F32M1, RVV_F32M2, f32, m1, m2)
|
||||
HAL_RVV_GROUP(RVV_F32M2, RVV_F32M4, f32, m2, m4)
|
||||
HAL_RVV_GROUP(RVV_F32M4, RVV_F32M8, f32, m4, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_F64M1, RVV_F64M2, f64, m1, m2)
|
||||
HAL_RVV_GROUP(RVV_F64M2, RVV_F64M4, f64, m2, m4)
|
||||
HAL_RVV_GROUP(RVV_F64M4, RVV_F64M8, f64, m4, m8)
|
||||
|
||||
#undef HAL_RVV_GROUP
|
||||
|
||||
#if defined(__clang__) && __clang_major__ <= 17
|
||||
#define HAL_RVV_GROUP(ONE, FOUR, TYPE, ONE_LMUL, FOUR_LMUL) \
|
||||
template <size_t idx> \
|
||||
inline ONE::VecType vget(FOUR::VecType v) { \
|
||||
return __riscv_vget_v_##TYPE##FOUR_LMUL##_##TYPE##ONE_LMUL(v, idx); \
|
||||
} \
|
||||
template <size_t idx> \
|
||||
inline void vset(FOUR::VecType v, ONE::VecType val) { \
|
||||
__riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##FOUR_LMUL(v, idx, val); \
|
||||
} \
|
||||
inline FOUR::VecType vcreate(ONE::VecType v0, ONE::VecType v1, ONE::VecType v2, ONE::VecType v3) { \
|
||||
FOUR::VecType v{}; \
|
||||
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##FOUR_LMUL(v, 0, v0); \
|
||||
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##FOUR_LMUL(v, 1, v1); \
|
||||
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##FOUR_LMUL(v, 2, v2); \
|
||||
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##FOUR_LMUL(v, 3, v3); \
|
||||
return v; \
|
||||
}
|
||||
#else
|
||||
#define HAL_RVV_GROUP(ONE, FOUR, TYPE, ONE_LMUL, FOUR_LMUL) \
|
||||
template <size_t idx> \
|
||||
inline ONE::VecType vget(FOUR::VecType v) { \
|
||||
return __riscv_vget_v_##TYPE##FOUR_LMUL##_##TYPE##ONE_LMUL(v, idx); \
|
||||
} \
|
||||
template <size_t idx> \
|
||||
inline void vset(FOUR::VecType v, ONE::VecType val) { \
|
||||
__riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##FOUR_LMUL(v, idx, val); \
|
||||
} \
|
||||
inline FOUR::VecType vcreate(ONE::VecType v0, ONE::VecType v1, ONE::VecType v2, ONE::VecType v3) { \
|
||||
return __riscv_vcreate_v_##TYPE##ONE_LMUL##_##TYPE##FOUR_LMUL(v0, v1, v2, v3); \
|
||||
}
|
||||
#endif
|
||||
|
||||
HAL_RVV_GROUP(RVV_I8M1, RVV_I8M4, i8, m1, m4)
|
||||
HAL_RVV_GROUP(RVV_I8M2, RVV_I8M8, i8, m2, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_U8M1, RVV_U8M4, u8, m1, m4)
|
||||
HAL_RVV_GROUP(RVV_U8M2, RVV_U8M8, u8, m2, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_I16M1, RVV_I16M4, i16, m1, m4)
|
||||
HAL_RVV_GROUP(RVV_I16M2, RVV_I16M8, i16, m2, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_U16M1, RVV_U16M4, u16, m1, m4)
|
||||
HAL_RVV_GROUP(RVV_U16M2, RVV_U16M8, u16, m2, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_I32M1, RVV_I32M4, i32, m1, m4)
|
||||
HAL_RVV_GROUP(RVV_I32M2, RVV_I32M8, i32, m2, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_U32M1, RVV_U32M4, u32, m1, m4)
|
||||
HAL_RVV_GROUP(RVV_U32M2, RVV_U32M8, u32, m2, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_I64M1, RVV_I64M4, i64, m1, m4)
|
||||
HAL_RVV_GROUP(RVV_I64M2, RVV_I64M8, i64, m2, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_U64M1, RVV_U64M4, u64, m1, m4)
|
||||
HAL_RVV_GROUP(RVV_U64M2, RVV_U64M8, u64, m2, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_F32M1, RVV_F32M4, f32, m1, m4)
|
||||
HAL_RVV_GROUP(RVV_F32M2, RVV_F32M8, f32, m2, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_F64M1, RVV_F64M4, f64, m1, m4)
|
||||
HAL_RVV_GROUP(RVV_F64M2, RVV_F64M8, f64, m2, m8)
|
||||
|
||||
#undef HAL_RVV_GROUP
|
||||
|
||||
#if defined(__clang__) && __clang_major__ <= 17
|
||||
#define HAL_RVV_GROUP(ONE, EIGHT, TYPE, ONE_LMUL, EIGHT_LMUL) \
|
||||
template <size_t idx> \
|
||||
inline ONE::VecType vget(EIGHT::VecType v) { \
|
||||
return __riscv_vget_v_##TYPE##EIGHT_LMUL##_##TYPE##ONE_LMUL(v, idx); \
|
||||
} \
|
||||
template <size_t idx> \
|
||||
inline void vset(EIGHT::VecType v, ONE::VecType val) { \
|
||||
__riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, idx, val); \
|
||||
} \
|
||||
inline EIGHT::VecType vcreate(ONE::VecType v0, ONE::VecType v1, ONE::VecType v2, ONE::VecType v3, \
|
||||
ONE::VecType v4, ONE::VecType v5, ONE::VecType v6, ONE::VecType v7) { \
|
||||
EIGHT::VecType v{}; \
|
||||
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, 0, v0); \
|
||||
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, 1, v1); \
|
||||
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, 2, v2); \
|
||||
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, 3, v3); \
|
||||
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, 4, v4); \
|
||||
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, 5, v5); \
|
||||
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, 6, v6); \
|
||||
v = __riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, 7, v7); \
|
||||
return v; \
|
||||
}
|
||||
#else
|
||||
#define HAL_RVV_GROUP(ONE, EIGHT, TYPE, ONE_LMUL, EIGHT_LMUL) \
|
||||
template <size_t idx> \
|
||||
inline ONE::VecType vget(EIGHT::VecType v) { \
|
||||
return __riscv_vget_v_##TYPE##EIGHT_LMUL##_##TYPE##ONE_LMUL(v, idx); \
|
||||
} \
|
||||
template <size_t idx> \
|
||||
inline void vset(EIGHT::VecType v, ONE::VecType val) { \
|
||||
__riscv_vset_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v, idx, val); \
|
||||
} \
|
||||
inline EIGHT::VecType vcreate(ONE::VecType v0, ONE::VecType v1, ONE::VecType v2, ONE::VecType v3, \
|
||||
ONE::VecType v4, ONE::VecType v5, ONE::VecType v6, ONE::VecType v7) { \
|
||||
return __riscv_vcreate_v_##TYPE##ONE_LMUL##_##TYPE##EIGHT_LMUL(v0, v1, v2, v3, v4, v5, v6, v7); \
|
||||
}
|
||||
#endif
|
||||
|
||||
HAL_RVV_GROUP(RVV_I8M1, RVV_I8M8, i8, m1, m8)
|
||||
HAL_RVV_GROUP(RVV_U8M1, RVV_U8M8, u8, m1, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_I16M1, RVV_I16M8, i16, m1, m8)
|
||||
HAL_RVV_GROUP(RVV_U16M1, RVV_U16M8, u16, m1, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_I32M1, RVV_I32M8, i32, m1, m8)
|
||||
HAL_RVV_GROUP(RVV_U32M1, RVV_U32M8, u32, m1, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_I64M1, RVV_I64M8, i64, m1, m8)
|
||||
HAL_RVV_GROUP(RVV_U64M1, RVV_U64M8, u64, m1, m8)
|
||||
|
||||
HAL_RVV_GROUP(RVV_F32M1, RVV_F32M8, f32, m1, m8)
|
||||
HAL_RVV_GROUP(RVV_F64M1, RVV_F64M8, f64, m1, m8)
|
||||
|
||||
#undef HAL_RVV_GROUP
|
||||
|
||||
}} // namespace cv::cv_hal_rvv
|
||||
|
||||
#endif //OPENCV_HAL_RVV_TYPES_HPP_INCLUDED
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ static int extraOutputDepths[6][2] = {{CV_32S, CV_32S}, {CV_32S, CV_32F}, {CV_32
|
|||
typedef tuple<Size, MatType, MatDepth> Size_MatType_OutMatDepth_t;
|
||||
typedef perf::TestBaseWithParam<Size_MatType_OutMatDepth_t> Size_MatType_OutMatDepth;
|
||||
|
||||
typedef tuple<Size, MatType, IntegralOutputDepths> Size_MatType_OutMatDepthArray_t;
|
||||
typedef tuple<Size, std::tuple<MatType, IntegralOutputDepths>> Size_MatType_OutMatDepthArray_t;
|
||||
typedef perf::TestBaseWithParam<Size_MatType_OutMatDepthArray_t> Size_MatType_OutMatDepthArray;
|
||||
|
||||
PERF_TEST_P(Size_MatType_OutMatDepth, integral,
|
||||
|
|
@ -83,19 +83,42 @@ PERF_TEST_P(Size_MatType_OutMatDepth, integral_sqsum,
|
|||
SANITY_CHECK(sqsum, 1e-6);
|
||||
}
|
||||
|
||||
static std::vector<std::tuple<MatType, IntegralOutputDepths>> GetFullSqsumDepthPairs() {
|
||||
static int extraDepths[12][2] = {
|
||||
{CV_8U, DEPTH_32S_64F},
|
||||
{CV_8U, DEPTH_32S_32F},
|
||||
{CV_8U, DEPTH_32S_32S},
|
||||
{CV_8U, DEPTH_32F_64F},
|
||||
{CV_8U, DEPTH_32F_32F},
|
||||
{CV_8U, DEPTH_64F_64F},
|
||||
{CV_16U, DEPTH_64F_64F},
|
||||
{CV_16S, DEPTH_64F_64F},
|
||||
{CV_32F, DEPTH_32F_64F},
|
||||
{CV_32F, DEPTH_32F_32F},
|
||||
{CV_32F, DEPTH_64F_64F},
|
||||
{CV_64F, DEPTH_64F_64F}
|
||||
};
|
||||
std::vector<std::tuple<MatType, IntegralOutputDepths>> valid_pairs;
|
||||
for (size_t i = 0; i < 12; i++) {
|
||||
for (int cn = 1; cn <= 4; cn++) {
|
||||
valid_pairs.emplace_back(CV_MAKETYPE(extraDepths[i][0], cn), extraDepths[i][1]);
|
||||
}
|
||||
}
|
||||
return valid_pairs;
|
||||
}
|
||||
|
||||
PERF_TEST_P(Size_MatType_OutMatDepthArray, DISABLED_integral_sqsum_full,
|
||||
testing::Combine(
|
||||
testing::Values(TYPICAL_MAT_SIZES),
|
||||
testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4),
|
||||
testing::Values(DEPTH_32S_32S, DEPTH_32S_32F, DEPTH_32S_64F, DEPTH_32F_32F, DEPTH_32F_64F, DEPTH_64F_64F)
|
||||
testing::ValuesIn(GetFullSqsumDepthPairs())
|
||||
)
|
||||
)
|
||||
{
|
||||
Size sz = get<0>(GetParam());
|
||||
int matType = get<1>(GetParam());
|
||||
int *outputDepths = (int *)extraOutputDepths[get<2>(GetParam())];
|
||||
int sdepth = outputDepths[0];
|
||||
int sqdepth = outputDepths[1];
|
||||
auto depths = get<1>(GetParam());
|
||||
int matType = get<0>(depths);
|
||||
int sdepth = extraOutputDepths[get<1>(depths)][0];
|
||||
int sqdepth = extraOutputDepths[get<1>(depths)][1];
|
||||
|
||||
Mat src(sz, matType);
|
||||
Mat sum(sz, sdepth);
|
||||
|
|
|
|||
|
|
@ -486,7 +486,8 @@ cvIntegral( const CvArr* image, CvArr* sumImage,
|
|||
ptilted = &tilted;
|
||||
}
|
||||
cv::integral( src, sum, psqsum ? cv::_OutputArray(*psqsum) : cv::_OutputArray(),
|
||||
ptilted ? cv::_OutputArray(*ptilted) : cv::_OutputArray(), sum.depth() );
|
||||
ptilted ? cv::_OutputArray(*ptilted) : cv::_OutputArray(), sum.depth(),
|
||||
psqsum ? psqsum->depth() : -1 );
|
||||
|
||||
CV_Assert( sum.data == sum0.data && sqsum.data == sqsum0.data && tilted.data == tilted0.data );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1684,19 +1684,34 @@ void CV_IntegralTest::get_test_array_types_and_sizes( int test_case_idx,
|
|||
vector<vector<Size> >& sizes, vector<vector<int> >& types )
|
||||
{
|
||||
RNG& rng = ts->get_rng();
|
||||
int depth = cvtest::randInt(rng) % 2, sum_depth;
|
||||
int cn = cvtest::randInt(rng) % 4 + 1;
|
||||
cvtest::ArrayTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
|
||||
Size sum_size;
|
||||
|
||||
depth = depth == 0 ? CV_8U : CV_32F;
|
||||
int b = (cvtest::randInt(rng) & 1) != 0;
|
||||
sum_depth = depth == CV_8U && b ? CV_32S : b ? CV_32F : CV_64F;
|
||||
const int depths[12][3] = {
|
||||
{CV_8U, CV_32S, CV_64F},
|
||||
{CV_8U, CV_32S, CV_32F},
|
||||
{CV_8U, CV_32S, CV_32S},
|
||||
{CV_8U, CV_32F, CV_64F},
|
||||
{CV_8U, CV_32F, CV_32F},
|
||||
{CV_8U, CV_64F, CV_64F},
|
||||
{CV_16U, CV_64F, CV_64F},
|
||||
{CV_16S, CV_64F, CV_64F},
|
||||
{CV_32F, CV_32F, CV_64F},
|
||||
{CV_32F, CV_32F, CV_32F},
|
||||
{CV_32F, CV_64F, CV_64F},
|
||||
{CV_64F, CV_64F, CV_64F},
|
||||
};
|
||||
|
||||
types[INPUT][0] = CV_MAKETYPE(depth,cn);
|
||||
int random_choice = cvtest::randInt(rng) % 12;
|
||||
int depth = depths[random_choice][0];
|
||||
int sum_depth = depths[random_choice][1];
|
||||
int sqsum_depth = depths[random_choice][2];
|
||||
|
||||
types[INPUT][0] = CV_MAKETYPE(depth, cn);
|
||||
types[OUTPUT][0] = types[REF_OUTPUT][0] =
|
||||
types[OUTPUT][2] = types[REF_OUTPUT][2] = CV_MAKETYPE(sum_depth, cn);
|
||||
types[OUTPUT][1] = types[REF_OUTPUT][1] = CV_MAKETYPE(CV_64F, cn);
|
||||
types[OUTPUT][1] = types[REF_OUTPUT][1] = CV_MAKETYPE(sqsum_depth, cn);
|
||||
|
||||
sum_size.width = sizes[INPUT][0].width + 1;
|
||||
sum_size.height = sizes[INPUT][0].height + 1;
|
||||
|
|
@ -1738,7 +1753,7 @@ void CV_IntegralTest::run_func()
|
|||
|
||||
static void test_integral( const Mat& img, Mat* sum, Mat* sqsum, Mat* tilted )
|
||||
{
|
||||
CV_Assert( img.depth() == CV_32F );
|
||||
CV_Assert( img.depth() == CV_64F );
|
||||
|
||||
sum->create(img.rows+1, img.cols+1, CV_64F);
|
||||
if( sqsum )
|
||||
|
|
@ -1746,7 +1761,7 @@ static void test_integral( const Mat& img, Mat* sum, Mat* sqsum, Mat* tilted )
|
|||
if( tilted )
|
||||
tilted->create(img.rows+1, img.cols+1, CV_64F);
|
||||
|
||||
const float* data = img.ptr<float>();
|
||||
const double* data = img.ptr<double>();
|
||||
double* sdata = sum->ptr<double>();
|
||||
double* sqdata = sqsum ? sqsum->ptr<double>() : 0;
|
||||
double* tdata = tilted ? tilted->ptr<double>() : 0;
|
||||
|
|
@ -1788,7 +1803,7 @@ static void test_integral( const Mat& img, Mat* sum, Mat* sqsum, Mat* tilted )
|
|||
else
|
||||
{
|
||||
ts += tdata[x-tstep-1];
|
||||
if( data > img.ptr<float>() )
|
||||
if( data > img.ptr<double>() )
|
||||
{
|
||||
ts += data[x-step-1];
|
||||
if( x < size.width )
|
||||
|
|
@ -1824,7 +1839,7 @@ void CV_IntegralTest::prepare_to_validation( int /*test_case_idx*/ )
|
|||
{
|
||||
if( cn > 1 )
|
||||
cvtest::extract(src, plane, i);
|
||||
plane.convertTo(srcf, CV_32F);
|
||||
plane.convertTo(srcf, CV_64F);
|
||||
|
||||
test_integral( srcf, &psum, sqsum0 ? &psqsum : 0, tsum0 ? &ptsum : 0 );
|
||||
psum.convertTo(psum2, sum0->depth());
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user