From 2a8d4b8e43f6e499c5553edd26056caed284d5a6 Mon Sep 17 00:00:00 2001 From: GenshinImpactStarts <147074368+GenshinImpactStarts@users.noreply.github.com> Date: Thu, 13 Mar 2025 20:56:56 +0800 Subject: [PATCH] Merge pull request #27000 from GenshinImpactStarts:cart_to_polar [HAL RVV] reuse atan | impl cart_to_polar | add perf test #27000 Implement through the existing `cv_hal_cartToPolar32f` and `cv_hal_cartToPolar64f` interfaces. Add `cartToPolar` performance tests. cv_hal_rvv::fast_atan is modified to make it more reusable because it's needed in cartToPolar. **UPDATE**: UI enabled. Since the vec type of RVV can't be stored in struct. UI implementation of `v_atan_f32` is modified. Both `fastAtan` and `cartToPolar` are affected so the test result for `atan` is also appended. I have tested the modified UI on RVV and AVX2 and no regressions appears. Perf test done on MUSE-PI. AVX2 test done on Intel(R) Xeon(R) Gold 6140 CPU @ 2.30GHz. ```sh $ opencv_test_core --gtest_filter="*CartToPolar*:*Core_CartPolar_reverse*:*Phase*" $ opencv_perf_core --gtest_filter="*CartToPolar*:*phase*" --perf_min_samples=300 --perf_force_samples=300 ``` Test result between enabled UI and HAL: ``` Name of Test ui rvv rvv vs ui (x-factor) CartToPolar::CartToPolarFixture::(127x61, 32FC1) 0.106 0.059 1.80 CartToPolar::CartToPolarFixture::(127x61, 64FC1) 0.155 0.070 2.20 CartToPolar::CartToPolarFixture::(640x480, 32FC1) 4.188 2.317 1.81 CartToPolar::CartToPolarFixture::(640x480, 64FC1) 6.593 2.889 2.28 CartToPolar::CartToPolarFixture::(1280x720, 32FC1) 12.600 7.057 1.79 CartToPolar::CartToPolarFixture::(1280x720, 64FC1) 19.860 8.797 2.26 CartToPolar::CartToPolarFixture::(1920x1080, 32FC1) 28.295 15.809 1.79 CartToPolar::CartToPolarFixture::(1920x1080, 64FC1) 44.573 19.398 2.30 phase32f::VectorLength::128 0.002 0.002 1.20 phase32f::VectorLength::1000 0.008 0.006 1.32 phase32f::VectorLength::131072 1.061 0.731 1.45 phase32f::VectorLength::524288 3.997 2.976 1.34 phase32f::VectorLength::1048576 8.001 5.959 1.34 phase64f::VectorLength::128 0.002 0.002 1.33 phase64f::VectorLength::1000 0.012 0.008 1.58 phase64f::VectorLength::131072 1.648 0.931 1.77 phase64f::VectorLength::524288 6.836 3.837 1.78 phase64f::VectorLength::1048576 14.060 7.540 1.86 ``` Test result before and after enabling UI on RVV: ``` Name of Test perf perf perf ui ui ui orig pr pr vs perf ui orig (x-factor) CartToPolar::CartToPolarFixture::(127x61, 32FC1) 0.141 0.106 1.33 CartToPolar::CartToPolarFixture::(127x61, 64FC1) 0.187 0.155 1.20 CartToPolar::CartToPolarFixture::(640x480, 32FC1) 5.990 4.188 1.43 CartToPolar::CartToPolarFixture::(640x480, 64FC1) 8.370 6.593 1.27 CartToPolar::CartToPolarFixture::(1280x720, 32FC1) 18.214 12.600 1.45 CartToPolar::CartToPolarFixture::(1280x720, 64FC1) 25.365 19.860 1.28 CartToPolar::CartToPolarFixture::(1920x1080, 32FC1) 40.437 28.295 1.43 CartToPolar::CartToPolarFixture::(1920x1080, 64FC1) 56.699 44.573 1.27 phase32f::VectorLength::128 0.003 0.002 1.54 phase32f::VectorLength::1000 0.016 0.008 1.90 phase32f::VectorLength::131072 2.048 1.061 1.93 phase32f::VectorLength::524288 8.219 3.997 2.06 phase32f::VectorLength::1048576 16.426 8.001 2.05 phase64f::VectorLength::128 0.003 0.002 1.44 phase64f::VectorLength::1000 0.020 0.012 1.60 phase64f::VectorLength::131072 2.621 1.648 1.59 phase64f::VectorLength::524288 10.780 6.836 1.58 phase64f::VectorLength::1048576 22.723 14.060 1.62 ``` Test result before and after modifying UI on AVX2: ``` Name of Test perf perf perf avx2 avx2 avx2 orig pr pr vs perf avx2 orig (x-factor) CartToPolar::CartToPolarFixture::(127x61, 32FC1) 0.006 0.005 1.14 CartToPolar::CartToPolarFixture::(127x61, 64FC1) 0.010 0.009 1.08 CartToPolar::CartToPolarFixture::(640x480, 32FC1) 0.273 0.264 1.03 CartToPolar::CartToPolarFixture::(640x480, 64FC1) 0.511 0.487 1.05 CartToPolar::CartToPolarFixture::(1280x720, 32FC1) 0.760 0.723 1.05 CartToPolar::CartToPolarFixture::(1280x720, 64FC1) 2.009 1.937 1.04 CartToPolar::CartToPolarFixture::(1920x1080, 32FC1) 1.996 1.923 1.04 CartToPolar::CartToPolarFixture::(1920x1080, 64FC1) 5.721 5.509 1.04 phase32f::VectorLength::128 0.000 0.000 0.98 phase32f::VectorLength::1000 0.001 0.001 0.97 phase32f::VectorLength::131072 0.105 0.111 0.95 phase32f::VectorLength::524288 0.402 0.402 1.00 phase32f::VectorLength::1048576 0.775 0.767 1.01 phase64f::VectorLength::128 0.000 0.000 1.00 phase64f::VectorLength::1000 0.001 0.001 1.01 phase64f::VectorLength::131072 0.163 0.162 1.01 phase64f::VectorLength::524288 0.669 0.653 1.02 phase64f::VectorLength::1048576 1.660 1.634 1.02 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- 3rdparty/hal_rvv/hal_rvv.hpp | 1 + 3rdparty/hal_rvv/hal_rvv_1p0/atan.hpp | 122 +++++++++--------- .../hal_rvv/hal_rvv_1p0/cart_to_polar.hpp | 48 +++++++ modules/core/perf/perf_math.cpp | 22 ++++ modules/core/src/mathfuncs_core.simd.hpp | 76 +++++------ 5 files changed, 158 insertions(+), 111 deletions(-) create mode 100644 3rdparty/hal_rvv/hal_rvv_1p0/cart_to_polar.hpp diff --git a/3rdparty/hal_rvv/hal_rvv.hpp b/3rdparty/hal_rvv/hal_rvv.hpp index 83b1ea272c..f2f9aa68d4 100644 --- a/3rdparty/hal_rvv/hal_rvv.hpp +++ b/3rdparty/hal_rvv/hal_rvv.hpp @@ -31,6 +31,7 @@ #include "hal_rvv_1p0/atan.hpp" // core #include "hal_rvv_1p0/split.hpp" // core #include "hal_rvv_1p0/magnitude.hpp" // core +#include "hal_rvv_1p0/cart_to_polar.hpp" // core #include "hal_rvv_1p0/flip.hpp" // core #include "hal_rvv_1p0/lut.hpp" // core #include "hal_rvv_1p0/exp.hpp" // core diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/atan.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/atan.hpp index 2134d98a6e..2e4f9c2a67 100644 --- a/3rdparty/hal_rvv/hal_rvv_1p0/atan.hpp +++ b/3rdparty/hal_rvv/hal_rvv_1p0/atan.hpp @@ -13,67 +13,76 @@ #include -namespace cv::cv_hal_rvv { +namespace cv { namespace cv_hal_rvv { namespace detail { // ref: mathfuncs_core.simd.hpp static constexpr float pi = CV_PI; -static constexpr float atan2_p1 = 0.9997878412794807F * (180 / pi); -static constexpr float atan2_p3 = -0.3258083974640975F * (180 / pi); -static constexpr float atan2_p5 = 0.1555786518463281F * (180 / pi); -static constexpr float atan2_p7 = -0.04432655554792128F * (180 / pi); -__attribute__((always_inline)) inline vfloat32m4_t -rvv_atan_f32(vfloat32m4_t vy, vfloat32m4_t vx, size_t vl, float p7, - vfloat32m4_t vp5, vfloat32m4_t vp3, vfloat32m4_t vp1, - float angle_90_deg) { +struct AtanParams +{ + float p1, p3, p5, p7, angle_90; +}; + +static constexpr AtanParams atan_params_rad { + 0.9997878412794807F, + -0.3258083974640975F, + 0.1555786518463281F, + -0.04432655554792128F, + 90.F * (pi / 180.F)}; +static constexpr AtanParams atan_params_deg { + atan_params_rad.p1 * (180 / pi), + atan_params_rad.p3 * (180 / pi), + atan_params_rad.p5 * (180 / pi), + atan_params_rad.p7 * (180 / pi), + 90.F}; + +template +__attribute__((always_inline)) inline VEC_T + rvv_atan(VEC_T vy, VEC_T vx, size_t vl, const AtanParams& params) +{ const auto ax = __riscv_vfabs(vx, vl); const auto ay = __riscv_vfabs(vy, vl); - const auto c = __riscv_vfdiv( - __riscv_vfmin(ax, ay, vl), - __riscv_vfadd(__riscv_vfmax(ax, ay, vl), FLT_EPSILON, vl), vl); + // Reciprocal Estimate (vfrec7) is not accurate enough to pass the test of cartToPolar. + const auto c = __riscv_vfdiv(__riscv_vfmin(ax, ay, vl), + __riscv_vfadd(__riscv_vfmax(ax, ay, vl), FLT_EPSILON, vl), + vl); const auto c2 = __riscv_vfmul(c, c, vl); - auto a = __riscv_vfmadd(c2, p7, vp5, vl); - a = __riscv_vfmadd(a, c2, vp3, vl); - a = __riscv_vfmadd(a, c2, vp1, vl); + // Using vfmadd only results in about a 2% performance improvement, but it occupies 3 additional + // M4 registers. (Performance test on phase32f::VectorLength::1048576: time decreased + // from 5.952ms to 5.805ms on Muse Pi) + // Additionally, when registers are nearly fully utilized (though not yet exhausted), the + // compiler is likely to fail to optimize and may introduce slower memory access (e.g., in + // cv::cv_hal_rvv::fast_atan_64). + // Saving registers can also make this function more reusable in other contexts. + // Therefore, vfmadd is not used here. + auto a = __riscv_vfadd(__riscv_vfmul(c2, params.p7, vl), params.p5, vl); + a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p3, vl); + a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p1, vl); a = __riscv_vfmul(a, c, vl); - const auto mask = __riscv_vmflt(ax, ay, vl); - a = __riscv_vfrsub_mu(mask, a, a, angle_90_deg, vl); - - a = __riscv_vfrsub_mu(__riscv_vmflt(vx, 0.F, vl), a, a, angle_90_deg * 2, - vl); - a = __riscv_vfrsub_mu(__riscv_vmflt(vy, 0.F, vl), a, a, angle_90_deg * 4, - vl); + a = __riscv_vfrsub_mu(__riscv_vmflt(ax, ay, vl), a, a, params.angle_90, vl); + a = __riscv_vfrsub_mu(__riscv_vmflt(vx, 0.F, vl), a, a, params.angle_90 * 2, vl); + a = __riscv_vfrsub_mu(__riscv_vmflt(vy, 0.F, vl), a, a, params.angle_90 * 4, vl); return a; } -} // namespace detail +} // namespace detail -inline int fast_atan_32(const float *y, const float *x, float *dst, size_t n, - bool angle_in_deg) { - const float scale = angle_in_deg ? 1.f : CV_PI / 180.f; - const float p1 = detail::atan2_p1 * scale; - const float p3 = detail::atan2_p3 * scale; - const float p5 = detail::atan2_p5 * scale; - const float p7 = detail::atan2_p7 * scale; - const float angle_90_deg = 90.F * scale; +inline int fast_atan_32(const float* y, const float* x, float* dst, size_t n, bool angle_in_deg) +{ + auto atan_params = angle_in_deg ? detail::atan_params_deg : detail::atan_params_rad; - static size_t vlmax = __riscv_vsetvlmax_e32m4(); - auto vp1 = __riscv_vfmv_v_f_f32m4(p1, vlmax); - auto vp3 = __riscv_vfmv_v_f_f32m4(p3, vlmax); - auto vp5 = __riscv_vfmv_v_f_f32m4(p5, vlmax); - - for (size_t vl{}; n > 0; n -= vl) { + for (size_t vl = 0; n > 0; n -= vl) + { vl = __riscv_vsetvl_e32m4(n); auto vy = __riscv_vle32_v_f32m4(y, vl); auto vx = __riscv_vle32_v_f32m4(x, vl); - auto a = - detail::rvv_atan_f32(vy, vx, vl, p7, vp5, vp3, vp1, angle_90_deg); + auto a = detail::rvv_atan(vy, vx, vl, atan_params); __riscv_vse32(dst, a, vl); @@ -85,37 +94,22 @@ inline int fast_atan_32(const float *y, const float *x, float *dst, size_t n, return CV_HAL_ERROR_OK; } -inline int fast_atan_64(const double *y, const double *x, double *dst, size_t n, - bool angle_in_deg) { +inline int fast_atan_64(const double* y, const double* x, double* dst, size_t n, bool angle_in_deg) +{ // this also uses float32 version, ref: mathfuncs_core.simd.hpp - const float scale = angle_in_deg ? 1.f : CV_PI / 180.f; - const float p1 = detail::atan2_p1 * scale; - const float p3 = detail::atan2_p3 * scale; - const float p5 = detail::atan2_p5 * scale; - const float p7 = detail::atan2_p7 * scale; - const float angle_90_deg = 90.F * scale; + auto atan_params = angle_in_deg ? detail::atan_params_deg : detail::atan_params_rad; - static size_t vlmax = __riscv_vsetvlmax_e32m4(); - auto vp1 = __riscv_vfmv_v_f_f32m4(p1, vlmax); - auto vp3 = __riscv_vfmv_v_f_f32m4(p3, vlmax); - auto vp5 = __riscv_vfmv_v_f_f32m4(p5, vlmax); - - for (size_t vl{}; n > 0; n -= vl) { + for (size_t vl = 0; n > 0; n -= vl) + { vl = __riscv_vsetvl_e64m8(n); - auto wy = __riscv_vle64_v_f64m8(y, vl); - auto wx = __riscv_vle64_v_f64m8(x, vl); + auto vy = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(y, vl), vl); + auto vx = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(x, vl), vl); - auto vy = __riscv_vfncvt_f_f_w_f32m4(wy, vl); - auto vx = __riscv_vfncvt_f_f_w_f32m4(wx, vl); + auto a = detail::rvv_atan(vy, vx, vl, atan_params); - auto a = - detail::rvv_atan_f32(vy, vx, vl, p7, vp5, vp3, vp1, angle_90_deg); - - auto wa = __riscv_vfwcvt_f_f_v_f64m8(a, vl); - - __riscv_vse64(dst, wa, vl); + __riscv_vse64(dst, __riscv_vfwcvt_f(a, vl), vl); x += vl; y += vl; @@ -125,4 +119,4 @@ inline int fast_atan_64(const double *y, const double *x, double *dst, size_t n, return CV_HAL_ERROR_OK; } -} // namespace cv::cv_hal_rvv +}} // namespace cv::cv_hal_rvv diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/cart_to_polar.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/cart_to_polar.hpp new file mode 100644 index 0000000000..676133b668 --- /dev/null +++ b/3rdparty/hal_rvv/hal_rvv_1p0/cart_to_polar.hpp @@ -0,0 +1,48 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. + +#ifndef OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED +#define OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED + +#include + +#include "hal_rvv_1p0/atan.hpp" +#include "hal_rvv_1p0/sqrt.hpp" +#include "hal_rvv_1p0/types.hpp" + +namespace cv { namespace cv_hal_rvv { + +#undef cv_hal_cartToPolar32f +#define cv_hal_cartToPolar32f cv::cv_hal_rvv::cartToPolar +#undef cv_hal_cartToPolar64f +#define cv_hal_cartToPolar64f cv::cv_hal_rvv::cartToPolar + +template +inline int cartToPolar(const T* x, const T* y, T* mag, T* angle, int len, bool angleInDegrees) +{ + using CalType = RVV_SameLen; + auto atan_params = angleInDegrees ? detail::atan_params_deg : detail::atan_params_rad; + size_t vl; + for (; len > 0; len -= (int)vl, x += vl, y += vl, mag += vl, angle += vl) + { + vl = RVV_T::setvl(len); + + auto vx = CalType::cast(RVV_T::vload(x, vl), vl); + auto vy = CalType::cast(RVV_T::vload(y, vl), vl); + + auto vmag = detail::sqrt<2>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl); + RVV_T::vstore(mag, RVV_T::cast(vmag, vl), vl); + + auto vangle = detail::rvv_atan(vy, vx, vl, atan_params); + RVV_T::vstore(angle, RVV_T::cast(vangle, vl), vl); + } + + return CV_HAL_ERROR_OK; +} + +}} // namespace cv::cv_hal_rvv + +#endif // OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED diff --git a/modules/core/perf/perf_math.cpp b/modules/core/perf/perf_math.cpp index c06fda44da..398a3ad651 100644 --- a/modules/core/perf/perf_math.cpp +++ b/modules/core/perf/perf_math.cpp @@ -57,6 +57,28 @@ PERF_TEST_P(MagnitudeFixture, Magnitude, SANITY_CHECK_NOTHING(); } +///////////// Cart to Polar ///////////// + +typedef Size_MatType CartToPolarFixture; + +PERF_TEST_P(CartToPolarFixture, CartToPolar, + testing::Combine(testing::Values(TYPICAL_MAT_SIZES), testing::Values(CV_32F, CV_64F))) +{ + cv::Size size = std::get<0>(GetParam()); + int type = std::get<1>(GetParam()); + + cv::Mat x(size, type); + cv::Mat y(size, type); + cv::Mat magnitude(size, type); + cv::Mat angle(size, type); + + declare.in(x, y, WARMUP_RNG).out(magnitude, angle); + + TEST_CYCLE() cv::cartToPolar(x, y, magnitude, angle); + + SANITY_CHECK_NOTHING(); +} + // generates random vectors, performs Gram-Schmidt orthogonalization on them Mat randomOrtho(int rows, int ftype, RNG& rng) { diff --git a/modules/core/src/mathfuncs_core.simd.hpp b/modules/core/src/mathfuncs_core.simd.hpp index 0d9d9272e6..e9d57a5c4d 100644 --- a/modules/core/src/mathfuncs_core.simd.hpp +++ b/modules/core/src/mathfuncs_core.simd.hpp @@ -73,48 +73,30 @@ static inline float atan_f32(float y, float x) } #endif -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) -struct v_atan_f32 +v_float32 v_atan_f32(const v_float32& y, const v_float32& x) { - explicit v_atan_f32(const float& scale) - { - eps = vx_setall_f32((float)DBL_EPSILON); - z = vx_setzero_f32(); - p7 = vx_setall_f32(atan2_p7); - p5 = vx_setall_f32(atan2_p5); - p3 = vx_setall_f32(atan2_p3); - p1 = vx_setall_f32(atan2_p1); - val90 = vx_setall_f32(90.f); - val180 = vx_setall_f32(180.f); - val360 = vx_setall_f32(360.f); - s = vx_setall_f32(scale); - } + v_float32 eps = vx_setall_f32((float)DBL_EPSILON); + v_float32 z = vx_setzero_f32(); + v_float32 p7 = vx_setall_f32(atan2_p7); + v_float32 p5 = vx_setall_f32(atan2_p5); + v_float32 p3 = vx_setall_f32(atan2_p3); + v_float32 p1 = vx_setall_f32(atan2_p1); + v_float32 val90 = vx_setall_f32(90.f); + v_float32 val180 = vx_setall_f32(180.f); + v_float32 val360 = vx_setall_f32(360.f); - v_float32 compute(const v_float32& y, const v_float32& x) - { - v_float32 ax = v_abs(x); - v_float32 ay = v_abs(y); - v_float32 c = v_div(v_min(ax, ay), v_add(v_max(ax, ay), this->eps)); - v_float32 cc = v_mul(c, c); - v_float32 a = v_mul(v_fma(v_fma(v_fma(cc, this->p7, this->p5), cc, this->p3), cc, this->p1), c); - a = v_select(v_ge(ax, ay), a, v_sub(this->val90, a)); - a = v_select(v_lt(x, this->z), v_sub(this->val180, a), a); - a = v_select(v_lt(y, this->z), v_sub(this->val360, a), a); - return v_mul(a, this->s); - } - - v_float32 eps; - v_float32 z; - v_float32 p7; - v_float32 p5; - v_float32 p3; - v_float32 p1; - v_float32 val90; - v_float32 val180; - v_float32 val360; - v_float32 s; -}; + v_float32 ax = v_abs(x); + v_float32 ay = v_abs(y); + v_float32 c = v_div(v_min(ax, ay), v_add(v_max(ax, ay), eps)); + v_float32 cc = v_mul(c, c); + v_float32 a = v_mul(v_fma(v_fma(v_fma(cc, p7, p5), cc, p3), cc, p1), c); + a = v_select(v_ge(ax, ay), a, v_sub(val90, a)); + a = v_select(v_lt(x, z), v_sub(val180, a), a); + a = v_select(v_lt(y, z), v_sub(val360, a), a); + return a; +} #endif @@ -124,9 +106,9 @@ static void cartToPolar32f_(const float *X, const float *Y, float *mag, float *a { float scale = angleInDegrees ? 1.f : (float)(CV_PI/180); int i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) const int VECSZ = VTraits::vlanes(); - v_atan_f32 v(scale); + v_float32 s = vx_setall_f32(scale); for( ; i < len; i += VECSZ*2 ) { @@ -148,8 +130,8 @@ static void cartToPolar32f_(const float *X, const float *Y, float *mag, float *a v_float32 m0 = v_sqrt(v_muladd(x0, x0, v_mul(y0, y0))); v_float32 m1 = v_sqrt(v_muladd(x1, x1, v_mul(y1, y1))); - v_float32 r0 = v.compute(y0, x0); - v_float32 r1 = v.compute(y1, x1); + v_float32 r0 = v_mul(v_atan_f32(y0, x0), s); + v_float32 r1 = v_mul(v_atan_f32(y1, x1), s); v_store(mag + i, m0); v_store(mag + i + VECSZ, m1); @@ -200,9 +182,9 @@ static void fastAtan32f_(const float *Y, const float *X, float *angle, int len, { float scale = angleInDegrees ? 1.f : (float)(CV_PI/180); int i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) const int VECSZ = VTraits::vlanes(); - v_atan_f32 v(scale); + v_float32 s = vx_setall_f32(scale); for( ; i < len; i += VECSZ*2 ) { @@ -221,8 +203,8 @@ static void fastAtan32f_(const float *Y, const float *X, float *angle, int len, v_float32 y1 = vx_load(Y + i + VECSZ); v_float32 x1 = vx_load(X + i + VECSZ); - v_float32 r0 = v.compute(y0, x0); - v_float32 r1 = v.compute(y1, x1); + v_float32 r0 = v_mul(v_atan_f32(y0, x0), s); + v_float32 r1 = v_mul(v_atan_f32(y1, x1), s); v_store(angle + i, r0); v_store(angle + i + VECSZ, r1);