From 2a8d4b8e43f6e499c5553edd26056caed284d5a6 Mon Sep 17 00:00:00 2001
From: GenshinImpactStarts
 <147074368+GenshinImpactStarts@users.noreply.github.com>
Date: Thu, 13 Mar 2025 20:56:56 +0800
Subject: [PATCH] Merge pull request #27000 from
 GenshinImpactStarts:cart_to_polar

[HAL RVV] reuse atan | impl cart_to_polar | add perf test #27000

Implement through the existing `cv_hal_cartToPolar32f` and `cv_hal_cartToPolar64f` interfaces.

Add `cartToPolar` performance tests.

cv_hal_rvv::fast_atan is modified to make it more reusable because it's needed in cartToPolar.

**UPDATE**: UI enabled. Since the vec type of RVV can't be stored in struct. UI implementation of `v_atan_f32` is modified. Both `fastAtan` and `cartToPolar` are affected so the test result for `atan` is also appended. I have tested the modified UI on RVV and AVX2 and no regressions appears.

Perf test done on MUSE-PI. AVX2 test done on Intel(R) Xeon(R) Gold 6140 CPU @ 2.30GHz.

```sh
$ opencv_test_core --gtest_filter="*CartToPolar*:*Core_CartPolar_reverse*:*Phase*"
$ opencv_perf_core --gtest_filter="*CartToPolar*:*phase*" --perf_min_samples=300 --perf_force_samples=300
```

Test result between enabled UI and HAL:
```
                   Name of Test                       ui    rvv      rvv
                                                                      vs
                                                                      ui
                                                                  (x-factor)
CartToPolar::CartToPolarFixture::(127x61, 32FC1)    0.106  0.059     1.80
CartToPolar::CartToPolarFixture::(127x61, 64FC1)    0.155  0.070     2.20
CartToPolar::CartToPolarFixture::(640x480, 32FC1)   4.188  2.317     1.81
CartToPolar::CartToPolarFixture::(640x480, 64FC1)   6.593  2.889     2.28
CartToPolar::CartToPolarFixture::(1280x720, 32FC1)  12.600 7.057     1.79
CartToPolar::CartToPolarFixture::(1280x720, 64FC1)  19.860 8.797     2.26
CartToPolar::CartToPolarFixture::(1920x1080, 32FC1) 28.295 15.809    1.79
CartToPolar::CartToPolarFixture::(1920x1080, 64FC1) 44.573 19.398    2.30
phase32f::VectorLength::128                         0.002  0.002     1.20
phase32f::VectorLength::1000                        0.008  0.006     1.32
phase32f::VectorLength::131072                      1.061  0.731     1.45
phase32f::VectorLength::524288                      3.997  2.976     1.34
phase32f::VectorLength::1048576                     8.001  5.959     1.34
phase64f::VectorLength::128                         0.002  0.002     1.33
phase64f::VectorLength::1000                        0.012  0.008     1.58
phase64f::VectorLength::131072                      1.648  0.931     1.77
phase64f::VectorLength::524288                      6.836  3.837     1.78
phase64f::VectorLength::1048576                     14.060 7.540     1.86
```

Test result before and after enabling UI on RVV:
```
                   Name of Test                      perf   perf     perf
                                                      ui     ui       ui
                                                     orig    pr       pr
                                                                      vs
                                                                     perf
                                                                      ui
                                                                     orig
                                                                  (x-factor)
CartToPolar::CartToPolarFixture::(127x61, 32FC1)    0.141  0.106     1.33
CartToPolar::CartToPolarFixture::(127x61, 64FC1)    0.187  0.155     1.20
CartToPolar::CartToPolarFixture::(640x480, 32FC1)   5.990  4.188     1.43
CartToPolar::CartToPolarFixture::(640x480, 64FC1)   8.370  6.593     1.27
CartToPolar::CartToPolarFixture::(1280x720, 32FC1)  18.214 12.600    1.45
CartToPolar::CartToPolarFixture::(1280x720, 64FC1)  25.365 19.860    1.28
CartToPolar::CartToPolarFixture::(1920x1080, 32FC1) 40.437 28.295    1.43
CartToPolar::CartToPolarFixture::(1920x1080, 64FC1) 56.699 44.573    1.27
phase32f::VectorLength::128                         0.003  0.002     1.54
phase32f::VectorLength::1000                        0.016  0.008     1.90
phase32f::VectorLength::131072                      2.048  1.061     1.93
phase32f::VectorLength::524288                      8.219  3.997     2.06
phase32f::VectorLength::1048576                     16.426 8.001     2.05
phase64f::VectorLength::128                         0.003  0.002     1.44
phase64f::VectorLength::1000                        0.020  0.012     1.60
phase64f::VectorLength::131072                      2.621  1.648     1.59
phase64f::VectorLength::524288                      10.780 6.836     1.58
phase64f::VectorLength::1048576                     22.723 14.060    1.62
```

Test result before and after modifying UI on AVX2:
```
                   Name of Test                     perf  perf     perf
                                                    avx2  avx2     avx2
                                                    orig   pr       pr
                                                                    vs
                                                                   perf
                                                                   avx2
                                                                   orig
                                                                (x-factor)
CartToPolar::CartToPolarFixture::(127x61, 32FC1)    0.006 0.005    1.14
CartToPolar::CartToPolarFixture::(127x61, 64FC1)    0.010 0.009    1.08
CartToPolar::CartToPolarFixture::(640x480, 32FC1)   0.273 0.264    1.03
CartToPolar::CartToPolarFixture::(640x480, 64FC1)   0.511 0.487    1.05
CartToPolar::CartToPolarFixture::(1280x720, 32FC1)  0.760 0.723    1.05
CartToPolar::CartToPolarFixture::(1280x720, 64FC1)  2.009 1.937    1.04
CartToPolar::CartToPolarFixture::(1920x1080, 32FC1) 1.996 1.923    1.04
CartToPolar::CartToPolarFixture::(1920x1080, 64FC1) 5.721 5.509    1.04
phase32f::VectorLength::128                         0.000 0.000    0.98
phase32f::VectorLength::1000                        0.001 0.001    0.97
phase32f::VectorLength::131072                      0.105 0.111    0.95
phase32f::VectorLength::524288                      0.402 0.402    1.00
phase32f::VectorLength::1048576                     0.775 0.767    1.01
phase64f::VectorLength::128                         0.000 0.000    1.00
phase64f::VectorLength::1000                        0.001 0.001    1.01
phase64f::VectorLength::131072                      0.163 0.162    1.01
phase64f::VectorLength::524288                      0.669 0.653    1.02
phase64f::VectorLength::1048576                     1.660 1.634    1.02
```

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 3rdparty/hal_rvv/hal_rvv.hpp                  |   1 +
 3rdparty/hal_rvv/hal_rvv_1p0/atan.hpp         | 122 +++++++++---------
 .../hal_rvv/hal_rvv_1p0/cart_to_polar.hpp     |  48 +++++++
 modules/core/perf/perf_math.cpp               |  22 ++++
 modules/core/src/mathfuncs_core.simd.hpp      |  76 +++++------
 5 files changed, 158 insertions(+), 111 deletions(-)
 create mode 100644 3rdparty/hal_rvv/hal_rvv_1p0/cart_to_polar.hpp

diff --git a/3rdparty/hal_rvv/hal_rvv.hpp b/3rdparty/hal_rvv/hal_rvv.hpp
index 83b1ea272c..f2f9aa68d4 100644
--- a/3rdparty/hal_rvv/hal_rvv.hpp
+++ b/3rdparty/hal_rvv/hal_rvv.hpp
@@ -31,6 +31,7 @@
 #include "hal_rvv_1p0/atan.hpp" // core
 #include "hal_rvv_1p0/split.hpp" // core
 #include "hal_rvv_1p0/magnitude.hpp" // core
+#include "hal_rvv_1p0/cart_to_polar.hpp" // core
 #include "hal_rvv_1p0/flip.hpp" // core
 #include "hal_rvv_1p0/lut.hpp" // core
 #include "hal_rvv_1p0/exp.hpp" // core
diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/atan.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/atan.hpp
index 2134d98a6e..2e4f9c2a67 100644
--- a/3rdparty/hal_rvv/hal_rvv_1p0/atan.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/atan.hpp
@@ -13,67 +13,76 @@
 
 #include <cfloat>
 
-namespace cv::cv_hal_rvv {
+namespace cv { namespace cv_hal_rvv {
 
 namespace detail {
 // ref: mathfuncs_core.simd.hpp
 static constexpr float pi = CV_PI;
-static constexpr float atan2_p1 = 0.9997878412794807F * (180 / pi);
-static constexpr float atan2_p3 = -0.3258083974640975F * (180 / pi);
-static constexpr float atan2_p5 = 0.1555786518463281F * (180 / pi);
-static constexpr float atan2_p7 = -0.04432655554792128F * (180 / pi);
 
-__attribute__((always_inline)) inline vfloat32m4_t
-rvv_atan_f32(vfloat32m4_t vy, vfloat32m4_t vx, size_t vl, float p7,
-             vfloat32m4_t vp5, vfloat32m4_t vp3, vfloat32m4_t vp1,
-             float angle_90_deg) {
+struct AtanParams
+{
+    float p1, p3, p5, p7, angle_90;
+};
+
+static constexpr AtanParams atan_params_rad {
+    0.9997878412794807F,
+    -0.3258083974640975F,
+    0.1555786518463281F,
+    -0.04432655554792128F,
+    90.F * (pi / 180.F)};
+static constexpr AtanParams atan_params_deg {
+    atan_params_rad.p1 * (180 / pi),
+    atan_params_rad.p3 * (180 / pi),
+    atan_params_rad.p5 * (180 / pi),
+    atan_params_rad.p7 * (180 / pi),
+    90.F};
+
+template <typename VEC_T>
+__attribute__((always_inline)) inline VEC_T
+    rvv_atan(VEC_T vy, VEC_T vx, size_t vl, const AtanParams& params)
+{
     const auto ax = __riscv_vfabs(vx, vl);
     const auto ay = __riscv_vfabs(vy, vl);
-    const auto c = __riscv_vfdiv(
-        __riscv_vfmin(ax, ay, vl),
-        __riscv_vfadd(__riscv_vfmax(ax, ay, vl), FLT_EPSILON, vl), vl);
+    // Reciprocal Estimate (vfrec7) is not accurate enough to pass the test of cartToPolar.
+    const auto c = __riscv_vfdiv(__riscv_vfmin(ax, ay, vl),
+                                 __riscv_vfadd(__riscv_vfmax(ax, ay, vl), FLT_EPSILON, vl),
+                                 vl);
     const auto c2 = __riscv_vfmul(c, c, vl);
 
-    auto a = __riscv_vfmadd(c2, p7, vp5, vl);
-    a = __riscv_vfmadd(a, c2, vp3, vl);
-    a = __riscv_vfmadd(a, c2, vp1, vl);
+    // Using vfmadd only results in about a 2% performance improvement, but it occupies 3 additional
+    // M4 registers. (Performance test on phase32f::VectorLength::1048576: time decreased
+    // from 5.952ms to 5.805ms on Muse Pi)
+    // Additionally, when registers are nearly fully utilized (though not yet exhausted), the
+    // compiler is likely to fail to optimize and may introduce slower memory access (e.g., in
+    // cv::cv_hal_rvv::fast_atan_64).
+    // Saving registers can also make this function more reusable in other contexts.
+    // Therefore, vfmadd is not used here.
+    auto a = __riscv_vfadd(__riscv_vfmul(c2, params.p7, vl), params.p5, vl);
+    a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p3, vl);
+    a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p1, vl);
     a = __riscv_vfmul(a, c, vl);
 
-    const auto mask = __riscv_vmflt(ax, ay, vl);
-    a = __riscv_vfrsub_mu(mask, a, a, angle_90_deg, vl);
-
-    a = __riscv_vfrsub_mu(__riscv_vmflt(vx, 0.F, vl), a, a, angle_90_deg * 2,
-                          vl);
-    a = __riscv_vfrsub_mu(__riscv_vmflt(vy, 0.F, vl), a, a, angle_90_deg * 4,
-                          vl);
+    a = __riscv_vfrsub_mu(__riscv_vmflt(ax, ay, vl), a, a, params.angle_90, vl);
+    a = __riscv_vfrsub_mu(__riscv_vmflt(vx, 0.F, vl), a, a, params.angle_90 * 2, vl);
+    a = __riscv_vfrsub_mu(__riscv_vmflt(vy, 0.F, vl), a, a, params.angle_90 * 4, vl);
 
     return a;
 }
 
-} // namespace detail
+}  // namespace detail
 
-inline int fast_atan_32(const float *y, const float *x, float *dst, size_t n,
-                        bool angle_in_deg) {
-    const float scale = angle_in_deg ? 1.f : CV_PI / 180.f;
-    const float p1 = detail::atan2_p1 * scale;
-    const float p3 = detail::atan2_p3 * scale;
-    const float p5 = detail::atan2_p5 * scale;
-    const float p7 = detail::atan2_p7 * scale;
-    const float angle_90_deg = 90.F * scale;
+inline int fast_atan_32(const float* y, const float* x, float* dst, size_t n, bool angle_in_deg)
+{
+    auto atan_params = angle_in_deg ? detail::atan_params_deg : detail::atan_params_rad;
 
-    static size_t vlmax = __riscv_vsetvlmax_e32m4();
-    auto vp1 = __riscv_vfmv_v_f_f32m4(p1, vlmax);
-    auto vp3 = __riscv_vfmv_v_f_f32m4(p3, vlmax);
-    auto vp5 = __riscv_vfmv_v_f_f32m4(p5, vlmax);
-
-    for (size_t vl{}; n > 0; n -= vl) {
+    for (size_t vl = 0; n > 0; n -= vl)
+    {
         vl = __riscv_vsetvl_e32m4(n);
 
         auto vy = __riscv_vle32_v_f32m4(y, vl);
         auto vx = __riscv_vle32_v_f32m4(x, vl);
 
-        auto a =
-            detail::rvv_atan_f32(vy, vx, vl, p7, vp5, vp3, vp1, angle_90_deg);
+        auto a = detail::rvv_atan(vy, vx, vl, atan_params);
 
         __riscv_vse32(dst, a, vl);
 
@@ -85,37 +94,22 @@ inline int fast_atan_32(const float *y, const float *x, float *dst, size_t n,
     return CV_HAL_ERROR_OK;
 }
 
-inline int fast_atan_64(const double *y, const double *x, double *dst, size_t n,
-                        bool angle_in_deg) {
+inline int fast_atan_64(const double* y, const double* x, double* dst, size_t n, bool angle_in_deg)
+{
     // this also uses float32 version, ref: mathfuncs_core.simd.hpp
 
-    const float scale = angle_in_deg ? 1.f : CV_PI / 180.f;
-    const float p1 = detail::atan2_p1 * scale;
-    const float p3 = detail::atan2_p3 * scale;
-    const float p5 = detail::atan2_p5 * scale;
-    const float p7 = detail::atan2_p7 * scale;
-    const float angle_90_deg = 90.F * scale;
+    auto atan_params = angle_in_deg ? detail::atan_params_deg : detail::atan_params_rad;
 
-    static size_t vlmax = __riscv_vsetvlmax_e32m4();
-    auto vp1 = __riscv_vfmv_v_f_f32m4(p1, vlmax);
-    auto vp3 = __riscv_vfmv_v_f_f32m4(p3, vlmax);
-    auto vp5 = __riscv_vfmv_v_f_f32m4(p5, vlmax);
-
-    for (size_t vl{}; n > 0; n -= vl) {
+    for (size_t vl = 0; n > 0; n -= vl)
+    {
         vl = __riscv_vsetvl_e64m8(n);
 
-        auto wy = __riscv_vle64_v_f64m8(y, vl);
-        auto wx = __riscv_vle64_v_f64m8(x, vl);
+        auto vy = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(y, vl), vl);
+        auto vx = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(x, vl), vl);
 
-        auto vy = __riscv_vfncvt_f_f_w_f32m4(wy, vl);
-        auto vx = __riscv_vfncvt_f_f_w_f32m4(wx, vl);
+        auto a = detail::rvv_atan(vy, vx, vl, atan_params);
 
-        auto a =
-            detail::rvv_atan_f32(vy, vx, vl, p7, vp5, vp3, vp1, angle_90_deg);
-
-        auto wa = __riscv_vfwcvt_f_f_v_f64m8(a, vl);
-
-        __riscv_vse64(dst, wa, vl);
+        __riscv_vse64(dst, __riscv_vfwcvt_f(a, vl), vl);
 
         x += vl;
         y += vl;
@@ -125,4 +119,4 @@ inline int fast_atan_64(const double *y, const double *x, double *dst, size_t n,
     return CV_HAL_ERROR_OK;
 }
 
-} // namespace cv::cv_hal_rvv
+}}  // namespace cv::cv_hal_rvv
diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/cart_to_polar.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/cart_to_polar.hpp
new file mode 100644
index 0000000000..676133b668
--- /dev/null
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/cart_to_polar.hpp
@@ -0,0 +1,48 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#ifndef OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED
+#define OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED
+
+#include <riscv_vector.h>
+
+#include "hal_rvv_1p0/atan.hpp"
+#include "hal_rvv_1p0/sqrt.hpp"
+#include "hal_rvv_1p0/types.hpp"
+
+namespace cv { namespace cv_hal_rvv {
+
+#undef cv_hal_cartToPolar32f
+#define cv_hal_cartToPolar32f cv::cv_hal_rvv::cartToPolar<cv::cv_hal_rvv::RVV_F32M4>
+#undef cv_hal_cartToPolar64f
+#define cv_hal_cartToPolar64f cv::cv_hal_rvv::cartToPolar<cv::cv_hal_rvv::RVV_F64M8>
+
+template <typename RVV_T, typename T = typename RVV_T::ElemType>
+inline int cartToPolar(const T* x, const T* y, T* mag, T* angle, int len, bool angleInDegrees)
+{
+    using CalType = RVV_SameLen<float, RVV_T>;
+    auto atan_params = angleInDegrees ? detail::atan_params_deg : detail::atan_params_rad;
+    size_t vl;
+    for (; len > 0; len -= (int)vl, x += vl, y += vl, mag += vl, angle += vl)
+    {
+        vl = RVV_T::setvl(len);
+
+        auto vx = CalType::cast(RVV_T::vload(x, vl), vl);
+        auto vy = CalType::cast(RVV_T::vload(y, vl), vl);
+
+        auto vmag = detail::sqrt<2>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl);
+        RVV_T::vstore(mag, RVV_T::cast(vmag, vl), vl);
+
+        auto vangle = detail::rvv_atan(vy, vx, vl, atan_params);
+        RVV_T::vstore(angle, RVV_T::cast(vangle, vl), vl);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+}}  // namespace cv::cv_hal_rvv
+
+#endif  // OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED
diff --git a/modules/core/perf/perf_math.cpp b/modules/core/perf/perf_math.cpp
index c06fda44da..398a3ad651 100644
--- a/modules/core/perf/perf_math.cpp
+++ b/modules/core/perf/perf_math.cpp
@@ -57,6 +57,28 @@ PERF_TEST_P(MagnitudeFixture, Magnitude,
     SANITY_CHECK_NOTHING();
 }
 
+///////////// Cart to Polar /////////////
+
+typedef Size_MatType CartToPolarFixture;
+
+PERF_TEST_P(CartToPolarFixture, CartToPolar,
+    testing::Combine(testing::Values(TYPICAL_MAT_SIZES), testing::Values(CV_32F, CV_64F)))
+{
+    cv::Size size = std::get<0>(GetParam());
+    int type = std::get<1>(GetParam());
+
+    cv::Mat x(size, type);
+    cv::Mat y(size, type);
+    cv::Mat magnitude(size, type);
+    cv::Mat angle(size, type);
+
+    declare.in(x, y, WARMUP_RNG).out(magnitude, angle);
+
+    TEST_CYCLE() cv::cartToPolar(x, y, magnitude, angle);
+
+    SANITY_CHECK_NOTHING();
+}
+
 // generates random vectors, performs Gram-Schmidt orthogonalization on them
 Mat randomOrtho(int rows, int ftype, RNG& rng)
 {
diff --git a/modules/core/src/mathfuncs_core.simd.hpp b/modules/core/src/mathfuncs_core.simd.hpp
index 0d9d9272e6..e9d57a5c4d 100644
--- a/modules/core/src/mathfuncs_core.simd.hpp
+++ b/modules/core/src/mathfuncs_core.simd.hpp
@@ -73,48 +73,30 @@ static inline float atan_f32(float y, float x)
 }
 #endif
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
-struct v_atan_f32
+v_float32 v_atan_f32(const v_float32& y, const v_float32& x)
 {
-    explicit v_atan_f32(const float& scale)
-    {
-        eps = vx_setall_f32((float)DBL_EPSILON);
-        z = vx_setzero_f32();
-        p7 = vx_setall_f32(atan2_p7);
-        p5 = vx_setall_f32(atan2_p5);
-        p3 = vx_setall_f32(atan2_p3);
-        p1 = vx_setall_f32(atan2_p1);
-        val90 = vx_setall_f32(90.f);
-        val180 = vx_setall_f32(180.f);
-        val360 = vx_setall_f32(360.f);
-        s = vx_setall_f32(scale);
-    }
+    v_float32 eps = vx_setall_f32((float)DBL_EPSILON);
+    v_float32 z = vx_setzero_f32();
+    v_float32 p7 = vx_setall_f32(atan2_p7);
+    v_float32 p5 = vx_setall_f32(atan2_p5);
+    v_float32 p3 = vx_setall_f32(atan2_p3);
+    v_float32 p1 = vx_setall_f32(atan2_p1);
+    v_float32 val90 = vx_setall_f32(90.f);
+    v_float32 val180 = vx_setall_f32(180.f);
+    v_float32 val360 = vx_setall_f32(360.f);
 
-    v_float32 compute(const v_float32& y, const v_float32& x)
-    {
-        v_float32 ax = v_abs(x);
-        v_float32 ay = v_abs(y);
-        v_float32 c = v_div(v_min(ax, ay), v_add(v_max(ax, ay), this->eps));
-        v_float32 cc = v_mul(c, c);
-        v_float32 a = v_mul(v_fma(v_fma(v_fma(cc, this->p7, this->p5), cc, this->p3), cc, this->p1), c);
-        a = v_select(v_ge(ax, ay), a, v_sub(this->val90, a));
-        a = v_select(v_lt(x, this->z), v_sub(this->val180, a), a);
-        a = v_select(v_lt(y, this->z), v_sub(this->val360, a), a);
-        return v_mul(a, this->s);
-    }
-
-    v_float32 eps;
-    v_float32 z;
-    v_float32 p7;
-    v_float32 p5;
-    v_float32 p3;
-    v_float32 p1;
-    v_float32 val90;
-    v_float32 val180;
-    v_float32 val360;
-    v_float32 s;
-};
+    v_float32 ax = v_abs(x);
+    v_float32 ay = v_abs(y);
+    v_float32 c = v_div(v_min(ax, ay), v_add(v_max(ax, ay), eps));
+    v_float32 cc = v_mul(c, c);
+    v_float32 a = v_mul(v_fma(v_fma(v_fma(cc, p7, p5), cc, p3), cc, p1), c);
+    a = v_select(v_ge(ax, ay), a, v_sub(val90, a));
+    a = v_select(v_lt(x, z), v_sub(val180, a), a);
+    a = v_select(v_lt(y, z), v_sub(val360, a), a);
+    return a;
+}
 
 #endif
 
@@ -124,9 +106,9 @@ static void cartToPolar32f_(const float *X, const float *Y, float *mag, float *a
 {
     float scale = angleInDegrees ? 1.f : (float)(CV_PI/180);
     int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     const int VECSZ = VTraits<v_float32>::vlanes();
-    v_atan_f32 v(scale);
+    v_float32 s = vx_setall_f32(scale);
 
     for( ; i < len; i += VECSZ*2 )
     {
@@ -148,8 +130,8 @@ static void cartToPolar32f_(const float *X, const float *Y, float *mag, float *a
         v_float32 m0 = v_sqrt(v_muladd(x0, x0, v_mul(y0, y0)));
         v_float32 m1 = v_sqrt(v_muladd(x1, x1, v_mul(y1, y1)));
 
-        v_float32 r0 = v.compute(y0, x0);
-        v_float32 r1 = v.compute(y1, x1);
+        v_float32 r0 = v_mul(v_atan_f32(y0, x0), s);
+        v_float32 r1 = v_mul(v_atan_f32(y1, x1), s);
 
         v_store(mag + i, m0);
         v_store(mag + i + VECSZ, m1);
@@ -200,9 +182,9 @@ static void fastAtan32f_(const float *Y, const float *X, float *angle, int len,
 {
     float scale = angleInDegrees ? 1.f : (float)(CV_PI/180);
     int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     const int VECSZ = VTraits<v_float32>::vlanes();
-    v_atan_f32 v(scale);
+    v_float32 s = vx_setall_f32(scale);
 
     for( ; i < len; i += VECSZ*2 )
     {
@@ -221,8 +203,8 @@ static void fastAtan32f_(const float *Y, const float *X, float *angle, int len,
         v_float32 y1 = vx_load(Y + i + VECSZ);
         v_float32 x1 = vx_load(X + i + VECSZ);
 
-        v_float32 r0 = v.compute(y0, x0);
-        v_float32 r1 = v.compute(y1, x1);
+        v_float32 r0 = v_mul(v_atan_f32(y0, x0), s);
+        v_float32 r1 = v_mul(v_atan_f32(y1, x1), s);
 
         v_store(angle + i, r0);
         v_store(angle + i + VECSZ, r1);