Merge pull request #26868 from FantasqueX/bayer2gray-simd-2

Use universal intrinsics in bayer2gray
2025-12-06 00:19:46 +01:00 · 2025-03-11 09:55:09 +03:00 · 2025-03-11 09:55:09 +03:00 · 4bb57ceb73
commit 4bb57ceb73
parent 2fbb310265 40843d06ab
1 changed files with 74 additions and 57 deletions
--- a/modules/imgproc/src/demosaicing.cpp
+++ b/modules/imgproc/src/demosaicing.cpp
@ -123,13 +123,13 @@ public:
    }
 };

-#if CV_SIMD128
 class SIMDBayerInterpolator_8u
 {
 public:
-    int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst,
-                   int width, int bcoeff, int gcoeff, int rcoeff) const
+    static int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst,
+                   int width, int bcoeff, int gcoeff, int rcoeff)
    {
+#if CV_SIMD
 #if CV_NEON
        uint16x8_t masklo = vdupq_n_u16(255);
        const uchar* bayer_end = bayer + width;
@ -177,102 +177,108 @@ public:
            vst1_u8(dst + 8, p.val[1]);
        }
 #else
-        v_uint16x8 v255 = v_setall_u16(255);
-        v_int16x8 v_descale = v_setall_s16(static_cast<short>(1 << 14));
-        v_int16x8 dummy;
-        v_int16x8 cxrb;
-        v_int16x8 cxg2;
-        v_zip(v_setall_s16(static_cast<short>(rcoeff)),
-              v_setall_s16(static_cast<short>(bcoeff)),
+        v_uint16 v255 = vx_setall_u16(255);
+        v_int16 v_descale = vx_setall_s16(static_cast<short>(1 << 14));
+        v_int16 dummy;
+        v_int16 cxrb;
+        v_int16 cxg2;
+        v_zip(vx_setall_s16(static_cast<short>(rcoeff)),
+              vx_setall_s16(static_cast<short>(bcoeff)),
              cxrb,
              dummy);
-        v_zip(v_setall_s16(static_cast<short>(gcoeff)),
-              v_setall_s16(static_cast<short>(2)),
+        v_zip(vx_setall_s16(static_cast<short>(gcoeff)),
+              vx_setall_s16(static_cast<short>(2)),
              cxg2,
              dummy);

        const uchar* bayer_end = bayer + width;

-        for (; bayer < bayer_end - 14; bayer += 14, dst += 14)
+        const int step = VTraits<v_uint8>::vlanes() - 2;
+        for (; bayer < bayer_end - step; bayer += step, dst += step)
        {
-            v_uint16x8 first_line = v_reinterpret_as_u16(v_load(bayer));
-            v_uint16x8 second_line = v_reinterpret_as_u16(v_load(bayer + bayer_step));
-            v_uint16x8 third_line = v_reinterpret_as_u16(v_load(bayer + bayer_step * 2));
+            v_uint16 first_line = v_reinterpret_as_u16(vx_load(bayer));
+            v_uint16 second_line = v_reinterpret_as_u16(vx_load(bayer + bayer_step));
+            v_uint16 third_line = v_reinterpret_as_u16(vx_load(bayer + bayer_step * 2));

            // bayer[0]
-            v_uint16x8 first_line0 = v_and(first_line, v255);
+            v_uint16 first_line0 = v_and(first_line, v255);
            // bayer[bayer_step*2]
-            v_uint16x8 third_line0 = v_and(third_line, v255);
+            v_uint16 third_line0 = v_and(third_line, v255);
            // bayer[0] + bayer[bayer_step*2]
-            v_uint16x8 first_third_line0 = v_add(first_line0, third_line0);
+            v_uint16 first_third_line0 = v_add(first_line0, third_line0);
            // bayer[2] + bayer[bayer_step*2+2]
-            v_uint16x8 first_third_line2 = v_rotate_right<1>(first_third_line0);
+            v_uint16 first_third_line2 = v_rotate_right<1>(first_third_line0);
            // bayer[0] + bayer[2] + bayer[bayer_step*2] + bayer[bayer_step*2+2]
-            v_int16x8 r0 = v_reinterpret_as_s16(v_add(first_third_line0, first_third_line2));
+            v_int16 r0 = v_reinterpret_as_s16(v_add(first_third_line0, first_third_line2));
            // (bayer[2] + bayer[bayer_step*2+2]) * 2
-            v_int16x8 r1 = v_reinterpret_as_s16(v_shl<1>(first_third_line2));
+            v_int16 r1 = v_reinterpret_as_s16(v_shl<1>(first_third_line2));

            // bayer[bayer_step+1]
-            v_uint16x8 second_line1 = v_shr<8>(second_line);
+            v_uint16 second_line1 = v_shr<8>(second_line);
            // bayer[bayer_step+1] * 4
-            v_int16x8 b0 = v_reinterpret_as_s16(v_shl<2>(second_line1));
+            v_int16 b0 = v_reinterpret_as_s16(v_shl<2>(second_line1));
            // bayer[bayer_step+3]
-            v_uint16x8 second_line3 = v_rotate_right<1>(second_line1);
+            v_uint16 second_line3 = v_rotate_right<1>(second_line1);
            // bayer[bayer_step+1] + bayer[bayer_step+3]
-            v_uint16x8 second_line13 = v_add(second_line1, second_line3);
+            v_uint16 second_line13 = v_add(second_line1, second_line3);
            // (bayer[bayer_step+1] + bayer[bayer_step+3]) * 2
-            v_int16x8 b1 = v_reinterpret_as_s16(v_shl(second_line13, 1));
+            v_int16 b1 = v_reinterpret_as_s16(v_shl(second_line13, 1));

            // bayer[1]
-            v_uint16x8 first_line1 = v_shr<8>(first_line);
+            v_uint16 first_line1 = v_shr<8>(first_line);
            // bayer[bayer_step]
-            v_uint16x8 second_line0 = v_and(second_line, v255);
+            v_uint16 second_line0 = v_and(second_line, v255);
            // bayer[bayer_step+2]
-            v_uint16x8 second_line2 = v_rotate_right<1>(second_line0);
+            v_uint16 second_line2 = v_rotate_right<1>(second_line0);
            // bayer[bayer_step] + bayer[bayer_step+2]
-            v_uint16x8 second_line02 = v_add(second_line0, second_line2);
+            v_uint16 second_line02 = v_add(second_line0, second_line2);
            // bayer[bayer_step*2+1]
-            v_uint16x8 third_line1 = v_shr<8>(third_line);
+            v_uint16 third_line1 = v_shr<8>(third_line);
            // bayer[1] + bayer[bayer_step*2+1]
-            v_uint16x8 first_third_line1 = v_add(first_line1, third_line1);
+            v_uint16 first_third_line1 = v_add(first_line1, third_line1);
            // bayer[1] + bayer[bayer_step] + bayer[bayer_step+2] + bayer[bayer_step*2+1]
-            v_int16x8 g0 = v_reinterpret_as_s16(v_add(first_third_line1, second_line02));
+            v_int16 g0 = v_reinterpret_as_s16(v_add(first_third_line1, second_line02));
            // bayer[bayer_step+2] * 4
-            v_int16x8 g1 = v_reinterpret_as_s16(v_shl<2>(second_line2));
+            v_int16 g1 = v_reinterpret_as_s16(v_shl<2>(second_line2));

-            v_int16x8 rb0;
-            v_int16x8 rb1;
-            v_int16x8 rb2;
-            v_int16x8 rb3;
+            v_int16 rb0;
+            v_int16 rb1;
+            v_int16 rb2;
+            v_int16 rb3;
            v_zip(r0, b0, rb0, rb1);
            v_zip(r1, b1, rb2, rb3);

-            v_int16x8 gd0;
-            v_int16x8 gd1;
-            v_int16x8 gd2;
-            v_int16x8 gd3;
+            v_int16 gd0;
+            v_int16 gd1;
+            v_int16 gd2;
+            v_int16 gd3;
            v_zip(g0, v_descale, gd0, gd1);
            v_zip(g1, v_descale, gd2, gd3);

-            v_int32x4 gray_even0 = v_shr<16>(v_add(v_dotprod(rb0, cxrb), v_dotprod(gd0, cxg2)));
-            v_int32x4 gray_even1 = v_shr<16>(v_add(v_dotprod(rb1, cxrb), v_dotprod(gd1, cxg2)));
-            v_int32x4 gray_odd0 = v_shr<16>(v_add(v_dotprod(rb2, cxrb), v_dotprod(gd2, cxg2)));
-            v_int32x4 gray_odd1 = v_shr<16>(v_add(v_dotprod(rb3, cxrb), v_dotprod(gd3, cxg2)));
+            v_int32 gray_even0 = v_shr<16>(v_add(v_dotprod(rb0, cxrb), v_dotprod(gd0, cxg2)));
+            v_int32 gray_even1 = v_shr<16>(v_add(v_dotprod(rb1, cxrb), v_dotprod(gd1, cxg2)));
+            v_int32 gray_odd0 = v_shr<16>(v_add(v_dotprod(rb2, cxrb), v_dotprod(gd2, cxg2)));
+            v_int32 gray_odd1 = v_shr<16>(v_add(v_dotprod(rb3, cxrb), v_dotprod(gd3, cxg2)));

-            v_int16x8 gray_even = v_pack(gray_even0, gray_even1);
-            v_int16x8 gray_odd = v_pack(gray_odd0, gray_odd1);
+            v_int16 gray_even = v_pack(gray_even0, gray_even1);
+            v_int16 gray_odd = v_pack(gray_odd0, gray_odd1);

-            v_int16x8 gray_d0;
-            v_int16x8 gray_d1;
+            v_int16 gray_d0;
+            v_int16 gray_d1;
            v_zip(gray_even, gray_odd, gray_d0, gray_d1);

-            v_uint8x16 gray = v_pack(v_reinterpret_as_u16(gray_d0), v_reinterpret_as_u16(gray_d1));
+            v_uint8 gray = v_pack(v_reinterpret_as_u16(gray_d0), v_reinterpret_as_u16(gray_d1));

            v_store(dst, gray);
        }
 #endif

        return static_cast<int>(bayer - (bayer_end - width));
+#else
+        CV_UNUSED(bayer); CV_UNUSED(bayer_step); CV_UNUSED(dst);
+        CV_UNUSED(width); CV_UNUSED(bcoeff); CV_UNUSED(gcoeff); CV_UNUSED(rcoeff);
+        return 0;
+#endif
    }

    int bayer2RGB(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
@ -282,7 +288,7 @@ public:
         G R G R | G R G R | G R G R | G R G R
         B G B G | B G B G | B G B G | B G B G
         */
-
+#if CV_SIMD128
 #if CV_NEON
        uint16x8_t masklo = vdupq_n_u16(255);
        uint8x16x3_t pix;
@ -405,6 +411,10 @@ public:
 #endif

        return (int)(bayer - (bayer_end - width));
+#else
+        CV_UNUSED(bayer); CV_UNUSED(bayer_step); CV_UNUSED(dst); CV_UNUSED(width); CV_UNUSED(blue);
+        return 0;
+#endif
    }

    int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue, const uchar alpha) const
@ -415,6 +425,7 @@ public:
         B G B G | B G B G | B G B G | B G B G
         */

+#if CV_SIMD128
 #if CV_NEON
        uint16x8_t masklo = vdupq_n_u16(255);
        uint8x16x4_t pix;
@ -536,10 +547,16 @@ public:
 #endif

        return (int)(bayer - (bayer_end - width));
+#else
+        CV_UNUSED(bayer); CV_UNUSED(bayer_step); CV_UNUSED(dst);
+        CV_UNUSED(width); CV_UNUSED(blue); CV_UNUSED(alpha);
+        return 0;
+#endif
    }

    int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
    {
+#if CV_SIMD128
        const uchar* bayer_end = bayer + width;
        v_uint16x8 masklow = v_setall_u16(0x00ff);
        v_uint16x8 delta1 = v_setall_u16(1), delta2 = v_setall_u16(2);
@ -642,12 +659,12 @@ public:
        }

        return int(bayer - (bayer_end - width));
+#else
+        CV_UNUSED(bayer); CV_UNUSED(bayer_step); CV_UNUSED(dst); CV_UNUSED(width); CV_UNUSED(blue);
+        return 0;
+#endif
    }
 };
-#else
-typedef SIMDBayerStubInterpolator_<uchar> SIMDBayerInterpolator_8u;
-#endif
-

 template<typename T, class SIMDInterpolator>
 class Bayer2Gray_Invoker :