Add large scalar kernel for fastconv

2025-12-06 00:19:46 +01:00 · 2025-08-22 10:53:30 +08:00 · 2025-08-22 10:53:30 +08:00 · bfdd7d5a10
commit bfdd7d5a10
parent d9556920dc
1 changed files with 88 additions and 0 deletions
--- a/modules/dnn/src/layers/cpu_kernels/convolution.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/convolution.cpp
@ -2046,6 +2046,85 @@ static inline void convBlock4x4(int np, const float* a, const float* b, float* c
 }
 #endif

+#if defined(__EMSCRIPTEN__)
+template<int RowBase, int NR = 24>
+static inline void conv2xNR(
+    int np,
+    const float* a,
+    const float* b,
+    float* c, int ldc,
+    bool init_c)
+{
+    static_assert(NR % 8 == 0, "NR must be a multiple of 8");
+    float* d0 = c + (RowBase + 0) * ldc;
+    float* d1 = c + (RowBase + 1) * ldc;
+
+    // p = 0
+    {
+        const float* bp = b;
+        const float a0 = a[RowBase + 0];
+        const float a1 = a[RowBase + 1];
+
+        if (init_c)
+        {
+            for (int j = 0; j < NR; j += 8)
+            {
+                const float b0=bp[j+0], b1=bp[j+1], b2=bp[j+2], b3=bp[j+3];
+                const float b4=bp[j+4], b5=bp[j+5], b6=bp[j+6], b7=bp[j+7];
+                d0[j+0] = b0*a0; d0[j+1] = b1*a0; d0[j+2] = b2*a0; d0[j+3] = b3*a0;
+                d0[j+4] = b4*a0; d0[j+5] = b5*a0; d0[j+6] = b6*a0; d0[j+7] = b7*a0;
+                d1[j+0] = b0*a1; d1[j+1] = b1*a1; d1[j+2] = b2*a1; d1[j+3] = b3*a1;
+                d1[j+4] = b4*a1; d1[j+5] = b5*a1; d1[j+6] = b6*a1; d1[j+7] = b7*a1;
+            }
+        } else
+        {
+            for (int j = 0; j < NR; j += 8)
+            {
+                const float b0=bp[j+0], b1=bp[j+1], b2=bp[j+2], b3=bp[j+3];
+                const float b4=bp[j+4], b5=bp[j+5], b6=bp[j+6], b7=bp[j+7];
+                d0[j+0] += b0*a0; d0[j+1] += b1*a0; d0[j+2] += b2*a0; d0[j+3] += b3*a0;
+                d0[j+4] += b4*a0; d0[j+5] += b5*a0; d0[j+6] += b6*a0; d0[j+7] += b7*a0;
+                d1[j+0] += b0*a1; d1[j+1] += b1*a1; d1[j+2] += b2*a1; d1[j+3] += b3*a1;
+                d1[j+4] += b4*a1; d1[j+5] += b5*a1; d1[j+6] += b6*a1; d1[j+7] += b7*a1;
+            }
+        }
+    }
+
+    // p = 1..np-1
+    for (int p = 1; p < np; ++p)
+    {
+        const float* bp = b + p * NR;
+        const int aoff = p * 4 + RowBase;
+        const float a0 = a[aoff + 0];
+        const float a1 = a[aoff + 1];
+
+        for (int j = 0; j < NR; j += 8)
+        {
+            const float b0=bp[j+0], b1=bp[j+1], b2=bp[j+2], b3=bp[j+3];
+            const float b4=bp[j+4], b5=bp[j+5], b6=bp[j+6], b7=bp[j+7];
+            d0[j+0] += b0*a0; d0[j+1] += b1*a0; d0[j+2] += b2*a0; d0[j+3] += b3*a0;
+            d0[j+4] += b4*a0; d0[j+5] += b5*a0; d0[j+6] += b6*a0; d0[j+7] += b7*a0;
+            d1[j+0] += b0*a1; d1[j+1] += b1*a1; d1[j+2] += b2*a1; d1[j+3] += b3*a1;
+            d1[j+4] += b4*a1; d1[j+5] += b5*a1; d1[j+6] += b6*a1; d1[j+7] += b7*a1;
+        }
+    }
+}
+
+// MR == 4, outLen == 24, scalar (no SIMD128)
+static inline void convBlockNoSIMD4x24(
+    int np,
+    const float*  a,
+    const float*  b,
+    float*  c, int ldc,
+    bool init_c,
+    int convNR)
+{
+    CV_Assert(np > 0 && convNR == 24);
+    conv2xNR<0, 24>(np, a, b, c, ldc, init_c); // rows 0 & 1
+    conv2xNR<2, 24>(np, a, b, c, ldc, init_c); // rows 2 & 3
+}
+#endif
+
 static inline void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
                            const int convMR, const int convNR)
 {
@ -2103,8 +2182,17 @@ void convBlock_F32(int np, const float* a, const float* b, float* c, int ldc, bo
        return;
    }
    convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR);
+#elif defined(__EMSCRIPTEN__)
+    CV_Assert(convMR == 4);
+    if (outLen == 24 && convNR == 24)
+    {
+        convBlockNoSIMD4x24(np, a, b, c, ldc, init_c, convNR);
+        return;
+    }
+    convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR);
 #else
    convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR);
+    return;
 #endif
 }