Merge pull request #26927 from asmorkalov:as/squeeze_windows

Squeeze several Windows pipelines into one with jobs
Merge pull request #26911 from asmorkalov:as/openvx_hal_imgproc
2025-12-06 00:19:46 +01:00 · 2025-02-17 15:08:43 +03:00 · 2025-02-17 13:57:17 +03:00 · 2025-02-17 13:54:31 +03:00 · 2025-02-17 13:51:48 +03:00 · 2025-02-17 12:38:00 +05:30
650 changed files with 41680 additions and 20170 deletions
--- a/.github/workflows/PR-4.x.yaml
+++ b/.github/workflows/PR-4.x.yaml
@ -6,24 +6,21 @@ on:
      - 4.x

 jobs:
+
+  Linux:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-Linux.yaml@main
+    with:
+      workflow_branch: main
+
  Ubuntu2004-ARM64:
    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-ARM64.yaml@main

  Ubuntu2004-ARM64-Debug:
    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-ARM64-Debug.yaml@main

-  Ubuntu2004-x64:
-    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-U20.yaml@main
-
  Ubuntu2004-x64-OpenVINO:
    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-U20-OpenVINO.yaml@main

-  Ubuntu2204-x64:
-    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-U22.yaml@main
-
-  Ubuntu2404-x64:
-    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-U24.yaml@main
-
  Ubuntu2004-x64-CUDA:
    if: "${{ contains(github.event.pull_request.labels.*.name, 'category: dnn') }} || ${{ contains(github.event.pull_request.labels.*.name, 'category: dnn (onnx)') }}"
    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-U20-Cuda.yaml@main
@ -31,9 +28,6 @@ jobs:
  Windows10-x64:
    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10.yaml@main

-  Windows10-ARM64:
-    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10-ARM64.yaml@main
-
  Windows10-x64-Vulkan:
    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10-Vulkan.yaml@main

--- a/3rdparty/carotene/CMakeLists.txt
+++ b/3rdparty/carotene/CMakeLists.txt
@ -42,17 +42,9 @@ endif()

 if(WITH_NEON)
    target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
-    if(NOT DEFINED CAROTENE_NEON_ARCH )
-    elseif(CAROTENE_NEON_ARCH EQUAL 8)
-	    target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=8")
-    elseif(CAROTENE_NEON_ARCH EQUAL 7)
-	    target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=7")
-    else()
-	    target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=0")
-    endif()
 endif()

- if(MINGW) 
+ if(MINGW)
    target_compile_definitions(carotene_objs PRIVATE "-D_USE_MATH_DEFINES=1")
 endif()

--- a/3rdparty/carotene/hal/tegra_hal.hpp
+++ b/3rdparty/carotene/hal/tegra_hal.hpp
@ -119,7 +119,7 @@ private: \
 #define TEGRA_BINARYOP(type, op, src1, sz1, src2, sz2, dst, sz, w, h) \
 ( \
    CAROTENE_NS::isSupportedConfiguration() ? \
-    parallel_for_(Range(0, h), \
+    parallel_for_(cv::Range(0, h), \
    TegraGenOp_##op##_Invoker<const type, type>(src1, sz1, src2, sz2, dst, sz, w, h), \
    (w * h) / static_cast<double>(1<<16)), \
    CV_HAL_ERROR_OK \
@ -154,7 +154,7 @@ TegraUnaryOp_Invoker(bitwiseNot, bitwiseNot)
 #define TEGRA_UNARYOP(type, op, src1, sz1, dst, sz, w, h) \
 ( \
    CAROTENE_NS::isSupportedConfiguration() ? \
-    parallel_for_(Range(0, h), \
+    parallel_for_(cv::Range(0, h), \
    TegraGenOp_##op##_Invoker<const type, type>(src1, sz1, dst, sz, w, h), \
    (w * h) / static_cast<double>(1<<16)), \
    CV_HAL_ERROR_OK \
@ -254,32 +254,32 @@ TegraGenOp_Invoker(cmpLE, cmpGE, 2, 1, 0, RANGE_DATA(ST, src2_data, src2_step),
 ( \
    CAROTENE_NS::isSupportedConfiguration() ? \
        ((op) == cv::CMP_EQ) ? \
-        parallel_for_(Range(0, h), \
+        parallel_for_(cv::Range(0, h), \
        TegraGenOp_cmpEQ_Invoker<const type, CAROTENE_NS::u8>(src1, sz1, src2, sz2, dst, sz, w, h), \
        (w * h) / static_cast<double>(1<<16)), \
        CV_HAL_ERROR_OK : \
        ((op) == cv::CMP_NE) ? \
-        parallel_for_(Range(0, h), \
+        parallel_for_(cv::Range(0, h), \
        TegraGenOp_cmpNE_Invoker<const type, CAROTENE_NS::u8>(src1, sz1, src2, sz2, dst, sz, w, h), \
        (w * h) / static_cast<double>(1<<16)), \
        CV_HAL_ERROR_OK : \
        ((op) == cv::CMP_GT) ? \
-        parallel_for_(Range(0, h), \
+        parallel_for_(cv::Range(0, h), \
        TegraGenOp_cmpGT_Invoker<const type, CAROTENE_NS::u8>(src1, sz1, src2, sz2, dst, sz, w, h), \
        (w * h) / static_cast<double>(1<<16)), \
        CV_HAL_ERROR_OK : \
        ((op) == cv::CMP_GE) ? \
-        parallel_for_(Range(0, h), \
+        parallel_for_(cv::Range(0, h), \
        TegraGenOp_cmpGE_Invoker<const type, CAROTENE_NS::u8>(src1, sz1, src2, sz2, dst, sz, w, h), \
        (w * h) / static_cast<double>(1<<16)), \
        CV_HAL_ERROR_OK : \
        ((op) == cv::CMP_LT) ? \
-        parallel_for_(Range(0, h), \
+        parallel_for_(cv::Range(0, h), \
        TegraGenOp_cmpLT_Invoker<const type, CAROTENE_NS::u8>(src1, sz1, src2, sz2, dst, sz, w, h), \
        (w * h) / static_cast<double>(1<<16)), \
        CV_HAL_ERROR_OK : \
        ((op) == cv::CMP_LE) ? \
-        parallel_for_(Range(0, h), \
+        parallel_for_(cv::Range(0, h), \
        TegraGenOp_cmpLE_Invoker<const type, CAROTENE_NS::u8>(src1, sz1, src2, sz2, dst, sz, w, h), \
        (w * h) / static_cast<double>(1<<16)), \
        CV_HAL_ERROR_OK : \
@ -310,7 +310,7 @@ TegraGenOp_Invoker(cmpLE, cmpGE, 2, 1, 0, RANGE_DATA(ST, src2_data, src2_step),
 #define TEGRA_BINARYOPSCALE(type, op, src1, sz1, src2, sz2, dst, sz, w, h, scales) \
 ( \
    CAROTENE_NS::isSupportedConfiguration() ? \
-    parallel_for_(Range(0, h), \
+    parallel_for_(cv::Range(0, h), \
    TegraGenOp_##op##_Invoker<const type, type>(src1, sz1, src2, sz2, dst, sz, w, h, scales), \
    (w * h) / static_cast<double>(1<<16)), \
    CV_HAL_ERROR_OK \
@ -332,7 +332,7 @@ TegraBinaryOpScale_Invoker(divf, div, 1, scale)
 #define TEGRA_UNARYOPSCALE(type, op, src1, sz1, dst, sz, w, h, scales) \
 ( \
    CAROTENE_NS::isSupportedConfiguration() ? \
-    parallel_for_(Range(0, h), \
+    parallel_for_(cv::Range(0, h), \
    TegraGenOp_##op##_Invoker<const type, type>(src1, sz1, dst, sz, w, h, scales), \
    (w * h) / static_cast<double>(1<<16)), \
    CV_HAL_ERROR_OK \
@ -928,17 +928,17 @@ TegraRowOp_Invoker(split4, split4, 1, 4, 0, RANGE_DATA(ST, src1_data, 4*sizeof(S
 ( \
    CAROTENE_NS::isSupportedConfiguration() ? \
        cn == 2 ? \
-        parallel_for_(Range(0, len), \
+        parallel_for_(cv::Range(0, len), \
        TegraRowOp_split2_Invoker<const type, type>(src, dst[0], dst[1]), \
        (len) / static_cast<double>(1<<16)), \
        CV_HAL_ERROR_OK : \
        cn == 3 ? \
-        parallel_for_(Range(0, len), \
+        parallel_for_(cv::Range(0, len), \
        TegraRowOp_split3_Invoker<const type, type>(src, dst[0], dst[1], dst[2]), \
        (len) / static_cast<double>(1<<16)), \
        CV_HAL_ERROR_OK : \
        cn == 4 ? \
-        parallel_for_(Range(0, len), \
+        parallel_for_(cv::Range(0, len), \
        TegraRowOp_split4_Invoker<const type, type>(src, dst[0], dst[1], dst[2], dst[3]), \
        (len) / static_cast<double>(1<<16)), \
        CV_HAL_ERROR_OK : \
@ -990,17 +990,17 @@ TegraRowOp_Invoker(combine4, combine4, 4, 1, 0, RANGE_DATA(ST, src1_data, sizeof
 ( \
    CAROTENE_NS::isSupportedConfiguration() ? \
        cn == 2 ? \
-        parallel_for_(Range(0, len), \
+        parallel_for_(cv::Range(0, len), \
        TegraRowOp_combine2_Invoker<const type, type>(src[0], src[1], dst), \
        (len) / static_cast<double>(1<<16)), \
        CV_HAL_ERROR_OK : \
        cn == 3 ? \
-        parallel_for_(Range(0, len), \
+        parallel_for_(cv::Range(0, len), \
        TegraRowOp_combine3_Invoker<const type, type>(src[0], src[1], src[2], dst), \
        (len) / static_cast<double>(1<<16)), \
        CV_HAL_ERROR_OK : \
        cn == 4 ? \
-        parallel_for_(Range(0, len), \
+        parallel_for_(cv::Range(0, len), \
        TegraRowOp_combine4_Invoker<const type, type>(src[0], src[1], src[2], src[3], dst), \
        (len) / static_cast<double>(1<<16)), \
        CV_HAL_ERROR_OK : \
@ -1033,7 +1033,7 @@ TegraRowOp_Invoker(phase, phase, 2, 1, 1, RANGE_DATA(ST, src1_data, sizeof(CAROT
 #define TEGRA_FASTATAN(y, x, dst, len, angleInDegrees) \
 ( \
    CAROTENE_NS::isSupportedConfiguration() ? \
-    parallel_for_(Range(0, len), \
+    parallel_for_(cv::Range(0, len), \
    TegraRowOp_phase_Invoker<const CAROTENE_NS::f32, CAROTENE_NS::f32>(x, y, dst, angleInDegrees ? 1.0f : M_PI/180), \
    (len) / static_cast<double>(1<<16)), \
    CV_HAL_ERROR_OK \
@ -1049,7 +1049,7 @@ TegraRowOp_Invoker(magnitude, magnitude, 2, 1, 0, RANGE_DATA(ST, src1_data, size
 #define TEGRA_MAGNITUDE(x, y, dst, len) \
 ( \
    CAROTENE_NS::isSupportedConfiguration() ? \
-    parallel_for_(Range(0, len), \
+    parallel_for_(cv::Range(0, len), \
    TegraRowOp_magnitude_Invoker<const CAROTENE_NS::f32, CAROTENE_NS::f32>(x, y, dst), \
    (len) / static_cast<double>(1<<16)), \
    CV_HAL_ERROR_OK \
@ -1563,17 +1563,17 @@ TegraCvtColor_Invoker(rgbx2bgrx, rgbx2bgrx, src_data + static_cast<size_t>(range
        scn == 3 ? \
            dcn == 3 ? \
                swapBlue ? \
-                    parallel_for_(Range(0, height), \
+                    parallel_for_(cv::Range(0, height), \
                    TegraCvtColor_rgb2bgr_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                    (width * height) / static_cast<double>(1<<16)), \
                    CV_HAL_ERROR_OK : \
                    CV_HAL_ERROR_NOT_IMPLEMENTED : \
            dcn == 4 ? \
                (swapBlue ? \
-                    parallel_for_(Range(0, height), \
+                    parallel_for_(cv::Range(0, height), \
                    TegraCvtColor_rgb2bgrx_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                    (width * height) / static_cast<double>(1<<16)) : \
-                    parallel_for_(Range(0, height), \
+                    parallel_for_(cv::Range(0, height), \
                    TegraCvtColor_rgb2rgbx_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                    (width * height) / static_cast<double>(1<<16)) ), \
                CV_HAL_ERROR_OK : \
@ -1581,16 +1581,16 @@ TegraCvtColor_Invoker(rgbx2bgrx, rgbx2bgrx, src_data + static_cast<size_t>(range
        scn == 4 ? \
            dcn == 3 ? \
                (swapBlue ? \
-                    parallel_for_(Range(0, height), \
+                    parallel_for_(cv::Range(0, height), \
                    TegraCvtColor_rgbx2bgr_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                    (width * height) / static_cast<double>(1<<16)) : \
-                    parallel_for_(Range(0, height), \
+                    parallel_for_(cv::Range(0, height), \
                    TegraCvtColor_rgbx2rgb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                    (width * height) / static_cast<double>(1<<16)) ), \
                CV_HAL_ERROR_OK : \
            dcn == 4 ? \
                swapBlue ? \
-                    parallel_for_(Range(0, height), \
+                    parallel_for_(cv::Range(0, height), \
                    TegraCvtColor_rgbx2bgrx_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                    (width * height) / static_cast<double>(1<<16)), \
                    CV_HAL_ERROR_OK : \
@ -1613,19 +1613,19 @@ TegraCvtColor_Invoker(rgbx2rgb565, rgbx2rgb565, src_data + static_cast<size_t>(r
    greenBits == 6 && CAROTENE_NS::isSupportedConfiguration() ? \
        scn == 3 ? \
            (swapBlue ? \
-                parallel_for_(Range(0, height), \
+                parallel_for_(cv::Range(0, height), \
                TegraCvtColor_rgb2bgr565_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                (width * height) / static_cast<double>(1<<16)) : \
-                parallel_for_(Range(0, height), \
+                parallel_for_(cv::Range(0, height), \
                TegraCvtColor_rgb2rgb565_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                (width * height) / static_cast<double>(1<<16)) ), \
            CV_HAL_ERROR_OK : \
        scn == 4 ? \
            (swapBlue ? \
-                parallel_for_(Range(0, height), \
+                parallel_for_(cv::Range(0, height), \
                TegraCvtColor_rgbx2bgr565_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                (width * height) / static_cast<double>(1<<16)) : \
-                parallel_for_(Range(0, height), \
+                parallel_for_(cv::Range(0, height), \
                TegraCvtColor_rgbx2rgb565_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                (width * height) / static_cast<double>(1<<16)) ), \
            CV_HAL_ERROR_OK : \
@ -1646,19 +1646,19 @@ TegraCvtColor_Invoker(bgrx2gray, bgrx2gray, CAROTENE_NS::COLOR_SPACE_BT601, src_
    depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \
        scn == 3 ? \
            (swapBlue ? \
-                parallel_for_(Range(0, height), \
+                parallel_for_(cv::Range(0, height), \
                TegraCvtColor_rgb2gray_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                (width * height) / static_cast<double>(1<<16)) : \
-                parallel_for_(Range(0, height), \
+                parallel_for_(cv::Range(0, height), \
                TegraCvtColor_bgr2gray_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                (width * height) / static_cast<double>(1<<16)) ), \
            CV_HAL_ERROR_OK : \
        scn == 4 ? \
            (swapBlue ? \
-                parallel_for_(Range(0, height), \
+                parallel_for_(cv::Range(0, height), \
                TegraCvtColor_rgbx2gray_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                (width * height) / static_cast<double>(1<<16)) : \
-                parallel_for_(Range(0, height), \
+                parallel_for_(cv::Range(0, height), \
                TegraCvtColor_bgrx2gray_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                (width * height) / static_cast<double>(1<<16)) ), \
            CV_HAL_ERROR_OK : \
@ -1674,12 +1674,12 @@ TegraCvtColor_Invoker(gray2rgbx, gray2rgbx, src_data + static_cast<size_t>(range
 ( \
    depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \
        dcn == 3 ? \
-            parallel_for_(Range(0, height), \
+            parallel_for_(cv::Range(0, height), \
            TegraCvtColor_gray2rgb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
            (width * height) / static_cast<double>(1<<16)), \
            CV_HAL_ERROR_OK : \
        dcn == 4 ? \
-            parallel_for_(Range(0, height), \
+            parallel_for_(cv::Range(0, height), \
            TegraCvtColor_gray2rgbx_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
            (width * height) / static_cast<double>(1<<16)), \
            CV_HAL_ERROR_OK : \
@ -1700,19 +1700,19 @@ TegraCvtColor_Invoker(bgrx2ycrcb, bgrx2ycrcb, src_data + static_cast<size_t>(ran
    isCbCr && depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \
        scn == 3 ? \
            (swapBlue ? \
-                parallel_for_(Range(0, height), \
+                parallel_for_(cv::Range(0, height), \
                TegraCvtColor_rgb2ycrcb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                (width * height) / static_cast<double>(1<<16)) : \
-                parallel_for_(Range(0, height), \
+                parallel_for_(cv::Range(0, height), \
                TegraCvtColor_bgr2ycrcb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                (width * height) / static_cast<double>(1<<16)) ), \
            CV_HAL_ERROR_OK : \
        scn == 4 ? \
            (swapBlue ? \
-                parallel_for_(Range(0, height), \
+                parallel_for_(cv::Range(0, height), \
                TegraCvtColor_rgbx2ycrcb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                (width * height) / static_cast<double>(1<<16)) : \
-                parallel_for_(Range(0, height), \
+                parallel_for_(cv::Range(0, height), \
                TegraCvtColor_bgrx2ycrcb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                (width * height) / static_cast<double>(1<<16)) ), \
            CV_HAL_ERROR_OK : \
@ -1742,34 +1742,34 @@ TegraCvtColor_Invoker(bgrx2hsvf, bgrx2hsv, src_data + static_cast<size_t>(range.
        scn == 3 ? \
            (swapBlue ? \
                isFullRange ? \
-                    parallel_for_(Range(0, height), \
+                    parallel_for_(cv::Range(0, height), \
                    TegraCvtColor_rgb2hsvf_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                    (width * height) / static_cast<double>(1<<16)) : \
-                    parallel_for_(Range(0, height), \
+                    parallel_for_(cv::Range(0, height), \
                    TegraCvtColor_rgb2hsv_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                    (width * height) / static_cast<double>(1<<16)) : \
                isFullRange ? \
-                    parallel_for_(Range(0, height), \
+                    parallel_for_(cv::Range(0, height), \
                    TegraCvtColor_bgr2hsvf_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                    (width * height) / static_cast<double>(1<<16)) : \
-                    parallel_for_(Range(0, height), \
+                    parallel_for_(cv::Range(0, height), \
                    TegraCvtColor_bgr2hsv_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                    (width * height) / static_cast<double>(1<<16)) ), \
            CV_HAL_ERROR_OK : \
        scn == 4 ? \
            (swapBlue ? \
                isFullRange ? \
-                    parallel_for_(Range(0, height), \
+                    parallel_for_(cv::Range(0, height), \
                    TegraCvtColor_rgbx2hsvf_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                    (width * height) / static_cast<double>(1<<16)) : \
-                    parallel_for_(Range(0, height), \
+                    parallel_for_(cv::Range(0, height), \
                    TegraCvtColor_rgbx2hsv_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                    (width * height) / static_cast<double>(1<<16)) : \
                isFullRange ? \
-                    parallel_for_(Range(0, height), \
+                    parallel_for_(cv::Range(0, height), \
                    TegraCvtColor_bgrx2hsvf_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                    (width * height) / static_cast<double>(1<<16)) : \
-                    parallel_for_(Range(0, height), \
+                    parallel_for_(cv::Range(0, height), \
                    TegraCvtColor_bgrx2hsv_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
                    (width * height) / static_cast<double>(1<<16)) ), \
            CV_HAL_ERROR_OK : \
@ -1857,7 +1857,7 @@ TegraCvtColor_Invoker(bgrx2hsvf, bgrx2hsv, src_data + static_cast<size_t>(range.
 #endif

 // The optimized branch was developed for old armv7 processors and leads to perf degradation on armv8
-#if defined(DCAROTENE_NEON_ARCH) && (DCAROTENE_NEON_ARCH == 7)
+#if defined(__ARM_ARCH) && (__ARM_ARCH == 7)
 inline CAROTENE_NS::BORDER_MODE borderCV2Carotene(int borderType)
 {
    switch(borderType)
@ -1928,8 +1928,54 @@ inline int TEGRA_GaussianBlurBinomial(const uchar* src_data, size_t src_step, uc
 #undef cv_hal_gaussianBlurBinomial
 #define cv_hal_gaussianBlurBinomial TEGRA_GaussianBlurBinomial

-#endif // DCAROTENE_NEON_ARCH=7
+#endif // __ARM_ARCH=7

 #endif // OPENCV_IMGPROC_HAL_INTERFACE_H

+// The optimized branch was developed for old armv7 processors
+#if defined(__ARM_ARCH) && (__ARM_ARCH == 7)
+inline int TEGRA_LKOpticalFlowLevel(const uchar *prev_data, size_t prev_data_step,
+                       const short* prev_deriv_data, size_t prev_deriv_step,
+                       const uchar* next_data, size_t next_step,
+                       int width, int height, int cn,
+                       const float *prev_points, float *next_points, size_t point_count,
+                       uchar *status, float *err,
+                       const int win_width, const int win_height,
+                       int termination_count, double termination_epsilon,
+                       bool get_min_eigen_vals,
+                       float min_eigen_vals_threshold)
+{
+    if (!CAROTENE_NS::isSupportedConfiguration())
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    CAROTENE_NS::pyrLKOptFlowLevel(CAROTENE_NS::Size2D(width, height), cn,
+        prev_data, prev_data_step, prev_deriv_data, prev_deriv_step,
+        next_data, next_step,
+        point_count, prev_points, next_points,
+        status, err, CAROTENE_NS::Size2D(win_width, win_height),
+        termination_count, termination_epsilon,
+        get_min_eigen_vals, min_eigen_vals_threshold);
+    return CV_HAL_ERROR_OK;
+}
+
+#undef cv_hal_LKOpticalFlowLevel
+#define cv_hal_LKOpticalFlowLevel TEGRA_LKOpticalFlowLevel
+#endif // __ARM_ARCH=7
+
+#if 0 // OpenCV provides fater parallel implementation
+inline int TEGRA_ScharrDeriv(const uchar* src_data, size_t src_step,
+                      short* dst_data, size_t dst_step,
+                      int width, int height, int cn)
+{
+    if (!CAROTENE_NS::isSupportedConfiguration())
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    CAROTENE_NS::ScharrDeriv(CAROTENE_NS::Size2D(width, height), cn, src_data, src_step, dst_data, dst_step);
+    return CV_HAL_ERROR_OK;
+}
+
+#undef cv_hal_ScharrDeriv
+#define cv_hal_ScharrDeriv TEGRA_ScharrDeriv
+#endif
+
 #endif
--- a/3rdparty/carotene/include/carotene/functions.hpp
+++ b/3rdparty/carotene/include/carotene/functions.hpp
@ -2485,7 +2485,7 @@ namespace CAROTENE_NS {
                           u8 *status, f32 *err,
                           const Size2D &winSize,
                           u32 terminationCount, f64 terminationEpsilon,
-                           u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
+                           bool getMinEigenVals,
                           f32 minEigThreshold);
 }

--- a/3rdparty/carotene/src/common.hpp
+++ b/3rdparty/carotene/src/common.hpp
@ -58,17 +58,6 @@

 namespace CAROTENE_NS { namespace internal {

-#ifndef CAROTENE_NEON_ARCH
-#    if defined(__aarch64__) || defined(__aarch32__)
-#        define CAROTENE_NEON_ARCH 8
-#    else
-#        define CAROTENE_NEON_ARCH 7
-#    endif
-#endif
-#if ( !defined(__aarch64__) && !defined(__aarch32__) ) && (CAROTENE_NEON_ARCH == 8 )
-#    error("ARMv7 doen't support A32/A64 Neon instructions")
-#endif
-
 inline void prefetch(const void *ptr, size_t offset = 32*10)
 {
 #if defined __GNUC__
--- a/3rdparty/carotene/src/opticalflow.cpp
+++ b/3rdparty/carotene/src/opticalflow.cpp
@ -58,7 +58,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
                       u8 *status, f32 *err,
                       const Size2D &winSize,
                       u32 terminationCount, f64 terminationEpsilon,
-                       u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
+                       bool getMinEigenVals,
                       f32 minEigThreshold)
 {
    internal::assertSupportedConfiguration();
@ -74,32 +74,11 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,

    for( u32 ptidx = 0; ptidx < ptCount; ptidx++ )
    {
-        f32 levscale = (1./(1 << level));
        u32 ptref = ptidx << 1;
-        f32 prevPtX = prevPts[ptref+0]*levscale;
-        f32 prevPtY = prevPts[ptref+1]*levscale;
-        f32 nextPtX;
-        f32 nextPtY;
-        if( level == maxLevel )
-        {
-            if( useInitialFlow )
-            {
-                nextPtX = nextPts[ptref+0]*levscale;
-                nextPtY = nextPts[ptref+1]*levscale;
-            }
-            else
-            {
-                nextPtX = prevPtX;
-                nextPtY = prevPtY;
-            }
-        }
-        else
-        {
-            nextPtX = nextPts[ptref+0]*2.f;
-            nextPtY = nextPts[ptref+1]*2.f;
-        }
-        nextPts[ptref+0] = nextPtX;
-        nextPts[ptref+1] = nextPtY;
+        f32 prevPtX = prevPts[ptref+0];
+        f32 prevPtY = prevPts[ptref+1];
+        f32 nextPtX = nextPts[ptref+0];
+        f32 nextPtY = nextPts[ptref+1];

        s32 iprevPtX, iprevPtY;
        s32 inextPtX, inextPtY;
@ -111,13 +90,10 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
        if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width ||
            iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height )
        {
-            if( level == 0 )
-            {
-                if( status )
-                    status[ptidx] = false;
-                if( err )
-                    err[ptidx] = 0;
-            }
+            if( status )
+                status[ptidx] = false;
+            if( err )
+                err[ptidx] = 0;
            continue;
        }

@ -333,7 +309,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,

        if( minEig < minEigThreshold || D < FLT_EPSILON )
        {
-            if( level == 0 && status )
+            if( status )
                status[ptidx] = false;
            continue;
        }
@ -353,7 +329,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
            if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width ||
               inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height )
            {
-                if( level == 0 && status )
+                if( status )
                    status[ptidx] = false;
                break;
            }
@ -469,8 +445,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
            prevDeltaX = deltaX;
            prevDeltaY = deltaY;
        }
-
-        if( status && status[ptidx] && err && level == 0 && !getMinEigenVals )
+        if( status && status[ptidx] && err && !getMinEigenVals )
        {
            f32 nextPointX = nextPts[ptref+0] - halfWinX;
            f32 nextPointY = nextPts[ptref+1] - halfWinY;
@ -526,9 +501,6 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
    (void)winSize;
    (void)terminationCount;
    (void)terminationEpsilon;
-    (void)level;
-    (void)maxLevel;
-    (void)useInitialFlow;
    (void)getMinEigenVals;
    (void)minEigThreshold;
    (void)ptCount;
@ -536,4 +508,3 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
 }

 }//CAROTENE_NS
-
--- a/3rdparty/carotene/src/vround_helper.hpp
+++ b/3rdparty/carotene/src/vround_helper.hpp
@ -57,7 +57,7 @@ namespace CAROTENE_NS { namespace internal {

 inline uint32x4_t vroundq_u32_f32(const float32x4_t val)
 {
-#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
+#if defined(__ARM_ARCH) && (__ARM_ARCH >= 8)
    return vcvtnq_u32_f32(val);
 #else
    const float32x4_t delta = vdupq_n_f32(CAROTENE_ROUND_DELTA);
@ -67,7 +67,7 @@ inline uint32x4_t vroundq_u32_f32(const float32x4_t val)

 inline uint32x2_t vround_u32_f32(const float32x2_t val)
 {
-#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
+#if defined(__ARM_ARCH) && (__ARM_ARCH >= 8)
    return vcvtn_u32_f32(val);
 #else
    const float32x2_t delta = vdup_n_f32(CAROTENE_ROUND_DELTA);
@ -77,7 +77,7 @@ inline uint32x2_t vround_u32_f32(const float32x2_t val)

 inline int32x4_t vroundq_s32_f32(const float32x4_t val)
 {
-#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
+#if defined(__ARM_ARCH) && (__ARM_ARCH >= 8)
    return vcvtnq_s32_f32(val);
 #else
    const float32x4_t delta = vdupq_n_f32(CAROTENE_ROUND_DELTA);
@ -87,7 +87,7 @@ inline int32x4_t vroundq_s32_f32(const float32x4_t val)

 inline int32x2_t vround_s32_f32(const float32x2_t val)
 {
-#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
+#if defined(__ARM_ARCH) && (__ARM_ARCH >= 8)
    return vcvtn_s32_f32(val);
 #else
    const float32x2_t delta = vdup_n_f32(CAROTENE_ROUND_DELTA);
--- a/3rdparty/fastcv/CMakeLists.txt
+++ b/3rdparty/fastcv/CMakeLists.txt
@ -0,0 +1,32 @@
+if(HAVE_FASTCV)
+  set(FASTCV_HAL_VERSION 0.0.1 CACHE INTERNAL "")
+  set(FASTCV_HAL_LIBRARIES "fastcv_hal" CACHE INTERNAL "")
+  set(FASTCV_HAL_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/include" CACHE INTERNAL "")
+  set(FASTCV_HAL_HEADERS
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/fastcv_hal_core.hpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/fastcv_hal_imgproc.hpp"
+    CACHE INTERNAL "")
+
+  file(GLOB FASTCV_HAL_FILES    "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp")
+
+  add_library(fastcv_hal STATIC ${FASTCV_HAL_FILES})
+
+  target_include_directories(fastcv_hal PRIVATE
+    ${CMAKE_SOURCE_DIR}/modules/core/include
+    ${CMAKE_SOURCE_DIR}/modules/imgproc/include
+    ${FASTCV_HAL_INCLUDE_DIRS} ${FastCV_INCLUDE_PATH})
+
+  target_link_libraries(fastcv_hal PUBLIC ${FASTCV_LIBRARY})
+
+  set_target_properties(fastcv_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
+
+  if(NOT BUILD_SHARED_LIBS)
+    ocv_install_target(fastcv_hal EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+  endif()
+
+  if(ENABLE_SOLUTION_FOLDERS)
+    set_target_properties(fastcv_hal PROPERTIES FOLDER "3rdparty")
+  endif()
+else()
+  message(STATUS "FastCV is not available, disabling related HAL")
+endif(HAVE_FASTCV)
--- a/3rdparty/fastcv/fastcv.cmake
+++ b/3rdparty/fastcv/fastcv.cmake
@ -0,0 +1,44 @@
+function(download_fastcv root_dir)
+
+  # Commit SHA in the opencv_3rdparty repo
+  set(FASTCV_COMMIT "f4413cc2ab7233fdfc383a4cded402c072677fb0")
+
+  # Define actual FastCV versions
+  if(ANDROID)
+    if(AARCH64)
+      message(STATUS "Download FastCV for Android aarch64")
+      set(FCV_PACKAGE_NAME  "fastcv_android_aarch64_2024_12_11.tgz")
+      set(FCV_PACKAGE_HASH  "9dac41e86597305f846212dae31a4a88")
+    else()
+      message(STATUS "Download FastCV for Android armv7")
+      set(FCV_PACKAGE_NAME  "fastcv_android_arm32_2024_12_11.tgz")
+      set(FCV_PACKAGE_HASH  "fe2d30334180b17e3031eee92aac43b6")
+    endif()
+  elseif(UNIX AND NOT APPLE AND NOT IOS AND NOT XROS)
+    if(AARCH64)
+      set(FCV_PACKAGE_NAME  "fastcv_linux_aarch64_2025_02_12.tgz")
+      set(FCV_PACKAGE_HASH  "33ac2a59cf3e7d6402eee2e010de1202")
+    else()
+      message("FastCV: fastcv lib for 32-bit Linux is not supported for now!")
+    endif()
+  endif(ANDROID)
+
+  # Download Package
+
+  set(OPENCV_FASTCV_URL "https://raw.githubusercontent.com/opencv/opencv_3rdparty/${FASTCV_COMMIT}/fastcv/")
+
+  ocv_download( FILENAME        ${FCV_PACKAGE_NAME}
+                HASH            ${FCV_PACKAGE_HASH}
+                URL             ${OPENCV_FASTCV_URL}
+                DESTINATION_DIR ${root_dir}
+                ID              FASTCV
+                STATUS          res
+                UNPACK
+                RELATIVE_URL)
+  if(res)
+    set(HAVE_FASTCV TRUE CACHE BOOL "FastCV status")
+  else()
+    message(WARNING "FastCV: package download failed!")
+  endif()
+
+endfunction()
--- a/3rdparty/fastcv/include/fastcv_hal_core.hpp
+++ b/3rdparty/fastcv/include/fastcv_hal_core.hpp
@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_HAL_CORE_HPP_INCLUDED
+#define OPENCV_FASTCV_HAL_CORE_HPP_INCLUDED
+
+#include <opencv2/core/base.hpp>
+
+#undef  cv_hal_lut
+#define cv_hal_lut                  fastcv_hal_lut
+#undef  cv_hal_normHammingDiff8u
+#define cv_hal_normHammingDiff8u    fastcv_hal_normHammingDiff8u
+#undef  cv_hal_mul8u16u
+#define cv_hal_mul8u16u             fastcv_hal_mul8u16u
+#undef  cv_hal_sub8u32f
+#define cv_hal_sub8u32f             fastcv_hal_sub8u32f
+#undef  cv_hal_transpose2d
+#define cv_hal_transpose2d          fastcv_hal_transpose2d
+#undef  cv_hal_meanStdDev
+#define cv_hal_meanStdDev           fastcv_hal_meanStdDev
+#undef  cv_hal_flip
+#define cv_hal_flip                 fastcv_hal_flip
+#undef  cv_hal_rotate90
+#define cv_hal_rotate90             fastcv_hal_rotate
+#undef  cv_hal_addWeighted8u
+#define cv_hal_addWeighted8u        fastcv_hal_addWeighted8u
+#undef  cv_hal_mul8u
+#define cv_hal_mul8u                fastcv_hal_mul8u
+#undef  cv_hal_mul16s
+#define cv_hal_mul16s               fastcv_hal_mul16s
+#undef  cv_hal_mul32f
+#define cv_hal_mul32f               fastcv_hal_mul32f
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief look-up table transform of an array.
+/// @param src_data Source image data
+/// @param src_step Source image step
+/// @param src_type Source image type
+/// @param lut_data Pointer to lookup table
+/// @param lut_channel_size Size of each channel in bytes
+/// @param lut_channels Number of channels in lookup table
+/// @param dst_data Destination data
+/// @param dst_step Destination step
+/// @param width Width of images
+/// @param height Height of images
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_lut(
+    const uchar*    src_data,
+    size_t          src_step,
+    size_t          src_type,
+    const uchar*    lut_data,
+    size_t          lut_channel_size,
+    size_t          lut_channels,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Hamming distance between two vectors
+/// @param a pointer to first vector data
+/// @param b pointer to second vector data
+/// @param n length of vectors
+/// @param cellSize how many bits of the vectors will be added and treated as a single bit, can be 1 (standard Hamming distance), 2 or 4
+/// @param result pointer to result output
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_mul8u16u(
+    const uchar    * src1_data,
+    size_t           src1_step,
+    const uchar    * src2_data,
+    size_t           src2_step,
+    ushort         * dst_data,
+    size_t           dst_step,
+    int              width,
+    int              height,
+    double           scale);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_sub8u32f(
+    const uchar     *src1_data,
+    size_t           src1_step,
+    const uchar     *src2_data,
+    size_t           src2_step,
+    float           *dst_data,
+    size_t           dst_step,
+    int              width,
+    int              height);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_transpose2d(
+    const uchar*     src_data,
+    size_t           src_step,
+    uchar*           dst_data,
+    size_t           dst_step,
+    int              src_width,
+    int              src_height,
+    int              element_size);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_meanStdDev(
+    const uchar     * src_data,
+    size_t            src_step,
+    int               width,
+    int               height,
+    int               src_type,
+    double          * mean_val,
+    double          * stddev_val,
+    uchar           * mask,
+    size_t            mask_step);
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Flips a 2D array around vertical, horizontal, or both axes
+/// @param src_type source and destination image type
+/// @param src_data source image data
+/// @param src_step source image step
+/// @param src_width source and destination image width
+/// @param src_height source and destination image height
+/// @param dst_data destination image data
+/// @param dst_step destination image step
+/// @param flip_mode 0 flips around x-axis, 1 around y-axis, -1 both
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_flip(
+    int             src_type,
+    const uchar*    src_data,
+    size_t          src_step,
+    int             src_width,
+    int             src_height,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             flip_mode);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Rotates a 2D array in multiples of 90 degrees.
+/// @param src_type source and destination image type
+/// @param src_data source image data
+/// @param src_step source image step
+/// @param src_width source image width
+///   @If angle has value [180] it is also destination image width
+///   If angle has values [90, 270] it is also destination image height
+/// @param src_height source and destination image height (destination image width for angles [90, 270])
+///   If angle has value [180] it is also destination image height
+///   If angle has values [90, 270] it is also destination image width
+/// @param dst_data destination image data
+/// @param dst_step destination image step
+/// @param angle clockwise angle for rotation in degrees from set [90, 180, 270]
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_rotate(
+    int             src_type,
+    const uchar*    src_data,
+    size_t          src_step,
+    int             src_width,
+    int             src_height,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             angle);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief weighted sum of two arrays using formula: dst[i] = a * src1[i] + b * src2[i]
+/// @param src1_data first source image data
+/// @param src1_step first source image step
+/// @param src2_data second source image data
+/// @param src2_step second source image step
+/// @param dst_data  destination image data
+/// @param dst_step  destination image step
+/// @param width     width of the images
+/// @param height    height of the images
+/// @param scalars   numbers a, b, and c
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_addWeighted8u(
+    const uchar*    src1_data,
+    size_t          src1_step,
+    const uchar*    src2_data,
+    size_t          src2_step,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    const double    scalars[3]);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_mul8u(
+    const uchar     *src1_data,
+    size_t          src1_step,
+    const uchar     *src2_data,
+    size_t          src2_step,
+    uchar           *dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    double          scale);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_mul16s(
+    const short     *src1_data,
+    size_t          src1_step,
+    const short     *src2_data,
+    size_t          src2_step,
+    short           *dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    double          scale);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_mul32f(
+    const float    *src1_data,
+    size_t          src1_step,
+    const float    *src2_data,
+    size_t          src2_step,
+    float          *dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    double          scale);
+
+#endif
--- a/3rdparty/fastcv/include/fastcv_hal_imgproc.hpp
+++ b/3rdparty/fastcv/include/fastcv_hal_imgproc.hpp
@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_HAL_IMGPROC_HPP_INCLUDED
+#define OPENCV_FASTCV_HAL_IMGPROC_HPP_INCLUDED
+
+#include <opencv2/core/base.hpp>
+
+#undef  cv_hal_medianBlur
+#define cv_hal_medianBlur           fastcv_hal_medianBlur
+#undef  cv_hal_sobel
+#define cv_hal_sobel                fastcv_hal_sobel
+#undef  cv_hal_boxFilter
+#define cv_hal_boxFilter            fastcv_hal_boxFilter
+#undef  cv_hal_adaptiveThreshold
+#define cv_hal_adaptiveThreshold    fastcv_hal_adaptiveThreshold
+#undef  cv_hal_gaussianBlurBinomial
+#define cv_hal_gaussianBlurBinomial fastcv_hal_gaussianBlurBinomial
+#undef  cv_hal_warpPerspective
+#define cv_hal_warpPerspective      fastcv_hal_warpPerspective
+#undef  cv_hal_pyrdown
+#define cv_hal_pyrdown              fastcv_hal_pyrdown
+#undef  cv_hal_cvtBGRtoHSV
+#define cv_hal_cvtBGRtoHSV          fastcv_hal_cvtBGRtoHSV
+#undef  cv_hal_cvtBGRtoYUVApprox
+#define cv_hal_cvtBGRtoYUVApprox    fastcv_hal_cvtBGRtoYUVApprox
+#undef  cv_hal_canny
+#define cv_hal_canny                fastcv_hal_canny
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Calculate medianBlur filter
+/// @param src_data Source image data
+/// @param src_step Source image step
+/// @param dst_data Destination image data
+/// @param dst_step Destination image step
+/// @param width    Source image width
+/// @param height   Source image height
+/// @param depth    Depths of source and destination image
+/// @param cn       Number of channels
+/// @param ksize    Size of kernel
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_medianBlur(
+    const uchar*    src_data,
+    size_t          src_step,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    int             depth,
+    int             cn,
+    int             ksize);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Computes Sobel derivatives
+///
+/// @param src_data         Source image data
+/// @param src_step         Source image step
+/// @param dst_data         Destination image data
+/// @param dst_step         Destination image step
+/// @param width            Source image width
+/// @param height           Source image height
+/// @param src_depth        Depth of source image
+/// @param dst_depth        Depths of destination image
+/// @param cn               Number of channels
+/// @param margin_left      Left margins for source image
+/// @param margin_top       Top margins for source image
+/// @param margin_right     Right margins for source image
+/// @param margin_bottom    Bottom margins for source image
+/// @param dx               orders of the derivative x
+/// @param dy               orders of the derivative y
+/// @param ksize            Size of kernel
+/// @param scale            Scale factor for the computed derivative values
+/// @param delta            Delta value that is added to the results prior to storing them in dst
+/// @param border_type      Border type
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_sobel(
+    const uchar*    src_data,
+    size_t          src_step,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    int             src_depth,
+    int             dst_depth,
+    int             cn,
+    int             margin_left,
+    int             margin_top,
+    int             margin_right,
+    int             margin_bottom,
+    int             dx,
+    int             dy,
+    int             ksize,
+    double          scale,
+    double          delta,
+    int             border_type);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+int fastcv_hal_boxFilter(
+    const uchar*     src_data,
+    size_t           src_step,
+    uchar*           dst_data,
+    size_t           dst_step,
+    int              width,
+    int              height,
+    int              src_depth,
+    int              dst_depth,
+    int              cn,
+    int              margin_left,
+    int              margin_top,
+    int              margin_right,
+    int              margin_bottom,
+    size_t           ksize_width,
+    size_t           ksize_height,
+    int              anchor_x,
+    int              anchor_y,
+    bool             normalize,
+    int              border_type);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_adaptiveThreshold(
+    const uchar*    src_data,
+    size_t          src_step,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    double          maxValue,
+    int             adaptiveMethod,
+    int             thresholdType,
+    int             blockSize,
+    double          C);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Blurs an image using a Gaussian filter.
+/// @param src_data         Source image data
+/// @param src_step         Source image step
+/// @param dst_data         Destination image data
+/// @param dst_step         Destination image step
+/// @param width            Source image width
+/// @param height           Source image height
+/// @param depth            Depth of source and destination image
+/// @param cn               Number of channels
+/// @param margin_left      Left margins for source image
+/// @param margin_top       Top margins for source image
+/// @param margin_right     Right margins for source image
+/// @param margin_bottom    Bottom margins for source image
+/// @param ksize            Kernel size
+/// @param border_type      Border type
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_gaussianBlurBinomial(
+    const uchar*    src_data,
+    size_t          src_step,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    int             depth,
+    int             cn,
+    size_t          margin_left,
+    size_t          margin_top,
+    size_t          margin_right,
+    size_t          margin_bottom,
+    size_t          ksize,
+    int             border_type);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Applies a perspective transformation to an image.
+///
+/// @param src_type         Source and destination image type
+/// @param src_data         Source image data
+/// @param src_step         Source image step
+/// @param src_width        Source image width
+/// @param src_height       Source image height
+/// @param dst_data         Destination image data
+/// @param dst_step         Destination image step
+/// @param dst_width        Destination image width
+/// @param dst_height       Destination image height
+/// @param M                3x3 matrix with transform coefficients
+/// @param interpolation    Interpolation mode (CV_HAL_INTER_NEAREST, ...)
+/// @param border_type      Border processing mode (CV_HAL_BORDER_REFLECT, ...)
+/// @param border_value     Values to use for CV_HAL_BORDER_CONSTANT mode
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_warpPerspective(
+    int             src_type,
+    const uchar*    src_data,
+    size_t          src_step,
+    int             src_width,
+    int             src_height,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             dst_width,
+    int             dst_height,
+    const double    M[9],
+    int             interpolation,
+    int             border_type,
+    const double    border_value[4]);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_pyrdown(
+    const uchar*     src_data,
+    size_t           src_step,
+    int              src_width,
+    int              src_height,
+    uchar*           dst_data,
+    size_t           dst_step,
+    int              dst_width,
+    int              dst_height,
+    int              depth,
+    int              cn,
+    int              border_type);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_cvtBGRtoHSV(
+    const uchar    * src_data,
+    size_t          src_step,
+    uchar          * dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    int             depth,
+    int             scn,
+    bool            swapBlue,
+    bool            isFullRange,
+    bool            isHSV);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_cvtBGRtoYUVApprox(
+    const uchar    * src_data,
+    size_t          src_step,
+    uchar          * dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    int             depth,
+    int             scn,
+    bool            swapBlue,
+    bool            isCbCr);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Canny edge detector
+/// @param src_data Source image data
+/// @param src_step Source image step
+/// @param dst_data Destination image data
+/// @param dst_step Destination image step
+/// @param width Source image width
+/// @param height Source image height
+/// @param cn Number of channels
+/// @param lowThreshold low hresholds value
+/// @param highThreshold high thresholds value
+/// @param ksize Kernel size for Sobel operator.
+/// @param L2gradient Flag, indicating use of L2 or L1 norma.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_canny(
+    const uchar* 	src_data,
+    size_t          src_step,
+    uchar* 			dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    int             cn,
+    double          lowThreshold,
+    double          highThreshold,
+    int             ksize,
+    bool            L2gradient);
+
+#endif
--- a/3rdparty/fastcv/include/fastcv_hal_utils.hpp
+++ b/3rdparty/fastcv/include/fastcv_hal_utils.hpp
@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_HAL_UTILS_HPP_INCLUDED
+#define OPENCV_FASTCV_HAL_UTILS_HPP_INCLUDED
+
+#include "fastcv.h"
+#include <opencv2/core/utils/logger.hpp>
+
+#define INITIALIZATION_CHECK                                        \
+{                                                                   \
+    if (!FastCvContext::getContext().isInitialized)                 \
+    {                                                               \
+        return CV_HAL_ERROR_UNKNOWN;                                \
+    }                                                               \
+}
+
+#define CV_HAL_RETURN(status, func)                                         \
+{                                                                           \
+    if( status == FASTCV_SUCCESS )                                          \
+    {                                                                       \
+        CV_LOG_DEBUG(NULL, "FastCV HAL for "<<#func<<" run successfully!"); \
+        return CV_HAL_ERROR_OK;                                             \
+    }                                                                       \
+    else if(status == FASTCV_EBADPARAM || status == FASTCV_EUNALIGNPARAM || \
+            status == FASTCV_EUNSUPPORTED || status == FASTCV_EHWQDSP ||    \
+            status == FASTCV_EHWGPU)                                        \
+    {                                                                       \
+        CV_LOG_DEBUG(NULL, "FastCV status:"<<getFastCVErrorString(status)   \
+            <<", Switching to default OpenCV solution!");                   \
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;                                \
+    }                                                                       \
+    else                                                                    \
+    {                                                                       \
+        CV_LOG_ERROR(NULL,"FastCV error:"<<getFastCVErrorString(status));   \
+        return CV_HAL_ERROR_UNKNOWN;                                        \
+    }                                                                       \
+}
+
+#define CV_HAL_RETURN_NOT_IMPLEMENTED(reason)                           \
+{                                                                       \
+    CV_LOG_DEBUG(NULL,"Switching to default OpenCV\nInfo: "<<reason);   \
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;                                \
+}
+
+#define FCV_KernelSize_SHIFT 3
+#define FCV_MAKETYPE(ksize,depth) ((ksize<<FCV_KernelSize_SHIFT) + depth)
+#define FCV_CMP_EQ(val1,val2) (fabs(val1 - val2) < FLT_EPSILON)
+
+const char* getFastCVErrorString(int status);
+const char* borderToString(int border);
+const char* interpolationToString(int interpolation);
+
+struct FastCvContext
+{
+public:
+    // initialize at first call
+    // Defines a static local variable context. Variable is created only once.
+    static FastCvContext& getContext()
+    {
+        static FastCvContext context;
+        return context;
+    }
+
+    FastCvContext()
+    {
+        if (fcvSetOperationMode(FASTCV_OP_CPU_PERFORMANCE) != 0)
+        {
+            CV_LOG_WARNING(NULL, "Failed to switch FastCV operation mode");
+            isInitialized = false;
+        }
+        else
+        {
+            CV_LOG_INFO(NULL, "FastCV Operation Mode Switched");
+            isInitialized = true;
+        }
+    }
+
+    bool isInitialized;
+};
+
+#endif
--- a/3rdparty/fastcv/src/fastcv_hal_core.cpp
+++ b/3rdparty/fastcv/src/fastcv_hal_core.cpp
@ -0,0 +1,574 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "fastcv_hal_core.hpp"
+#include "fastcv_hal_utils.hpp"
+#include <opencv2/core/core.hpp>
+#include <opencv2/core/base.hpp>
+
+
+class ParallelTableLookup : public cv::ParallelLoopBody
+{
+public:
+
+    ParallelTableLookup(const uchar* src_data_, int width_, size_t src_step_, const uchar* lut_data_, uchar* dst_data_, size_t dst_step_) :
+        cv::ParallelLoopBody(), src_data(src_data_), width(width_), src_step(src_step_), lut_data(lut_data_), dst_data(dst_data_), dst_step(dst_step_)
+    {
+    }
+
+    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        fcvStatus status = FASTCV_SUCCESS;
+        for (int y = range.start; y < range.end; y++) {
+            status = fcvTableLookupu8((uint8_t*)src_data + y * src_step, width, 1, src_step, (uint8_t*)lut_data, (uint8_t*)dst_data + y * dst_step, dst_step);
+            if(status != FASTCV_SUCCESS)
+                CV_LOG_ERROR(NULL,"FastCV error:"<<getFastCVErrorString(status));
+        }
+    }
+
+private:
+    const uchar* src_data;
+    int          width;
+    size_t       src_step;
+    const uchar* lut_data;
+    uchar*       dst_data;
+    size_t       dst_step;
+};
+
+int fastcv_hal_lut(
+    const uchar*    src_data,
+    size_t          src_step,
+    size_t          src_type,
+    const uchar*    lut_data,
+    size_t          lut_channel_size,
+    size_t          lut_channels,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height)
+{
+    if((width*height)<=(320*240))
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution!");
+
+    INITIALIZATION_CHECK;
+
+    fcvStatus           status;
+    if (src_type == CV_8UC1 && lut_channels == 1 && lut_channel_size == 1)
+    {
+        cv::parallel_for_(cv::Range(0, height),
+            ParallelTableLookup(src_data, width, src_step, lut_data, dst_data, dst_step));
+        status = FASTCV_SUCCESS;
+        CV_HAL_RETURN(status, hal_lut);
+    }
+    else
+    {
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Multi-channel input is not supported");
+    }
+}
+
+int fastcv_hal_normHammingDiff8u(
+    const uchar*    a,
+    const uchar*    b,
+    int             n,
+    int             cellSize,
+    int*            result)
+{
+    fcvStatus           status;
+
+    if (cellSize != 1)
+        CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("NORM_HAMMING2 cellSize:%d is not supported", cellSize));
+
+    INITIALIZATION_CHECK;
+
+    uint32_t dist = 0;
+
+    dist = fcvHammingDistanceu8((uint8_t*)a, (uint8_t*)b, n);
+
+    *result = dist;
+    status = FASTCV_SUCCESS;
+    CV_HAL_RETURN(status, hal_normHammingDiff8u);
+}
+
+int fastcv_hal_mul8u16u(
+    const uchar*     src1_data,
+    size_t           src1_step,
+    const uchar*     src2_data,
+    size_t           src2_step,
+    ushort*          dst_data,
+    size_t           dst_step,
+    int              width,
+    int              height,
+    double           scale)
+{
+    if(scale != 1.0)
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Scale factor not supported");
+
+    INITIALIZATION_CHECK;
+
+    fcvStatus status = FASTCV_SUCCESS;
+
+    if (src1_step < (size_t)width && src2_step < (size_t)width)
+    {
+        src1_step = width*sizeof(uchar);
+        src2_step = width*sizeof(uchar);
+        dst_step  = width*sizeof(ushort);
+    }
+
+    status = fcvElementMultiplyu8u16_v2(src1_data, width, height, src1_step,
+                            src2_data, src2_step, dst_data, dst_step);
+
+    CV_HAL_RETURN(status,hal_multiply);
+}
+
+int fastcv_hal_sub8u32f(
+    const uchar*     src1_data,
+    size_t           src1_step,
+    const uchar*     src2_data,
+    size_t           src2_step,
+    float*           dst_data,
+    size_t           dst_step,
+    int              width,
+    int              height)
+{
+    INITIALIZATION_CHECK;
+
+    fcvStatus status = FASTCV_SUCCESS;
+
+    if (src1_step < (size_t)width && src2_step < (size_t)width)
+    {
+        src1_step = width*sizeof(uchar);
+        src2_step = width*sizeof(uchar);
+        dst_step  = width*sizeof(float);
+    }
+
+    status = fcvImageDiffu8f32_v2(src1_data, src2_data, width, height, src1_step,
+                                  src2_step, dst_data, dst_step);
+
+    CV_HAL_RETURN(status,hal_subtract);
+
+}
+
+int fastcv_hal_transpose2d(
+    const uchar*     src_data,
+    size_t           src_step,
+    uchar*           dst_data,
+    size_t           dst_step,
+    int              src_width,
+    int              src_height,
+    int              element_size)
+{
+    INITIALIZATION_CHECK;
+
+    if (src_data == dst_data)
+        CV_HAL_RETURN_NOT_IMPLEMENTED("In-place not supported");
+
+    fcvStatus status = FASTCV_SUCCESS;
+
+    switch (element_size)
+    {
+        case 1:
+            status = fcvTransposeu8_v2(src_data, src_width, src_height, src_step,
+                                       dst_data, dst_step);
+            break;
+        case 2:
+            status = fcvTransposeu16_v2((const uint16_t*)src_data, src_width, src_height,
+                                       src_step, (uint16_t*)dst_data, dst_step);
+            break;
+        case 4:
+            status = fcvTransposef32_v2((const float32_t*)src_data, src_width, src_height,
+                                       src_step, (float32_t*)dst_data, dst_step);
+            break;
+        default:
+            CV_HAL_RETURN_NOT_IMPLEMENTED("srcType not supported");
+    }
+
+    CV_HAL_RETURN(status,hal_transpose);
+}
+
+int fastcv_hal_meanStdDev(
+    const uchar*      src_data,
+    size_t            src_step,
+    int               width,
+    int               height,
+    int               src_type,
+    double*           mean_val,
+    double*           stddev_val,
+    uchar*            mask,
+    size_t            mask_step)
+{
+    INITIALIZATION_CHECK;
+
+    CV_UNUSED(mask_step);
+
+    if(src_type != CV_8UC1)
+    {
+        CV_HAL_RETURN_NOT_IMPLEMENTED("src type not supported");
+    }
+    else if(mask != nullptr)
+    {
+        CV_HAL_RETURN_NOT_IMPLEMENTED("mask not supported");
+    }
+    else if(mean_val == nullptr && stddev_val == nullptr)
+    {
+        CV_HAL_RETURN_NOT_IMPLEMENTED("null ptr for mean and stddev");
+    }
+
+    float32_t mean, variance;
+
+    fcvStatus status = fcvImageIntensityStats_v2(src_data, src_step, 0, 0, width, height,
+                                   &mean, &variance, FASTCV_BIASED_VARIANCE_ESTIMATOR);
+
+    if(mean_val != nullptr)
+        *mean_val = mean;
+    if(stddev_val != nullptr)
+        *stddev_val = std::sqrt(variance);
+
+    CV_HAL_RETURN(status,hal_meanStdDev);
+}
+
+int fastcv_hal_flip(
+    int             src_type,
+    const uchar*    src_data,
+    size_t          src_step,
+    int             src_width,
+    int             src_height,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             flip_mode)
+{
+    INITIALIZATION_CHECK;
+
+    if(src_type!=CV_8UC1 && src_type!=CV_16UC1 && src_type!=CV_8UC3)
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Data type is not supported, Switching to default OpenCV solution!");
+
+    if((src_width*src_height)<=(640*480))
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution!");
+
+    fcvStatus       status = FASTCV_SUCCESS;;
+    fcvFlipDir      dir;
+
+    switch (flip_mode)
+    {
+        //Flip around X-Axis: Vertical Flip or FLIP_ROWS
+        case 0:
+            CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution due to low perf!");
+            dir = FASTCV_FLIP_VERT;
+            break;
+
+        //Flip around Y-Axis: Horizontal Flip or FLIP_COLS
+        case 1:
+            dir = FASTCV_FLIP_HORIZ;
+            break;
+
+        //Flip around both X and Y-Axis or FLIP_BOTH
+        case -1:
+            dir = FASTCV_FLIP_BOTH;
+            break;
+        default:
+            CV_HAL_RETURN_NOT_IMPLEMENTED("Invalid flip_mode, Switching to default OpenCV solution!");
+    }
+
+    if(src_type==CV_8UC1)
+        fcvFlipu8(src_data, src_width, src_height, src_step, dst_data, dst_step, dir);
+    else if(src_type==CV_16UC1)
+        fcvFlipu16((uint16_t*)src_data, src_width, src_height, src_step, (uint16_t*)dst_data, dst_step, dir);
+    else if(src_type==CV_8UC3)
+        status = fcvFlipRGB888u8((uint8_t*)src_data, src_width, src_height, src_step, (uint8_t*)dst_data, dst_step, dir);
+    else
+        CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Data type:%d is not supported, Switching to default OpenCV solution!", src_type));
+
+    CV_HAL_RETURN(status, hal_flip);
+}
+
+int fastcv_hal_rotate(
+    int             src_type,
+    const uchar*    src_data,
+    size_t          src_step,
+    int             src_width,
+    int             src_height,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             angle)
+{
+    if((src_width*src_height)<(120*80))
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution for lower resolution!");
+
+    fcvStatus           status;
+    fcvRotateDegree     degree;
+
+    if (src_type != CV_8UC1 && src_type != CV_8UC2)
+        CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("src_type:%d is not supported", src_type));
+
+    INITIALIZATION_CHECK;
+
+    switch (angle)
+    {
+        case 90:
+            degree = FASTCV_ROTATE_90;
+            break;
+        case 180:
+            degree = FASTCV_ROTATE_180;
+            break;
+        case 270:
+            degree = FASTCV_ROTATE_270;
+            break;
+        default:
+            CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Rotation angle:%d is not supported", angle));
+    }
+
+    switch(src_type)
+    {
+        case CV_8UC1:
+            status = fcvRotateImageu8(src_data, src_width, src_height, src_step, dst_data, dst_step, degree);
+            break;
+        case CV_8UC2:
+            status = fcvRotateImageInterleavedu8((uint8_t*)src_data, src_width, src_height, src_step, (uint8_t*)dst_data,
+                                                    dst_step, degree);
+            break;
+        default:
+            CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("src_type:%d is not supported", src_type));
+    }
+    CV_HAL_RETURN(status, hal_rotate);
+}
+
+int fastcv_hal_addWeighted8u(
+    const uchar*    src1_data,
+    size_t          src1_step,
+    const uchar*    src2_data,
+    size_t          src2_step,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    const double    scalars[3])
+{
+    if( (scalars[0] < -128.0f) || (scalars[0] >= 128.0f) ||
+        (scalars[1] < -128.0f) || (scalars[1] >= 128.0f) ||
+        (scalars[2] < -(1<<23))|| (scalars[2] >= 1<<23))
+        CV_HAL_RETURN_NOT_IMPLEMENTED(
+            cv::format("Alpha:%f,Beta:%f,Gamma:%f is not supported because it's too large or too small\n",
+            scalars[0],scalars[1],scalars[2]));
+
+    INITIALIZATION_CHECK;
+
+    fcvStatus status = FASTCV_SUCCESS;
+
+    if (height == 1)
+    {
+        src1_step = width*sizeof(uchar);
+        src2_step = width*sizeof(uchar);
+        dst_step  = width*sizeof(uchar);
+
+        cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
+            int rangeWidth = range.end - range.start;
+            const uint8_t *src1 = src1_data + range.start;
+            const uint8_t *src2 = src2_data + range.start;
+            uint8_t *dst = dst_data + range.start;
+            fcvAddWeightedu8_v2(src1, rangeWidth, height, src1_step, src2, src2_step,
+                scalars[0], scalars[1], scalars[2], dst, dst_step);
+            });
+    }
+    else
+    {
+        cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
+            int rangeHeight = range.end - range.start;
+            const uint8_t *src1 = src1_data + range.start * src1_step;
+            const uint8_t *src2 = src2_data + range.start * src2_step;
+            uint8_t *dst = dst_data + range.start * dst_step;
+            fcvAddWeightedu8_v2(src1, width, rangeHeight, src1_step, src2, src2_step,
+                scalars[0], scalars[1], scalars[2], dst, dst_step);
+            });
+    }
+
+    CV_HAL_RETURN(status, hal_addWeighted8u_v2);
+}
+
+int fastcv_hal_mul8u(
+    const uchar     *src1_data,
+    size_t          src1_step,
+    const uchar     *src2_data,
+    size_t          src2_step,
+    uchar           *dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    double          scale)
+{
+    int8_t sF;
+
+    if(FCV_CMP_EQ(scale,1.0))              { sF =  0; }
+    else if(scale > 1.0)                    
+    {
+        if(FCV_CMP_EQ(scale,2.0))          { sF = -1; }
+        else if(FCV_CMP_EQ(scale,4.0))     { sF = -2; }
+        else if(FCV_CMP_EQ(scale,8.0))     { sF = -3; }
+        else if(FCV_CMP_EQ(scale,16.0))    { sF = -4; }
+        else if(FCV_CMP_EQ(scale,32.0))    { sF = -5; }
+        else if(FCV_CMP_EQ(scale,64.0))    { sF = -6; }
+        else if(FCV_CMP_EQ(scale,128.0))   { sF = -7; }
+        else if(FCV_CMP_EQ(scale,256.0))   { sF = -8; }
+        else CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
+    }
+    else if(scale > 0 && scale < 1.0)
+    {
+        if(FCV_CMP_EQ(scale,1/2.0))        { sF = 1;  }
+        else if(FCV_CMP_EQ(scale,1/4.0))   { sF = 2;  }
+        else if(FCV_CMP_EQ(scale,1/8.0))   { sF = 3;  }
+        else if(FCV_CMP_EQ(scale,1/16.0))  { sF = 4;  }
+        else if(FCV_CMP_EQ(scale,1/32.0))  { sF = 5;  }
+        else if(FCV_CMP_EQ(scale,1/64.0))  { sF = 6;  }
+        else if(FCV_CMP_EQ(scale,1/128.0)) { sF = 7;  }
+        else if(FCV_CMP_EQ(scale,1/256.0)) { sF = 8;  }
+        else CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
+    }
+    else
+        CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
+
+    INITIALIZATION_CHECK;
+
+    int nStripes = cv::getNumThreads();
+
+    if(height == 1)
+    {
+        cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
+                      int rangeWidth = range.end - range.start;
+                      const uchar* yS1 =  src1_data + static_cast<size_t>(range.start);
+                      const uchar* yS2 =  src2_data + static_cast<size_t>(range.start);
+                      uchar* yD = dst_data + static_cast<size_t>(range.start);
+                      fcvElementMultiplyu8(yS1, rangeWidth, 1, 0, yS2, 0, sF,
+                                            FASTCV_CONVERT_POLICY_SATURATE, yD, 0);
+                      }, nStripes);
+    }
+    else
+    {
+        cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
+                      int rangeHeight = range.end - range.start;
+                      const uchar* yS1 =  src1_data + static_cast<size_t>(range.start)*src1_step;
+                      const uchar* yS2 =  src2_data + static_cast<size_t>(range.start)*src2_step;
+                      uchar* yD = dst_data + static_cast<size_t>(range.start)*dst_step;
+                      fcvElementMultiplyu8(yS1, width, rangeHeight, src1_step, yS2, src2_step,
+                                            sF, FASTCV_CONVERT_POLICY_SATURATE, yD, dst_step);
+                      }, nStripes);
+    }
+
+    fcvStatus status = FASTCV_SUCCESS;
+    CV_HAL_RETURN(status, hal_mul8u);
+}
+
+int fastcv_hal_mul16s(
+    const short     *src1_data,
+    size_t          src1_step,
+    const short     *src2_data,
+    size_t          src2_step,
+    short           *dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    double          scale)
+{
+    int8_t sF;
+
+    if(FCV_CMP_EQ(scale,1.0))              { sF =  0; }
+    else if(scale > 1.0)                    
+    {
+        if(FCV_CMP_EQ(scale,2.0))          { sF = -1; }
+        else if(FCV_CMP_EQ(scale,4.0))     { sF = -2; }
+        else if(FCV_CMP_EQ(scale,8.0))     { sF = -3; }
+        else if(FCV_CMP_EQ(scale,16.0))    { sF = -4; }
+        else if(FCV_CMP_EQ(scale,32.0))    { sF = -5; }
+        else if(FCV_CMP_EQ(scale,64.0))    { sF = -6; }
+        else if(FCV_CMP_EQ(scale,128.0))   { sF = -7; }
+        else if(FCV_CMP_EQ(scale,256.0))   { sF = -8; }
+        else CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
+    }
+    else if(scale > 0 && scale < 1.0)
+    {
+        if(FCV_CMP_EQ(scale,1/2.0))        { sF = 1;  }
+        else if(FCV_CMP_EQ(scale,1/4.0))   { sF = 2;  }
+        else if(FCV_CMP_EQ(scale,1/8.0))   { sF = 3;  }
+        else if(FCV_CMP_EQ(scale,1/16.0))  { sF = 4;  }
+        else if(FCV_CMP_EQ(scale,1/32.0))  { sF = 5;  }
+        else if(FCV_CMP_EQ(scale,1/64.0))  { sF = 6;  }
+        else if(FCV_CMP_EQ(scale,1/128.0)) { sF = 7;  }
+        else if(FCV_CMP_EQ(scale,1/256.0)) { sF = 8;  }
+        else CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
+    }
+    else
+        CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
+
+    INITIALIZATION_CHECK;
+
+    int nStripes = cv::getNumThreads();
+
+    if(height == 1)
+    {
+        cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
+                      int rangeWidth = range.end - range.start;
+                      const short* yS1 =  src1_data + static_cast<size_t>(range.start);
+                      const short* yS2 =  src2_data + static_cast<size_t>(range.start);
+                      short* yD = dst_data + static_cast<size_t>(range.start);
+                      fcvElementMultiplys16(yS1, rangeWidth, 1, 0, yS2, 0, sF,
+                                             FASTCV_CONVERT_POLICY_SATURATE, yD, 0);
+                      }, nStripes);
+    }
+    else
+    {
+        cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
+                      int rangeHeight = range.end - range.start;
+                      const short* yS1 =  src1_data + static_cast<size_t>(range.start) * (src1_step/sizeof(short));
+                      const short* yS2 =  src2_data + static_cast<size_t>(range.start) * (src2_step/sizeof(short));
+                      short* yD = dst_data + static_cast<size_t>(range.start) * (dst_step/sizeof(short));
+                      fcvElementMultiplys16(yS1, width, rangeHeight, src1_step, yS2, src2_step,
+                                                sF, FASTCV_CONVERT_POLICY_SATURATE, yD, dst_step);
+                      }, nStripes);
+    }
+
+    fcvStatus status = FASTCV_SUCCESS;
+    CV_HAL_RETURN(status, hal_mul16s);
+}
+
+int fastcv_hal_mul32f(
+    const float    *src1_data,
+    size_t          src1_step,
+    const float    *src2_data,
+    size_t          src2_step,
+    float          *dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    double          scale)
+{
+    if(!FCV_CMP_EQ(scale,1.0))
+        CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
+
+    INITIALIZATION_CHECK;
+
+    int nStripes = cv::getNumThreads();
+
+    if(height == 1)
+    {
+        cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
+                      int rangeWidth = range.end - range.start;
+                      const float* yS1 =  src1_data + static_cast<size_t>(range.start);
+                      const float* yS2 =  src2_data + static_cast<size_t>(range.start);
+                      float* yD = dst_data + static_cast<size_t>(range.start);
+                      fcvElementMultiplyf32(yS1, rangeWidth, 1, 0, yS2, 0, yD, 0);
+                      }, nStripes);
+    }
+    else
+    {
+        cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
+                      int rangeHeight = range.end - range.start;
+                      const float* yS1 =  src1_data + static_cast<size_t>(range.start) * (src1_step/sizeof(float));
+                      const float* yS2 =  src2_data + static_cast<size_t>(range.start) * (src2_step/sizeof(float));
+                      float* yD = dst_data + static_cast<size_t>(range.start) * (dst_step/sizeof(float));
+                      fcvElementMultiplyf32(yS1, width, rangeHeight, src1_step,
+                                                  yS2, src2_step, yD, dst_step);
+                      }, nStripes);
+    }
+
+    fcvStatus status = FASTCV_SUCCESS;
+    CV_HAL_RETURN(status, hal_mul32f);
+}
--- a/3rdparty/fastcv/src/fastcv_hal_imgproc.cpp
+++ b/3rdparty/fastcv/src/fastcv_hal_imgproc.cpp
--- a/3rdparty/fastcv/src/fastcv_hal_utils.cpp
+++ b/3rdparty/fastcv/src/fastcv_hal_utils.cpp
@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "fastcv_hal_utils.hpp"
+
+const char* getFastCVErrorString(int status)
+{
+    switch(status)
+    {
+        case FASTCV_SUCCESS: return "Successful";
+        case FASTCV_EFAIL: return "General failure";
+        case FASTCV_EUNALIGNPARAM: return "Unaligned pointer parameter";
+        case FASTCV_EBADPARAM: return "Bad parameters";
+        case FASTCV_EINVALSTATE: return "Called at invalid state";
+        case FASTCV_ENORES: return "Insufficient resources, memory, thread, etc";
+        case FASTCV_EUNSUPPORTED: return "Unsupported feature";
+        case FASTCV_EHWQDSP: return "Hardware QDSP failed to respond";
+        case FASTCV_EHWGPU: return "Hardware GPU failed to respond";
+        default: return "Unknown FastCV Error";
+    }
+}
+
+const char* borderToString(int border)
+{
+    switch (border)
+    {
+        case 0: return "BORDER_CONSTANT";
+        case 1: return "BORDER_REPLICATE";
+        case 2: return "BORDER_REFLECT";
+        case 3: return "BORDER_WRAP";
+        case 4: return "BORDER_REFLECT_101";
+        case 5: return "BORDER_TRANSPARENT";
+        default: return "Unknown border type";
+    }
+}
+
+const char* interpolationToString(int interpolation)
+{
+    switch (interpolation)
+    {
+        case 0: return "INTER_NEAREST";
+        case 1: return "INTER_LINEAR";
+        case 2: return "INTER_CUBIC";
+        case 3: return "INTER_AREA";
+        case 4: return "INTER_LANCZOS4";
+        case 5: return "INTER_LINEAR_EXACT";
+        case 6: return "INTER_NEAREST_EXACT";
+        case 7: return "INTER_MAX";
+        case 8: return "WARP_FILL_OUTLIERS";
+        case 16: return "WARP_INVERSE_MAP";
+        case 32: return "WARP_RELATIVE_MAP";
+        default: return "Unknown interpolation type";
+    }
+}
--- a/3rdparty/ffmpeg/ffmpeg.cmake
+++ b/3rdparty/ffmpeg/ffmpeg.cmake
@ -1,8 +1,8 @@
-# Binaries branch name: ffmpeg/4.x_20240522
-# Binaries were created for OpenCV: 8393885a39dac1e650bf5d0aaff84c04ad8bcdd3
-ocv_update(FFMPEG_BINARIES_COMMIT "394dca6ceb3085c979415e6385996b6570e94153")
-ocv_update(FFMPEG_FILE_HASH_BIN32 "bdfbd1efb295f3e54c07d2cb7a843bf9")
-ocv_update(FFMPEG_FILE_HASH_BIN64 "bfef029900f788480a363d6dc05c4f0e")
+# Binaries branch name: ffmpeg/4.x_20241226
+# Binaries were created for OpenCV: 09892c9d1706f40342bda0bc404580f63492d9f8
+ocv_update(FFMPEG_BINARIES_COMMIT "d63d7c154c57242bf2283be61166be2bd30ec47e")
+ocv_update(FFMPEG_FILE_HASH_BIN32 "642b94d032a8292b07550126934173f6")
+ocv_update(FFMPEG_FILE_HASH_BIN64 "a8c3560c8f20e1ae465bef81580fa92c")
 ocv_update(FFMPEG_FILE_HASH_CMAKE "8862c87496e2e8c375965e1277dee1c7")

 function(download_win_ffmpeg script_var)
--- a/3rdparty/hal_rvv/hal_rvv.hpp
+++ b/3rdparty/hal_rvv/hal_rvv.hpp
@ -19,4 +19,15 @@
 #include "version/hal_rvv_071.hpp"
 #endif

-#endif
+#if defined(__riscv_v) && __riscv_v == 1000000
+#include "hal_rvv_1p0/merge.hpp" // core
+#include "hal_rvv_1p0/mean.hpp" // core
+#include "hal_rvv_1p0/norm.hpp" // core
+#include "hal_rvv_1p0/norm_diff.hpp" // core
+#include "hal_rvv_1p0/convert_scale.hpp" // core
+#include "hal_rvv_1p0/minmax.hpp" // core
+#include "hal_rvv_1p0/atan.hpp" // core
+#include "hal_rvv_1p0/split.hpp" // core
+#endif
+
+#endif
--- a/3rdparty/hal_rvv/hal_rvv_1p0/atan.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/atan.hpp
@ -0,0 +1,128 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level
+// directory of this distribution and at http://opencv.org/license.html.
+#pragma once
+
+#undef cv_hal_fastAtan32f
+#define cv_hal_fastAtan32f cv::cv_hal_rvv::fast_atan_32
+
+#undef cv_hal_fastAtan64f
+#define cv_hal_fastAtan64f cv::cv_hal_rvv::fast_atan_64
+
+#include <riscv_vector.h>
+
+#include <cfloat>
+
+namespace cv::cv_hal_rvv {
+
+namespace detail {
+// ref: mathfuncs_core.simd.hpp
+static constexpr float pi = CV_PI;
+static constexpr float atan2_p1 = 0.9997878412794807F * (180 / pi);
+static constexpr float atan2_p3 = -0.3258083974640975F * (180 / pi);
+static constexpr float atan2_p5 = 0.1555786518463281F * (180 / pi);
+static constexpr float atan2_p7 = -0.04432655554792128F * (180 / pi);
+
+__attribute__((always_inline)) inline vfloat32m4_t
+rvv_atan_f32(vfloat32m4_t vy, vfloat32m4_t vx, size_t vl, float p7,
+             vfloat32m4_t vp5, vfloat32m4_t vp3, vfloat32m4_t vp1,
+             float angle_90_deg) {
+    const auto ax = __riscv_vfabs(vx, vl);
+    const auto ay = __riscv_vfabs(vy, vl);
+    const auto c = __riscv_vfdiv(
+        __riscv_vfmin(ax, ay, vl),
+        __riscv_vfadd(__riscv_vfmax(ax, ay, vl), FLT_EPSILON, vl), vl);
+    const auto c2 = __riscv_vfmul(c, c, vl);
+
+    auto a = __riscv_vfmadd(c2, p7, vp5, vl);
+    a = __riscv_vfmadd(a, c2, vp3, vl);
+    a = __riscv_vfmadd(a, c2, vp1, vl);
+    a = __riscv_vfmul(a, c, vl);
+
+    const auto mask = __riscv_vmflt(ax, ay, vl);
+    a = __riscv_vfrsub_mu(mask, a, a, angle_90_deg, vl);
+
+    a = __riscv_vfrsub_mu(__riscv_vmflt(vx, 0.F, vl), a, a, angle_90_deg * 2,
+                          vl);
+    a = __riscv_vfrsub_mu(__riscv_vmflt(vy, 0.F, vl), a, a, angle_90_deg * 4,
+                          vl);
+
+    return a;
+}
+
+} // namespace detail
+
+inline int fast_atan_32(const float *y, const float *x, float *dst, size_t n,
+                        bool angle_in_deg) {
+    const float scale = angle_in_deg ? 1.f : CV_PI / 180.f;
+    const float p1 = detail::atan2_p1 * scale;
+    const float p3 = detail::atan2_p3 * scale;
+    const float p5 = detail::atan2_p5 * scale;
+    const float p7 = detail::atan2_p7 * scale;
+    const float angle_90_deg = 90.F * scale;
+
+    static size_t vlmax = __riscv_vsetvlmax_e32m4();
+    auto vp1 = __riscv_vfmv_v_f_f32m4(p1, vlmax);
+    auto vp3 = __riscv_vfmv_v_f_f32m4(p3, vlmax);
+    auto vp5 = __riscv_vfmv_v_f_f32m4(p5, vlmax);
+
+    for (size_t vl{}; n > 0; n -= vl) {
+        vl = __riscv_vsetvl_e32m4(n);
+
+        auto vy = __riscv_vle32_v_f32m4(y, vl);
+        auto vx = __riscv_vle32_v_f32m4(x, vl);
+
+        auto a =
+            detail::rvv_atan_f32(vy, vx, vl, p7, vp5, vp3, vp1, angle_90_deg);
+
+        __riscv_vse32(dst, a, vl);
+
+        x += vl;
+        y += vl;
+        dst += vl;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int fast_atan_64(const double *y, const double *x, double *dst, size_t n,
+                        bool angle_in_deg) {
+    // this also uses float32 version, ref: mathfuncs_core.simd.hpp
+
+    const float scale = angle_in_deg ? 1.f : CV_PI / 180.f;
+    const float p1 = detail::atan2_p1 * scale;
+    const float p3 = detail::atan2_p3 * scale;
+    const float p5 = detail::atan2_p5 * scale;
+    const float p7 = detail::atan2_p7 * scale;
+    const float angle_90_deg = 90.F * scale;
+
+    static size_t vlmax = __riscv_vsetvlmax_e32m4();
+    auto vp1 = __riscv_vfmv_v_f_f32m4(p1, vlmax);
+    auto vp3 = __riscv_vfmv_v_f_f32m4(p3, vlmax);
+    auto vp5 = __riscv_vfmv_v_f_f32m4(p5, vlmax);
+
+    for (size_t vl{}; n > 0; n -= vl) {
+        vl = __riscv_vsetvl_e64m8(n);
+
+        auto wy = __riscv_vle64_v_f64m8(y, vl);
+        auto wx = __riscv_vle64_v_f64m8(x, vl);
+
+        auto vy = __riscv_vfncvt_f_f_w_f32m4(wy, vl);
+        auto vx = __riscv_vfncvt_f_f_w_f32m4(wx, vl);
+
+        auto a =
+            detail::rvv_atan_f32(vy, vx, vl, p7, vp5, vp3, vp1, angle_90_deg);
+
+        auto wa = __riscv_vfwcvt_f_f_v_f64m8(a, vl);
+
+        __riscv_vse64(dst, wa, vl);
+
+        x += vl;
+        y += vl;
+        dst += vl;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // namespace cv::cv_hal_rvv
--- a/3rdparty/hal_rvv/hal_rvv_1p0/convert_scale.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/convert_scale.hpp
@ -0,0 +1,120 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef OPENCV_HAL_RVV_CONVERT_SCALE_HPP_INCLUDED
+#define OPENCV_HAL_RVV_CONVERT_SCALE_HPP_INCLUDED
+
+#include <riscv_vector.h>
+
+namespace cv { namespace cv_hal_rvv {
+
+#undef cv_hal_convertScale
+#define cv_hal_convertScale cv::cv_hal_rvv::convertScale
+
+inline int convertScale_8U8U(const uchar* src, size_t src_step, uchar* dst, size_t dst_step, int width, int height, double alpha, double beta)
+{
+    int vlmax = __riscv_vsetvlmax_e32m8();
+    auto vec_b = __riscv_vfmv_v_f_f32m8(beta, vlmax);
+    float a = alpha;
+
+    for (int i = 0; i < height; i++)
+    {
+        const uchar* src_row = src + i * src_step;
+        uchar* dst_row = dst + i * dst_step;
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e8m2(width - j);
+            auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl);
+            auto vec_src_u16 = __riscv_vzext_vf2(vec_src, vl);
+            auto vec_src_f32 = __riscv_vfwcvt_f(vec_src_u16, vl);
+            auto vec_fma = __riscv_vfmadd(vec_src_f32, a, vec_b, vl);
+            auto vec_dst_u16 = __riscv_vfncvt_xu(vec_fma, vl);
+            auto vec_dst = __riscv_vnclipu(vec_dst_u16, 0, __RISCV_VXRM_RNU, vl);
+            __riscv_vse8_v_u8m2(dst_row + j, vec_dst, vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int convertScale_8U32F(const uchar* src, size_t src_step, uchar* dst, size_t dst_step, int width, int height, double alpha, double beta)
+{
+    int vlmax = __riscv_vsetvlmax_e32m8();
+    auto vec_b = __riscv_vfmv_v_f_f32m8(beta, vlmax);
+    float a = alpha;
+
+    for (int i = 0; i < height; i++)
+    {
+        const uchar* src_row = src + i * src_step;
+        float* dst_row = reinterpret_cast<float*>(dst + i * dst_step);
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e8m2(width - j);
+            auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl);
+            auto vec_src_u16 = __riscv_vzext_vf2(vec_src, vl);
+            auto vec_src_f32 = __riscv_vfwcvt_f(vec_src_u16, vl);
+            auto vec_fma = __riscv_vfmadd(vec_src_f32, a, vec_b, vl);
+            __riscv_vse32_v_f32m8(dst_row + j, vec_fma, vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int convertScale_32F32F(const uchar* src, size_t src_step, uchar* dst, size_t dst_step, int width, int height, double alpha, double beta)
+{
+    int vlmax = __riscv_vsetvlmax_e32m8();
+    auto vec_b = __riscv_vfmv_v_f_f32m8(beta, vlmax);
+    float a = alpha;
+
+    for (int i = 0; i < height; i++)
+    {
+        const float* src_row = reinterpret_cast<const float*>(src + i * src_step);
+        float* dst_row = reinterpret_cast<float*>(dst + i * dst_step);
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e32m8(width - j);
+            auto vec_src = __riscv_vle32_v_f32m8(src_row + j, vl);
+            auto vec_fma = __riscv_vfmadd(vec_src, a, vec_b, vl);
+            __riscv_vse32_v_f32m8(dst_row + j, vec_fma, vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t dst_step, int width, int height,
+                        int sdepth, int ddepth, double alpha, double beta)
+{
+    if (!dst)
+        return CV_HAL_ERROR_OK;
+
+    switch (sdepth)
+    {
+    case CV_8U:
+        switch (ddepth)
+        {
+        case CV_8U:
+            return convertScale_8U8U(src, src_step, dst, dst_step, width, height, alpha, beta);
+        case CV_32F:
+            return convertScale_8U32F(src, src_step, dst, dst_step, width, height, alpha, beta);
+        }
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    case CV_32F:
+        switch (ddepth)
+        {
+        case CV_32F:
+            return convertScale_32F32F(src, src_step, dst, dst_step, width, height, alpha, beta);
+        }
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+}}
+
+#endif
--- a/3rdparty/hal_rvv/hal_rvv_1p0/mean.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/mean.hpp
@ -0,0 +1,228 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef OPENCV_HAL_RVV_MEANSTDDEV_HPP_INCLUDED
+#define OPENCV_HAL_RVV_MEANSTDDEV_HPP_INCLUDED
+
+#include <riscv_vector.h>
+
+namespace cv { namespace cv_hal_rvv {
+
+#undef cv_hal_meanStdDev
+#define cv_hal_meanStdDev cv::cv_hal_rvv::meanStdDev
+
+inline int meanStdDev_8UC1(const uchar* src_data, size_t src_step, int width, int height,
+                            double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
+inline int meanStdDev_8UC4(const uchar* src_data, size_t src_step, int width, int height,
+                            double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
+inline int meanStdDev_32FC1(const uchar* src_data, size_t src_step, int width, int height,
+                            double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
+
+inline int meanStdDev(const uchar* src_data, size_t src_step, int width, int height,
+                             int src_type, double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
+    switch (src_type)
+    {
+    case CV_8UC1:
+        return meanStdDev_8UC1(src_data, src_step, width, height, mean_val, stddev_val, mask, mask_step);
+    case CV_8UC4:
+        return meanStdDev_8UC4(src_data, src_step, width, height, mean_val, stddev_val, mask, mask_step);
+    case CV_32FC1:
+        return meanStdDev_32FC1(src_data, src_step, width, height, mean_val, stddev_val, mask, mask_step);
+    default:
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+}
+
+inline int meanStdDev_8UC1(const uchar* src_data, size_t src_step, int width, int height,
+                             double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
+    int nz = 0;
+    int vlmax = __riscv_vsetvlmax_e64m8();
+    vuint64m8_t vec_sum = __riscv_vmv_v_x_u64m8(0, vlmax);
+    vuint64m8_t vec_sqsum = __riscv_vmv_v_x_u64m8(0, vlmax);
+    if (mask) {
+        for (int i = 0; i < height; ++i) {
+            const uchar* src_row = src_data + i * src_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int j = 0, vl;
+            for ( ; j < width; j += vl) {
+                vl = __riscv_vsetvl_e8m1(width - j);
+                auto vec_pixel_u8 = __riscv_vle8_v_u8m1(src_row + j, vl);
+                auto vmask_u8 = __riscv_vle8_v_u8m1(mask_row+j, vl);
+                auto vec_pixel = __riscv_vzext_vf4(vec_pixel_u8, vl);
+                auto vmask = __riscv_vmseq_vx_u8m1_b8(vmask_u8, 1, vl);
+                vec_sum = __riscv_vwaddu_wv_u64m8_tumu(vmask, vec_sum, vec_sum, vec_pixel, vl);
+                vec_sqsum = __riscv_vwmaccu_vv_u64m8_tumu(vmask, vec_sqsum, vec_pixel, vec_pixel, vl);
+                nz += __riscv_vcpop_m_b8(vmask, vl);
+            }
+        }
+    } else {
+        for (int i = 0; i < height; i++) {
+            const uchar*  src_row = src_data + i * src_step;
+            int j = 0, vl;
+            for ( ; j < width; j += vl) {
+                vl = __riscv_vsetvl_e8m1(width - j);
+                auto vec_pixel_u8 = __riscv_vle8_v_u8m1(src_row + j, vl);
+                auto vec_pixel = __riscv_vzext_vf4(vec_pixel_u8, vl);
+                vec_sum = __riscv_vwaddu_wv_u64m8_tu(vec_sum, vec_sum, vec_pixel, vl);
+                vec_sqsum = __riscv_vwmaccu_vv_u64m8_tu(vec_sqsum, vec_pixel, vec_pixel, vl);
+            }
+        }
+        nz = height * width;
+    }
+    if (nz == 0) {
+        if (mean_val) *mean_val = 0.0;
+        if (stddev_val) *stddev_val = 0.0;
+        return CV_HAL_ERROR_OK;
+    }
+    auto zero = __riscv_vmv_s_x_u64m1(0, vlmax);
+    auto vec_red = __riscv_vmv_v_x_u64m1(0, vlmax);
+    auto vec_reddev = __riscv_vmv_v_x_u64m1(0, vlmax);
+    vec_red = __riscv_vredsum(vec_sum, zero, vlmax);
+    vec_reddev = __riscv_vredsum(vec_sqsum, zero, vlmax);
+    double sum = __riscv_vmv_x(vec_red);
+    double mean = sum / nz;
+    if (mean_val) {
+        *mean_val = mean;
+    }
+    if (stddev_val) {
+        double sqsum = __riscv_vmv_x(vec_reddev);
+        double variance = std::max((sqsum / nz) - (mean * mean), 0.0);
+        double stddev = std::sqrt(variance);
+        *stddev_val = stddev;
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+inline int meanStdDev_8UC4(const uchar* src_data, size_t src_step, int width, int height,
+                             double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
+    int nz = 0;
+    int vlmax = __riscv_vsetvlmax_e64m8();
+    vuint64m8_t vec_sum = __riscv_vmv_v_x_u64m8(0, vlmax);
+    vuint64m8_t vec_sqsum = __riscv_vmv_v_x_u64m8(0, vlmax);
+    if (mask) {
+        for (int i = 0; i < height; ++i) {
+            const uchar* src_row = src_data + i * src_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int j = 0, jm = 0, vl, vlm;
+            for ( ; j < width*4; j += vl, jm += vlm) {
+                vl = __riscv_vsetvl_e8m1(width*4 - j);
+                vlm = __riscv_vsetvl_e8mf4(width - jm);
+                auto vec_pixel_u8 = __riscv_vle8_v_u8m1(src_row + j, vl);
+                auto vmask_u8mf4 = __riscv_vle8_v_u8mf4(mask_row + jm, vlm);
+                auto vmask_u32 = __riscv_vzext_vf4(vmask_u8mf4, vlm);
+                // 0 -> 0000; 1 -> 1111
+                vmask_u32 = __riscv_vmul(vmask_u32, 0b00000001000000010000000100000001, vlm);
+                auto vmask_u8 = __riscv_vreinterpret_u8m1(vmask_u32);
+                auto vec_pixel = __riscv_vzext_vf4(vec_pixel_u8, vl);
+                auto vmask = __riscv_vmseq_vx_u8m1_b8(vmask_u8, 1, vl);
+                vec_sum = __riscv_vwaddu_wv_u64m8_tumu(vmask, vec_sum, vec_sum, vec_pixel, vl);
+                vec_sqsum = __riscv_vwmaccu_vv_u64m8_tumu(vmask, vec_sqsum, vec_pixel, vec_pixel, vl);
+                nz += __riscv_vcpop_m_b8(vmask, vl);
+            }
+        }
+        nz /= 4;
+    } else {
+        for (int i = 0; i < height; i++) {
+            const uchar*  src_row = src_data + i * src_step;
+            int j = 0, vl;
+            for ( ; j <  width*4; j += vl) {
+                vl = __riscv_vsetvl_e8m1(width*4 - j);
+                auto vec_pixel_u8 = __riscv_vle8_v_u8m1(src_row + j, vl);
+                auto vec_pixel = __riscv_vzext_vf4(vec_pixel_u8, vl);
+                vec_sum = __riscv_vwaddu_wv_u64m8_tu(vec_sum, vec_sum, vec_pixel, vl);
+                vec_sqsum = __riscv_vwmaccu_vv_u64m8_tu(vec_sqsum, vec_pixel, vec_pixel, vl);
+            }
+        }
+        nz = height * width;
+    }
+    if (nz == 0) {
+        if (mean_val) *mean_val = 0.0;
+        if (stddev_val) *stddev_val = 0.0;
+        return CV_HAL_ERROR_OK;
+    }
+    uint64_t s[256], sq[256], sum[4] = {0}, sqsum[4] = {0};
+    __riscv_vse64(s, vec_sum, vlmax);
+    __riscv_vse64(sq, vec_sqsum, vlmax);
+    for (int i = 0; i < vlmax; ++i)
+    {
+        sum[i % 4] += s[i];
+        sqsum[i % 4] += sq[i];
+    }
+    if (mean_val) {
+        mean_val[0] = (double)sum[0] / nz;
+        mean_val[1] = (double)sum[1] / nz;
+        mean_val[2] = (double)sum[2] / nz;
+        mean_val[3] = (double)sum[3] / nz;
+    }
+    if (stddev_val) {
+        stddev_val[0] = std::sqrt(std::max(((double)sqsum[0] / nz) - (mean_val[0] * mean_val[0]), 0.0));
+        stddev_val[1] = std::sqrt(std::max(((double)sqsum[1] / nz) - (mean_val[1] * mean_val[1]), 0.0));
+        stddev_val[2] = std::sqrt(std::max(((double)sqsum[2] / nz) - (mean_val[2] * mean_val[2]), 0.0));
+        stddev_val[3] = std::sqrt(std::max(((double)sqsum[3] / nz) - (mean_val[3] * mean_val[3]), 0.0));
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+inline int meanStdDev_32FC1(const uchar* src_data, size_t src_step, int width, int height,
+                             double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
+    int nz = 0;
+    int vlmax = __riscv_vsetvlmax_e64m4();
+    vfloat64m4_t vec_sum = __riscv_vfmv_v_f_f64m4(0, vlmax);
+    vfloat64m4_t vec_sqsum = __riscv_vfmv_v_f_f64m4(0, vlmax);
+    src_step /= sizeof(float);
+    if (mask) {
+        for (int i = 0; i < height; ++i) {
+            const float* src_row0 = reinterpret_cast<const float*>(src_data) + i * src_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int j = 0, vl;
+            for ( ; j < width; j += vl) {
+                vl = __riscv_vsetvl_e32m2(width - j);
+                auto vec_pixel = __riscv_vle32_v_f32m2(src_row0 + j, vl);
+                auto vmask_u8 = __riscv_vle8_v_u8mf2(mask_row + j, vl);
+                auto vmask_u32 = __riscv_vzext_vf4(vmask_u8, vl);
+                auto vmask = __riscv_vmseq_vx_u32m2_b16(vmask_u32, 1, vl);
+                vec_sum = __riscv_vfwadd_wv_f64m4_tumu(vmask, vec_sum, vec_sum, vec_pixel, vl);
+                vec_sqsum = __riscv_vfwmacc_vv_f64m4_tumu(vmask, vec_sqsum, vec_pixel, vec_pixel, vl);
+                nz += __riscv_vcpop_m_b16(vmask, vl);
+            }
+        }
+    } else {
+        for (int i = 0; i < height; i++) {
+            const float* src_row0 = reinterpret_cast<const float*>(src_data) + i * src_step;
+            int j = 0, vl;
+            for ( ; j < width; j += vl) {
+                vl = __riscv_vsetvl_e32m2(width - j);
+                auto vec_pixel = __riscv_vle32_v_f32m2(src_row0 + j, vl);
+                vec_sum = __riscv_vfwadd_wv_f64m4_tu(vec_sum, vec_sum, vec_pixel, vl);
+                vec_sqsum = __riscv_vfwmacc_vv_f64m4_tu(vec_sqsum, vec_pixel, vec_pixel, vl);
+            }
+        }
+        nz = height * width;
+    }
+    if (nz == 0) {
+        if (mean_val) *mean_val = 0.0;
+        if (stddev_val) *stddev_val = 0.0;
+        return CV_HAL_ERROR_OK;
+    }
+    auto zero = __riscv_vfmv_v_f_f64m1(0, vlmax);
+    auto vec_red = __riscv_vfmv_v_f_f64m1(0, vlmax);
+    auto vec_reddev = __riscv_vfmv_v_f_f64m1(0, vlmax);
+    vec_red = __riscv_vfredusum(vec_sum, zero, vlmax);
+    vec_reddev = __riscv_vfredusum(vec_sqsum, zero, vlmax);
+    double sum = __riscv_vfmv_f(vec_red);
+    double mean = sum / nz;
+    if (mean_val) {
+        *mean_val = mean;
+    }
+    if (stddev_val) {
+        double sqsum = __riscv_vfmv_f(vec_reddev);
+        double variance = std::max((sqsum / nz) - (mean * mean), 0.0);
+        double stddev = std::sqrt(variance);
+        *stddev_val = stddev;
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+}}
+
+#endif
--- a/3rdparty/hal_rvv/hal_rvv_1p0/merge.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/merge.hpp
@ -0,0 +1,397 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
+#define OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
+
+#include <riscv_vector.h>
+
+namespace cv { namespace cv_hal_rvv {
+
+#undef cv_hal_merge8u
+#define cv_hal_merge8u cv::cv_hal_rvv::merge8u
+#undef cv_hal_merge16u
+#define cv_hal_merge16u cv::cv_hal_rvv::merge16u
+#undef cv_hal_merge32s
+#define cv_hal_merge32s cv::cv_hal_rvv::merge32s
+#undef cv_hal_merge64s
+#define cv_hal_merge64s cv::cv_hal_rvv::merge64s
+
+#if defined __GNUC__
+__attribute__((optimize("no-tree-vectorize")))
+#endif
+inline int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
+    int k = cn % 4 ? cn % 4 : 4;
+    int i = 0;
+    int vl = __riscv_vsetvlmax_e8m1();
+    if( k == 1 )
+    {
+        const uchar* src0 = src[0];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++)
+            dst[i*cn] = src0[i];
+    }
+    else if( k == 2 )
+    {
+        const uchar *src0 = src[0], *src1 = src[1];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
+            auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+        }
+    }
+    else if( k == 3 )
+    {
+        const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
+            auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
+            auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 2, sizeof(uchar)*cn, c, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+            dst[i*cn+2] = src2[i];
+        }
+    }
+    else
+    {
+        const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
+            auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
+            auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
+            auto d = __riscv_vle8_v_u8m1(src3 + i, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 2, sizeof(uchar)*cn, c, vl);
+            __riscv_vsse8_v_u8m1(dst + i*cn + 3, sizeof(uchar)*cn, d, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+            dst[i*cn+2] = src2[i];
+            dst[i*cn+3] = src3[i];
+        }
+    }
+    #if defined(__clang__)
+    #pragma clang loop vectorize(disable)
+    #endif
+    for( ; k < cn; k += 4 )
+    {
+        const uchar *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
+        i = 0;
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
+            auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
+            auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
+            auto d = __riscv_vle8_v_u8m1(src3 + i, vl);
+            __riscv_vsse8_v_u8m1(dst + k+i*cn, sizeof(uchar)*cn, a, vl);
+            __riscv_vsse8_v_u8m1(dst + k+i*cn + 1, sizeof(uchar)*cn, b, vl);
+            __riscv_vsse8_v_u8m1(dst + k+i*cn + 2, sizeof(uchar)*cn, c, vl);
+            __riscv_vsse8_v_u8m1(dst + k+i*cn + 3, sizeof(uchar)*cn, d, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[k+i*cn] = src0[i];
+            dst[k+i*cn+1] = src1[i];
+            dst[k+i*cn+2] = src2[i];
+            dst[k+i*cn+3] = src3[i];
+        }
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+#if defined __GNUC__
+__attribute__((optimize("no-tree-vectorize")))
+#endif
+inline int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
+    int k = cn % 4 ? cn % 4 : 4;
+    int i = 0;
+    int vl = __riscv_vsetvlmax_e16m1();
+    if( k == 1 )
+    {
+        const ushort* src0 = src[0];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++)
+            dst[i*cn] = src0[i];
+    }
+    else if( k == 2 )
+    {
+        const ushort *src0 = src[0], *src1 = src[1];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
+            auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+        }
+    }
+    else if( k == 3 )
+    {
+        const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
+            auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
+            auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 2, sizeof(ushort)*cn, c, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+            dst[i*cn+2] = src2[i];
+        }
+    }
+    else
+    {
+        const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
+            auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
+            auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
+            auto d = __riscv_vle16_v_u16m1(src3 + i, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 2, sizeof(ushort)*cn, c, vl);
+            __riscv_vsse16_v_u16m1(dst + i*cn + 3, sizeof(ushort)*cn, d, vl);
+        }
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++ )
+        {
+            dst[i*cn] = src0[i];
+            dst[i*cn+1] = src1[i];
+            dst[i*cn+2] = src2[i];
+            dst[i*cn+3] = src3[i];
+        }
+    }
+    #if defined(__clang__)
+    #pragma clang loop vectorize(disable)
+    #endif
+    for( ; k < cn; k += 4 )
+    {
+        const uint16_t *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
+        i = 0;
+        for( ; i <= len - vl; i += vl)
+        {
+            auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
+            auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
+            auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
+            auto d = __riscv_vle16_v_u16m1(src3 + i, vl);
+            __riscv_vsse16_v_u16m1(dst + k+i*cn, sizeof(ushort)*cn, a, vl);
+            __riscv_vsse16_v_u16m1(dst + k+i*cn + 1, sizeof(ushort)*cn, b, vl);
+            __riscv_vsse16_v_u16m1(dst + k+i*cn + 2, sizeof(ushort)*cn, c, vl);
+            __riscv_vsse16_v_u16m1(dst + k+i*cn + 3, sizeof(ushort)*cn, d, vl);
+        }
+        for( ; i < len; i++ )
+        {
+            dst[k+i*cn] = src0[i];
+            dst[k+i*cn+1] = src1[i];
+            dst[k+i*cn+2] = src2[i];
+            dst[k+i*cn+3] = src3[i];
+        }
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+#if defined __GNUC__
+__attribute__((optimize("no-tree-vectorize")))
+#endif
+inline int merge32s(const int** src, int* dst, int len, int cn ) {
+    int k = cn % 4 ? cn % 4 : 4;
+    int i, j;
+    if( k == 1 )
+    {
+        const int* src0 = src[0];
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( i = j = 0; i < len; i++, j += cn )
+            dst[j] = src0[i];
+    }
+    else if( k == 2 )
+    {
+        const int *src0 = src[0], *src1 = src[1];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i];
+            dst[j+1] = src1[i];
+        }
+    }
+    else if( k == 3 )
+    {
+        const int *src0 = src[0], *src1 = src[1], *src2 = src[2];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i];
+            dst[j+1] = src1[i];
+            dst[j+2] = src2[i];
+        }
+    }
+    else
+    {
+        const int *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+    #if defined(__clang__)
+    #pragma clang loop vectorize(disable)
+    #endif
+    for( ; k < cn; k += 4 )
+    {
+        const int *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
+        for( i = 0, j = k; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+#if defined __GNUC__
+__attribute__((optimize("no-tree-vectorize")))
+#endif
+inline int merge64s(const int64** src, int64* dst, int len, int cn ) {
+    int k = cn % 4 ? cn % 4 : 4;
+    int i, j;
+    if( k == 1 )
+    {
+        const int64* src0 = src[0];
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( i = j = 0; i < len; i++, j += cn )
+            dst[j] = src0[i];
+    }
+    else if( k == 2 )
+    {
+        const int64 *src0 = src[0], *src1 = src[1];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i];
+            dst[j+1] = src1[i];
+        }
+    }
+    else if( k == 3 )
+    {
+        const int64 *src0 = src[0], *src1 = src[1], *src2 = src[2];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i];
+            dst[j+1] = src1[i];
+            dst[j+2] = src2[i];
+        }
+    }
+    else
+    {
+        const int64 *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
+        i = j = 0;
+        #if defined(__clang__)
+        #pragma clang loop vectorize(disable)
+        #endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+    #if defined(__clang__)
+    #pragma clang loop vectorize(disable)
+    #endif
+    for( ; k < cn; k += 4 )
+    {
+        const int64 *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
+        for( i = 0, j = k; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+}}
+
+#endif
--- a/3rdparty/hal_rvv/hal_rvv_1p0/minmax.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/minmax.hpp
@ -0,0 +1,335 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef OPENCV_HAL_RVV_MINMAXIDX_HPP_INCLUDED
+#define OPENCV_HAL_RVV_MINMAXIDX_HPP_INCLUDED
+
+#include <riscv_vector.h>
+
+namespace cv { namespace cv_hal_rvv {
+
+#undef cv_hal_minMaxIdx
+#define cv_hal_minMaxIdx cv::cv_hal_rvv::minMaxIdx
+#undef cv_hal_minMaxIdxMaskStep
+#define cv_hal_minMaxIdxMaskStep cv::cv_hal_rvv::minMaxIdx
+
+namespace
+{
+    template<typename T> struct rvv;
+
+    #define HAL_RVV_GENERATOR(T, EEW, TYPE, IS_U, EMUL, M_EMUL, B_LEN) \
+    template<> struct rvv<T> \
+    { \
+        using vec_t = v##IS_U##int##EEW##EMUL##_t; \
+        using bool_t = vbool##B_LEN##_t; \
+        static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e##EEW##EMUL(); } \
+        static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e##EEW##EMUL(a); } \
+        static inline vec_t vmv_v_x(T a, size_t b) { return __riscv_vmv_v_x_##TYPE##EMUL(a, b); } \
+        static inline vec_t vle(const T* a, size_t b) { return __riscv_vle##EEW##_v_##TYPE##EMUL(a, b); } \
+        static inline vuint8##M_EMUL##_t vle_mask(const uchar* a, size_t b) { return __riscv_vle8_v_u8##M_EMUL(a, b); } \
+        static inline vec_t vmin_tu(vec_t a, vec_t b, vec_t c, size_t d) { return __riscv_vmin##IS_U##_tu(a, b, c, d); } \
+        static inline vec_t vmax_tu(vec_t a, vec_t b, vec_t c, size_t d) { return __riscv_vmax##IS_U##_tu(a, b, c, d); } \
+        static inline vec_t vmin_tumu(bool_t a, vec_t b, vec_t c, vec_t d, size_t e) { return __riscv_vmin##IS_U##_tumu(a, b, c, d, e); } \
+        static inline vec_t vmax_tumu(bool_t a, vec_t b, vec_t c, vec_t d, size_t e) { return __riscv_vmax##IS_U##_tumu(a, b, c, d, e); } \
+        static inline vec_t vredmin(vec_t a, vec_t b, size_t c) { return __riscv_vredmin##IS_U(a, b, c); } \
+        static inline vec_t vredmax(vec_t a, vec_t b, size_t c) { return __riscv_vredmax##IS_U(a, b, c); } \
+    };
+    HAL_RVV_GENERATOR(uchar , 8 , u8 , u, m1, m1 , 8 )
+    HAL_RVV_GENERATOR(schar , 8 , i8 ,  , m1, m1 , 8 )
+    HAL_RVV_GENERATOR(ushort, 16, u16, u, m1, mf2, 16)
+    HAL_RVV_GENERATOR(short , 16, i16,  , m1, mf2, 16)
+    #undef HAL_RVV_GENERATOR
+
+    #define HAL_RVV_GENERATOR(T, NAME, EEW, TYPE, IS_F, F_OR_S, F_OR_X, EMUL, M_EMUL, P_EMUL, B_LEN) \
+    template<> struct rvv<T> \
+    { \
+        using vec_t = v##NAME##EEW##EMUL##_t; \
+        using bool_t = vbool##B_LEN##_t; \
+        static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e##EEW##EMUL(); } \
+        static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e##EEW##EMUL(a); } \
+        static inline vec_t vmv_v_x(T a, size_t b) { return __riscv_v##IS_F##mv_v_##F_OR_X##_##TYPE##EMUL(a, b); } \
+        static inline vuint32##P_EMUL##_t vid(size_t a) { return __riscv_vid_v_u32##P_EMUL(a); } \
+        static inline vuint32##P_EMUL##_t vundefined() { return __riscv_vundefined_u32##P_EMUL(); } \
+        static inline vec_t vle(const T* a, size_t b) { return __riscv_vle##EEW##_v_##TYPE##EMUL(a, b); } \
+        static inline vuint8##M_EMUL##_t vle_mask(const uchar* a, size_t b) { return __riscv_vle8_v_u8##M_EMUL(a, b); } \
+        static inline bool_t vmlt(vec_t a, vec_t b, size_t c) { return __riscv_vm##F_OR_S##lt(a, b, c); } \
+        static inline bool_t vmgt(vec_t a, vec_t b, size_t c) { return __riscv_vm##F_OR_S##gt(a, b, c); } \
+        static inline bool_t vmlt_mu(bool_t a, bool_t b, vec_t c, vec_t d, size_t e) { return __riscv_vm##F_OR_S##lt##_mu(a, b, c, d, e); } \
+        static inline bool_t vmgt_mu(bool_t a, bool_t b, vec_t c, vec_t d, size_t e) { return __riscv_vm##F_OR_S##gt##_mu(a, b, c, d, e); } \
+        static inline T vmv_x_s(vec_t a) { return __riscv_v##IS_F##mv_##F_OR_X(a); } \
+    };
+    HAL_RVV_GENERATOR(int   , int  , 32, i32,  , s, x, m4, m1 , m4, 8 )
+    HAL_RVV_GENERATOR(float , float, 32, f32, f, f, f, m4, m1 , m4, 8 )
+    HAL_RVV_GENERATOR(double, float, 64, f64, f, f, f, m4, mf2, m2, 16)
+    #undef HAL_RVV_GENERATOR
+}
+
+template<typename T>
+inline int minMaxIdxReadTwice(const uchar* src_data, size_t src_step, int width, int height, double* minVal, double* maxVal,
+                              int* minIdx, int* maxIdx, uchar* mask, size_t mask_step)
+{
+    int vlmax = rvv<T>::vsetvlmax();
+    auto vec_min = rvv<T>::vmv_v_x(std::numeric_limits<T>::max(), vlmax);
+    auto vec_max = rvv<T>::vmv_v_x(std::numeric_limits<T>::lowest(), vlmax);
+    T val_min, val_max;
+
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const T* src_row = reinterpret_cast<const T*>(src_data + i * src_step);
+            const uchar* mask_row = mask + i * mask_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = rvv<T>::vsetvl(width - j);
+                auto vec_src = rvv<T>::vle(src_row + j, vl);
+                auto vec_mask = rvv<T>::vle_mask(mask_row + j, vl);
+                auto bool_mask = __riscv_vmsne(vec_mask, 0, vl);
+                vec_min = rvv<T>::vmin_tumu(bool_mask, vec_min, vec_min, vec_src, vl);
+                vec_max = rvv<T>::vmax_tumu(bool_mask, vec_max, vec_max, vec_src, vl);
+            }
+        }
+
+        auto sc_minval = rvv<T>::vmv_v_x(std::numeric_limits<T>::max(), vlmax);
+        auto sc_maxval = rvv<T>::vmv_v_x(std::numeric_limits<T>::lowest(), vlmax);
+        sc_minval = rvv<T>::vredmin(vec_min, sc_minval, vlmax);
+        sc_maxval = rvv<T>::vredmax(vec_max, sc_maxval, vlmax);
+        val_min = __riscv_vmv_x(sc_minval);
+        val_max = __riscv_vmv_x(sc_maxval);
+
+        bool found_min = !minIdx, found_max = !maxIdx;
+        for (int i = 0; i < height && (!found_min || !found_max); i++)
+        {
+            const T* src_row = reinterpret_cast<const T*>(src_data + i * src_step);
+            const uchar* mask_row = mask + i * mask_step;
+            int vl;
+            for (int j = 0; j < width && (!found_min || !found_max); j += vl)
+            {
+                vl = rvv<T>::vsetvl(width - j);
+                auto vec_src = rvv<T>::vle(src_row + j, vl);
+                auto vec_mask = rvv<T>::vle_mask(mask_row + j, vl);
+                auto bool_mask = __riscv_vmsne(vec_mask, 0, vl);
+                auto bool_zero = __riscv_vmxor(bool_mask, bool_mask, vl);
+                if (!found_min)
+                {
+                    auto bool_minpos = __riscv_vmseq_mu(bool_mask, bool_zero, vec_src, val_min, vl);
+                    int index = __riscv_vfirst(bool_minpos, vl);
+                    if (index != -1)
+                    {
+                        found_min = true;
+                        minIdx[0] = i;
+                        minIdx[1] = j + index;
+                    }
+                }
+                if (!found_max)
+                {
+                    auto bool_maxpos = __riscv_vmseq_mu(bool_mask, bool_zero, vec_src, val_max, vl);
+                    int index = __riscv_vfirst(bool_maxpos, vl);
+                    if (index != -1)
+                    {
+                        found_max = true;
+                        maxIdx[0] = i;
+                        maxIdx[1] = j + index;
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const T* src_row = reinterpret_cast<const T*>(src_data + i * src_step);
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = rvv<T>::vsetvl(width - j);
+                auto vec_src = rvv<T>::vle(src_row + j, vl);
+                vec_min = rvv<T>::vmin_tu(vec_min, vec_min, vec_src, vl);
+                vec_max = rvv<T>::vmax_tu(vec_max, vec_max, vec_src, vl);
+            }
+        }
+
+        auto sc_minval = rvv<T>::vmv_v_x(std::numeric_limits<T>::max(), vlmax);
+        auto sc_maxval = rvv<T>::vmv_v_x(std::numeric_limits<T>::lowest(), vlmax);
+        sc_minval = rvv<T>::vredmin(vec_min, sc_minval, vlmax);
+        sc_maxval = rvv<T>::vredmax(vec_max, sc_maxval, vlmax);
+        val_min = __riscv_vmv_x(sc_minval);
+        val_max = __riscv_vmv_x(sc_maxval);
+
+        bool found_min = !minIdx, found_max = !maxIdx;
+        for (int i = 0; i < height && (!found_min || !found_max); i++)
+        {
+            const T* src_row = reinterpret_cast<const T*>(src_data + i * src_step);
+            int vl;
+            for (int j = 0; j < width && (!found_min || !found_max); j += vl)
+            {
+                vl = rvv<T>::vsetvl(width - j);
+                auto vec_src = rvv<T>::vle(src_row + j, vl);
+                if (!found_min)
+                {
+                    auto bool_minpos = __riscv_vmseq(vec_src, val_min, vl);
+                    int index = __riscv_vfirst(bool_minpos, vl);
+                    if (index != -1)
+                    {
+                        found_min = true;
+                        minIdx[0] = i;
+                        minIdx[1] = j + index;
+                    }
+                }
+                if (!found_max)
+                {
+                    auto bool_maxpos = __riscv_vmseq(vec_src, val_max, vl);
+                    int index = __riscv_vfirst(bool_maxpos, vl);
+                    if (index != -1)
+                    {
+                        found_max = true;
+                        maxIdx[0] = i;
+                        maxIdx[1] = j + index;
+                    }
+                }
+            }
+        }
+    }
+    if (minVal)
+    {
+        *minVal = val_min;
+    }
+    if (maxVal)
+    {
+        *maxVal = val_max;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template<typename T>
+inline int minMaxIdxReadOnce(const uchar* src_data, size_t src_step, int width, int height, double* minVal, double* maxVal,
+                             int* minIdx, int* maxIdx, uchar* mask, size_t mask_step)
+{
+    int vlmax = rvv<T>::vsetvlmax();
+    auto vec_min = rvv<T>::vmv_v_x(std::numeric_limits<T>::max(), vlmax);
+    auto vec_max = rvv<T>::vmv_v_x(std::numeric_limits<T>::lowest(), vlmax);
+    auto vec_pos = rvv<T>::vid(vlmax);
+    auto vec_minpos = rvv<T>::vundefined(), vec_maxpos = rvv<T>::vundefined();
+    T val_min, val_max;
+
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const T* src_row = reinterpret_cast<const T*>(src_data + i * src_step);
+            const uchar* mask_row = mask + i * mask_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = rvv<T>::vsetvl(width - j);
+                auto vec_src = rvv<T>::vle(src_row + j, vl);
+                auto vec_mask = rvv<T>::vle_mask(mask_row + j, vl);
+                auto bool_mask = __riscv_vmsne(vec_mask, 0, vl);
+                auto bool_zero = __riscv_vmxor(bool_mask, bool_mask, vl);
+
+                auto bool_minpos = rvv<T>::vmlt_mu(bool_mask, bool_zero, vec_src, vec_min, vl);
+                auto bool_maxpos = rvv<T>::vmgt_mu(bool_mask, bool_zero, vec_src, vec_max, vl);
+                vec_minpos = __riscv_vmerge_tu(vec_minpos, vec_minpos, vec_pos, bool_minpos, vl);
+                vec_maxpos = __riscv_vmerge_tu(vec_maxpos, vec_maxpos, vec_pos, bool_maxpos, vl);
+
+                vec_min = __riscv_vmerge_tu(vec_min, vec_min, vec_src, bool_minpos, vl);
+                vec_max = __riscv_vmerge_tu(vec_max, vec_max, vec_src, bool_maxpos, vl);
+                vec_pos = __riscv_vadd(vec_pos, vl, vlmax);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const T* src_row = reinterpret_cast<const T*>(src_data + i * src_step);
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = rvv<T>::vsetvl(width - j);
+                auto vec_src = rvv<T>::vle(src_row + j, vl);
+
+                auto bool_minpos = rvv<T>::vmlt(vec_src, vec_min, vl);
+                auto bool_maxpos = rvv<T>::vmgt(vec_src, vec_max, vl);
+                vec_minpos = __riscv_vmerge_tu(vec_minpos, vec_minpos, vec_pos, bool_minpos, vl);
+                vec_maxpos = __riscv_vmerge_tu(vec_maxpos, vec_maxpos, vec_pos, bool_maxpos, vl);
+
+                vec_min = __riscv_vmerge_tu(vec_min, vec_min, vec_src, bool_minpos, vl);
+                vec_max = __riscv_vmerge_tu(vec_max, vec_max, vec_src, bool_maxpos, vl);
+                vec_pos = __riscv_vadd(vec_pos, vl, vlmax);
+            }
+        }
+    }
+
+    val_min = std::numeric_limits<T>::max();
+    val_max = std::numeric_limits<T>::lowest();
+    for (int i = 0; i < vlmax; i++)
+    {
+        if (val_min > rvv<T>::vmv_x_s(vec_min))
+        {
+            val_min = rvv<T>::vmv_x_s(vec_min);
+            if (minIdx)
+            {
+                minIdx[0] = __riscv_vmv_x(vec_minpos) / width;
+                minIdx[1] = __riscv_vmv_x(vec_minpos) % width;
+            }
+        }
+        if (val_max < rvv<T>::vmv_x_s(vec_max))
+        {
+            val_max = rvv<T>::vmv_x_s(vec_max);
+            if (maxIdx)
+            {
+                maxIdx[0] = __riscv_vmv_x(vec_maxpos) / width;
+                maxIdx[1] = __riscv_vmv_x(vec_maxpos) % width;
+            }
+        }
+        vec_min = __riscv_vslidedown(vec_min, 1, vlmax);
+        vec_max = __riscv_vslidedown(vec_max, 1, vlmax);
+        vec_minpos = __riscv_vslidedown(vec_minpos, 1, vlmax);
+        vec_maxpos = __riscv_vslidedown(vec_maxpos, 1, vlmax);
+    }
+    if (minVal)
+    {
+        *minVal = val_min;
+    }
+    if (maxVal)
+    {
+        *maxVal = val_max;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth, double* minVal, double* maxVal,
+                     int* minIdx, int* maxIdx, uchar* mask, size_t mask_step = 0)
+{
+    if (!mask_step)
+        mask_step = src_step;
+
+    switch (depth)
+    {
+    case CV_8UC1:
+        return minMaxIdxReadTwice<uchar>(src_data, src_step, width, height, minVal, maxVal, minIdx, maxIdx, mask, mask_step);
+    case CV_8SC1:
+        return minMaxIdxReadTwice<schar>(src_data, src_step, width, height, minVal, maxVal, minIdx, maxIdx, mask, mask_step);
+    case CV_16UC1:
+        return minMaxIdxReadTwice<ushort>(src_data, src_step, width, height, minVal, maxVal, minIdx, maxIdx, mask, mask_step);
+    case CV_16SC1:
+        return minMaxIdxReadTwice<short>(src_data, src_step, width, height, minVal, maxVal, minIdx, maxIdx, mask, mask_step);
+    case CV_32SC1:
+        return minMaxIdxReadOnce<int>(src_data, src_step, width, height, minVal, maxVal, minIdx, maxIdx, mask, mask_step);
+    case CV_32FC1:
+        return minMaxIdxReadOnce<float>(src_data, src_step, width, height, minVal, maxVal, minIdx, maxIdx, mask, mask_step);
+    case CV_64FC1:
+        return minMaxIdxReadOnce<double>(src_data, src_step, width, height, minVal, maxVal, minIdx, maxIdx, mask, mask_step);
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+}}
+
+#endif
--- a/3rdparty/hal_rvv/hal_rvv_1p0/norm.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/norm.hpp
@ -0,0 +1,517 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef OPENCV_HAL_RVV_NORM_HPP_INCLUDED
+#define OPENCV_HAL_RVV_NORM_HPP_INCLUDED
+
+#include <riscv_vector.h>
+
+namespace cv { namespace cv_hal_rvv {
+
+#undef cv_hal_norm
+#define cv_hal_norm cv::cv_hal_rvv::norm
+
+inline int normInf_8UC1(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e8m8();
+    auto vec_max = __riscv_vmv_v_x_u8m8(0, vlmax);
+
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src_row = src + i * src_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m8(width - j);
+                auto vec_src = __riscv_vle8_v_u8m8(src_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8m8(mask_row + j, vl);
+                auto bool_mask = __riscv_vmsne(vec_mask, 0, vl);
+                vec_max = __riscv_vmaxu_tumu(bool_mask, vec_max, vec_max, vec_src, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src_row = src + i * src_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m8(width - j);
+                auto vec_src = __riscv_vle8_v_u8m8(src_row + j, vl);
+                vec_max = __riscv_vmaxu_tu(vec_max, vec_max, vec_src, vl);
+            }
+        }
+    }
+    auto sc_max = __riscv_vmv_s_x_u8m1(0, vlmax);
+    sc_max = __riscv_vredmaxu(vec_max, sc_max, vlmax);
+    *result = __riscv_vmv_x(sc_max);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normL1_8UC1(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e8m2();
+    auto vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax);
+
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src_row = src + i * src_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8m2(mask_row + j, vl);
+                auto bool_mask = __riscv_vmsne(vec_mask, 0, vl);
+                auto vec_zext = __riscv_vzext_vf4_u32m8_m(bool_mask, vec_src, vl);
+                vec_sum = __riscv_vadd_tumu(bool_mask, vec_sum, vec_sum, vec_zext, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src_row = src + i * src_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl);
+                auto vec_zext = __riscv_vzext_vf4(vec_src, vl);
+                vec_sum = __riscv_vadd_tu(vec_sum, vec_sum, vec_zext, vl);
+            }
+        }
+    }
+    auto sc_sum = __riscv_vmv_s_x_u32m1(0, vlmax);
+    sc_sum = __riscv_vredsum(vec_sum, sc_sum, vlmax);
+    *result = __riscv_vmv_x(sc_sum);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normL2Sqr_8UC1(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e8m2();
+    auto vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax);
+    int cnt = 0;
+    auto reduce = [&](int vl) {
+        if ((cnt += vl) < (1 << 16))
+            return;
+        cnt = vl;
+        for (int i = 0; i < vlmax; i++)
+        {
+            *result += __riscv_vmv_x(vec_sum);
+            vec_sum = __riscv_vslidedown(vec_sum, 1, vlmax);
+        }
+        vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax);
+    };
+
+    *result = 0;
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src_row = src + i * src_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                reduce(vl);
+
+                auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8m2(mask_row + j, vl);
+                auto bool_mask = __riscv_vmsne(vec_mask, 0, vl);
+                auto vec_mul = __riscv_vwmulu_vv_u16m4_m(bool_mask, vec_src, vec_src, vl);
+                auto vec_zext = __riscv_vzext_vf2_u32m8_m(bool_mask, vec_mul, vl);
+                vec_sum = __riscv_vadd_tumu(bool_mask, vec_sum, vec_sum, vec_zext, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src_row = src + i * src_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                reduce(vl);
+
+                auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl);
+                auto vec_mul = __riscv_vwmulu(vec_src, vec_src, vl);
+                auto vec_zext = __riscv_vzext_vf2(vec_mul, vl);
+                vec_sum = __riscv_vadd_tu(vec_sum, vec_sum, vec_zext, vl);
+            }
+        }
+    }
+    reduce(1 << 16);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normInf_8UC4(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e8m8();
+    auto vec_max = __riscv_vmv_v_x_u8m8(0, vlmax);
+
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src_row = src + i * src_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int vl, vlm;
+            for (int j = 0, jm = 0; j < width * 4; j += vl, jm += vlm)
+            {
+                vl = __riscv_vsetvl_e8m8(width * 4 - j);
+                vlm = __riscv_vsetvl_e8m2(width - jm);
+                auto vec_src = __riscv_vle8_v_u8m8(src_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8m2(mask_row + jm, vlm);
+                auto vec_mask_ext = __riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(vec_mask, 1, vlm), vlm), 0x01010101, vlm);
+                auto bool_mask_ext = __riscv_vmsne(__riscv_vreinterpret_u8m8(vec_mask_ext), 0, vl);
+                vec_max = __riscv_vmaxu_tumu(bool_mask_ext, vec_max, vec_max, vec_src, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src_row = src + i * src_step;
+            int vl;
+            for (int j = 0; j < width * 4; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m8(width * 4 - j);
+                auto vec_src = __riscv_vle8_v_u8m8(src_row + j, vl);
+                vec_max = __riscv_vmaxu_tu(vec_max, vec_max, vec_src, vl);
+            }
+        }
+    }
+    auto sc_max = __riscv_vmv_s_x_u8m1(0, vlmax);
+    sc_max = __riscv_vredmaxu(vec_max, sc_max, vlmax);
+    *result = __riscv_vmv_x(sc_max);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normL1_8UC4(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e8m2();
+    auto vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax);
+
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src_row = src + i * src_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int vl, vlm;
+            for (int j = 0, jm = 0; j < width * 4; j += vl, jm += vlm)
+            {
+                vl = __riscv_vsetvl_e8m2(width * 4 - j);
+                vlm = __riscv_vsetvl_e8mf2(width - jm);
+                auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8mf2(mask_row + jm, vlm);
+                auto vec_mask_ext = __riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(vec_mask, 1, vlm), vlm), 0x01010101, vlm);
+                auto bool_mask_ext = __riscv_vmsne(__riscv_vreinterpret_u8m2(vec_mask_ext), 0, vl);
+                auto vec_zext = __riscv_vzext_vf4_u32m8_m(bool_mask_ext, vec_src, vl);
+                vec_sum = __riscv_vadd_tumu(bool_mask_ext, vec_sum, vec_sum, vec_zext, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src_row = src + i * src_step;
+            int vl;
+            for (int j = 0; j < width * 4; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width * 4 - j);
+                auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl);
+                auto vec_zext = __riscv_vzext_vf4(vec_src, vl);
+                vec_sum = __riscv_vadd_tu(vec_sum, vec_sum, vec_zext, vl);
+            }
+        }
+    }
+    auto sc_sum = __riscv_vmv_s_x_u32m1(0, vlmax);
+    sc_sum = __riscv_vredsum(vec_sum, sc_sum, vlmax);
+    *result = __riscv_vmv_x(sc_sum);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normL2Sqr_8UC4(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e8m2();
+    auto vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax);
+    int cnt = 0;
+    auto reduce = [&](int vl) {
+        if ((cnt += vl) < (1 << 16))
+            return;
+        cnt = vl;
+        for (int i = 0; i < vlmax; i++)
+        {
+            *result += __riscv_vmv_x(vec_sum);
+            vec_sum = __riscv_vslidedown(vec_sum, 1, vlmax);
+        }
+        vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax);
+    };
+
+    *result = 0;
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src_row = src + i * src_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int vl, vlm;
+            for (int j = 0, jm = 0; j < width * 4; j += vl, jm += vlm)
+            {
+                vl = __riscv_vsetvl_e8m2(width * 4 - j);
+                vlm = __riscv_vsetvl_e8mf2(width - jm);
+                reduce(vl);
+
+                auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8mf2(mask_row + jm, vlm);
+                auto vec_mask_ext = __riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(vec_mask, 1, vlm), vlm), 0x01010101, vlm);
+                auto bool_mask_ext = __riscv_vmsne(__riscv_vreinterpret_u8m2(vec_mask_ext), 0, vl);
+                auto vec_mul = __riscv_vwmulu_vv_u16m4_m(bool_mask_ext, vec_src, vec_src, vl);
+                auto vec_zext = __riscv_vzext_vf2_u32m8_m(bool_mask_ext, vec_mul, vl);
+                vec_sum = __riscv_vadd_tumu(bool_mask_ext, vec_sum, vec_sum, vec_zext, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src_row = src + i * src_step;
+            int vl;
+            for (int j = 0; j < width * 4; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width * 4 - j);
+                reduce(vl);
+
+                auto vec_src = __riscv_vle8_v_u8m2(src_row + j, vl);
+                auto vec_mul = __riscv_vwmulu(vec_src, vec_src, vl);
+                auto vec_zext = __riscv_vzext_vf2(vec_mul, vl);
+                vec_sum = __riscv_vadd_tu(vec_sum, vec_sum, vec_zext, vl);
+            }
+        }
+    }
+    reduce(1 << 16);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normInf_32FC1(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e32m8();
+    auto vec_max = __riscv_vfmv_v_f_f32m8(0, vlmax);
+
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const float* src_row = reinterpret_cast<const float*>(src + i * src_step);
+            const uchar* mask_row = mask + i * mask_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m8(width - j);
+                auto vec_src = __riscv_vle32_v_f32m8(src_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8m2(mask_row + j, vl);
+                auto bool_mask = __riscv_vmsne(vec_mask, 0, vl);
+                auto vec_abs = __riscv_vfabs_v_f32m8_m(bool_mask, vec_src, vl);
+                vec_max = __riscv_vfmax_tumu(bool_mask, vec_max, vec_max, vec_abs, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const float* src_row = reinterpret_cast<const float*>(src + i * src_step);
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m8(width - j);
+                auto vec_src = __riscv_vle32_v_f32m8(src_row + j, vl);
+                auto vec_abs = __riscv_vfabs(vec_src, vl);
+                vec_max = __riscv_vfmax_tu(vec_max, vec_max, vec_abs, vl);
+            }
+        }
+    }
+    auto sc_max = __riscv_vfmv_s_f_f32m1(0, vlmax);
+    sc_max = __riscv_vfredmax(vec_max, sc_max, vlmax);
+    *result = __riscv_vfmv_f(sc_max);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normL1_32FC1(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e32m4();
+    auto vec_sum = __riscv_vfmv_v_f_f64m8(0, vlmax);
+
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const float* src_row = reinterpret_cast<const float*>(src + i * src_step);
+            const uchar* mask_row = mask + i * mask_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m4(width - j);
+                auto vec_src = __riscv_vle32_v_f32m4(src_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8m1(mask_row + j, vl);
+                auto bool_mask = __riscv_vmsne(vec_mask, 0, vl);
+                auto vec_abs = __riscv_vfabs_v_f32m4_m(bool_mask, vec_src, vl);
+                auto vec_fext = __riscv_vfwcvt_f_f_v_f64m8_m(bool_mask, vec_abs, vl);
+                vec_sum = __riscv_vfadd_tumu(bool_mask, vec_sum, vec_sum, vec_fext, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const float* src_row = reinterpret_cast<const float*>(src + i * src_step);
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m4(width - j);
+                auto vec_src = __riscv_vle32_v_f32m4(src_row + j, vl);
+                auto vec_abs = __riscv_vfabs(vec_src, vl);
+                auto vec_fext = __riscv_vfwcvt_f_f_v_f64m8(vec_abs, vl);
+                vec_sum = __riscv_vfadd_tu(vec_sum, vec_sum, vec_fext, vl);
+            }
+        }
+    }
+    auto sc_sum = __riscv_vfmv_s_f_f64m1(0, vlmax);
+    sc_sum = __riscv_vfredosum(vec_sum, sc_sum, vlmax);
+    *result = __riscv_vfmv_f(sc_sum);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normL2Sqr_32FC1(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e32m4();
+    auto vec_sum = __riscv_vfmv_v_f_f64m8(0, vlmax);
+
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const float* src_row = reinterpret_cast<const float*>(src + i * src_step);
+            const uchar* mask_row = mask + i * mask_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m4(width - j);
+                auto vec_src = __riscv_vle32_v_f32m4(src_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8m1(mask_row + j, vl);
+                auto bool_mask = __riscv_vmsne(vec_mask, 0, vl);
+                auto vec_mul = __riscv_vfwmul_vv_f64m8_m(bool_mask, vec_src, vec_src, vl);
+                vec_sum = __riscv_vfadd_tumu(bool_mask, vec_sum, vec_sum, vec_mul, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const float* src_row = reinterpret_cast<const float*>(src + i * src_step);
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m4(width - j);
+                auto vec_src = __riscv_vle32_v_f32m4(src_row + j, vl);
+                auto vec_mul = __riscv_vfwmul(vec_src, vec_src, vl);
+                vec_sum = __riscv_vfadd_tu(vec_sum, vec_sum, vec_mul, vl);
+            }
+        }
+    }
+    auto sc_sum = __riscv_vfmv_s_f_f64m1(0, vlmax);
+    sc_sum = __riscv_vfredosum(vec_sum, sc_sum, vlmax);
+    *result = __riscv_vfmv_f(sc_sum);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width,
+                int height, int type, int norm_type, double* result)
+{
+    if (!result)
+        return CV_HAL_ERROR_OK;
+
+    switch (type)
+    {
+    case CV_8UC1:
+        switch (norm_type)
+        {
+        case NORM_INF:
+            return normInf_8UC1(src, src_step, mask, mask_step, width, height, result);
+        case NORM_L1:
+            return normL1_8UC1(src, src_step, mask, mask_step, width, height, result);
+        case NORM_L2SQR:
+            return normL2Sqr_8UC1(src, src_step, mask, mask_step, width, height, result);
+        case NORM_L2:
+            int ret = normL2Sqr_8UC1(src, src_step, mask, mask_step, width, height, result);
+            *result = std::sqrt(*result);
+            return ret;
+        }
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    case CV_8UC4:
+        switch (norm_type)
+        {
+        case NORM_INF:
+            return normInf_8UC4(src, src_step, mask, mask_step, width, height, result);
+        case NORM_L1:
+            return normL1_8UC4(src, src_step, mask, mask_step, width, height, result);
+        case NORM_L2SQR:
+            return normL2Sqr_8UC4(src, src_step, mask, mask_step, width, height, result);
+        case NORM_L2:
+            int ret = normL2Sqr_8UC4(src, src_step, mask, mask_step, width, height, result);
+            *result = std::sqrt(*result);
+            return ret;
+        }
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    case CV_32FC1:
+        switch (norm_type)
+        {
+        case NORM_INF:
+            return normInf_32FC1(src, src_step, mask, mask_step, width, height, result);
+        case NORM_L1:
+            return normL1_32FC1(src, src_step, mask, mask_step, width, height, result);
+        case NORM_L2SQR:
+            return normL2Sqr_32FC1(src, src_step, mask, mask_step, width, height, result);
+        case NORM_L2:
+            int ret = normL2Sqr_32FC1(src, src_step, mask, mask_step, width, height, result);
+            *result = std::sqrt(*result);
+            return ret;
+        }
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+}}
+
+#endif
--- a/3rdparty/hal_rvv/hal_rvv_1p0/norm_diff.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/norm_diff.hpp
@ -0,0 +1,605 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef OPENCV_HAL_RVV_NORM_DIFF_HPP_INCLUDED
+#define OPENCV_HAL_RVV_NORM_DIFF_HPP_INCLUDED
+
+#include <riscv_vector.h>
+
+namespace cv { namespace cv_hal_rvv {
+
+#undef cv_hal_normDiff
+#define cv_hal_normDiff cv::cv_hal_rvv::normDiff
+
+inline int normDiffInf_8UC1(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e8m8();
+    auto vec_max = __riscv_vmv_v_x_u8m8(0, vlmax);
+
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src1_row = src1 + i * src1_step;
+            const uchar* src2_row = src2 + i * src2_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m8(width - j);
+                auto vec_src1 = __riscv_vle8_v_u8m8(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle8_v_u8m8(src2_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8m8(mask_row + j, vl);
+                auto bool_mask = __riscv_vmsne(vec_mask, 0, vl);
+                auto vec_src = __riscv_vsub_vv_u8m8_m(bool_mask, __riscv_vmaxu_vv_u8m8_m(bool_mask, vec_src1, vec_src2, vl),
+                                                      __riscv_vminu_vv_u8m8_m(bool_mask, vec_src1, vec_src2, vl), vl);
+                vec_max = __riscv_vmaxu_tumu(bool_mask, vec_max, vec_max, vec_src, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src1_row = src1 + i * src1_step;
+            const uchar* src2_row = src2 + i * src2_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m8(width - j);
+                auto vec_src1 = __riscv_vle8_v_u8m8(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle8_v_u8m8(src2_row + j, vl);
+                auto vec_src = __riscv_vsub(__riscv_vmaxu(vec_src1, vec_src2, vl), __riscv_vminu(vec_src1, vec_src2, vl), vl);
+                vec_max = __riscv_vmaxu_tu(vec_max, vec_max, vec_src, vl);
+            }
+        }
+    }
+    auto sc_max = __riscv_vmv_s_x_u8m1(0, vlmax);
+    sc_max = __riscv_vredmaxu(vec_max, sc_max, vlmax);
+    *result = __riscv_vmv_x(sc_max);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normDiffL1_8UC1(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e8m2();
+    auto vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax);
+
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src1_row = src1 + i * src1_step;
+            const uchar* src2_row = src2 + i * src2_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                auto vec_src1 = __riscv_vle8_v_u8m2(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle8_v_u8m2(src2_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8m2(mask_row + j, vl);
+                auto bool_mask = __riscv_vmsne(vec_mask, 0, vl);
+                auto vec_src = __riscv_vsub_vv_u8m2_m(bool_mask, __riscv_vmaxu_vv_u8m2_m(bool_mask, vec_src1, vec_src2, vl),
+                                                      __riscv_vminu_vv_u8m2_m(bool_mask, vec_src1, vec_src2, vl), vl);
+                auto vec_zext = __riscv_vzext_vf4_u32m8_m(bool_mask, vec_src, vl);
+                vec_sum = __riscv_vadd_tumu(bool_mask, vec_sum, vec_sum, vec_zext, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src1_row = src1 + i * src1_step;
+            const uchar* src2_row = src2 + i * src2_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                auto vec_src1 = __riscv_vle8_v_u8m2(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle8_v_u8m2(src2_row + j, vl);
+                auto vec_src = __riscv_vsub(__riscv_vmaxu(vec_src1, vec_src2, vl), __riscv_vminu(vec_src1, vec_src2, vl), vl);
+                auto vec_zext = __riscv_vzext_vf4(vec_src, vl);
+                vec_sum = __riscv_vadd_tu(vec_sum, vec_sum, vec_zext, vl);
+            }
+        }
+    }
+    auto sc_sum = __riscv_vmv_s_x_u32m1(0, vlmax);
+    sc_sum = __riscv_vredsum(vec_sum, sc_sum, vlmax);
+    *result = __riscv_vmv_x(sc_sum);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normDiffL2Sqr_8UC1(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e8m2();
+    auto vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax);
+    int cnt = 0;
+    auto reduce = [&](int vl) {
+        if ((cnt += vl) < (1 << 16))
+            return;
+        cnt = vl;
+        for (int i = 0; i < vlmax; i++)
+        {
+            *result += __riscv_vmv_x(vec_sum);
+            vec_sum = __riscv_vslidedown(vec_sum, 1, vlmax);
+        }
+        vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax);
+    };
+
+    *result = 0;
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src1_row = src1 + i * src1_step;
+            const uchar* src2_row = src2 + i * src2_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                reduce(vl);
+
+                auto vec_src1 = __riscv_vle8_v_u8m2(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle8_v_u8m2(src2_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8m2(mask_row + j, vl);
+                auto bool_mask = __riscv_vmsne(vec_mask, 0, vl);
+                auto vec_src = __riscv_vsub_vv_u8m2_m(bool_mask, __riscv_vmaxu_vv_u8m2_m(bool_mask, vec_src1, vec_src2, vl),
+                                                      __riscv_vminu_vv_u8m2_m(bool_mask, vec_src1, vec_src2, vl), vl);
+                auto vec_mul = __riscv_vwmulu_vv_u16m4_m(bool_mask, vec_src, vec_src, vl);
+                auto vec_zext = __riscv_vzext_vf2_u32m8_m(bool_mask, vec_mul, vl);
+                vec_sum = __riscv_vadd_tumu(bool_mask, vec_sum, vec_sum, vec_zext, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src1_row = src1 + i * src1_step;
+            const uchar* src2_row = src2 + i * src2_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                reduce(vl);
+
+                auto vec_src1 = __riscv_vle8_v_u8m2(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle8_v_u8m2(src2_row + j, vl);
+                auto vec_src = __riscv_vsub(__riscv_vmaxu(vec_src1, vec_src2, vl), __riscv_vminu(vec_src1, vec_src2, vl), vl);
+                auto vec_mul = __riscv_vwmulu(vec_src, vec_src, vl);
+                auto vec_zext = __riscv_vzext_vf2(vec_mul, vl);
+                vec_sum = __riscv_vadd_tu(vec_sum, vec_sum, vec_zext, vl);
+            }
+        }
+    }
+    reduce(1 << 16);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normDiffInf_8UC4(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e8m8();
+    auto vec_max = __riscv_vmv_v_x_u8m8(0, vlmax);
+
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src1_row = src1 + i * src1_step;
+            const uchar* src2_row = src2 + i * src2_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int vl, vlm;
+            for (int j = 0, jm = 0; j < width * 4; j += vl, jm += vlm)
+            {
+                vl = __riscv_vsetvl_e8m8(width * 4 - j);
+                vlm = __riscv_vsetvl_e8m2(width - jm);
+                auto vec_src1 = __riscv_vle8_v_u8m8(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle8_v_u8m8(src2_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8m2(mask_row + jm, vlm);
+                auto vec_mask_ext = __riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(vec_mask, 1, vlm), vlm), 0x01010101, vlm);
+                auto bool_mask_ext = __riscv_vmsne(__riscv_vreinterpret_u8m8(vec_mask_ext), 0, vl);
+                auto vec_src = __riscv_vsub_vv_u8m8_m(bool_mask_ext, __riscv_vmaxu_vv_u8m8_m(bool_mask_ext, vec_src1, vec_src2, vl),
+                                                      __riscv_vminu_vv_u8m8_m(bool_mask_ext, vec_src1, vec_src2, vl), vl);
+                vec_max = __riscv_vmaxu_tumu(bool_mask_ext, vec_max, vec_max, vec_src, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src1_row = src1 + i * src1_step;
+            const uchar* src2_row = src2 + i * src2_step;
+            int vl;
+            for (int j = 0; j < width * 4; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m8(width * 4 - j);
+                auto vec_src1 = __riscv_vle8_v_u8m8(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle8_v_u8m8(src2_row + j, vl);
+                auto vec_src = __riscv_vsub(__riscv_vmaxu(vec_src1, vec_src2, vl), __riscv_vminu(vec_src1, vec_src2, vl), vl);
+                vec_max = __riscv_vmaxu_tu(vec_max, vec_max, vec_src, vl);
+            }
+        }
+    }
+    auto sc_max = __riscv_vmv_s_x_u8m1(0, vlmax);
+    sc_max = __riscv_vredmaxu(vec_max, sc_max, vlmax);
+    *result = __riscv_vmv_x(sc_max);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normDiffL1_8UC4(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e8m2();
+    auto vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax);
+
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src1_row = src1 + i * src1_step;
+            const uchar* src2_row = src2 + i * src2_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int vl, vlm;
+            for (int j = 0, jm = 0; j < width * 4; j += vl, jm += vlm)
+            {
+                vl = __riscv_vsetvl_e8m2(width * 4 - j);
+                vlm = __riscv_vsetvl_e8mf2(width - jm);
+                auto vec_src1 = __riscv_vle8_v_u8m2(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle8_v_u8m2(src2_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8mf2(mask_row + jm, vlm);
+                auto vec_mask_ext = __riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(vec_mask, 1, vlm), vlm), 0x01010101, vlm);
+                auto bool_mask_ext = __riscv_vmsne(__riscv_vreinterpret_u8m2(vec_mask_ext), 0, vl);
+                auto vec_src = __riscv_vsub_vv_u8m2_m(bool_mask_ext, __riscv_vmaxu_vv_u8m2_m(bool_mask_ext, vec_src1, vec_src2, vl),
+                                                      __riscv_vminu_vv_u8m2_m(bool_mask_ext, vec_src1, vec_src2, vl), vl);
+                auto vec_zext = __riscv_vzext_vf4_u32m8_m(bool_mask_ext, vec_src, vl);
+                vec_sum = __riscv_vadd_tumu(bool_mask_ext, vec_sum, vec_sum, vec_zext, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src1_row = src1 + i * src1_step;
+            const uchar* src2_row = src2 + i * src2_step;
+            int vl;
+            for (int j = 0; j < width * 4; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width * 4 - j);
+                auto vec_src1 = __riscv_vle8_v_u8m2(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle8_v_u8m2(src2_row + j, vl);
+                auto vec_src = __riscv_vsub(__riscv_vmaxu(vec_src1, vec_src2, vl), __riscv_vminu(vec_src1, vec_src2, vl), vl);
+                auto vec_zext = __riscv_vzext_vf4(vec_src, vl);
+                vec_sum = __riscv_vadd_tu(vec_sum, vec_sum, vec_zext, vl);
+            }
+        }
+    }
+    auto sc_sum = __riscv_vmv_s_x_u32m1(0, vlmax);
+    sc_sum = __riscv_vredsum(vec_sum, sc_sum, vlmax);
+    *result = __riscv_vmv_x(sc_sum);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normDiffL2Sqr_8UC4(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e8m2();
+    auto vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax);
+    int cnt = 0;
+    auto reduce = [&](int vl) {
+        if ((cnt += vl) < (1 << 16))
+            return;
+        cnt = vl;
+        for (int i = 0; i < vlmax; i++)
+        {
+            *result += __riscv_vmv_x(vec_sum);
+            vec_sum = __riscv_vslidedown(vec_sum, 1, vlmax);
+        }
+        vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax);
+    };
+
+    *result = 0;
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src1_row = src1 + i * src1_step;
+            const uchar* src2_row = src2 + i * src2_step;
+            const uchar* mask_row = mask + i * mask_step;
+            int vl, vlm;
+            for (int j = 0, jm = 0; j < width * 4; j += vl, jm += vlm)
+            {
+                vl = __riscv_vsetvl_e8m2(width * 4 - j);
+                vlm = __riscv_vsetvl_e8mf2(width - jm);
+                reduce(vl);
+
+                auto vec_src1 = __riscv_vle8_v_u8m2(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle8_v_u8m2(src2_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8mf2(mask_row + jm, vlm);
+                auto vec_mask_ext = __riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(vec_mask, 1, vlm), vlm), 0x01010101, vlm);
+                auto bool_mask_ext = __riscv_vmsne(__riscv_vreinterpret_u8m2(vec_mask_ext), 0, vl);
+                auto vec_src = __riscv_vsub_vv_u8m2_m(bool_mask_ext, __riscv_vmaxu_vv_u8m2_m(bool_mask_ext, vec_src1, vec_src2, vl),
+                                                      __riscv_vminu_vv_u8m2_m(bool_mask_ext, vec_src1, vec_src2, vl), vl);
+                auto vec_mul = __riscv_vwmulu_vv_u16m4_m(bool_mask_ext, vec_src, vec_src, vl);
+                auto vec_zext = __riscv_vzext_vf2_u32m8_m(bool_mask_ext, vec_mul, vl);
+                vec_sum = __riscv_vadd_tumu(bool_mask_ext, vec_sum, vec_sum, vec_zext, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const uchar* src1_row = src1 + i * src1_step;
+            const uchar* src2_row = src2 + i * src2_step;
+            int vl;
+            for (int j = 0; j < width * 4; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width * 4 - j);
+                reduce(vl);
+
+                auto vec_src1 = __riscv_vle8_v_u8m2(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle8_v_u8m2(src2_row + j, vl);
+                auto vec_src = __riscv_vsub(__riscv_vmaxu(vec_src1, vec_src2, vl), __riscv_vminu(vec_src1, vec_src2, vl), vl);
+                auto vec_mul = __riscv_vwmulu(vec_src, vec_src, vl);
+                auto vec_zext = __riscv_vzext_vf2(vec_mul, vl);
+                vec_sum = __riscv_vadd_tu(vec_sum, vec_sum, vec_zext, vl);
+            }
+        }
+    }
+    reduce(1 << 16);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normDiffInf_32FC1(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e32m8();
+    auto vec_max = __riscv_vfmv_v_f_f32m8(0, vlmax);
+
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const float* src1_row = reinterpret_cast<const float*>(src1 + i * src1_step);
+            const float* src2_row = reinterpret_cast<const float*>(src2 + i * src2_step);
+            const uchar* mask_row = mask + i * mask_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m8(width - j);
+                auto vec_src1 = __riscv_vle32_v_f32m8(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle32_v_f32m8(src2_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8m2(mask_row + j, vl);
+                auto bool_mask = __riscv_vmsne(vec_mask, 0, vl);
+                auto vec_src = __riscv_vfsub_vv_f32m8_m(bool_mask, vec_src1, vec_src2, vl);
+                auto vec_abs = __riscv_vfabs_v_f32m8_m(bool_mask, vec_src, vl);
+                vec_max = __riscv_vfmax_tumu(bool_mask, vec_max, vec_max, vec_abs, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const float* src1_row = reinterpret_cast<const float*>(src1 + i * src1_step);
+            const float* src2_row = reinterpret_cast<const float*>(src2 + i * src2_step);
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m8(width - j);
+                auto vec_src1 = __riscv_vle32_v_f32m8(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle32_v_f32m8(src2_row + j, vl);
+                auto vec_src = __riscv_vfsub(vec_src1, vec_src2, vl);
+                auto vec_abs = __riscv_vfabs(vec_src, vl);
+                vec_max = __riscv_vfmax_tu(vec_max, vec_max, vec_abs, vl);
+            }
+        }
+    }
+    auto sc_max = __riscv_vfmv_s_f_f32m1(0, vlmax);
+    sc_max = __riscv_vfredmax(vec_max, sc_max, vlmax);
+    *result = __riscv_vfmv_f(sc_max);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normDiffL1_32FC1(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e32m4();
+    auto vec_sum = __riscv_vfmv_v_f_f64m8(0, vlmax);
+
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const float* src1_row = reinterpret_cast<const float*>(src1 + i * src1_step);
+            const float* src2_row = reinterpret_cast<const float*>(src2 + i * src2_step);
+            const uchar* mask_row = mask + i * mask_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m4(width - j);
+                auto vec_src1 = __riscv_vle32_v_f32m4(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle32_v_f32m4(src2_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8m1(mask_row + j, vl);
+                auto bool_mask = __riscv_vmsne(vec_mask, 0, vl);
+                auto vec_src = __riscv_vfsub_vv_f32m4_m(bool_mask, vec_src1, vec_src2, vl);
+                auto vec_abs = __riscv_vfabs_v_f32m4_m(bool_mask, vec_src, vl);
+                auto vec_fext = __riscv_vfwcvt_f_f_v_f64m8_m(bool_mask, vec_abs, vl);
+                vec_sum = __riscv_vfadd_tumu(bool_mask, vec_sum, vec_sum, vec_fext, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const float* src1_row = reinterpret_cast<const float*>(src1 + i * src1_step);
+            const float* src2_row = reinterpret_cast<const float*>(src2 + i * src2_step);
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m4(width - j);
+                auto vec_src1 = __riscv_vle32_v_f32m4(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle32_v_f32m4(src2_row + j, vl);
+                auto vec_src = __riscv_vfsub(vec_src1, vec_src2, vl);
+                auto vec_abs = __riscv_vfabs(vec_src, vl);
+                auto vec_fext = __riscv_vfwcvt_f_f_v_f64m8(vec_abs, vl);
+                vec_sum = __riscv_vfadd_tu(vec_sum, vec_sum, vec_fext, vl);
+            }
+        }
+    }
+    auto sc_sum = __riscv_vfmv_s_f_f64m1(0, vlmax);
+    sc_sum = __riscv_vfredosum(vec_sum, sc_sum, vlmax);
+    *result = __riscv_vfmv_f(sc_sum);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normDiffL2Sqr_32FC1(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result)
+{
+    int vlmax = __riscv_vsetvlmax_e32m4();
+    auto vec_sum = __riscv_vfmv_v_f_f64m8(0, vlmax);
+
+    if (mask)
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const float* src1_row = reinterpret_cast<const float*>(src1 + i * src1_step);
+            const float* src2_row = reinterpret_cast<const float*>(src2 + i * src2_step);
+            const uchar* mask_row = mask + i * mask_step;
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m4(width - j);
+                auto vec_src1 = __riscv_vle32_v_f32m4(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle32_v_f32m4(src2_row + j, vl);
+                auto vec_mask = __riscv_vle8_v_u8m1(mask_row + j, vl);
+                auto bool_mask = __riscv_vmsne(vec_mask, 0, vl);
+                auto vec_src = __riscv_vfsub_vv_f32m4_m(bool_mask, vec_src1, vec_src2, vl);
+                auto vec_mul = __riscv_vfwmul_vv_f64m8_m(bool_mask, vec_src, vec_src, vl);
+                vec_sum = __riscv_vfadd_tumu(bool_mask, vec_sum, vec_sum, vec_mul, vl);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++)
+        {
+            const float* src1_row = reinterpret_cast<const float*>(src1 + i * src1_step);
+            const float* src2_row = reinterpret_cast<const float*>(src2 + i * src2_step);
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m4(width - j);
+                auto vec_src1 = __riscv_vle32_v_f32m4(src1_row + j, vl);
+                auto vec_src2 = __riscv_vle32_v_f32m4(src2_row + j, vl);
+                auto vec_src = __riscv_vfsub(vec_src1, vec_src2, vl);
+                auto vec_mul = __riscv_vfwmul(vec_src, vec_src, vl);
+                vec_sum = __riscv_vfadd_tu(vec_sum, vec_sum, vec_mul, vl);
+            }
+        }
+    }
+    auto sc_sum = __riscv_vfmv_s_f_f64m1(0, vlmax);
+    sc_sum = __riscv_vfredosum(vec_sum, sc_sum, vlmax);
+    *result = __riscv_vfmv_f(sc_sum);
+
+    return CV_HAL_ERROR_OK;
+}
+
+inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask,
+                    size_t mask_step, int width, int height, int type, int norm_type, double* result)
+{
+    if (!result)
+        return CV_HAL_ERROR_OK;
+
+    int ret;
+    switch (type)
+    {
+    case CV_8UC1:
+        switch (norm_type & ~NORM_RELATIVE)
+        {
+        case NORM_INF:
+            ret = normDiffInf_8UC1(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result);
+            break;
+        case NORM_L1:
+            ret = normDiffL1_8UC1(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result);
+            break;
+        case NORM_L2SQR:
+            ret = normDiffL2Sqr_8UC1(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result);
+            break;
+        case NORM_L2:
+            ret = normDiffL2Sqr_8UC1(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result);
+            *result = std::sqrt(*result);
+            break;
+        default:
+            ret = CV_HAL_ERROR_NOT_IMPLEMENTED;
+        }
+        break;
+    case CV_8UC4:
+        switch (norm_type & ~NORM_RELATIVE)
+        {
+        case NORM_INF:
+            ret = normDiffInf_8UC4(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result);
+            break;
+        case NORM_L1:
+            ret = normDiffL1_8UC4(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result);
+            break;
+        case NORM_L2SQR:
+            ret = normDiffL2Sqr_8UC4(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result);
+            break;
+        case NORM_L2:
+            ret = normDiffL2Sqr_8UC4(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result);
+            *result = std::sqrt(*result);
+            break;
+        default:
+            ret = CV_HAL_ERROR_NOT_IMPLEMENTED;
+        }
+        break;
+    case CV_32FC1:
+        switch (norm_type & ~NORM_RELATIVE)
+        {
+        case NORM_INF:
+            ret = normDiffInf_32FC1(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result);
+            break;
+        case NORM_L1:
+            ret = normDiffL1_32FC1(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result);
+            break;
+        case NORM_L2SQR:
+            ret = normDiffL2Sqr_32FC1(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result);
+            break;
+        case NORM_L2:
+            ret = normDiffL2Sqr_32FC1(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result);
+            *result = std::sqrt(*result);
+            break;
+        default:
+            ret = CV_HAL_ERROR_NOT_IMPLEMENTED;
+        }
+        break;
+    default:
+        ret = CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if(ret == CV_HAL_ERROR_OK && (norm_type & NORM_RELATIVE))
+    {
+        double result_;
+        ret = cv::cv_hal_rvv::norm(src2, src2_step, mask, mask_step, width, height, type, norm_type & ~NORM_RELATIVE, &result_);
+        if(ret == CV_HAL_ERROR_OK)
+        {
+            *result /= result_ + DBL_EPSILON;
+        }
+    }
+
+    return ret;
+}
+
+}}
+
+#endif
--- a/3rdparty/hal_rvv/hal_rvv_1p0/split.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/split.hpp
@ -0,0 +1,93 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef OPENCV_HAL_RVV_SPLIT_HPP_INCLUDED
+#define OPENCV_HAL_RVV_SPLIT_HPP_INCLUDED
+
+#include <riscv_vector.h>
+
+namespace cv { namespace cv_hal_rvv {
+
+#undef cv_hal_split8u
+#define cv_hal_split8u cv::cv_hal_rvv::split8u
+
+inline int split8u(const uchar* src, uchar** dst, int len, int cn)
+{
+    int vl = 0;
+    if (cn == 1)
+    {
+        uchar* dst0 = dst[0];
+        for (int i = 0; i < len; i += vl)
+        {
+            vl = __riscv_vsetvl_e8m8(len - i);
+            __riscv_vse8_v_u8m8(dst0 + i, __riscv_vle8_v_u8m8(src + i, vl), vl);
+        }
+    }
+    else if (cn == 2)
+    {
+        uchar *dst0 = dst[0], *dst1 = dst[1];
+        for (int i = 0; i < len; i += vl)
+        {
+            vl = __riscv_vsetvl_e8m4(len - i);
+            vuint8m4x2_t seg = __riscv_vlseg2e8_v_u8m4x2(src + i * cn, vl);
+            __riscv_vse8_v_u8m4(dst0 + i, __riscv_vget_v_u8m4x2_u8m4(seg, 0), vl);
+            __riscv_vse8_v_u8m4(dst1 + i, __riscv_vget_v_u8m4x2_u8m4(seg, 1), vl);
+        }
+    }
+    else if (cn == 3)
+    {
+        uchar *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
+        for (int i = 0; i < len; i += vl)
+        {
+            vl = __riscv_vsetvl_e8m2(len - i);
+            vuint8m2x3_t seg = __riscv_vlseg3e8_v_u8m2x3(src + i * cn, vl);
+            __riscv_vse8_v_u8m2(dst0 + i, __riscv_vget_v_u8m2x3_u8m2(seg, 0), vl);
+            __riscv_vse8_v_u8m2(dst1 + i, __riscv_vget_v_u8m2x3_u8m2(seg, 1), vl);
+            __riscv_vse8_v_u8m2(dst2 + i, __riscv_vget_v_u8m2x3_u8m2(seg, 2), vl);
+        }
+    }
+    else if (cn == 4)
+    {
+        uchar *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
+        for (int i = 0; i < len; i += vl)
+        {
+            vl = __riscv_vsetvl_e8m2(len - i);
+            vuint8m2x4_t seg = __riscv_vlseg4e8_v_u8m2x4(src + i * cn, vl);
+            __riscv_vse8_v_u8m2(dst0 + i, __riscv_vget_v_u8m2x4_u8m2(seg, 0), vl);
+            __riscv_vse8_v_u8m2(dst1 + i, __riscv_vget_v_u8m2x4_u8m2(seg, 1), vl);
+            __riscv_vse8_v_u8m2(dst2 + i, __riscv_vget_v_u8m2x4_u8m2(seg, 2), vl);
+            __riscv_vse8_v_u8m2(dst3 + i, __riscv_vget_v_u8m2x4_u8m2(seg, 3), vl);
+        }
+    }
+    else
+    {
+        int k = 0;
+        for (; k <= cn - 4; k += 4)
+        {
+            uchar *dst0 = dst[k], *dst1 = dst[k + 1], *dst2 = dst[k + 2], *dst3 = dst[k + 3];
+            for (int i = 0; i < len; i += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(len - i);
+                vuint8m2x4_t seg = __riscv_vlsseg4e8_v_u8m2x4(src + k + i * cn, cn, vl);
+                __riscv_vse8_v_u8m2(dst0 + i, __riscv_vget_v_u8m2x4_u8m2(seg, 0), vl);
+                __riscv_vse8_v_u8m2(dst1 + i, __riscv_vget_v_u8m2x4_u8m2(seg, 1), vl);
+                __riscv_vse8_v_u8m2(dst2 + i, __riscv_vget_v_u8m2x4_u8m2(seg, 2), vl);
+                __riscv_vse8_v_u8m2(dst3 + i, __riscv_vget_v_u8m2x4_u8m2(seg, 3), vl);
+            }
+        }
+        for (; k < cn; ++k)
+        {
+            uchar* dstK = dst[k];
+            for (int i = 0; i < len; i += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(len - i);
+                vuint8m2_t seg = __riscv_vlse8_v_u8m2(src + k + i * cn, cn, vl);
+                __riscv_vse8_v_u8m2(dstK + i, seg, vl);
+            }
+        }
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+}}
+#endif
--- a/3rdparty/ippicv/ippicv.cmake
+++ b/3rdparty/ippicv/ippicv.cmake
@ -2,7 +2,7 @@ function(download_ippicv root_var)
  set(${root_var} "" PARENT_SCOPE)

  # Commit SHA in the opencv_3rdparty repo
-  set(IPPICV_COMMIT "7f55c0c26be418d494615afca15218566775c725")
+  set(IPPICV_COMMIT "d1cbea44d326eb0421fedcdd16de4630fd8c7ed0")
  # Define actual ICV versions
  if(APPLE)
    set(IPPICV_COMMIT "0cc4aa06bf2bef4b05d237c69a5a96b9cd0cb85a")
@ -14,9 +14,10 @@ function(download_ippicv root_var)
    set(OPENCV_ICV_PLATFORM "linux")
    set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_lnx")
    if(X86_64)
-      set(OPENCV_ICV_NAME "ippicv_2021.12.0_lnx_intel64_20240425_general.tgz")
-      set(OPENCV_ICV_HASH "d06e6d44ece88f7f17a6cd9216761186")
+      set(OPENCV_ICV_NAME "ippicv_2022.0.0_lnx_intel64_20240904_general.tgz")
+      set(OPENCV_ICV_HASH "63717ee0f918ad72fb5a737992a206d1")
    else()
+      set(IPPICV_COMMIT "7f55c0c26be418d494615afca15218566775c725")
      set(OPENCV_ICV_NAME "ippicv_2021.12.0_lnx_ia32_20240425_general.tgz")
      set(OPENCV_ICV_HASH "85ffa2b9ed7802b93c23fa27b0097d36")
    endif()
@ -24,9 +25,10 @@ function(download_ippicv root_var)
    set(OPENCV_ICV_PLATFORM "windows")
    set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_win")
    if(X86_64)
-      set(OPENCV_ICV_NAME "ippicv_2021.12.0_win_intel64_20240425_general.zip")
-      set(OPENCV_ICV_HASH "402ff8c6b4986738fed71c44e1ce665d")
+      set(OPENCV_ICV_NAME "ippicv_2022.0.0_win_intel64_20240904_general.zip")
+      set(OPENCV_ICV_HASH "3a6eca7cc3bce7159eb1443c6fca4e31")
    else()
+      set(IPPICV_COMMIT "7f55c0c26be418d494615afca15218566775c725")
      set(OPENCV_ICV_NAME "ippicv_2021.12.0_win_ia32_20240425_general.zip")
      set(OPENCV_ICV_HASH "8b1d2a23957d57624d0de8f2a5cae5f1")
    endif()
--- a/3rdparty/ittnotify/CMakeLists.txt
+++ b/3rdparty/ittnotify/CMakeLists.txt
@ -24,7 +24,6 @@ set(ITT_PUBLIC_HDRS
    include/ittnotify.h
    include/jitprofiling.h
    include/libittnotify.h
-    include/llvm_jit_event_listener.hpp
 )
 set(ITT_PRIVATE_HDRS
    src/ittnotify/disable_warnings.h
@ -39,6 +38,11 @@ set(ITT_SRCS

 add_library(${ITT_LIBRARY} STATIC ${OPENCV_3RDPARTY_EXCLUDE_FROM_ALL} ${ITT_SRCS} ${ITT_PUBLIC_HDRS} ${ITT_PRIVATE_HDRS})

+file(STRINGS "src/ittnotify/ittnotify_config.h" API_VERSION_NUM REGEX "#define\[ \t]+API_VERSION_NUM[ \t]+([0-9\.]+)")
+if(API_VERSION_NUM MATCHES "#define\[ \t]+API_VERSION_NUM[ \t]+([0-9\.]*)")
+  set(ITTNOTIFY_VERSION "${CMAKE_MATCH_1}"  CACHE INTERNAL "" FORCE)
+endif()
+
 if(NOT WIN32)
  if(HAVE_DL_LIBRARY)
    target_link_libraries(${ITT_LIBRARY} dl)
@ -64,4 +68,4 @@ if(NOT BUILD_SHARED_LIBS)
  ocv_install_target(${ITT_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev OPTIONAL)
 endif()

-ocv_install_3rdparty_licenses(ittnotify src/ittnotify/LICENSE.BSD src/ittnotify/LICENSE.GPL)
+ocv_install_3rdparty_licenses(ittnotify src/ittnotify/BSD-3-Clause.txt src/ittnotify/GPL-2.0-only.txt)
--- a/3rdparty/ittnotify/include/ittnotify.h
+++ b/3rdparty/ittnotify/include/ittnotify.h
@ -1,60 +1,8 @@
-/* <copyright>
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
+/*
+  Copyright (C) 2005-2019 Intel Corporation

-  GPL LICENSE SUMMARY
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  Contact Information:
-  http://software.intel.com/en-us/articles/intel-vtune-amplifier-xe/
-
-  BSD LICENSE
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-</copyright> */
+  SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause
+*/
 #ifndef _ITTNOTIFY_H_
 #define _ITTNOTIFY_H_

@ -63,7 +11,8 @@
@brief Public User API functions and types
@mainpage

-The ITT API is used to annotate a user's program with additional information
+The Instrumentation and Tracing Technology API (ITT API) is used to
+annotate a user's program with additional information
 that can be used by correctness and performance tools. The user inserts
 calls in their program. Those calls generate information that is collected
 at runtime, and used by Intel(R) Threading Tools.
@ -141,6 +90,10 @@ The same ID may not be reused for different instances, unless a previous
 #  define ITT_OS_FREEBSD   4
 #endif /* ITT_OS_FREEBSD */

+#ifndef ITT_OS_OPENBSD
+#  define ITT_OS_OPENBSD   5
+#endif /* ITT_OS_OPENBSD */
+
 #ifndef ITT_OS
 #  if defined WIN32 || defined _WIN32
 #    define ITT_OS ITT_OS_WIN
@ -148,6 +101,8 @@ The same ID may not be reused for different instances, unless a previous
 #    define ITT_OS ITT_OS_MAC
 #  elif defined( __FreeBSD__ )
 #    define ITT_OS ITT_OS_FREEBSD
+#  elif defined( __OpenBSD__)
+#    define ITT_OS ITT_OS_OPENBSD
 #  else
 #    define ITT_OS ITT_OS_LINUX
 #  endif
@ -169,6 +124,10 @@ The same ID may not be reused for different instances, unless a previous
 #  define ITT_PLATFORM_FREEBSD 4
 #endif /* ITT_PLATFORM_FREEBSD */

+#ifndef ITT_PLATFORM_OPENBSD
+#  define ITT_PLATFORM_OPENBSD 5
+#endif /* ITT_PLATFORM_OPENBSD */
+
 #ifndef ITT_PLATFORM
 #  if ITT_OS==ITT_OS_WIN
 #    define ITT_PLATFORM ITT_PLATFORM_WIN
@ -176,6 +135,8 @@ The same ID may not be reused for different instances, unless a previous
 #    define ITT_PLATFORM ITT_PLATFORM_MAC
 #  elif ITT_OS==ITT_OS_FREEBSD
 #    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#  elif ITT_OS==ITT_OS_OPENBSD
+#    define ITT_PLATFORM ITT_PLATFORM_OPENBSD
 #  else
 #    define ITT_PLATFORM ITT_PLATFORM_POSIX
 #  endif
@ -228,7 +189,12 @@ The same ID may not be reused for different instances, unless a previous

 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 /* use __forceinline (VC++ specific) */
-#define ITT_INLINE           __forceinline
+#if defined(__MINGW32__) && !defined(__cplusplus)
+#define ITT_INLINE           static __inline__ __attribute__((__always_inline__,__gnu_inline__))
+#else
+#define ITT_INLINE           static __forceinline
+#endif /* __MINGW32__ */
+
 #define ITT_INLINE_ATTRIBUTE /* nothing */
 #else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 /*
@ -289,20 +255,20 @@ The same ID may not be reused for different instances, unless a previous
 #define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)
 #define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)

-#define ITTNOTIFY_VOID_D0(n,d)       (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
-#define ITTNOTIFY_VOID_D1(n,d,x)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
-#define ITTNOTIFY_VOID_D2(n,d,x,y)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
-#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
-#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
-#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
-#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
-#define ITTNOTIFY_DATA_D0(n,d)       (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
-#define ITTNOTIFY_DATA_D1(n,d,x)     (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
-#define ITTNOTIFY_DATA_D2(n,d,x,y)   (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
-#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
-#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
-#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
-#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+#define ITTNOTIFY_VOID_D0(n,d)       (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_VOID_D1(n,d,x)     (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_VOID_D2(n,d,x,y)   (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+#define ITTNOTIFY_DATA_D0(n,d)       (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_DATA_D1(n,d,x)     (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_DATA_D2(n,d,x,y)   (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)

 #ifdef ITT_STUB
 #undef ITT_STUB
@ -340,7 +306,7 @@ extern "C" {
 *     only pauses tracing and analyzing memory access.
 *     It does not pause tracing or analyzing threading APIs.
 *   .
- * - Intel(R) Parallel Amplifier and Intel(R) VTune(TM) Amplifier XE:
+ * - Intel(R) VTune(TM) Profiler:
 *   - Does continue to record when new threads are started.
 *   .
 * - Other effects:
@ -355,35 +321,143 @@ void ITTAPI __itt_resume(void);
 /** @brief Detach collection */
 void ITTAPI __itt_detach(void);

+/**
+ * @enum __itt_collection_scope
+ * @brief Enumerator for collection scopes
+ */
+typedef enum {
+    __itt_collection_scope_host    = 1 << 0,
+    __itt_collection_scope_offload = 1 << 1,
+    __itt_collection_scope_all     = 0x7FFFFFFF
+} __itt_collection_scope;
+
+/** @brief Pause scoped collection */
+void ITTAPI __itt_pause_scoped(__itt_collection_scope);
+/** @brief Resume scoped collection */
+void ITTAPI __itt_resume_scoped(__itt_collection_scope);
+
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, pause,  (void))
-ITT_STUBV(ITTAPI, void, resume, (void))
-ITT_STUBV(ITTAPI, void, detach, (void))
-#define __itt_pause      ITTNOTIFY_VOID(pause)
-#define __itt_pause_ptr  ITTNOTIFY_NAME(pause)
-#define __itt_resume     ITTNOTIFY_VOID(resume)
-#define __itt_resume_ptr ITTNOTIFY_NAME(resume)
-#define __itt_detach     ITTNOTIFY_VOID(detach)
-#define __itt_detach_ptr ITTNOTIFY_NAME(detach)
+ITT_STUBV(ITTAPI, void, pause,         (void))
+ITT_STUBV(ITTAPI, void, pause_scoped,  (__itt_collection_scope))
+ITT_STUBV(ITTAPI, void, resume,        (void))
+ITT_STUBV(ITTAPI, void, resume_scoped, (__itt_collection_scope))
+ITT_STUBV(ITTAPI, void, detach,        (void))
+#define __itt_pause             ITTNOTIFY_VOID(pause)
+#define __itt_pause_ptr         ITTNOTIFY_NAME(pause)
+#define __itt_pause_scoped      ITTNOTIFY_VOID(pause_scoped)
+#define __itt_pause_scoped_ptr  ITTNOTIFY_NAME(pause_scoped)
+#define __itt_resume            ITTNOTIFY_VOID(resume)
+#define __itt_resume_ptr        ITTNOTIFY_NAME(resume)
+#define __itt_resume_scoped     ITTNOTIFY_VOID(resume_scoped)
+#define __itt_resume_scoped_ptr ITTNOTIFY_NAME(resume_scoped)
+#define __itt_detach            ITTNOTIFY_VOID(detach)
+#define __itt_detach_ptr        ITTNOTIFY_NAME(detach)
 #else  /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_pause()
-#define __itt_pause_ptr  0
+#define __itt_pause_ptr           0
+#define __itt_pause_scoped(scope)
+#define __itt_pause_scoped_ptr    0
 #define __itt_resume()
-#define __itt_resume_ptr 0
+#define __itt_resume_ptr          0
+#define __itt_resume_scoped(scope)
+#define __itt_resume_scoped_ptr   0
 #define __itt_detach()
-#define __itt_detach_ptr 0
+#define __itt_detach_ptr          0
 #endif /* INTEL_NO_ITTNOTIFY_API */
 #else  /* INTEL_NO_MACRO_BODY */
-#define __itt_pause_ptr  0
-#define __itt_resume_ptr 0
-#define __itt_detach_ptr 0
+#define __itt_pause_ptr           0
+#define __itt_pause_scoped_ptr    0
+#define __itt_resume_ptr          0
+#define __itt_resume_scoped_ptr   0
+#define __itt_detach_ptr          0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 /** @} control group */
 /** @endcond */

+/**
+ * @defgroup Intel Processor Trace control
+ * API from this group provides control over collection and analysis of Intel Processor Trace (Intel PT) data
+ * Information about Intel Processor Trace technology can be found here (Volume 3 chapter 35):
+ * https://software.intel.com/sites/default/files/managed/39/c5/325462-sdm-vol-1-2abcd-3abcd.pdf
+ * Use this API to mark particular code regions for loading detailed performance statistics.
+ * This mode makes your analysis faster and more accurate.
+ * @{
+*/
+typedef unsigned char __itt_pt_region;
+
+/**
+ * @brief function saves a region name marked with Intel PT API and returns a region id.
+ * Only 7 names can be registered. Attempts to register more names will be ignored and a region id with auto names will be returned.
+ * For automatic naming of regions pass NULL as function parameter
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_pt_region ITTAPI __itt_pt_region_createA(const char    *name);
+__itt_pt_region ITTAPI __itt_pt_region_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_pt_region_create __itt_pt_region_createW
+#else /* UNICODE */
+#  define __itt_pt_region_create __itt_pt_region_createA
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_pt_region ITTAPI __itt_pt_region_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_pt_region, pt_region_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_pt_region, pt_region_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_pt_region, pt_region_create,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_pt_region_createA     ITTNOTIFY_DATA(pt_region_createA)
+#define __itt_pt_region_createA_ptr ITTNOTIFY_NAME(pt_region_createA)
+#define __itt_pt_region_createW     ITTNOTIFY_DATA(pt_region_createW)
+#define __itt_pt_region_createW_ptr ITTNOTIFY_NAME(pt_region_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_pt_region_create     ITTNOTIFY_DATA(pt_region_create)
+#define __itt_pt_region_create_ptr ITTNOTIFY_NAME(pt_region_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_pt_region_createA(name) (__itt_pt_region)0
+#define __itt_pt_region_createA_ptr 0
+#define __itt_pt_region_createW(name) (__itt_pt_region)0
+#define __itt_pt_region_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_pt_region_create(name)  (__itt_pt_region)0
+#define __itt_pt_region_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_pt_region_createA_ptr 0
+#define __itt_pt_region_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_pt_region_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief function contains a special code pattern identified on the post-processing stage and
+ * marks the beginning of a code region targeted for Intel PT analysis
+ * @param[in] region - region id, 0 <= region < 8
+*/
+void __itt_mark_pt_region_begin(__itt_pt_region region);
+/**
+ * @brief function contains a special code pattern identified on the post-processing stage and
+ * marks the end of a code region targeted for Intel PT analysis
+ * @param[in] region - region id, 0 <= region < 8
+*/
+void __itt_mark_pt_region_end(__itt_pt_region region);
+/** @} Intel PT control group*/
+
 /**
 * @defgroup threads Threads
 * @ingroup public
@ -541,14 +615,26 @@ ITT_STUBV(ITTAPI, void, suppress_pop, (void))
 /** @endcond */

 /**
- * @enum __itt_model_disable
- * @brief Enumerator for the disable methods
+ * @enum __itt_suppress_mode
+ * @brief Enumerator for the suppressing modes
 */
 typedef enum __itt_suppress_mode {
    __itt_unsuppress_range,
    __itt_suppress_range
 } __itt_suppress_mode_t;

+/**
+ * @enum __itt_collection_state
+ * @brief Enumerator for collection state.
+ */
+typedef enum {
+    __itt_collection_uninitialized = 0, /* uninitialized */
+    __itt_collection_init_fail = 1, /* failed to init */
+    __itt_collection_collector_absent = 2, /* non work state collector is absent */
+    __itt_collection_collector_exists = 3, /* work state collector exists */
+    __itt_collection_init_successful = 4 /* success to init */
+} __itt_collection_state;
+
 /**
 * @brief Mark a range of memory for error suppression or unsuppression for error types included in mask
 */
@ -1496,7 +1582,7 @@ ITT_STUBV(ITTAPI, void, heap_allocate_end, (__itt_heap_function h, void** addr,
 /** @endcond */

 /**
- * @brief Record an free begin occurrence.
+ * @brief Record a free begin occurrence.
 */
 void ITTAPI __itt_heap_free_begin(__itt_heap_function h, void* addr);

@ -1516,7 +1602,7 @@ ITT_STUBV(ITTAPI, void, heap_free_begin, (__itt_heap_function h, void* addr))
 /** @endcond */

 /**
- * @brief Record an free end occurrence.
+ * @brief Record a free end occurrence.
 */
 void ITTAPI __itt_heap_free_end(__itt_heap_function h, void* addr);

@ -1536,7 +1622,7 @@ ITT_STUBV(ITTAPI, void, heap_free_end, (__itt_heap_function h, void* addr))
 /** @endcond */

 /**
- * @brief Record an reallocation begin occurrence.
+ * @brief Record a reallocation begin occurrence.
 */
 void ITTAPI __itt_heap_reallocate_begin(__itt_heap_function h, void* addr, size_t new_size, int initialized);

@ -1556,7 +1642,7 @@ ITT_STUBV(ITTAPI, void, heap_reallocate_begin, (__itt_heap_function h, void* add
 /** @endcond */

 /**
- * @brief Record an reallocation end occurrence.
+ * @brief Record a reallocation end occurrence.
 */
 void ITTAPI __itt_heap_reallocate_end(__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized);

@ -2692,7 +2778,7 @@ ITT_STUB(ITTAPI, __itt_clock_domain*, clock_domain_create, (__itt_get_clock_info

 /**
 * @ingroup clockdomains
- * @brief Recalculate clock domains frequences and clock base timestamps.
+ * @brief Recalculate clock domains frequencies and clock base timestamps.
 */
 void ITTAPI __itt_clock_domain_reset(void);

@ -3597,11 +3683,12 @@ ITT_STUBV(ITTAPI, void, enable_attach, (void))
 /** @endcond */

 /**
- * @brief Module load info
- * This API is used to report necessary information in case of module relocation
- * @param[in] start_addr - relocated module start address
- * @param[in] end_addr - relocated module end address
- * @param[in] path - file system path to the module
+ * @brief Module load notification
+ * This API is used to report necessary information in case of bypassing default system loader.
+ * Notification should be done immidiatelly after this module is loaded to process memory.
+ * @param[in] start_addr - module start address
+ * @param[in] end_addr - module end address
+ * @param[in] path - file system full path to the module
 */
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 void ITTAPI __itt_module_loadA(void *start_addr, void *end_addr, const char *path);
@ -3656,7 +3743,462 @@ ITT_STUB(ITTAPI, void, module_load,  (void *start_addr, void *end_addr, const ch
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */

+/**
+ * @brief Report module unload
+ * This API is used to report necessary information in case of bypassing default system loader.
+ * Notification should be done just before the module is unloaded from process memory.
+ * @param[in] addr - base address of loaded module
+ */
+void ITTAPI __itt_module_unload(void *addr);

+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, module_unload, (void *addr))
+#define __itt_module_unload     ITTNOTIFY_VOID(module_unload)
+#define __itt_module_unload_ptr ITTNOTIFY_NAME(module_unload)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_module_unload(addr)
+#define __itt_module_unload_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_module_unload_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+typedef enum
+{
+    __itt_module_type_unknown = 0,
+    __itt_module_type_elf,
+    __itt_module_type_coff
+} __itt_module_type;
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+typedef enum
+{
+    itt_section_type_unknown,
+    itt_section_type_bss,        /* notifies that the section contains uninitialized data. These are the relevant section types and the modules that contain them:
+                                  * ELF module:  SHT_NOBITS section type
+                                  * COFF module: IMAGE_SCN_CNT_UNINITIALIZED_DATA section type
+                                  */
+    itt_section_type_data,       /* notifies that section contains initialized data. These are the relevant section types and the modules that contain them:
+                                  * ELF module:  SHT_PROGBITS section type
+                                  * COFF module: IMAGE_SCN_CNT_INITIALIZED_DATA section type
+                                  */
+    itt_section_type_text        /* notifies that the section contains executable code. These are the relevant section types and the modules that contain them:
+                                  * ELF module:  SHT_PROGBITS section type
+                                  * COFF module: IMAGE_SCN_CNT_CODE section type
+                                  */
+} __itt_section_type;
+/** @endcond */
+
+/**
+ * @hideinitializer
+ * @brief bit-mask, detects a section attribute that indicates whether a section can be executed as code:
+ * These are the relevant section attributes and the modules that contain them:
+ * ELF module:  PF_X section attribute
+ * COFF module: IMAGE_SCN_MEM_EXECUTE attribute
+ */
+#define __itt_section_exec 0x20000000
+
+/**
+ * @hideinitializer
+ * @brief bit-mask, detects a section attribute that indicates whether a section can be read.
+ * These are the relevant section attributes and the modules that contain them:
+ * ELF module:  PF_R attribute
+ * COFF module: IMAGE_SCN_MEM_READ attribute
+ */
+#define __itt_section_read 0x40000000
+
+/**
+ * @hideinitializer
+ * @brief bit-mask, detects a section attribute that indicates whether a section can be written to.
+ * These are the relevant section attributes and the modules that contain them:
+ * ELF module:  PF_W attribute
+ * COFF module: IMAGE_SCN_MEM_WRITE attribute
+ */
+#define __itt_section_write 0x80000000
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_section_info
+{
+    const char* name;                 /*!< Section name in UTF8 */
+    __itt_section_type type;          /*!< Section content and semantics description */
+    size_t flags;                     /*!< Section bit flags that describe attributes using bit mask
+                                       * Zero if disabled, non-zero if enabled
+                                       */
+    void* start_addr;                 /*!< Section load(relocated) start address */
+    size_t size;                      /*!< Section file offset */
+    size_t file_offset;               /*!< Section size */
+} __itt_section_info;
+
+#pragma pack(pop)
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_module_object
+{
+    unsigned int version;                 /*!< API version*/
+    __itt_id module_id;                   /*!< Unique identifier. This is unchanged for sections that belong to the same module */
+    __itt_module_type module_type;        /*!< Binary module format */
+    const char* module_name;              /*!< Unique module name or path to module in UTF8
+                                           * Contains module name when module_bufer and module_size exist
+                                           * Contains module path when module_bufer and module_size absent
+                                           * module_name remains the same for the certain module_id
+                                           */
+    void* module_buffer;                  /*!< Module buffer content */
+    size_t module_size;                   /*!< Module buffer size */
+                                          /*!< If module_buffer and module_size exist, the binary module is dumped onto the system.
+                                           * If module_buffer and module_size do not exist,
+                                           * the binary module exists on the system already.
+                                           * The module_name parameter contains the path to the module.
+                                           */
+    __itt_section_info* section_array;    /*!< Reference to section information */
+    size_t section_number;
+} __itt_module_object;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @brief Load module content and its loaded(relocated) sections.
+ * This API is useful to save a module, or specify its location on the system and report information about loaded sections.
+ * The target module is saved on the system if module buffer content and size are available.
+ * If module buffer content and size are unavailable, the module name contains the path to the existing binary module.
+ * @param[in] module_obj - provides module and section information, along with unique module identifiers (name,module ID)
+ * which bind the binary module to particular sections.
+ */
+void ITTAPI __itt_module_load_with_sections(__itt_module_object* module_obj);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, module_load_with_sections,  (__itt_module_object* module_obj))
+#define __itt_module_load_with_sections     ITTNOTIFY_VOID(module_load_with_sections)
+#define __itt_module_load_with_sections_ptr ITTNOTIFY_NAME(module_load_with_sections)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_module_load_with_sections(module_obj)
+#define __itt_module_load_with_sections_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_module_load_with_sections_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Unload a module and its loaded(relocated) sections.
+ * This API notifies that the module and its sections were unloaded.
+ * @param[in] module_obj - provides module and sections information, along with unique module identifiers (name,module ID)
+ * which bind the binary module to particular sections.
+ */
+void ITTAPI __itt_module_unload_with_sections(__itt_module_object* module_obj);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, module_unload_with_sections,  (__itt_module_object* module_obj))
+#define __itt_module_unload_with_sections     ITTNOTIFY_VOID(module_unload_with_sections)
+#define __itt_module_unload_with_sections_ptr ITTNOTIFY_NAME(module_unload_with_sections)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_module_unload_with_sections(module_obj)
+#define __itt_module_unload_with_sections_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_module_unload_with_sections_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_histogram
+{
+    const __itt_domain* domain;      /*!< Domain of the histogram*/
+    const char* nameA;               /*!< Name of the histogram */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* nameW;
+#else  /* UNICODE || _UNICODE */
+    void* nameW;
+#endif /* UNICODE || _UNICODE */
+    __itt_metadata_type x_type;     /*!< Type of the histogram X axis */
+    __itt_metadata_type y_type;     /*!< Type of the histogram Y axis */
+    int   extra1;                   /*!< Reserved to the runtime */
+    void* extra2;                   /*!< Reserved to the runtime */
+    struct ___itt_histogram* next;
+}  __itt_histogram;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @brief Create a typed histogram instance with given name/domain.
+ * @param[in] domain The domain controlling the call.
+ * @param[in] name   The name of the histogram.
+ * @param[in] x_type The type of the X axis in histogram (may be 0 to calculate batch statistics).
+ * @param[in] y_type The type of the Y axis in histogram.
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_histogram* ITTAPI __itt_histogram_createA(const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type);
+__itt_histogram* ITTAPI __itt_histogram_createW(const __itt_domain* domain, const wchar_t* name, __itt_metadata_type x_type, __itt_metadata_type y_type);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_histogram_create     __itt_histogram_createW
+#  define __itt_histogram_create_ptr __itt_histogram_createW_ptr
+#else /* UNICODE */
+#  define __itt_histogram_create     __itt_histogram_createA
+#  define __itt_histogram_create_ptr __itt_histogram_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_histogram* ITTAPI __itt_histogram_create(const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_histogram*, histogram_createA, (const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type))
+ITT_STUB(ITTAPI, __itt_histogram*, histogram_createW, (const __itt_domain* domain, const wchar_t* name, __itt_metadata_type x_type, __itt_metadata_type y_type))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_histogram*, histogram_create, (const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_histogram_createA     ITTNOTIFY_DATA(histogram_createA)
+#define __itt_histogram_createA_ptr ITTNOTIFY_NAME(histogram_createA)
+#define __itt_histogram_createW     ITTNOTIFY_DATA(histogram_createW)
+#define __itt_histogram_createW_ptr ITTNOTIFY_NAME(histogram_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_histogram_create     ITTNOTIFY_DATA(histogram_create)
+#define __itt_histogram_create_ptr ITTNOTIFY_NAME(histogram_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_histogram_createA(domain, name, x_type, y_type) (__itt_histogram*)0
+#define __itt_histogram_createA_ptr 0
+#define __itt_histogram_createW(domain, name, x_type, y_type) (__itt_histogram*)0
+#define __itt_histogram_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_histogram_create(domain, name, x_type, y_type) (__itt_histogram*)0
+#define __itt_histogram_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_histogram_createA_ptr 0
+#define __itt_histogram_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_histogram_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Submit statistics for a histogram instance.
+ * @param[in] hist    Pointer to the histogram instance to which the histogram statistic is to be dumped.
+ * @param[in] length  The number of elements in dumped axis data array.
+ * @param[in] x_data  The X axis dumped data itself (may be NULL to calculate batch statistics).
+ * @param[in] y_data  The Y axis dumped data itself.
+*/
+void ITTAPI __itt_histogram_submit(__itt_histogram* hist, size_t length, void* x_data, void* y_data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, histogram_submit, (__itt_histogram* hist, size_t length, void* x_data, void* y_data))
+#define __itt_histogram_submit     ITTNOTIFY_VOID(histogram_submit)
+#define __itt_histogram_submit_ptr ITTNOTIFY_NAME(histogram_submit)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_histogram_submit(hist, length, x_data, y_data)
+#define __itt_histogram_submit_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_histogram_submit_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+
+/**
+* @brief function allows to obtain the current collection state at the moment
+* @return collection state as a enum __itt_collection_state
+*/
+__itt_collection_state __itt_get_collection_state(void);
+
+/**
+* @brief function releases resources allocated by ITT API static part
+* this API should be called from the library destructor
+* @return void
+*/
+void __itt_release_resources(void);
+/** @endcond */
+
+/**
+ * @brief Create a typed counter with given domain pointer, string name and counter type
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_counter ITTAPI __itt_counter_createA_v3(const __itt_domain* domain, const char* name, __itt_metadata_type type);
+__itt_counter ITTAPI __itt_counter_createW_v3(const __itt_domain* domain, const wchar_t* name, __itt_metadata_type type);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_counter_create_v3     __itt_counter_createW_v3
+#  define __itt_counter_create_v3_ptr __itt_counter_createW_v3_ptr
+#else /* UNICODE */
+#  define __itt_counter_create_v3     __itt_counter_createA_v3
+#  define __itt_counter_create_v3_ptr __itt_counter_createA_v3_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_counter ITTAPI __itt_counter_create_v3(const __itt_domain* domain, const char* name, __itt_metadata_type type);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_createA_v3, (const __itt_domain* domain, const char* name, __itt_metadata_type type))
+ITT_STUB(ITTAPI, __itt_counter, counter_createW_v3, (const __itt_domain* domain, const wchar_t* name, __itt_metadata_type type))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create_v3,  (const __itt_domain* domain, const char* name, __itt_metadata_type type))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA_v3     ITTNOTIFY_DATA(counter_createA_v3)
+#define __itt_counter_createA_v3_ptr ITTNOTIFY_NAME(counter_createA_v3)
+#define __itt_counter_createW_v3     ITTNOTIFY_DATA(counter_createW_v3)
+#define __itt_counter_createW_v3_ptr ITTNOTIFY_NAME(counter_createW_v3)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_v3     ITTNOTIFY_DATA(counter_create_v3)
+#define __itt_counter_create_v3_ptr ITTNOTIFY_NAME(counter_create_v3)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA_v3(domain, name, type) (__itt_counter)0
+#define __itt_counter_createA_v3_ptr 0
+#define __itt_counter_createW_v3(domain, name, type) (__itt_counter)0
+#define __itt_counter_create_typedW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_v3(domain, name, type) (__itt_counter)0
+#define __itt_counter_create_v3_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA_v3_ptr 0
+#define __itt_counter_createW_v3_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_v3_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Set the counter value api
+ */
+void ITTAPI __itt_counter_set_value_v3(__itt_counter counter, void *value_ptr);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_set_value_v3, (__itt_counter counter, void *value_ptr))
+#define __itt_counter_set_value_v3     ITTNOTIFY_VOID(counter_set_value_v3)
+#define __itt_counter_set_value_v3_ptr ITTNOTIFY_NAME(counter_set_value_v3)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_set_value_v3(counter, value_ptr)
+#define __itt_counter_set_value_v3_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_set_value_v3_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief describes the type of context metadata
+*/
+typedef enum {
+    __itt_context_unknown = 0,              /*!< Undefined type */
+    __itt_context_nameA,                    /*!< ASCII string char* type */
+    __itt_context_nameW,                    /*!< Unicode string wchar_t* type */
+    __itt_context_deviceA,                  /*!< ASCII string char* type */
+    __itt_context_deviceW,                  /*!< Unicode string wchar_t* type */
+    __itt_context_unitsA,                   /*!< ASCII string char* type */
+    __itt_context_unitsW,                   /*!< Unicode string wchar_t* type */
+    __itt_context_pci_addrA,                /*!< ASCII string char* type */
+    __itt_context_pci_addrW,                /*!< Unicode string wchar_t* type */
+    __itt_context_tid,                      /*!< Unsigned 64-bit integer type */
+    __itt_context_max_val,                  /*!< Unsigned 64-bit integer type */
+    __itt_context_bandwidth_flag,           /*!< Unsigned 64-bit integer type */
+    __itt_context_latency_flag,             /*!< Unsigned 64-bit integer type */
+    __itt_context_occupancy_flag,           /*!< Unsigned 64-bit integer type */
+    __itt_context_on_thread_flag,           /*!< Unsigned 64-bit integer type */
+    __itt_context_is_abs_val_flag,          /*!< Unsigned 64-bit integer type */
+    __itt_context_cpu_instructions_flag,    /*!< Unsigned 64-bit integer type */
+    __itt_context_cpu_cycles_flag           /*!< Unsigned 64-bit integer type */
+} __itt_context_type;
+
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_context_name __itt_context_nameW
+#  define __itt_context_device __itt_context_deviceW
+#  define __itt_context_units __itt_context_unitsW
+#  define __itt_context_pci_addr __itt_context_pci_addrW
+#else  /* UNICODE || _UNICODE */
+#  define __itt_context_name __itt_context_nameA
+#  define __itt_context_device __itt_context_deviceA
+#  define __itt_context_units __itt_context_unitsA
+#  define __itt_context_pci_addr __itt_context_pci_addrA
+#endif /* UNICODE || _UNICODE */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_context_metadata
+{
+    __itt_context_type type;    /*!< Type of the context metadata value */
+    void* value;                /*!< Pointer to context metadata value itself */
+}  __itt_context_metadata;
+
+#pragma pack(pop)
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_counter_metadata
+{
+    __itt_counter counter;              /*!< Associated context metadata counter */
+    __itt_context_type type;            /*!< Type of the context metadata value */
+    const char* str_valueA;             /*!< String context metadata value */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* str_valueW;
+#else  /* UNICODE || _UNICODE */
+    void* str_valueW;
+#endif /* UNICODE || _UNICODE */
+    unsigned long long value;           /*!< Numeric context metadata value */
+    int   extra1;                       /*!< Reserved to the runtime */
+    void* extra2;                       /*!< Reserved to the runtime */
+    struct ___itt_counter_metadata* next;
+}  __itt_counter_metadata;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @brief Bind context metadata to counter instance
+ * @param[in] counter   Pointer to the counter instance to which the context metadata is to be associated.
+ * @param[in] length    The number of elements in context metadata array.
+ * @param[in] metadata  The context metadata itself.
+*/
+void ITTAPI __itt_bind_context_metadata_to_counter(__itt_counter counter, size_t length, __itt_context_metadata* metadata);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, bind_context_metadata_to_counter, (__itt_counter counter, size_t length, __itt_context_metadata* metadata))
+#define __itt_bind_context_metadata_to_counter     ITTNOTIFY_VOID(bind_context_metadata_to_counter)
+#define __itt_bind_context_metadata_to_counter_ptr ITTNOTIFY_NAME(bind_context_metadata_to_counter)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_bind_context_metadata_to_counter(counter, length, metadata)
+#define __itt_bind_context_metadata_to_counter_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_bind_context_metadata_to_counter_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */

 #ifdef __cplusplus
 }
@ -4005,7 +4547,7 @@ ITT_STUB(ITTAPI, __itt_caller, stack_caller_create, (void))
 /** @endcond */

 /**
- * @brief Destroy the inforamtion about stitch point identified by the pointer previously returned by __itt_stack_caller_create()
+ * @brief Destroy the information about stitch point identified by the pointer previously returned by __itt_stack_caller_create()
 */
 void ITTAPI __itt_stack_caller_destroy(__itt_caller id);

--- a/3rdparty/ittnotify/include/jitprofiling.h
+++ b/3rdparty/ittnotify/include/jitprofiling.h
@ -1,60 +1,8 @@
-/* <copyright>
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
+/*
+  Copyright (C) 2005-2019 Intel Corporation

-  GPL LICENSE SUMMARY
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  Contact Information:
-  http://software.intel.com/en-us/articles/intel-vtune-amplifier-xe/
-
-  BSD LICENSE
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-</copyright> */
+  SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause
+*/

 #ifndef __JITPROFILING_H__
 #define __JITPROFILING_H__
@ -66,7 +14,7 @@
 * generated code that can be used by performance tools. The user inserts
 * calls in the code generator to report information before JIT-compiled
 * code goes to execution. This information is collected at runtime and used
- * by tools like Intel(R) VTune(TM) Amplifier to display performance metrics
+ * by tools like Intel(R) VTune(TM) Profiler to display performance metrics
 * associated with JIT-compiled code.
 *
 * These APIs can be used to\n
@ -97,16 +45,16 @@
 *  * Expected behavior:
 *    * If any iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED event overwrites an
 *      already reported method, then such a method becomes invalid and its
- *      memory region is treated as unloaded. VTune Amplifier displays the metrics
+ *      memory region is treated as unloaded. VTune Profiler displays the metrics
 *      collected by the method until it is overwritten.
 *    * If supplied line number information contains multiple source lines for
- *      the same assembly instruction (code location), then VTune Amplifier picks up
+ *      the same assembly instruction (code location), then VTune Profiler picks up
 *      the first line number.
 *    * Dynamically generated code can be associated with a module name.
 *      Use the iJIT_Method_Load_V2 structure.\n
 *      Clarification of some cases:
 *        * If you register a function with the same method ID multiple times,
- *          specifying different module names, then the VTune Amplifier picks up
+ *          specifying different module names, then the VTune Profiler picks up
 *          the module name registered first. If you want to distinguish the same
 *          function between different JIT engines, supply different method IDs for
 *          each function. Other symbolic information (for example, source file)
@ -143,18 +91,18 @@
 *        belonging to the same method. Symbolic information (method name,
 *        source file name) will be taken from the first notification, and all
 *        subsequent notifications with the same method ID will be processed
- *        only for line number table information. So, the VTune Amplifier will map
+ *        only for line number table information. So, the VTune Profiler will map
 *        samples to a source line using the line number table from the current
 *        notification while taking the source file name from the very first one.\n
 *        Clarification of some cases:\n
 *          * If you register a second code region with a different source file
 *          name and the same method ID, then this information will be saved and
 *          will not be considered as an extension of the first code region, but
- *          VTune Amplifier will use the source file of the first code region and map
+ *          VTune Profiler will use the source file of the first code region and map
 *          performance metrics incorrectly.
 *          * If you register a second code region with the same source file as
 *          for the first region and the same method ID, then the source file will be
- *          discarded but VTune Amplifier will map metrics to the source file correctly.
+ *          discarded but VTune Profiler will map metrics to the source file correctly.
 *          * If you register a second code region with a null source file and
 *          the same method ID, then provided line number info will be associated
 *          with the source file of the first code region.
@ -293,7 +241,7 @@ typedef enum _iJIT_IsProfilingActiveFlags
 * @brief Description of a single entry in the line number information of a code region.
 * @details A table of line number entries gives information about how the reported code region
 * is mapped to source file.
- * Intel(R) VTune(TM) Amplifier uses line number information to attribute
+ * Intel(R) VTune(TM) Profiler uses line number information to attribute
 * the samples (virtual address) to a line number. \n
 * It is acceptable to report different code addresses for the same source line:
 * @code
@ -304,7 +252,7 @@ typedef enum _iJIT_IsProfilingActiveFlags
 *      18      1
 *      21      30
 *
- *  VTune Amplifier constructs the following table using the client data
+ *  VTune Profiler constructs the following table using the client data
 *
 *   Code subrange  Line number
 *      0-1             2
@ -428,7 +376,7 @@ typedef struct _iJIT_Method_Load_V2

    char* module_name; /**<\brief Module name. Can be NULL.
                           The module name can be useful for distinguishing among
-                           different JIT engines. VTune Amplifier will display
+                           different JIT engines. VTune Profiler will display
                           reported methods grouped by specific module. */

 } *piJIT_Method_Load_V2, iJIT_Method_Load_V2;
@ -480,7 +428,7 @@ typedef struct _iJIT_Method_Load_V3

    char* module_name; /**<\brief Module name. Can be NULL.
                        *  The module name can be useful for distinguishing among
-                        *  different JIT engines. VTune Amplifier will display
+                        *  different JIT engines. VTune Profiler will display
                        *  reported methods grouped by specific module. */

    iJIT_CodeArchitecture module_arch; /**<\brief Architecture of the method's code region.
@ -490,9 +438,9 @@ typedef struct _iJIT_Method_Load_V3
                                        *  engine generates 64-bit code.
                                        *
                                        *  If JIT engine reports both 32-bit and 64-bit types
-                                        *  of methods then VTune Amplifier splits the methods
+                                        *  of methods then VTune Profiler splits the methods
                                        *  with the same module name but with different
-                                        *  architectures in two different modules. VTune Amplifier
+                                        *  architectures in two different modules. VTune Profiler
                                        *  modifies the original name provided with a 64-bit method
                                        *  version by ending it with '(64)' */

@ -561,9 +509,9 @@ typedef enum _iJIT_SegmentType
    iJIT_CT_CODE,           /**<\brief Executable code. */

    iJIT_CT_DATA,           /**<\brief Data (not executable code).
-                             * VTune Amplifier uses the format string
+                             * VTune Profiler uses the format string
                             * (see iJIT_Method_Update) to represent
-                             * this data in the VTune Amplifier GUI */
+                             * this data in the VTune Profiler GUI */

    iJIT_CT_KEEP,           /**<\brief Use the previous markup for the trace.
                             * Can be used for the following
@ -580,11 +528,11 @@ typedef enum _iJIT_SegmentType
 * structure to describe the update of the content within a JIT-compiled method,
 * use iJVM_EVENT_TYPE_METHOD_UPDATE_V2 as an event type to report it.
 *
- * On the first Update event, VTune Amplifier copies the original code range reported by
+ * On the first Update event, VTune Profiler copies the original code range reported by
 * the iJVM_EVENT_TYPE_METHOD_LOAD event, then modifies it with the supplied bytes and
- * adds the modified range to the original method. For next update events, VTune Amplifier
+ * adds the modified range to the original method. For next update events, VTune Profiler
 * does the same but it uses the latest modified version of a code region for update.
- * Eventually, VTune Amplifier GUI displays multiple code ranges for the method reported by
+ * Eventually, VTune Profiler GUI displays multiple code ranges for the method reported by
 * the iJVM_EVENT_TYPE_METHOD_LOAD event.
 * Notes:
 * - Multiple update events with different types for the same trace are allowed
@ -673,7 +621,7 @@ iJIT_IsProfilingActiveFlags JITAPI iJIT_IsProfilingActive(void);
 * @brief Reports infomation about JIT-compiled code to the agent.
 *
 * The reported information is used to attribute samples obtained from any
- * Intel(R) VTune(TM) Amplifier collector. This API needs to be called
+ * Intel(R) VTune(TM) Profiler collector. This API needs to be called
 * after JIT compilation and before the first entry into the JIT-compiled
 * code.
 *
--- a/3rdparty/ittnotify/include/legacy/ittnotify.h
+++ b/3rdparty/ittnotify/include/legacy/ittnotify.h
@ -1,60 +1,8 @@
-/* <copyright>
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
+/*
+  Copyright (C) 2005-2019 Intel Corporation

-  GPL LICENSE SUMMARY
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  Contact Information:
-  http://software.intel.com/en-us/articles/intel-vtune-amplifier-xe/
-
-  BSD LICENSE
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-</copyright> */
+  SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause
+*/
 #ifndef _LEGACY_ITTNOTIFY_H_
 #define _LEGACY_ITTNOTIFY_H_

@ -80,6 +28,10 @@
 #  define ITT_OS_FREEBSD   4
 #endif /* ITT_OS_FREEBSD */

+#ifndef ITT_OS_OPENBSD
+#  define ITT_OS_OPENBSD   5
+#endif /* ITT_OS_OPENBSD */
+
 #ifndef ITT_OS
 #  if defined WIN32 || defined _WIN32
 #    define ITT_OS ITT_OS_WIN
@ -87,6 +39,8 @@
 #    define ITT_OS ITT_OS_MAC
 #  elif defined( __FreeBSD__ )
 #    define ITT_OS ITT_OS_FREEBSD
+#  elif defined( __OpenBSD__ )
+#    define ITT_OS ITT_OS_OPENBSD
 #  else
 #    define ITT_OS ITT_OS_LINUX
 #  endif
@ -108,6 +62,10 @@
 #  define ITT_PLATFORM_FREEBSD 4
 #endif /* ITT_PLATFORM_FREEBSD */

+#ifndef ITT_PLATFORM_OPENBSD
+#  define ITT_PLATFORM_OPENBSD 5
+#endif /* ITT_PLATFORM_OPENBSD */
+
 #ifndef ITT_PLATFORM
 #  if ITT_OS==ITT_OS_WIN
 #    define ITT_PLATFORM ITT_PLATFORM_WIN
@ -115,6 +73,8 @@
 #    define ITT_PLATFORM ITT_PLATFORM_MAC
 #  elif ITT_OS==ITT_OS_FREEBSD
 #    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#  elif ITT_OS==ITT_OS_OPENBSD
+#    define ITT_PLATFORM ITT_PLATFORM_OPENBSD
 #  else
 #    define ITT_PLATFORM ITT_PLATFORM_POSIX
 #  endif
@ -167,7 +127,12 @@

 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 /* use __forceinline (VC++ specific) */
-#define ITT_INLINE           __forceinline
+#if defined(__MINGW32__) && !defined(__cplusplus)
+#define ITT_INLINE           static __inline__ __attribute__((__always_inline__,__gnu_inline__))
+#else
+#define ITT_INLINE           static __forceinline
+#endif /* __MINGW32__ */
+
 #define ITT_INLINE_ATTRIBUTE /* nothing */
 #else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 /*
@ -219,20 +184,20 @@
 #define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)
 #define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)

-#define ITTNOTIFY_VOID_D0(n,d)       (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
-#define ITTNOTIFY_VOID_D1(n,d,x)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
-#define ITTNOTIFY_VOID_D2(n,d,x,y)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
-#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
-#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
-#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
-#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
-#define ITTNOTIFY_DATA_D0(n,d)       (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
-#define ITTNOTIFY_DATA_D1(n,d,x)     (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
-#define ITTNOTIFY_DATA_D2(n,d,x,y)   (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
-#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
-#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
-#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
-#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+#define ITTNOTIFY_VOID_D0(n,d)       (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_VOID_D1(n,d,x)     (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_VOID_D2(n,d,x,y)   (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+#define ITTNOTIFY_DATA_D0(n,d)       (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_DATA_D1(n,d,x)     (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_DATA_D2(n,d,x,y)   (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)

 #ifdef ITT_STUB
 #undef ITT_STUB
@ -269,7 +234,7 @@ extern "C" {
 *     only pauses tracing and analyzing memory access.
 *     It does not pause tracing or analyzing threading APIs.
 *   .
- * - Intel(R) Parallel Amplifier and Intel(R) VTune(TM) Amplifier XE:
+ * - Intel(R) VTune(TM) Profiler:
 *   - Does continue to record when new threads are started.
 *   .
 * - Other effects:
@ -1005,9 +970,9 @@ ITT_STUB(ITTAPI, __itt_frame, frame_create,  (const char *domain))
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */

-/** @brief Record an frame begin occurrence. */
+/** @brief Record a frame begin occurrence. */
 void ITTAPI __itt_frame_begin(__itt_frame frame);
-/** @brief Record an frame end occurrence. */
+/** @brief Record a frame end occurrence. */
 void ITTAPI __itt_frame_end  (__itt_frame frame);

 /** @cond exclude_from_documentation */
--- a/3rdparty/ittnotify/include/libittnotify.h
+++ b/3rdparty/ittnotify/include/libittnotify.h
@ -1,60 +1,8 @@
-/* <copyright>
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
+/*
+  Copyright (C) 2005-2019 Intel Corporation

-  GPL LICENSE SUMMARY
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  Contact Information:
-  http://software.intel.com/en-us/articles/intel-vtune-amplifier-xe/
-
-  BSD LICENSE
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-</copyright> */
+  SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause
+*/

 #ifndef _LIBITTNOTIFY_H_
 #define _LIBITTNOTIFY_H_
--- a/3rdparty/ittnotify/include/llvm_jit_event_listener.hpp
+++ b/3rdparty/ittnotify/include/llvm_jit_event_listener.hpp
@ -1,241 +0,0 @@
-/* <copyright>
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  Contact Information:
-  http://software.intel.com/en-us/articles/intel-vtune-amplifier-xe/
-
-  BSD LICENSE
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-</copyright> */
-
-/*
- * This file implements an interface bridge from Low-Level Virtual Machine
- * llvm::JITEventListener to Intel JIT Profiling API.  It passes the function
- * and line information to the appropriate functions in the JIT profiling
- * interface so that any LLVM-based JIT engine can emit the JIT code
- * notifications that the profiler will receive.
- *
- * Usage model:
- *
- * 1. Register the listener implementation instance with the execution engine:
- *
- *    #include <llvm_jit_event_listener.hpp>
- *    ...
- *    ExecutionEngine *TheExecutionEngine;
- *    ...
- *    TheExecutionEngine = EngineBuilder(TheModule).create();
- *    ...
- *    __itt_llvm_jit_event_listener jitListener;
- *    TheExecutionEngine->RegisterJITEventListener(&jitListener);
- *    ...
- *
- * 2. When compiling make sure to add the ITT API include directory to the
- *    compiler include directories, ITT API library directory to the linker
- *    library directories and link with jitprofling static library.
- */
-
-#ifndef __ITT_LLVM_JIT_EVENT_LISTENER_HPP__
-#define __ITT_LLVM_JIT_EVENT_LISTENER_HPP__
-
-#include "jitprofiling.h"
-
-#include <llvm/Function.h>
-#include <llvm/ExecutionEngine/JITEventListener.h>
-#include <llvm/ADT/StringRef.h>
-#include <llvm/Analysis/DebugInfo.h>
-
-#include <map>
-#include <cassert>
-
-// Uncomment the line below to turn on logging to stderr
-#define JITPROFILING_DEBUG_ENABLE
-
-// Some elementary logging support
-#ifdef JITPROFILING_DEBUG_ENABLE
-#include <cstdio>
-#include <cstdarg>
-static void _jit_debug(const char* format, ...)
-{
-    va_list args;
-    va_start(args, format);
-    vfprintf(stderr, format, args);
-    va_end(args);
-}
-// Use the macro as JITDEBUG(("foo: %d", foo_val));
-#define JITDEBUG(x) \
-    do { \
-        _jit_debug("jit-listener: "); \
-        _jit_debug x; \
-    } \
-    while (0)
-#else
-#define JITDEBUG(x)
-#endif
-
-// LLVM JIT event listener, translates the notifications to the JIT profiling
-// API information.
-class __itt_llvm_jit_event_listener : public llvm::JITEventListener
-{
-public:
-    __itt_llvm_jit_event_listener() {}
-
-public:
-    virtual void NotifyFunctionEmitted(const llvm::Function &F,
-        void *Code, size_t Size, const EmittedFunctionDetails &Details)
-    {
-        std::string name = F.getName().str();
-        JITDEBUG(("function jitted:\n"));
-        JITDEBUG(("  addr=0x%08x\n", (int)Code));
-        JITDEBUG(("  name=`%s'\n", name.c_str()));
-        JITDEBUG(("  code-size=%d\n", (int)Size));
-        JITDEBUG(("  line-infos-count=%d\n", Details.LineStarts.size()));
-
-        // The method must not be in the map - the entry must have been cleared
-        // from the map in NotifyFreeingMachineCode in case of rejitting.
-        assert(m_addr2MethodId.find(Code) == m_addr2MethodId.end());
-
-        int mid = iJIT_GetNewMethodID();
-        m_addr2MethodId[Code] = mid;
-
-        iJIT_Method_Load mload;
-        memset(&mload, 0, sizeof mload);
-        mload.method_id = mid;
-
-        // Populate the method size and name information
-        // TODO: The JIT profiling API should have members as const char pointers.
-        mload.method_name = (char*)name.c_str();
-        mload.method_load_address = Code;
-        mload.method_size = (unsigned int)Size;
-
-        // Populate line information now.
-        // From the JIT API documentation it is not quite clear whether the
-        // line information can be given in ranges, so we'll populate it for
-        // every byte of the function, hmm.
-        std::string srcFilePath;
-        std::vector<LineNumberInfo> lineInfos;
-        char *addr = (char*)Code;
-        char *lineAddr = addr;          // Exclusive end point at which current
-                                        // line info changes.
-        const llvm::DebugLoc* loc = 0;  // Current line info
-        int lineIndex = -1;             // Current index into the line info table
-        for (int i = 0; i < Size; ++i, ++addr) {
-            while (addr >= lineAddr) {
-                if (lineIndex >= 0 && lineIndex < Details.LineStarts.size()) {
-                    loc = &Details.LineStarts[lineIndex].Loc;
-                    std::string p = getSrcFilePath(F.getContext(), *loc);
-                    assert(srcFilePath.empty() || p == srcFilePath);
-                    srcFilePath = p;
-                } else {
-                    loc = NULL;
-                }
-                lineIndex++;
-                if (lineIndex >= 0 && lineIndex < Details.LineStarts.size()) {
-                    lineAddr = (char*)Details.LineStarts[lineIndex].Address;
-                } else {
-                    lineAddr = addr + Size;
-                }
-            }
-            if (loc) {
-                int line = loc->getLine();
-                LineNumberInfo info = { i, line };
-                lineInfos.push_back(info);
-                JITDEBUG(("  addr 0x%08x -> line %d\n", addr, line));
-            }
-        }
-        if (!lineInfos.empty()) {
-            mload.line_number_size = lineInfos.size();
-            JITDEBUG(("  translated to %d line infos to JIT", (int)lineInfos.size()));
-            mload.line_number_table = &lineInfos[0];
-            mload.source_file_name = (char*)srcFilePath.c_str();
-        }
-
-        iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &mload);
-    }
-
-    virtual void NotifyFreeingMachineCode(void *OldPtr)
-    {
-        JITDEBUG(("function unjitted\n"));
-        JITDEBUG(("  addr=0x%08x\n", (int)OldPtr));
-        Addr2MethodId::iterator it = m_addr2MethodId.find(OldPtr);
-        assert(it != m_addr2MethodId.end());
-        iJIT_Method_Id mid = { it->second };
-        iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_UNLOAD_START, &mid);
-        m_addr2MethodId.erase(it);
-    }
-
-private:
-    std::string getSrcFilePath(const llvm::LLVMContext& ctx, const llvm::DebugLoc& loc)
-    {
-        llvm::MDNode* node = loc.getAsMDNode(ctx);
-        llvm::DILocation srcLoc(node);
-        return srcLoc.getDirectory().str() + "/" + srcLoc.getFilename().str();
-    }
-
-private:
-    /// Don't copy
-    __itt_llvm_jit_event_listener(const __itt_llvm_jit_event_listener&);
-    __itt_llvm_jit_event_listener& operator=(const __itt_llvm_jit_event_listener&);
-
-private:
-    typedef std::vector<LineNumberInfo> LineInfoList;
-
-    // The method unload notification in VTune JIT profiling API takes the
-    // method ID, not method address so have to maintain the mapping.  Is
-    // there a more efficient and simple way to do this like attaching the
-    // method ID information somehow to the LLVM function instance?
-    //
-    // TODO: It would be more convenient for the JIT API to take the method
-    // address, not method ID.
-    typedef std::map<const void*, int> Addr2MethodId;
-    Addr2MethodId m_addr2MethodId;
-};
-
-#endif // Header guard
--- a/3rdparty/ittnotify/src/ittnotify/BSD-3-Clause.txt
+++ b/3rdparty/ittnotify/src/ittnotify/BSD-3-Clause.txt
@ -1,7 +1,8 @@
-Copyright (c) 2011, Intel Corporation
-All rights reserved.
+Copyright (c) 2019 Intel Corporation. All rights reserved.
+
 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-•	Neither the name of the Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/3rdparty/ittnotify/src/ittnotify/GPL-2.0-only.txt
+++ b/3rdparty/ittnotify/src/ittnotify/GPL-2.0-only.txt
@ -1,65 +1,103 @@
-The GNU General Public License (GPL)
+GNU GENERAL PUBLIC LICENSE
 Version 2, June 1991
+
 Copyright (C) 1989, 1991 Free Software Foundation, Inc.
-59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-Everyone is permitted to copy and distribute verbatim copies
-of this license document, but changing it is not allowed.
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+
+Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
+
 Preamble
-The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too.
+
+The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too.
+
 When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things.
+
 To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it.
+
 For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights.
+
 We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software.
+
 Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations.
+
 Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all.
+
 The precise terms and conditions for copying, distribution and modification follow.
+
 TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you".
 Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does.
+
 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program.
 You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee.
+
 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions:
 a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change.
 b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License.
 c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.)
 These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it.
+
 Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program.
+
 In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License.
+
 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following:
 a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or,
 b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or,
 c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.)
 The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable.
+
 If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code.
+
 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance.
 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it.
 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License.
 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program.
 If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances.
+
 It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice.
+
 This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License.
+
 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License.
 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns.
 Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation.
+
 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally.
 NO WARRANTY
+
 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
 END OF TERMS AND CONDITIONS
+
 How to Apply These Terms to Your New Programs
+
 If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms.
+
 To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found.
-One line to give the program's name and a brief idea of what it does.
-Copyright (C) <year> <name of author>
+
+<one line to give the program's name and an idea of what it does.>
+Copyright (C) < yyyy> <name of author>
+
 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
+
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
 Also add information on how to contact you by electronic and paper mail.
+
 If the program is interactive, make it output a short notice like this when it starts in an interactive mode:
+
 Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details.
+
 The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program.
+
 You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names:
+
 Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker.
-signature of Ty Coon, 1 April 1989
-Ty Coon, President of Vice
-This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Library General Public License instead of this License.
+
+<signature of Ty Coon>, 1 April 1989 Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License.
--- a/3rdparty/ittnotify/src/ittnotify/disable_warnings.h
+++ b/3rdparty/ittnotify/src/ittnotify/disable_warnings.h
@ -1,71 +1,23 @@
-/* <copyright>
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
+/*
+  Copyright (C) 2005-2019 Intel Corporation

-  GPL LICENSE SUMMARY
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  Contact Information:
-  http://software.intel.com/en-us/articles/intel-vtune-amplifier-xe/
-
-  BSD LICENSE
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-</copyright> */
+  SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause
+*/

 #include "ittnotify_config.h"

 #if ITT_PLATFORM==ITT_PLATFORM_WIN

+#if defined _MSC_VER
+
 #pragma warning (disable: 593)   /* parameter "XXXX" was set but never used                 */
 #pragma warning (disable: 344)   /* typedef name has already been declared (with same type) */
 #pragma warning (disable: 174)   /* expression has no effect                                */
 #pragma warning (disable: 4127)  /* conditional expression is constant                      */
 #pragma warning (disable: 4306)  /* conversion from '?' to '?' of greater size              */

+#endif
+
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */

 #if defined __INTEL_COMPILER
--- a/3rdparty/ittnotify/src/ittnotify/ittnotify_config.h
+++ b/3rdparty/ittnotify/src/ittnotify/ittnotify_config.h
@ -1,60 +1,8 @@
-/* <copyright>
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
+/*
+  Copyright (C) 2005-2019 Intel Corporation

-  GPL LICENSE SUMMARY
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  Contact Information:
-  http://software.intel.com/en-us/articles/intel-vtune-amplifier-xe/
-
-  BSD LICENSE
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-</copyright> */
+  SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause
+*/
 #ifndef _ITTNOTIFY_CONFIG_H_
 #define _ITTNOTIFY_CONFIG_H_

@ -75,6 +23,10 @@
 #  define ITT_OS_FREEBSD   4
 #endif /* ITT_OS_FREEBSD */

+#ifndef ITT_OS_OPENBSD
+#  define ITT_OS_OPENBSD   5
+#endif /* ITT_OS_OPENBSD */
+
 #ifndef ITT_OS
 #  if defined WIN32 || defined _WIN32
 #    define ITT_OS ITT_OS_WIN
@ -82,6 +34,8 @@
 #    define ITT_OS ITT_OS_MAC
 #  elif defined( __FreeBSD__ )
 #    define ITT_OS ITT_OS_FREEBSD
+#  elif defined( __OpenBSD__ )
+#    define ITT_OS ITT_OS_OPENBSD
 #  else
 #    define ITT_OS ITT_OS_LINUX
 #  endif
@ -103,6 +57,10 @@
 #  define ITT_PLATFORM_FREEBSD 4
 #endif /* ITT_PLATFORM_FREEBSD */

+#ifndef ITT_PLATFORM_OPENBSD
+#  define ITT_PLATFORM_OPENBSD 5
+#endif /* ITT_PLATFORM_OPENBSD */
+
 #ifndef ITT_PLATFORM
 #  if ITT_OS==ITT_OS_WIN
 #    define ITT_PLATFORM ITT_PLATFORM_WIN
@ -110,6 +68,8 @@
 #    define ITT_PLATFORM ITT_PLATFORM_MAC
 #  elif ITT_OS==ITT_OS_FREEBSD
 #    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#  elif ITT_OS==ITT_OS_OPENBSD
+#    define ITT_PLATFORM ITT_PLATFORM_OPENBSD
 #  else
 #    define ITT_PLATFORM ITT_PLATFORM_POSIX
 #  endif
@ -162,7 +122,12 @@

 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 /* use __forceinline (VC++ specific) */
-#define ITT_INLINE           __forceinline
+#if defined(__MINGW32__) && !defined(__cplusplus)
+#define ITT_INLINE           static __inline__ __attribute__((__always_inline__,__gnu_inline__))
+#else
+#define ITT_INLINE           static __forceinline
+#endif /* __MINGW32__ */
+
 #define ITT_INLINE_ATTRIBUTE /* nothing */
 #else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 /*
@ -188,6 +153,10 @@
 #  define ITT_ARCH_IA32E 2
 #endif /* ITT_ARCH_IA32E */

+#ifndef ITT_ARCH_IA64
+#  define ITT_ARCH_IA64 3
+#endif /* ITT_ARCH_IA64 */
+
 #ifndef ITT_ARCH_ARM
 #  define ITT_ARCH_ARM  4
 #endif /* ITT_ARCH_ARM */
@ -196,9 +165,9 @@
 #  define ITT_ARCH_PPC64  5
 #endif /* ITT_ARCH_PPC64 */

-#ifndef ITT_ARCH_AARCH64  /* 64-bit ARM */
-#  define ITT_ARCH_AARCH64  6
-#endif /* ITT_ARCH_AARCH64 */
+#ifndef ITT_ARCH_ARM64
+#  define ITT_ARCH_ARM64  6
+#endif /* ITT_ARCH_ARM64 */

 #ifndef ITT_ARCH
 #  if defined _M_IX86 || defined __i386__
@ -210,7 +179,7 @@
 #  elif defined _M_ARM || defined __arm__
 #    define ITT_ARCH ITT_ARCH_ARM
 #  elif defined __aarch64__
-#    define ITT_ARCH ITT_ARCH_AARCH64
+#    define ITT_ARCH ITT_ARCH_ARM64
 #  elif defined __powerpc64__
 #    define ITT_ARCH ITT_ARCH_PPC64
 #  endif
@ -239,10 +208,10 @@
 #define ITT_MAGIC { 0xED, 0xAB, 0xAB, 0xEC, 0x0D, 0xEE, 0xDA, 0x30 }

 /* Replace with snapshot date YYYYMMDD for promotion build. */
-#define API_VERSION_BUILD    20151119
+#define API_VERSION_BUILD    20250113

 #ifndef API_VERSION_NUM
-#define API_VERSION_NUM 0.0.0
+#define API_VERSION_NUM 3.25.4
 #endif /* API_VERSION_NUM */

 #define API_VERSION "ITT-API-Version " ITT_TO_STR(API_VERSION_NUM) \
@ -254,7 +223,11 @@
 typedef HMODULE           lib_t;
 typedef DWORD             TIDT;
 typedef CRITICAL_SECTION  mutex_t;
+#ifdef __cplusplus
+#define MUTEX_INITIALIZER {}
+#else
 #define MUTEX_INITIALIZER { 0 }
+#endif
 #define strong_alias(name, aliasname) /* empty for Windows */
 #else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #include <dlfcn.h>
@ -282,13 +255,13 @@ typedef pthread_mutex_t   mutex_t;
 #define __itt_mutex_init(mutex)   InitializeCriticalSection(mutex)
 #define __itt_mutex_lock(mutex)   EnterCriticalSection(mutex)
 #define __itt_mutex_unlock(mutex) LeaveCriticalSection(mutex)
+#define __itt_mutex_destroy(mutex) DeleteCriticalSection(mutex)
 #define __itt_load_lib(name)      LoadLibraryA(name)
 #define __itt_unload_lib(handle)  FreeLibrary(handle)
 #define __itt_system_error()      (int)GetLastError()
 #define __itt_fstrcmp(s1, s2)     lstrcmpA(s1, s2)
 #define __itt_fstrnlen(s, l)      strnlen_s(s, l)
 #define __itt_fstrcpyn(s1, b, s2, l) strncpy_s(s1, b, s2, l)
-#define __itt_fstrdup(s)          _strdup(s)
 #define __itt_thread_id()         GetCurrentThreadId()
 #define __itt_thread_yield()      SwitchToThread()
 #ifndef ITT_SIMPLE_INIT
@ -298,6 +271,13 @@ ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
 {
    return InterlockedIncrement(ptr);
 }
+ITT_INLINE long
+__itt_interlocked_compare_exchange(volatile long* ptr, long exchange, long comperand) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long
+__itt_interlocked_compare_exchange(volatile long* ptr, long exchange, long comperand)
+{
+    return InterlockedCompareExchange(ptr, exchange, comperand);
+}
 #endif /* ITT_SIMPLE_INIT */

 #define DL_SYMBOLS (1)
@ -327,6 +307,7 @@ ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
 }
 #define __itt_mutex_lock(mutex)   pthread_mutex_lock(mutex)
 #define __itt_mutex_unlock(mutex) pthread_mutex_unlock(mutex)
+#define __itt_mutex_destroy(mutex) pthread_mutex_destroy(mutex)
 #define __itt_load_lib(name)      dlopen(name, RTLD_LAZY)
 #define __itt_unload_lib(handle)  dlclose(handle)
 #define __itt_system_error()      errno
@ -341,10 +322,18 @@ ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
 #ifdef SDL_STRNCPY_S
 #define __itt_fstrcpyn(s1, b, s2, l) SDL_STRNCPY_S(s1, b, s2, l)
 #else
-#define __itt_fstrcpyn(s1, b, s2, l) strncpy(s1, s2, b)
+#define __itt_fstrcpyn(s1, b, s2, l) {                                      \
+    if (b > 0) {                                                            \
+        /* 'volatile' is used to suppress the warning that a destination */ \
+        /*  bound depends on the length of the source.                   */ \
+        volatile size_t num_to_copy = (size_t)(b - 1) < (size_t)(l) ?       \
+                (size_t)(b - 1) : (size_t)(l);                              \
+        strncpy(s1, s2, num_to_copy);                                       \
+        s1[num_to_copy] = 0;                                                \
+    }                                                                       \
+}
 #endif /* SDL_STRNCPY_S */

-#define __itt_fstrdup(s)          strdup(s)
 #define __itt_thread_id()         pthread_self()
 #define __itt_thread_yield()      sched_yield()
 #if ITT_ARCH==ITT_ARCH_IA64
@ -360,12 +349,12 @@ ITT_INLINE long __TBB_machine_fetchadd4(volatile void* ptr, long addend)
 {
    long result;
    __asm__ __volatile__("lock\nxadd %0,%1"
-                          : "=r"(result),"=m"(*(int*)ptr)
-                          : "0"(addend), "m"(*(int*)ptr)
+                          : "=r"(result),"=m"(*(volatile int*)ptr)
+                          : "0"(addend), "m"(*(volatile int*)ptr)
                          : "memory");
    return result;
 }
-#elif ITT_ARCH==ITT_ARCH_ARM || ITT_ARCH==ITT_ARCH_AARCH64 || ITT_ARCH==ITT_ARCH_PPC64
+#else
 #define __TBB_machine_fetchadd4(addr, val) __sync_fetch_and_add(addr, val)
 #endif /* ITT_ARCH==ITT_ARCH_IA64 */
 #ifndef ITT_SIMPLE_INIT
@ -375,6 +364,13 @@ ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
 {
    return __TBB_machine_fetchadd4(ptr, 1) + 1L;
 }
+ITT_INLINE long
+__itt_interlocked_compare_exchange(volatile long* ptr, long exchange, long comperand) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long
+__itt_interlocked_compare_exchange(volatile long* ptr, long exchange, long comperand)
+{
+    return __sync_val_compare_and_swap(ptr, exchange, comperand);
+}
 #endif /* ITT_SIMPLE_INIT */

 void* dlopen(const char*, int) __attribute__((weak));
@ -394,10 +390,20 @@ pthread_t pthread_self(void) __attribute__((weak));

 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */

-typedef enum {
-    __itt_collection_normal = 0,
-    __itt_collection_paused = 1
-} __itt_collection_state;
+/* strdup() is not included into C99 which results in a compiler warning about
+ * implicitly declared symbol. To avoid the issue strdup is implemented
+ * manually.
+ */
+#define ITT_STRDUP_MAX_STRING_SIZE 4096
+#define __itt_fstrdup(s, new_s) do {                                        \
+    if (s != NULL) {                                                        \
+        size_t s_len = __itt_fstrnlen(s, ITT_STRDUP_MAX_STRING_SIZE);       \
+        new_s = (char *)malloc(s_len + 1);                                  \
+        if (new_s != NULL) {                                                \
+            __itt_fstrcpyn(new_s, s_len + 1, s, s_len);                     \
+        }                                                                   \
+    }                                                                       \
+} while(0)

 typedef enum {
    __itt_thread_normal  = 0,
@ -463,6 +469,10 @@ typedef struct __itt_counter_info

 struct ___itt_domain;
 struct ___itt_string_handle;
+struct ___itt_histogram;
+struct ___itt_counter_metadata;
+
+#include "ittnotify.h"

 typedef struct ___itt_global
 {
@ -484,7 +494,10 @@ typedef struct ___itt_global
    struct ___itt_domain*  domain_list;
    struct ___itt_string_handle* string_list;
    __itt_collection_state state;
-    __itt_counter_info_t* counter_list;
+    __itt_counter_info_t*  counter_list;
+    unsigned int           ipt_collect_events;
+    struct ___itt_histogram* histogram_list;
+    struct ___itt_counter_metadata* counter_metadata_list;
 } __itt_global;

 #pragma pack(pop)
@ -510,7 +523,9 @@ typedef struct ___itt_global
    h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \
    if (h != NULL) { \
        h->tid    = t; \
-        h->nameA  = n ? __itt_fstrdup(n) : NULL; \
+        char *n_copy = NULL; \
+        __itt_fstrdup(n, n_copy); \
+        h->nameA  = n_copy; \
        h->nameW  = NULL; \
        h->state  = s; \
        h->extra1 = 0;    /* reserved */ \
@ -543,7 +558,9 @@ typedef struct ___itt_global
    h = (__itt_domain*)malloc(sizeof(__itt_domain)); \
    if (h != NULL) { \
        h->flags  = 1;    /* domain is enabled by default */ \
-        h->nameA  = name ? __itt_fstrdup(name) : NULL; \
+        char *name_copy = NULL; \
+        __itt_fstrdup(name, name_copy); \
+        h->nameA  = name_copy; \
        h->nameW  = NULL; \
        h->extra1 = 0;    /* reserved */ \
        h->extra2 = NULL; /* reserved */ \
@ -573,7 +590,9 @@ typedef struct ___itt_global
 #define NEW_STRING_HANDLE_A(gptr,h,h_tail,name) { \
    h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \
    if (h != NULL) { \
-        h->strA   = name ? __itt_fstrdup(name) : NULL; \
+        char *name_copy = NULL; \
+        __itt_fstrdup(name, name_copy); \
+        h->strA  = name_copy; \
        h->strW   = NULL; \
        h->extra1 = 0;    /* reserved */ \
        h->extra2 = NULL; /* reserved */ \
@ -591,7 +610,7 @@ typedef struct ___itt_global
        h->nameA   = NULL; \
        h->nameW   = name ? _wcsdup(name) : NULL; \
        h->domainA   = NULL; \
-        h->domainW   = name ? _wcsdup(domain) : NULL; \
+        h->domainW   = domain ? _wcsdup(domain) : NULL; \
        h->type = type; \
        h->index = 0; \
        h->next   = NULL; \
@ -605,9 +624,13 @@ typedef struct ___itt_global
 #define NEW_COUNTER_A(gptr,h,h_tail,name,domain,type) { \
    h = (__itt_counter_info_t*)malloc(sizeof(__itt_counter_info_t)); \
    if (h != NULL) { \
-        h->nameA   = name ? __itt_fstrdup(name) : NULL; \
+        char *name_copy = NULL; \
+        __itt_fstrdup(name, name_copy); \
+        h->nameA  = name_copy; \
        h->nameW   = NULL; \
-        h->domainA   = domain ? __itt_fstrdup(domain) : NULL; \
+        char *domain_copy = NULL; \
+        __itt_fstrdup(domain, domain_copy); \
+        h->domainA  = domain_copy; \
        h->domainW   = NULL; \
        h->type = type; \
        h->index = 0; \
@ -619,4 +642,98 @@ typedef struct ___itt_global
    } \
 }

+#define NEW_HISTOGRAM_W(gptr,h,h_tail,domain,name,x_type,y_type) { \
+    h = (__itt_histogram*)malloc(sizeof(__itt_histogram)); \
+    if (h != NULL) { \
+        h->domain = domain; \
+        h->nameA  = NULL; \
+        h->nameW  = name ? _wcsdup(name) : NULL; \
+        h->x_type = x_type; \
+        h->y_type = y_type; \
+        h->extra1 = 0; \
+        h->extra2 = NULL; \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->histogram_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_HISTOGRAM_A(gptr,h,h_tail,domain,name,x_type,y_type) { \
+    h = (__itt_histogram*)malloc(sizeof(__itt_histogram)); \
+    if (h != NULL) { \
+        h->domain = domain; \
+        char *name_copy = NULL; \
+        __itt_fstrdup(name, name_copy); \
+        h->nameA  = name_copy; \
+        h->nameW  = NULL; \
+        h->x_type = x_type; \
+        h->y_type = y_type; \
+        h->extra1 = 0; \
+        h->extra2 = NULL; \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->histogram_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_COUNTER_METADATA_NUM(gptr,h,h_tail,counter,type,value) { \
+    h = (__itt_counter_metadata*)malloc(sizeof(__itt_counter_metadata)); \
+    if (h != NULL) { \
+        h->counter = counter; \
+        h->type = type; \
+        h->str_valueA = NULL; \
+        h->str_valueW = NULL; \
+        h->value = value; \
+        h->extra1 = 0; \
+        h->extra2 = NULL; \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->counter_metadata_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_COUNTER_METADATA_STR_A(gptr,h,h_tail,counter,type,str_valueA) { \
+    h = (__itt_counter_metadata*)malloc(sizeof(__itt_counter_metadata)); \
+    if (h != NULL) { \
+        h->counter = counter; \
+        h->type = type; \
+        char *str_value_copy = NULL; \
+        __itt_fstrdup(str_valueA, str_value_copy); \
+        h->str_valueA = str_value_copy; \
+        h->str_valueW = NULL; \
+        h->value = 0; \
+        h->extra1 = 0; \
+        h->extra2 = NULL; \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->counter_metadata_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_COUNTER_METADATA_STR_W(gptr,h,h_tail,counter,type,str_valueW) { \
+    h = (__itt_counter_metadata*)malloc(sizeof(__itt_counter_metadata)); \
+    if (h != NULL) { \
+        h->counter = counter; \
+        h->type = type; \
+        h->str_valueA = NULL; \
+        h->str_valueW = str_valueW ? _wcsdup(str_valueW) : NULL; \
+        h->value = 0; \
+        h->extra1 = 0; \
+        h->extra2 = NULL; \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->counter_metadata_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
 #endif /* _ITTNOTIFY_CONFIG_H_ */
--- a/3rdparty/ittnotify/src/ittnotify/ittnotify_static.c
+++ b/3rdparty/ittnotify/src/ittnotify/ittnotify_static.c
--- a/3rdparty/ittnotify/src/ittnotify/ittnotify_static.h
+++ b/3rdparty/ittnotify/src/ittnotify/ittnotify_static.h
@ -1,60 +1,8 @@
-/* <copyright>
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
+/*
+  Copyright (C) 2005-2019 Intel Corporation

-  GPL LICENSE SUMMARY
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  Contact Information:
-  http://software.intel.com/en-us/articles/intel-vtune-amplifier-xe/
-
-  BSD LICENSE
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-</copyright> */
+  SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause
+*/

 #include "ittnotify_config.h"

@ -81,6 +29,9 @@ ITT_STUB(ITTAPI, __itt_domain*, domain_createW, (const wchar_t *name), (ITT_FORM
 ITT_STUB(ITTAPI, __itt_domain*, domain_create,  (const char    *name), (ITT_FORMAT name), domain_create,  __itt_group_structure, "\"%s\"")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */

+ITT_STUBV(ITTAPI, void, module_load_with_sections, (__itt_module_object* module_obj), (ITT_FORMAT module_obj), module_load_with_sections, __itt_group_module, "%p")
+ITT_STUBV(ITTAPI, void, module_unload_with_sections, (__itt_module_object* module_obj), (ITT_FORMAT module_obj), module_unload_with_sections, __itt_group_module, "%p")
+
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createA, (const char    *name), (ITT_FORMAT name), string_handle_createA, __itt_group_structure, "\"%s\"")
 ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createW, (const wchar_t *name), (ITT_FORMAT name), string_handle_createW, __itt_group_structure, "\"%S\"")
@ -105,6 +56,8 @@ ITT_STUB(ITTAPI, __itt_counter, counter_create_typed,  (const char    *name, con

 ITT_STUBV(ITTAPI, void, pause,  (void), (ITT_NO_PARAMS), pause,  __itt_group_control | __itt_group_legacy, "no args")
 ITT_STUBV(ITTAPI, void, resume, (void), (ITT_NO_PARAMS), resume, __itt_group_control | __itt_group_legacy, "no args")
+ITT_STUBV(ITTAPI, void, pause_scoped,  (__itt_collection_scope scope), (ITT_FORMAT scope), pause_scoped,  __itt_group_control, "%d")
+ITT_STUBV(ITTAPI, void, resume_scoped, (__itt_collection_scope scope), (ITT_FORMAT scope), resume_scoped, __itt_group_control, "%d")

 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char    *name), (ITT_FORMAT name), thread_set_nameA, __itt_group_thread, "\"%s\"")
@ -121,6 +74,23 @@ ITT_STUB(LIBITTAPI, int,  thr_name_setW, (const wchar_t *name, int namelen), (IT
 ITT_STUB(LIBITTAPI, int,  thr_name_set,  (const char    *name, int namelen), (ITT_FORMAT name, namelen), thr_name_set,  __itt_group_thread | __itt_group_legacy, "\"%s\", %d")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 ITT_STUBV(LIBITTAPI, void, thr_ignore,   (void),                             (ITT_NO_PARAMS),            thr_ignore,    __itt_group_thread | __itt_group_legacy, "no args")
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_histogram*, histogram_createA, (const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type), (ITT_FORMAT domain, name, x_type, y_type), histogram_createA, __itt_group_structure, "%p, \"%s\", %d, %d")
+ITT_STUB(ITTAPI, __itt_histogram*, histogram_createW, (const __itt_domain* domain, const wchar_t* name, __itt_metadata_type x_type, __itt_metadata_type y_type), (ITT_FORMAT domain, name, x_type, y_type), histogram_createW, __itt_group_structure, "%p, \"%s\", %d, %d")
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_histogram*, histogram_create, (const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type), (ITT_FORMAT domain, name, x_type, y_type), histogram_create, __itt_group_structure, "%p, \"%s\", %d, %d")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_createA_v3, (const __itt_domain* domain, const char    *name, __itt_metadata_type type), (ITT_FORMAT domain, name, type), counter_createA_v3, __itt_group_counter, "%p, \"%s\", %d")
+ITT_STUB(ITTAPI, __itt_counter, counter_createW_v3, (const __itt_domain* domain, const wchar_t *name, __itt_metadata_type type), (ITT_FORMAT domain, name, type), counter_createW_v3, __itt_group_counter, "%p, \"%s\", %d")
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create_v3,  (const __itt_domain* domain, const char    *name, __itt_metadata_type type), (ITT_FORMAT domain, name, type), counter_create_v3,  __itt_group_counter, "%p, \"%s\", %d")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+ITT_STUBV(ITTAPI, void, bind_context_metadata_to_counter, (__itt_counter counter, size_t length, __itt_context_metadata* metadata), (ITT_FORMAT counter, length, metadata), bind_context_metadata_to_counter, __itt_group_structure, "%p, %lu, %p")
+
 #endif /* __ITT_INTERNAL_BODY */

 ITT_STUBV(ITTAPI, void, enable_attach, (void), (ITT_NO_PARAMS), enable_attach, __itt_group_all, "no args")
@ -296,6 +266,13 @@ ITT_STUB(ITTAPI, __itt_frame, frame_createW, (const wchar_t *domain), (ITT_FORMA
 #else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 ITT_STUB(ITTAPI, __itt_frame, frame_create,  (const char    *domain), (ITT_FORMAT domain), frame_create,  __itt_group_frame, "\"%s\"")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_pt_region, pt_region_createA, (const char    *name), (ITT_FORMAT name), pt_region_createA, __itt_group_structure, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_pt_region, pt_region_createW, (const wchar_t *name), (ITT_FORMAT name), pt_region_createW, __itt_group_structure, "\"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_pt_region, pt_region_create,  (const char    *name), (ITT_FORMAT name), pt_region_create,  __itt_group_structure, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* __ITT_INTERNAL_BODY */
 ITT_STUBV(ITTAPI, void, frame_begin,         (__itt_frame frame),     (ITT_FORMAT frame),  frame_begin,   __itt_group_frame, "%p")
 ITT_STUBV(ITTAPI, void, frame_end,           (__itt_frame frame),     (ITT_FORMAT frame),  frame_end,     __itt_group_frame, "%p")
@ -376,14 +353,16 @@ ITT_STUB(ITTAPI, int, av_save,  (void *data, int rank, const int *dimensions, in
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* __ITT_INTERNAL_BODY */

-#ifndef __ITT_INTERNAL_BODY
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, module_loadA, (void *start_addr, void* end_addr, const char *path), (ITT_FORMAT start_addr, end_addr, path), module_loadA, __itt_group_none, "%p, %p, %p")
-ITT_STUBV(ITTAPI, void, module_loadW, (void *start_addr, void* end_addr, const wchar_t *path), (ITT_FORMAT start_addr, end_addr, path), module_loadW, __itt_group_none, "%p, %p, %p")
+ITT_STUBV(ITTAPI, void, module_loadA, (void *start_addr, void* end_addr, const char *path), (ITT_FORMAT start_addr, end_addr, path), module_loadA, __itt_group_module, "%p, %p, %p")
+ITT_STUBV(ITTAPI, void, module_loadW, (void *start_addr, void* end_addr, const wchar_t *path), (ITT_FORMAT start_addr, end_addr, path), module_loadW, __itt_group_module, "%p, %p, %p")
 #else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, module_load, (void *start_addr, void *end_addr, const char *path), (ITT_FORMAT start_addr, end_addr, path), module_load, __itt_group_none, "%p, %p, %p")
+ITT_STUBV(ITTAPI, void, module_load, (void *start_addr, void *end_addr, const char *path), (ITT_FORMAT start_addr, end_addr, path), module_load, __itt_group_module, "%p, %p, %p")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* __ITT_INTERNAL_BODY */
+ITT_STUBV(ITTAPI, void, module_unload, (void *start_addr), (ITT_FORMAT start_addr), module_unload, __itt_group_module, "%p")

+ITT_STUBV(ITTAPI, void, histogram_submit, (__itt_histogram* hist, size_t length, void* x_data, void* y_data), (ITT_FORMAT hist, length, x_data, y_data), histogram_submit, __itt_group_structure, "%p, %lu, %p, %p")
+
+ITT_STUBV(ITTAPI, void, counter_set_value_v3, (__itt_counter counter, void *value_ptr), (ITT_FORMAT counter, value_ptr), counter_set_value_v3, __itt_group_counter, "%p, %p")

 #endif /* __ITT_INTERNAL_INIT */
--- a/3rdparty/ittnotify/src/ittnotify/ittnotify_types.h
+++ b/3rdparty/ittnotify/src/ittnotify/ittnotify_types.h
@ -1,85 +1,34 @@
-/* <copyright>
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
+/*
+  Copyright (C) 2005-2019 Intel Corporation

-  GPL LICENSE SUMMARY
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  Contact Information:
-  http://software.intel.com/en-us/articles/intel-vtune-amplifier-xe/
-
-  BSD LICENSE
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-</copyright> */
+  SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause
+*/

 #ifndef _ITTNOTIFY_TYPES_H_
 #define _ITTNOTIFY_TYPES_H_

 typedef enum ___itt_group_id
 {
-    __itt_group_none      = 0,
-    __itt_group_legacy    = 1<<0,
-    __itt_group_control   = 1<<1,
-    __itt_group_thread    = 1<<2,
-    __itt_group_mark      = 1<<3,
-    __itt_group_sync      = 1<<4,
-    __itt_group_fsync     = 1<<5,
-    __itt_group_jit       = 1<<6,
-    __itt_group_model     = 1<<7,
-    __itt_group_splitter_min = 1<<7,
-    __itt_group_counter   = 1<<8,
-    __itt_group_frame     = 1<<9,
-    __itt_group_stitch    = 1<<10,
-    __itt_group_heap      = 1<<11,
-    __itt_group_splitter_max = 1<<12,
-    __itt_group_structure = 1<<12,
-    __itt_group_suppress = 1<<13,
-    __itt_group_arrays    = 1<<14,
-    __itt_group_all       = -1
+    __itt_group_none      		= 0,
+    __itt_group_legacy    		= 1<<0,
+    __itt_group_control   		= 1<<1,
+    __itt_group_thread    		= 1<<2,
+    __itt_group_mark      		= 1<<3,
+    __itt_group_sync      		= 1<<4,
+    __itt_group_fsync     		= 1<<5,
+    __itt_group_jit       		= 1<<6,
+    __itt_group_model     		= 1<<7,
+    __itt_group_splitter_min 	= 1<<7,
+    __itt_group_counter   		= 1<<8,
+    __itt_group_frame     		= 1<<9,
+    __itt_group_stitch    		= 1<<10,
+    __itt_group_heap      		= 1<<11,
+    __itt_group_splitter_max 	= 1<<12,
+    __itt_group_structure 		= 1<<12,
+    __itt_group_suppress 		= 1<<13,
+    __itt_group_arrays    		= 1<<14,
+    __itt_group_module    		= 1<<15,
+    __itt_group_all       		= -1
 } __itt_group_id;

 #pragma pack(push, 8)
@ -109,6 +58,7 @@ typedef struct ___itt_group_list
        { __itt_group_structure, "structure" }, \
        { __itt_group_suppress,  "suppress"  }, \
        { __itt_group_arrays,    "arrays"    }, \
+		{ __itt_group_module,    "module"    }, \
        { __itt_group_none,      NULL        }  \
    }

--- a/3rdparty/ittnotify/src/ittnotify/jitprofiling.c
+++ b/3rdparty/ittnotify/src/ittnotify/jitprofiling.c
@ -1,76 +1,24 @@
-/* <copyright>
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
+/*
+  Copyright (C) 2005-2019 Intel Corporation

-  GPL LICENSE SUMMARY
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  Contact Information:
-  http://software.intel.com/en-us/articles/intel-vtune-amplifier-xe/
-
-  BSD LICENSE
-
-  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-</copyright> */
+  SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause
+*/

 #include "ittnotify_config.h"

 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 #include <windows.h>
+#include <string.h>
+#include <ctype.h>
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM != ITT_PLATFORM_MAC && ITT_PLATFORM != ITT_PLATFORM_FREEBSD
+#if ITT_PLATFORM != ITT_PLATFORM_MAC && ITT_PLATFORM != ITT_PLATFORM_FREEBSD && ITT_PLATFORM != ITT_PLATFORM_OPENBSD
 #include <malloc.h>
 #endif
 #include <stdlib.h>

 #include "jitprofiling.h"

-static const char rcsid[] = "\n@(#) $Revision: 471937 $\n";
-
-#define DLL_ENVIRONMENT_VAR             "VS_PROFILER"
+static const char rcsid[] = "\n@(#) $Revision$\n";

 #ifndef NEW_DLL_ENVIRONMENT_VAR
 #if ITT_ARCH==ITT_ARCH_IA32
@ -81,13 +29,10 @@ static const char rcsid[] = "\n@(#) $Revision: 471937 $\n";
 #endif /* NEW_DLL_ENVIRONMENT_VAR */

 #if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define DEFAULT_DLLNAME                 "JitPI.dll"
 HINSTANCE m_libHandle = NULL;
 #elif ITT_PLATFORM==ITT_PLATFORM_MAC
-#define DEFAULT_DLLNAME                 "libJitPI.dylib"
 void* m_libHandle = NULL;
 #else
-#define DEFAULT_DLLNAME                 "libJitPI.so"
 void* m_libHandle = NULL;
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */

@ -169,6 +114,38 @@ ITT_EXTERN_C iJIT_IsProfilingActiveFlags JITAPI iJIT_IsProfilingActive()
    return executionMode;
 }

+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+static int isValidAbsolutePath(char *path, size_t maxPathLength)
+{
+    if (path == NULL)
+    {
+        return 0;
+    }
+
+    size_t pathLength = strnlen(path, maxPathLength);
+    if (pathLength == maxPathLength)
+    {
+      /* The strnlen() function returns maxPathLength if there is no null terminating
+       * among the first maxPathLength characters in the string pointed to by path.
+       */
+      return 0;
+    }
+
+    if (pathLength > 2)
+    {
+        if (isalpha(path[0]) && path[1] == ':' && path[2] == '\\')
+        {
+            return 1;
+        }
+        else if (path[0] == '\\' && path[1] == '\\')
+        {
+            return 1;
+        }
+    }
+    return 0;
+}
+#endif
+
 /* This function loads the collector dll and the relevant functions.
 * on success: all functions load,     iJIT_DLL_is_missing = 0, return value = 1
 * on failure: all functions are NULL, iJIT_DLL_is_missing = 1, return value = 0
@ -212,7 +189,7 @@ static int loadiJIT_Funcs()
        {
            envret = GetEnvironmentVariableA(NEW_DLL_ENVIRONMENT_VAR, 
                                             dllName, dNameLength);
-            if (envret)
+            if (envret && isValidAbsolutePath(dllName, dNameLength))
            {
                /* Try to load the dll from the PATH... */
                m_libHandle = LoadLibraryExA(dllName, 
@ -220,30 +197,9 @@ static int loadiJIT_Funcs()
            }
            free(dllName);
        }
-    } else {
-        /* Try to use old VS_PROFILER variable */
-        dNameLength = GetEnvironmentVariableA(DLL_ENVIRONMENT_VAR, NULL, 0);
-        if (dNameLength)
-        {
-            DWORD envret = 0;
-            dllName = (char*)malloc(sizeof(char) * (dNameLength + 1));
-            if(dllName != NULL)
-            {
-                envret = GetEnvironmentVariableA(DLL_ENVIRONMENT_VAR, 
-                                                 dllName, dNameLength);
-                if (envret)
-                {
-                    /* Try to load the dll from the PATH... */
-                    m_libHandle = LoadLibraryA(dllName);
-                }
-                free(dllName);
-            }
-        }
    }
 #else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
    dllName = getenv(NEW_DLL_ENVIRONMENT_VAR);
-    if (!dllName)
-        dllName = getenv(DLL_ENVIRONMENT_VAR);
 #if defined(__ANDROID__) || defined(ANDROID)
    if (!dllName)
        dllName = ANDROID_JIT_AGENT_PATH;
@ -251,19 +207,13 @@ static int loadiJIT_Funcs()
    if (dllName)
    {
        /* Try to load the dll from the PATH... */
-        m_libHandle = dlopen(dllName, RTLD_LAZY);
+        if (DL_SYMBOLS)
+        {
+            m_libHandle = dlopen(dllName, RTLD_LAZY);
+        }
    }
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */

-    if (!m_libHandle)
-    {
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-        m_libHandle = LoadLibraryA(DEFAULT_DLLNAME);
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-        m_libHandle = dlopen(DEFAULT_DLLNAME, RTLD_LAZY);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-    }
-
    /* if the dll wasn't loaded - exit. */
    if (!m_libHandle)
    {
--- a/3rdparty/kleidicv/CMakeLists.txt
+++ b/3rdparty/kleidicv/CMakeLists.txt
@ -1,23 +1,11 @@
 project(kleidicv_hal)

-set(KLEIDICV_SOURCE_PATH "" CACHE PATH "Directory containing KleidiCV sources")
-ocv_update(KLEIDICV_SRC_COMMIT "0.1.0")
-ocv_update(KLEIDICV_SRC_HASH "9388f28cf2fbe3338197b2b57d491468")
-
-if(KLEIDICV_SOURCE_PATH)
-  set(THE_ROOT "${KLEIDICV_SOURCE_PATH}")
-else()
-  ocv_download(FILENAME "kleidicv-${KLEIDICV_SRC_COMMIT}.tar.gz"
-                HASH ${KLEIDICV_SRC_HASH}
-                URL
-                  "${OPENCV_KLEIDICV_URL}"
-                  "$ENV{OPENCV_KLEIDICV_URL}"
-                  "https://gitlab.arm.com/kleidi/kleidicv/-/archive/${KLEIDICV_SRC_COMMIT}/"
-                DESTINATION_DIR "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/"
-                ID KLEIDICV
-                STATUS res
-                UNPACK RELATIVE_URL)
-  set(THE_ROOT "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/kleidicv-${KLEIDICV_SRC_COMMIT}")
+if(HAVE_KLEIDICV)
+  option(KLEIDICV_ENABLE_SME2 "" OFF) # not compatible with some CLang versions in NDK
+  include("${KLEIDICV_SOURCE_PATH}/adapters/opencv/CMakeLists.txt")
+  # HACK to suppress adapters/opencv/kleidicv_hal.cpp:343:12: warning: unused function 'from_opencv' [-Wunused-function]
+  target_compile_options( kleidicv_hal PRIVATE
+      $<TARGET_PROPERTY:kleidicv,COMPILE_OPTIONS>
+      "-Wno-old-style-cast" "-Wno-unused-function"
+  )
 endif()
-
-include("${THE_ROOT}/adapters/opencv/CMakeLists.txt")
--- a/3rdparty/kleidicv/kleidicv.cmake
+++ b/3rdparty/kleidicv/kleidicv.cmake
@ -0,0 +1,21 @@
+function(download_kleidicv root_var)
+  set(${root_var} "" PARENT_SCOPE)
+
+  ocv_update(KLEIDICV_SRC_COMMIT "0.3.0")
+  ocv_update(KLEIDICV_SRC_HASH "51a77b0185c2bac2a968a2163869b1ed")
+
+  set(THE_ROOT "${OpenCV_BINARY_DIR}/3rdparty/kleidicv")
+  ocv_download(FILENAME "kleidicv-${KLEIDICV_SRC_COMMIT}.tar.gz"
+                HASH ${KLEIDICV_SRC_HASH}
+                URL
+                  "${OPENCV_KLEIDICV_URL}"
+                  "$ENV{OPENCV_KLEIDICV_URL}"
+                  "https://gitlab.arm.com/kleidi/kleidicv/-/archive/${KLEIDICV_SRC_COMMIT}/"
+                DESTINATION_DIR ${THE_ROOT}
+                ID KLEIDICV
+                STATUS res
+                UNPACK RELATIVE_URL)
+  if(res)
+    set(${root_var} "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/kleidicv-${KLEIDICV_SRC_COMMIT}" PARENT_SCOPE)
+  endif()
+endfunction()
--- a/3rdparty/ndsrvp/include/imgproc.hpp
+++ b/3rdparty/ndsrvp/include/imgproc.hpp
@ -5,6 +5,8 @@
 #ifndef OPENCV_NDSRVP_IMGPROC_HPP
 #define OPENCV_NDSRVP_IMGPROC_HPP

+struct cvhalFilter2D;
+
 namespace cv {

 namespace ndsrvp {
@ -71,6 +73,52 @@ int threshold(const uchar* src_data, size_t src_step,
 #undef cv_hal_threshold
 #define cv_hal_threshold (cv::ndsrvp::threshold)

+// ################ filter ################
+
+int filterInit(cvhalFilter2D **context,
+    uchar *kernel_data, size_t kernel_step,
+    int kernel_type, int kernel_width,
+    int kernel_height, int max_width, int max_height,
+    int src_type, int dst_type, int borderType,
+    double delta, int anchor_x, int anchor_y,
+    bool allowSubmatrix, bool allowInplace);
+
+#undef cv_hal_filterInit
+#define cv_hal_filterInit (cv::ndsrvp::filterInit)
+
+int filter(cvhalFilter2D *context,
+    const uchar *src_data, size_t src_step,
+    uchar *dst_data, size_t dst_step,
+    int width, int height,
+    int full_width, int full_height,
+    int offset_x, int offset_y);
+
+#undef cv_hal_filter
+#define cv_hal_filter (cv::ndsrvp::filter)
+
+int filterFree(cvhalFilter2D *context);
+
+#undef cv_hal_filterFree
+#define cv_hal_filterFree (cv::ndsrvp::filterFree)
+
+// ################ medianBlur ################
+
+int medianBlur(const uchar* src_data, size_t src_step,
+    uchar* dst_data, size_t dst_step,
+    int width, int height, int depth, int cn, int ksize);
+
+#undef cv_hal_medianBlur
+#define cv_hal_medianBlur (cv::ndsrvp::medianBlur)
+
+// ################ bilateralFilter ################
+
+int bilateralFilter(const uchar* src_data, size_t src_step,
+    uchar* dst_data, size_t dst_step, int width, int height, int depth,
+    int cn, int d, double sigma_color, double sigma_space, int border_type);
+
+#undef cv_hal_bilateralFilter
+#define cv_hal_bilateralFilter (cv::ndsrvp::bilateralFilter)
+
 } // namespace ndsrvp

 } // namespace cv
--- a/3rdparty/ndsrvp/src/bilateralFilter.cpp
+++ b/3rdparty/ndsrvp/src/bilateralFilter.cpp
@ -0,0 +1,270 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "ndsrvp_hal.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"
+
+namespace cv {
+
+namespace ndsrvp {
+
+static void bilateralFilterProcess(uchar* dst_data, size_t dst_step, uchar* pad_data, size_t pad_step,
+    int width, int height, int cn, int radius, int maxk,
+    int* space_ofs, float *space_weight, float *color_weight)
+{
+    int i, j, k;
+
+    for( i = 0; i < height; i++ )
+    {
+        const uchar* sptr = pad_data + (i + radius) * pad_step + radius * cn;
+        uchar* dptr = dst_data + i * dst_step;
+
+        if( cn == 1 )
+        {
+            std::vector<float> buf(width + width, 0.0);
+            float *sum = &buf[0];
+            float *wsum = sum + width;
+            k = 0;
+            for(; k <= maxk-4; k+=4)
+            {
+                const uchar* ksptr0 = sptr + space_ofs[k];
+                const uchar* ksptr1 = sptr + space_ofs[k+1];
+                const uchar* ksptr2 = sptr + space_ofs[k+2];
+                const uchar* ksptr3 = sptr + space_ofs[k+3];
+                j = 0;
+                for (; j < width; j++)
+                {
+                    int rval = sptr[j];
+
+                    int val = ksptr0[j];
+                    float w = space_weight[k] * color_weight[std::abs(val - rval)];
+                    wsum[j] += w;
+                    sum[j] += val * w;
+
+                    val = ksptr1[j];
+                    w = space_weight[k+1] * color_weight[std::abs(val - rval)];
+                    wsum[j] += w;
+                    sum[j] += val * w;
+
+                    val = ksptr2[j];
+                    w = space_weight[k+2] * color_weight[std::abs(val - rval)];
+                    wsum[j] += w;
+                    sum[j] += val * w;
+
+                    val = ksptr3[j];
+                    w = space_weight[k+3] * color_weight[std::abs(val - rval)];
+                    wsum[j] += w;
+                    sum[j] += val * w;
+                }
+            }
+            for(; k < maxk; k++)
+            {
+                const uchar* ksptr = sptr + space_ofs[k];
+                j = 0;
+                for (; j < width; j++)
+                {
+                    int val = ksptr[j];
+                    float w = space_weight[k] * color_weight[std::abs(val - sptr[j])];
+                    wsum[j] += w;
+                    sum[j] += val * w;
+                }
+            }
+            j = 0;
+            for (; j < width; j++)
+            {
+                // overflow is not possible here => there is no need to use cv::saturate_cast
+                ndsrvp_assert(fabs(wsum[j]) > 0);
+                dptr[j] = (uchar)(sum[j] / wsum[j] + 0.5);
+            }
+        }
+        else
+        {
+            ndsrvp_assert( cn == 3 );
+            std::vector<float> buf(width * 3 + width);
+            float *sum_b = &buf[0];
+            float *sum_g = sum_b + width;
+            float *sum_r = sum_g + width;
+            float *wsum = sum_r + width;
+            k = 0;
+            for(; k <= maxk-4; k+=4)
+            {
+                const uchar* ksptr0 = sptr + space_ofs[k];
+                const uchar* ksptr1 = sptr + space_ofs[k+1];
+                const uchar* ksptr2 = sptr + space_ofs[k+2];
+                const uchar* ksptr3 = sptr + space_ofs[k+3];
+                const uchar* rsptr = sptr;
+                j = 0;
+                for(; j < width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3)
+                {
+                    int rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
+
+                    int b = ksptr0[0], g = ksptr0[1], r = ksptr0[2];
+                    float w = space_weight[k] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
+                    wsum[j] += w;
+                    sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
+
+                    b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2];
+                    w = space_weight[k+1] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
+                    wsum[j] += w;
+                    sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
+
+                    b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2];
+                    w = space_weight[k+2] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
+                    wsum[j] += w;
+                    sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
+
+                    b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2];
+                    w = space_weight[k+3] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
+                    wsum[j] += w;
+                    sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
+                }
+            }
+            for(; k < maxk; k++)
+            {
+                const uchar* ksptr = sptr + space_ofs[k];
+                const uchar* rsptr = sptr;
+                j = 0;
+                for(; j < width; j++, ksptr += 3, rsptr += 3)
+                {
+                    int b = ksptr[0], g = ksptr[1], r = ksptr[2];
+                    float w = space_weight[k] * color_weight[std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])];
+                    wsum[j] += w;
+                    sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
+                }
+            }
+            j = 0;
+            for(; j < width; j++)
+            {
+                ndsrvp_assert(fabs(wsum[j]) > 0);
+                wsum[j] = 1.f / wsum[j];
+                *(dptr++) = (uchar)(sum_b[j] * wsum[j] + 0.5);
+                *(dptr++) = (uchar)(sum_g[j] * wsum[j] + 0.5);
+                *(dptr++) = (uchar)(sum_r[j] * wsum[j] + 0.5);
+            }
+        }
+    }
+}
+
+int bilateralFilter(const uchar* src_data, size_t src_step,
+    uchar* dst_data, size_t dst_step, int width, int height, int depth,
+    int cn, int d, double sigma_color, double sigma_space, int border_type)
+{
+    if( depth != CV_8U || !(cn == 1 || cn == 3) || src_data == dst_data)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    int i, j, maxk, radius;
+
+    if( sigma_color <= 0 )
+        sigma_color = 1;
+    if( sigma_space <= 0 )
+        sigma_space = 1;
+
+    double gauss_color_coeff = -0.5/(sigma_color * sigma_color);
+    double gauss_space_coeff = -0.5/(sigma_space * sigma_space);
+
+    if( d <= 0 )
+        radius = (int)(sigma_space * 1.5 + 0.5);
+    else
+        radius = d / 2;
+
+    radius = MAX(radius, 1);
+    d = radius * 2 + 1;
+
+    // no enough submatrix info
+    // fetch original image data
+    const uchar *ogn_data = src_data;
+    int ogn_step = src_step;
+
+    // ROI fully used in the computation
+    int cal_width = width + d - 1;
+    int cal_height = height + d - 1;
+    int cal_x = 0 - radius; // negative if left border exceeded
+    int cal_y = 0 - radius; // negative if top border exceeded
+
+    // calculate source border
+    std::vector<uchar> padding;
+    padding.resize(cal_width * cal_height * cn);
+    uchar* pad_data = &padding[0];
+    int pad_step = cal_width * cn;
+
+    uchar* pad_ptr;
+    const uchar* ogn_ptr;
+    std::vector<uchar> vec_zeros(cn, 0);
+    for(i = 0; i < cal_height; i++)
+    {
+        int y = borderInterpolate(i + cal_y, height, border_type);
+        if(y < 0) {
+            memset(pad_data + i * pad_step, 0, cn * cal_width);
+            continue;
+        }
+
+        // left border
+        j = 0;
+        for(; j + cal_x < 0; j++)
+        {
+            int x = borderInterpolate(j + cal_x, width, border_type);
+            if(x < 0) // border constant return value -1
+                ogn_ptr = &vec_zeros[0];
+            else
+                ogn_ptr = ogn_data + y * ogn_step + x * cn;
+            pad_ptr = pad_data + i * pad_step + j * cn;
+            memcpy(pad_ptr, ogn_ptr, cn);
+        }
+
+        // center
+        int rborder = MIN(cal_width, width - cal_x);
+        ogn_ptr = ogn_data + y * ogn_step + (j + cal_x) * cn;
+        pad_ptr = pad_data + i * pad_step + j * cn;
+        memcpy(pad_ptr, ogn_ptr, cn * (rborder - j));
+
+        // right border
+        j = rborder;
+        for(; j < cal_width; j++)
+        {
+            int x = borderInterpolate(j + cal_x, width, border_type);
+            if(x < 0) // border constant return value -1
+                ogn_ptr = &vec_zeros[0];
+            else
+                ogn_ptr = ogn_data + y * ogn_step + x * cn;
+            pad_ptr = pad_data + i * pad_step + j * cn;
+            memcpy(pad_ptr, ogn_ptr, cn);
+        }
+    }
+
+    std::vector<float> _color_weight(cn * 256);
+    std::vector<float> _space_weight(d * d);
+    std::vector<int> _space_ofs(d * d);
+    float* color_weight = &_color_weight[0];
+    float* space_weight = &_space_weight[0];
+    int* space_ofs = &_space_ofs[0];
+
+    // initialize color-related bilateral filter coefficients
+
+    for( i = 0; i < 256 * cn; i++ )
+        color_weight[i] = (float)std::exp(i * i * gauss_color_coeff);
+
+    // initialize space-related bilateral filter coefficients
+    for( i = -radius, maxk = 0; i <= radius; i++ )
+    {
+        j = -radius;
+
+        for( ; j <= radius; j++ )
+        {
+            double r = std::sqrt((double)i * i + (double)j * j);
+            if( r > radius )
+                continue;
+            space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
+            space_ofs[maxk++] = (int)(i * pad_step + j * cn);
+        }
+    }
+
+    bilateralFilterProcess(dst_data, dst_step, pad_data, pad_step, width, height, cn, radius, maxk, space_ofs, space_weight, color_weight);
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
--- a/3rdparty/ndsrvp/src/cvutils.cpp
+++ b/3rdparty/ndsrvp/src/cvutils.cpp
@ -73,6 +73,40 @@ int borderInterpolate(int p, int len, int borderType)
    return p;
 }

+int16x4_t borderInterpolate_vector(int16x4_t vp, short len, int borderType)
+{
+    int16x4_t vzero = (int16x4_t){0, 0, 0, 0};
+    int16x4_t vone = (int16x4_t){1, 1, 1, 1};
+    int16x4_t vlen = (int16x4_t){len, len, len, len};
+    if(borderType == CV_HAL_BORDER_REPLICATE)
+        vp = (int16x4_t)__nds__bpick(0, __nds__bpick((long)(vlen - 1), (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
+    else if(borderType == CV_HAL_BORDER_REFLECT || borderType == CV_HAL_BORDER_REFLECT_101)
+    {
+        int16x4_t vdelta = (borderType == CV_HAL_BORDER_REFLECT_101) ? vone : vzero;
+        if(len == 1)
+            return vzero;
+        do
+        {
+            int16x4_t vneg = -vp - 1 + vdelta;
+            int16x4_t vpos = vlen - 1 - (vp - vlen) - vdelta;
+            vp = (int16x4_t)__nds__bpick((long)vneg, __nds__bpick((long)vpos, (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
+        }
+        while( (long)(vp >= vlen) || (long)(vp < 0) );
+    }
+    else if(borderType == CV_HAL_BORDER_WRAP)
+    {
+        ndsrvp_assert(len > 0);
+        int16x4_t vneg = vp - ((vp - vlen + 1) / vlen) * vlen;
+        int16x4_t vpos = vp % vlen;
+        vp = (int16x4_t)__nds__bpick((long)vneg, __nds__bpick((long)vpos, (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
+    }
+    else if(borderType == CV_HAL_BORDER_CONSTANT)
+        vp = (int16x4_t)__nds__bpick((long)-vone, (long)vp, (long)(vp < 0 || vp >= vlen));
+    else
+        ndsrvp_error(Error::StsBadArg, "borderInterpolate_vector(): Unknown/unsupported border type");
+    return vp;
+}
+
 } // namespace ndsrvp

 } // namespace cv
--- a/3rdparty/ndsrvp/src/cvutils.hpp
+++ b/3rdparty/ndsrvp/src/cvutils.hpp
@ -14,6 +14,7 @@
 #include <iostream>
 #include <string>
 #include <array>
+#include <vector>
 #include <climits>
 #include <algorithm>

@ -26,16 +27,26 @@ namespace ndsrvp {
 void* fastMalloc(size_t size);
 void fastFree(void* ptr);
 int borderInterpolate(int p, int len, int borderType);
+int16x4_t borderInterpolate_vector(int16x4_t vp, short len, int borderType);

 #ifndef MAX
 #  define MAX(a,b)  ((a) < (b) ? (b) : (a))
 #endif

+#ifndef MIN
+#  define MIN(a,b)  ((a) > (b) ? (b) : (a))
+#endif
+
 #define CV_MAT_CN_MASK          ((CV_CN_MAX - 1) << CV_CN_SHIFT)
 #define CV_MAT_CN(flags)        ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)

+#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15)
+#define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type))
+
 #define CV_MALLOC_ALIGN 64

+inline size_t getElemSize(int type) { return (size_t)CV_ELEM_SIZE(type); }
+
 // error codes

 enum Error{
@ -69,6 +80,135 @@ inline int32x2_t vclip(int32x2_t x, int32x2_t a, int32x2_t b)
    return (int32x2_t)__nds__bpick((long)a, __nds__bpick((long)(b - 1), (long)x, (long)(x < b)), (long)(x >= a));
 }

+// expand
+
+/*
+    [0] [1] [2] [3] [4] [5] [6] [7]
+810 [  0  ] [  1  ] [  4  ] [  5  ]
+832 [  2  ] [  3  ] [  6  ] [  7  ]
+bb  [  0  ] [  1  ] [  2  ] [  3  ]
+tt  [  4  ] [  5  ] [  6  ] [  7  ]
+*/
+
+inline void ndsrvp_u8_u16_expand8(const unsigned long vs, ushort* dst)
+{
+    unsigned long vs810 = __nds__zunpkd810(vs);
+    unsigned long vs832 = __nds__zunpkd832(vs);
+    *(unsigned long*)dst = __nds__pkbb32(vs832, vs810);
+    *(unsigned long*)(dst + 4) = __nds__pktt32(vs832, vs810);
+}
+
+/*
+    [0] [1] [2] [3] [4] [5] [6] [7]
+820 [  0  ] [  2  ] [  4  ] [  6  ]
+831 [  1  ] [  3  ] [  5  ] [  7  ]
+bb  [  0  ] [  2  ] [  1  ] [  3  ]
+tt  [  4  ] [  6  ] [  5  ] [  7  ]
+*/
+
+inline void ndsrvp_u8_u16_eswap8(const unsigned long vs, ushort* dst)
+{
+    unsigned long vs820 = __nds__zunpkd820(vs);
+    unsigned long vs831 = __nds__zunpkd831(vs);
+    *(unsigned long*)dst = __nds__pkbb32(vs831, vs820);
+    *(unsigned long*)(dst + 4) = __nds__pktt32(vs831, vs820);
+}
+
+/*
+    [0] [1] [2] [3] [4] [5] [6] [7]
+820 [  0  ] [  2  ] [  4  ] [  6  ]
+831 [  1  ] [  3  ] [  5  ] [  7  ]
+bb  [  0  ] [  2  ] [  1  ] [  3  ]
+tt  [  4  ] [  6  ] [  5  ] [  7  ]
+bbbb[      0      ] [      1      ]
+bbtt[      2      ] [      3      ]
+ttbb[      4      ] [      5      ]
+tttt[      6      ] [      7      ]
+*/
+
+
+inline void ndsrvp_u8_u32_expand8(const unsigned long vs, uint* dst)
+{
+    unsigned long vs820 = __nds__zunpkd820(vs);
+    unsigned long vs831 = __nds__zunpkd831(vs);
+    unsigned long vsbb = __nds__pkbb32(vs831, vs820);
+    unsigned long vstt = __nds__pktt32(vs831, vs820);
+    *(unsigned long*)dst = __nds__pkbb16(0, vsbb);
+    *(unsigned long*)(dst + 2) = __nds__pktt16(0, vsbb);
+    *(unsigned long*)(dst + 4) = __nds__pkbb16(0, vstt);
+    *(unsigned long*)(dst + 6) = __nds__pktt16(0, vstt);
+}
+
+// float replacement
+
+inline void ndsrvp_f32_add8(const float* a, const float* b, float* c)
+{
+    c[0] = a[0] + b[0];
+    c[1] = a[1] + b[1];
+    c[2] = a[2] + b[2];
+    c[3] = a[3] + b[3];
+    c[4] = a[4] + b[4];
+    c[5] = a[5] + b[5];
+    c[6] = a[6] + b[6];
+    c[7] = a[7] + b[7];
+}
+
+/*
+    [1] [8] [23]
+    [24] [8]
+*/
+
+inline void ndsrvp_f32_u8_mul8(const float* a, const unsigned long b, float* c) // experimental, not bit exact
+{
+    const int mask_frac = 0x007FFFFF;
+    const int mask_sign = 0x7FFFFFFF;
+    const int mask_lead = 0x40000000;
+    const int ofs_exp = 23;
+
+    uint32x2_t va01 = *(uint32x2_t*)a;
+    uint32x2_t va23 = *(uint32x2_t*)(a + 2);
+    uint32x2_t va45 = *(uint32x2_t*)(a + 4);
+    uint32x2_t va67 = *(uint32x2_t*)(a + 6);
+
+    uint32x2_t vaexp01 = va01 >> ofs_exp;
+    uint32x2_t vaexp23 = va23 >> ofs_exp;
+    uint32x2_t vaexp45 = va45 >> ofs_exp;
+    uint32x2_t vaexp67 = va67 >> ofs_exp;
+
+    uint32x2_t vafrac01 = ((va01 << 7) & mask_sign) | mask_lead;
+    uint32x2_t vafrac23 = ((va23 << 7) & mask_sign) | mask_lead;
+    uint32x2_t vafrac45 = ((va45 << 7) & mask_sign) | mask_lead;
+    uint32x2_t vafrac67 = ((va67 << 7) & mask_sign) | mask_lead;
+
+    int16x4_t vb[2]; // fake signed for signed multiply
+    ndsrvp_u8_u16_eswap8(b, (ushort*)vb);
+
+    vafrac01 = (uint32x2_t)__nds__kmmwb2_u((long)vafrac01, (unsigned long)vb[0]);
+    vafrac23 = (uint32x2_t)__nds__kmmwt2_u((long)vafrac23, (unsigned long)vb[0]);
+    vafrac45 = (uint32x2_t)__nds__kmmwb2_u((long)vafrac45, (unsigned long)vb[1]);
+    vafrac67 = (uint32x2_t)__nds__kmmwt2_u((long)vafrac67, (unsigned long)vb[1]);
+
+    uint32x2_t vaclz01 = __nds__v_clz32(vafrac01) - 8;
+    uint32x2_t vaclz23 = __nds__v_clz32(vafrac23) - 8;
+    uint32x2_t vaclz45 = __nds__v_clz32(vafrac45) - 8;
+    uint32x2_t vaclz67 = __nds__v_clz32(vafrac67) - 8;
+
+    vaexp01 += 8 - vaclz01;
+    vaexp23 += 8 - vaclz23;
+    vaexp45 += 8 - vaclz45;
+    vaexp67 += 8 - vaclz67;
+
+    vafrac01 <<= vaclz01;
+    vafrac23 <<= vaclz23;
+    vafrac45 <<= vaclz45;
+    vafrac67 <<= vaclz67;
+
+    *(uint32x2_t*)c = (vaexp01 << ofs_exp) | (vafrac01 & mask_frac);
+    *(uint32x2_t*)(c + 2) = (vaexp23 << ofs_exp) | (vafrac23 & mask_frac);
+    *(uint32x2_t*)(c + 4) = (vaexp45 << ofs_exp) | (vafrac45 & mask_frac);
+    *(uint32x2_t*)(c + 6) = (vaexp67 << ofs_exp) | (vafrac67 & mask_frac);
+}
+
 // saturate

 template<typename _Tp> static inline _Tp saturate_cast(int v)    { return _Tp(v); }
@ -94,6 +234,26 @@ template<> inline short saturate_cast<short>(double v)     { return saturate_cas
 template<> inline int saturate_cast<int>(float v)     { return (int)lrintf(v); }
 template<> inline int saturate_cast<int>(double v)     { return (int)lrint(v); }

+inline double cast_ptr_to_double(const uchar* v, int depth) {
+    switch (depth) {
+        case CV_8U: return (double)*(uchar*)v;
+        case CV_8S: return (double)*(char*)v;
+        case CV_16U: return (double)*(ushort*)v;
+        case CV_16S: return (double)*(short*)v;
+        case CV_32S: return (double)*(int*)v;
+        case CV_32F: return (double)*(float*)v;
+        case CV_64F: return (double)*(double*)v;
+        case CV_16F: return (double)*(float*)v;
+        default: return 0;
+    }
+}
+
+template <typename _Tp>
+inline _Tp data_at(const uchar* data, int step, int y, int x, int cn)
+{
+    return ((_Tp*)(data + y * step))[x * cn];
+}
+
 // align

 inline long align(size_t v, int n)
--- a/3rdparty/ndsrvp/src/filter.cpp
+++ b/3rdparty/ndsrvp/src/filter.cpp
@ -0,0 +1,321 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "ndsrvp_hal.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"
+
+namespace cv {
+
+namespace ndsrvp {
+
+class FilterData
+{
+public:
+    FilterData(uchar *_kernel_data, size_t _kernel_step, int _kernel_type, int _src_type, int _dst_type, int _borderType,
+        int _kernel_width, int _kernel_height, int _max_width, int _max_height, double _delta, int _anchor_x, int _anchor_y)
+        : kernel_data(_kernel_data), kernel_step(_kernel_step), kernel_type(_kernel_type), src_type(_src_type), dst_type(_dst_type), borderType(_borderType),
+        kernel_width(_kernel_width), kernel_height(_kernel_height), max_width(_max_width), max_height(_max_height), delta(_delta), anchor_x(_anchor_x), anchor_y(_anchor_y)
+    {
+    }
+
+    uchar *kernel_data;
+    size_t kernel_step; // bytes between rows(height)
+    int kernel_type, src_type, dst_type, borderType;
+    int kernel_width, kernel_height;
+    int max_width, max_height;
+    double delta;
+    int anchor_x, anchor_y;
+    std::vector<uchar> coords;
+    std::vector<float> coeffs;
+    int nz;
+    std::vector<uchar> padding;
+};
+
+static int countNonZero(const FilterData* ctx)
+{
+    int i, j, nz = 0;
+    const uchar* ker_row = ctx->kernel_data;
+    for( i = 0; i < ctx->kernel_height; i++, ker_row += ctx->kernel_step )
+    {
+        for( j = 0; j < ctx->kernel_width; j++ )
+        {
+            if( ((float*)ker_row)[j] != 0.0 )
+                nz++;
+        }
+    }
+    return nz;
+}
+
+static void preprocess2DKernel(FilterData* ctx)
+{
+    int i, j, k, nz = countNonZero(ctx), ktype = ctx->kernel_type;
+    if(nz == 0)
+        nz = 1; // (0, 0) == 0 by default
+    ndsrvp_assert( ktype == CV_32F );
+
+    ctx->coords.resize(nz * 2);
+    ctx->coeffs.resize(nz);
+
+    const uchar* ker_row = ctx->kernel_data;
+    for( i = k = 0; i < ctx->kernel_height; i++, ker_row += ctx->kernel_step )
+    {
+        for( j = 0; j < ctx->kernel_width; j++ )
+        {
+            float val = ((float*)ker_row)[j];
+            if( val == 0.0 )
+                continue;
+            ctx->coords[k * 2] = j;
+            ctx->coords[k * 2 + 1] = i;
+            ctx->coeffs[k++] = val;
+        }
+    }
+
+    ctx->nz = k;
+}
+
+int filterInit(cvhalFilter2D **context,
+    uchar *kernel_data, size_t kernel_step,
+    int kernel_type, int kernel_width,
+    int kernel_height, int max_width, int max_height,
+    int src_type, int dst_type, int borderType,
+    double delta, int anchor_x, int anchor_y,
+    bool allowSubmatrix, bool allowInplace)
+{
+    int sdepth = CV_MAT_DEPTH(src_type), ddepth = CV_MAT_DEPTH(dst_type);
+    int cn = CV_MAT_CN(src_type), kdepth = kernel_type;
+
+    (void)allowSubmatrix;
+    (void)allowInplace;
+
+    if(delta - (int)delta != 0.0)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    if(kdepth != CV_32F || (sdepth != CV_8U && sdepth != CV_16U) || ddepth != sdepth)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    FilterData *ctx = new FilterData(kernel_data, kernel_step, kernel_type, src_type, dst_type, borderType,
+        kernel_width, kernel_height, max_width, max_height, delta, anchor_x, anchor_y);
+
+    *context = (cvhalFilter2D*)ctx;
+
+    ndsrvp_assert(cn == CV_MAT_CN(dst_type) && ddepth >= sdepth);
+
+    preprocess2DKernel(ctx);
+
+    return CV_HAL_ERROR_OK;
+}
+
+int filter(cvhalFilter2D *context,
+    const uchar *src_data, size_t src_step,
+    uchar *dst_data, size_t dst_step,
+    int width, int height,
+    int full_width, int full_height,
+    int offset_x, int offset_y)
+{
+    FilterData *ctx = (FilterData*)context;
+
+    int cn = CV_MAT_CN(ctx->src_type);
+    int cnes = CV_ELEM_SIZE(ctx->src_type);
+    int ddepth = CV_MAT_DEPTH(ctx->dst_type);
+    float delta_sat = (uchar)(ctx->delta);
+    if(ddepth == CV_8U)
+        delta_sat = (float)saturate_cast<uchar>(ctx->delta);
+    else if(ddepth == CV_16U)
+        delta_sat = (float)saturate_cast<ushort>(ctx->delta);
+
+    // fetch original image data
+    const uchar *ogn_data = src_data - offset_y * src_step - offset_x * cnes;
+    int ogn_step = src_step;
+
+    // ROI fully used in the computation
+    int cal_width = width + ctx->kernel_width - 1;
+    int cal_height = height + ctx->kernel_height - 1;
+    int cal_x = offset_x - ctx->anchor_x; // negative if left border exceeded
+    int cal_y = offset_y - ctx->anchor_y; // negative if top border exceeded
+
+    // calculate source border
+    ctx->padding.resize(cal_width * cal_height * cnes);
+    uchar* pad_data = &ctx->padding[0];
+    int pad_step = cal_width * cnes;
+
+    uchar* pad_ptr;
+    const uchar* ogn_ptr;
+    std::vector<uchar> vec_zeros(cnes, 0);
+    for(int i = 0; i < cal_height; i++)
+    {
+        int y = borderInterpolate(i + cal_y, full_height, ctx->borderType);
+        if(y < 0) {
+            memset(pad_data + i * pad_step, 0, cnes * cal_width);
+            continue;
+        }
+
+        // left border
+        int j = 0;
+        int16x4_t vj = {0, 1, 2, 3};
+        vj += saturate_cast<short>(cal_x);
+        for(; j + cal_x < -4; j += 4, vj += 4)
+        {
+            int16x4_t vx = borderInterpolate_vector(vj, full_width, ctx->borderType);
+            for(int k = 0; k < 4; k++) {
+                if(vx[k] < 0) // border constant return value -1
+                    ogn_ptr = &vec_zeros[0];
+                else
+                    ogn_ptr = ogn_data + y * ogn_step + vx[k] * cnes;
+                pad_ptr = pad_data + i * pad_step + (j + k) * cnes;
+                memcpy(pad_ptr, ogn_ptr, cnes);
+            }
+        }
+        for(; j + cal_x < 0; j++)
+        {
+            int x = borderInterpolate(j + cal_x, full_width, ctx->borderType);
+            if(x < 0) // border constant return value -1
+                ogn_ptr = &vec_zeros[0];
+            else
+                ogn_ptr = ogn_data + y * ogn_step + x * cnes;
+            pad_ptr = pad_data + i * pad_step + j * cnes;
+            memcpy(pad_ptr, ogn_ptr, cnes);
+        }
+
+        // center
+        int rborder = MIN(cal_width, full_width - cal_x);
+        ogn_ptr = ogn_data + y * ogn_step + (j + cal_x) * cnes;
+        pad_ptr = pad_data + i * pad_step + j * cnes;
+        memcpy(pad_ptr, ogn_ptr, cnes * (rborder - j));
+
+        // right border
+        j = rborder;
+        vj = (int16x4_t){0, 1, 2, 3} + saturate_cast<short>(cal_x + rborder);
+        for(; j <= cal_width - 4; j += 4, vj += 4)
+        {
+            int16x4_t vx = borderInterpolate_vector(vj, full_width, ctx->borderType);
+            for(int k = 0; k < 4; k++) {
+                if(vx[k] < 0) // border constant return value -1
+                    ogn_ptr = &vec_zeros[0];
+                else
+                    ogn_ptr = ogn_data + y * ogn_step + vx[k] * cnes;
+                pad_ptr = pad_data + i * pad_step + (j + k) * cnes;
+                memcpy(pad_ptr, ogn_ptr, cnes);
+            }
+        }
+        for(; j < cal_width; j++)
+        {
+            int x = borderInterpolate(j + cal_x, full_width, ctx->borderType);
+            if(x < 0) // border constant return value -1
+                ogn_ptr = &vec_zeros[0];
+            else
+                ogn_ptr = ogn_data + y * ogn_step + x * cnes;
+            pad_ptr = pad_data + i * pad_step + j * cnes;
+            memcpy(pad_ptr, ogn_ptr, cnes);
+        }
+    }
+
+    // prepare the pointers
+    int i, k, count, nz = ctx->nz;
+    const uchar* ker_pts = &ctx->coords[0];
+    const float* ker_cfs = &ctx->coeffs[0];
+
+    if( ddepth == CV_8U )
+    {
+        std::vector<uchar*> src_ptrarr;
+        src_ptrarr.resize(nz);
+        uchar** src_ptrs = &src_ptrarr[0];
+        uchar* dst_row = dst_data;
+        uchar* pad_row = pad_data;
+
+        for( count = 0; count < height; count++, dst_row += dst_step, pad_row += pad_step )
+        {
+            for( k = 0; k < nz; k++ )
+                src_ptrs[k] = (uchar*)pad_row + ker_pts[k * 2 + 1] * pad_step + ker_pts[k * 2] * cnes;
+
+            i = 0;
+            for( ; i <= width * cnes - 8; i += 8 )
+            {
+                float vs0[8] = {delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat};
+                for( k = 0; k < nz; k++ ) {
+                    float vker_cfs[8] = {ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k]};
+                    // experimental code
+                    // ndsrvp_f32_u8_mul8(vker_cfs, *(unsigned long*)(src_ptrs[k] + i), vker_cfs);
+                    // ndsrvp_f32_add8(vs0, vker_cfs, vs0);
+                    vs0[0] += vker_cfs[0] * src_ptrs[k][i];
+                    vs0[1] += vker_cfs[1] * src_ptrs[k][i + 1];
+                    vs0[2] += vker_cfs[2] * src_ptrs[k][i + 2];
+                    vs0[3] += vker_cfs[3] * src_ptrs[k][i + 3];
+                    vs0[4] += vker_cfs[4] * src_ptrs[k][i + 4];
+                    vs0[5] += vker_cfs[5] * src_ptrs[k][i + 5];
+                    vs0[6] += vker_cfs[6] * src_ptrs[k][i + 6];
+                    vs0[7] += vker_cfs[7] * src_ptrs[k][i + 7];
+                }
+                dst_row[i] = saturate_cast<uchar>(vs0[0]);
+                dst_row[i + 1] = saturate_cast<uchar>(vs0[1]);
+                dst_row[i + 2] = saturate_cast<uchar>(vs0[2]);
+                dst_row[i + 3] = saturate_cast<uchar>(vs0[3]);
+                dst_row[i + 4] = saturate_cast<uchar>(vs0[4]);
+                dst_row[i + 5] = saturate_cast<uchar>(vs0[5]);
+                dst_row[i + 6] = saturate_cast<uchar>(vs0[6]);
+                dst_row[i + 7] = saturate_cast<uchar>(vs0[7]);
+            }
+            for( ; i < width * cnes; i++ )
+            {
+                float s0 = delta_sat;
+                for( k = 0; k < nz; k++ ) {
+                    s0 += ker_cfs[k] * src_ptrs[k][i];
+                }
+                dst_row[i] = saturate_cast<uchar>(s0);
+            }
+        }
+    }
+    else if( ddepth == CV_16U )
+    {
+        std::vector<ushort*> src_ptrarr;
+        src_ptrarr.resize(nz);
+        ushort** src_ptrs = &src_ptrarr[0];
+        uchar* dst_row = dst_data;
+        uchar* pad_row = pad_data;
+
+        for( count = 0; count < height; count++, dst_row += dst_step, pad_row += pad_step )
+        {
+            for( k = 0; k < nz; k++ )
+                src_ptrs[k] = (ushort*)((uchar*)pad_row + ker_pts[k * 2 + 1] * pad_step + ker_pts[k * 2] * cnes);
+
+            i = 0;
+            for( ; i <= width * cn - 4; i += 4 )
+            {
+                float vs0[8] = {delta_sat, delta_sat, delta_sat, delta_sat};
+                for( k = 0; k < nz; k++ ) {
+                    float vker_cfs[8] = {ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k]};
+                    vs0[0] += vker_cfs[0] * src_ptrs[k][i];
+                    vs0[1] += vker_cfs[1] * src_ptrs[k][i + 1];
+                    vs0[2] += vker_cfs[2] * src_ptrs[k][i + 2];
+                    vs0[3] += vker_cfs[3] * src_ptrs[k][i + 3];
+                }
+                ushort* dst_row_ptr = (ushort*)dst_row;
+                dst_row_ptr[i] = saturate_cast<ushort>(vs0[0]);
+                dst_row_ptr[i + 1] = saturate_cast<ushort>(vs0[1]);
+                dst_row_ptr[i + 2] = saturate_cast<ushort>(vs0[2]);
+                dst_row_ptr[i + 3] = saturate_cast<ushort>(vs0[3]);
+            }
+            for( ; i < width * cn; i++ )
+            {
+                float s0 = delta_sat;
+                for( k = 0; k < nz; k++ ) {
+                    s0 += ker_cfs[k] * src_ptrs[k][i];
+                }
+                ((ushort*)dst_row)[i] = saturate_cast<ushort>(s0);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+int filterFree(cvhalFilter2D *context) {
+    FilterData *ctx = (FilterData*)context;
+    delete ctx;
+    return CV_HAL_ERROR_OK;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
--- a/3rdparty/ndsrvp/src/medianBlur.cpp
+++ b/3rdparty/ndsrvp/src/medianBlur.cpp
@ -0,0 +1,300 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "ndsrvp_hal.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"
+
+namespace cv {
+
+namespace ndsrvp {
+
+struct operators_minmax_t {
+    inline void vector(uint8x8_t & a, uint8x8_t & b) const {
+        uint8x8_t t = a;
+        a = __nds__v_umin8(a, b);
+        b = __nds__v_umax8(t, b);
+    }
+    inline void scalar(uchar & a, uchar & b) const {
+        uchar t = a;
+        a = __nds__umin8(a, b);
+        b = __nds__umax8(t, b);
+    }
+    inline void vector(int8x8_t & a, int8x8_t & b) const {
+        int8x8_t t = a;
+        a = __nds__v_smin8(a, b);
+        b = __nds__v_smax8(t, b);
+    }
+    inline void scalar(schar & a, schar & b) const {
+        schar t = a;
+        a = __nds__smin8(a, b);
+        b = __nds__smax8(t, b);
+    }
+    inline void vector(uint16x4_t & a, uint16x4_t & b) const {
+        uint16x4_t t = a;
+        a = __nds__v_umin16(a, b);
+        b = __nds__v_umax16(t, b);
+    }
+    inline void scalar(ushort & a, ushort & b) const {
+        ushort t = a;
+        a = __nds__umin16(a, b);
+        b = __nds__umax16(t, b);
+    }
+    inline void vector(int16x4_t & a, int16x4_t & b) const {
+        int16x4_t t = a;
+        a = __nds__v_smin16(a, b);
+        b = __nds__v_smax16(t, b);
+    }
+    inline void scalar(short & a, short & b) const {
+        short t = a;
+        a = __nds__smin16(a, b);
+        b = __nds__smax16(t, b);
+    }
+};
+
+template<typename T, typename WT, typename VT> // type, widen type, vector type
+static void
+medianBlur_SortNet( const uchar* src_data, size_t src_step,
+                    uchar* dst_data, size_t dst_step,
+                    int width, int height, int cn, int ksize )
+{
+    const T* src = (T*)src_data;
+    T* dst = (T*)dst_data;
+    int sstep = (int)(src_step / sizeof(T));
+    int dstep = (int)(dst_step / sizeof(T));
+    int i, j, k;
+    operators_minmax_t op;
+
+    if( ksize == 3 )
+    {
+        if( width == 1 || height == 1 )
+        {
+            int len = width + height - 1;
+            int sdelta = height == 1 ? cn : sstep;
+            int sdelta0 = height == 1 ? 0 : sstep - cn;
+            int ddelta = height == 1 ? cn : dstep;
+
+            for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
+                for( j = 0; j < cn; j++, src++ )
+                {
+                    T p0 = src[i > 0 ? -sdelta : 0];
+                    T p1 = src[0];
+                    T p2 = src[i < len - 1 ? sdelta : 0];
+
+                    op.scalar(p0, p1); op.scalar(p1, p2); op.scalar(p0, p1);
+                    dst[j] = (T)p1;
+                }
+            return;
+        }
+
+        width *= cn;
+        for( i = 0; i < height; i++, dst += dstep )
+        {
+            const T* row0 = src + std::max(i - 1, 0)*sstep;
+            const T* row1 = src + i*sstep;
+            const T* row2 = src + std::min(i + 1, height-1)*sstep;
+            int limit = cn;
+
+            for(j = 0;; )
+            {
+                for( ; j < limit; j++ )
+                {
+                    int j0 = j >= cn ? j - cn : j;
+                    int j2 = j < width - cn ? j + cn : j;
+                    T p0 = row0[j0], p1 = row0[j], p2 = row0[j2];
+                    T p3 = row1[j0], p4 = row1[j], p5 = row1[j2];
+                    T p6 = row2[j0], p7 = row2[j], p8 = row2[j2];
+
+                    op.scalar(p1, p2); op.scalar(p4, p5); op.scalar(p7, p8); op.scalar(p0, p1);
+                    op.scalar(p3, p4); op.scalar(p6, p7); op.scalar(p1, p2); op.scalar(p4, p5);
+                    op.scalar(p7, p8); op.scalar(p0, p3); op.scalar(p5, p8); op.scalar(p4, p7);
+                    op.scalar(p3, p6); op.scalar(p1, p4); op.scalar(p2, p5); op.scalar(p4, p7);
+                    op.scalar(p4, p2); op.scalar(p6, p4); op.scalar(p4, p2);
+                    dst[j] = (T)p4;
+                }
+
+                if( limit == width )
+                    break;
+
+                int nlanes = 8 / sizeof(T);
+
+                for( ; (cn % nlanes == 0) && (j <= width - nlanes - cn); j += nlanes ) // alignment
+                {
+                    VT p0 = *(VT*)(row0+j-cn), p1 = *(VT*)(row0+j), p2 = *(VT*)(row0+j+cn);
+                    VT p3 = *(VT*)(row1+j-cn), p4 = *(VT*)(row1+j), p5 = *(VT*)(row1+j+cn);
+                    VT p6 = *(VT*)(row2+j-cn), p7 = *(VT*)(row2+j), p8 = *(VT*)(row2+j+cn);
+
+                    op.vector(p1, p2); op.vector(p4, p5); op.vector(p7, p8); op.vector(p0, p1);
+                    op.vector(p3, p4); op.vector(p6, p7); op.vector(p1, p2); op.vector(p4, p5);
+                    op.vector(p7, p8); op.vector(p0, p3); op.vector(p5, p8); op.vector(p4, p7);
+                    op.vector(p3, p6); op.vector(p1, p4); op.vector(p2, p5); op.vector(p4, p7);
+                    op.vector(p4, p2); op.vector(p6, p4); op.vector(p4, p2);
+                    *(VT*)(dst+j) = p4;
+                }
+
+                limit = width;
+            }
+        }
+    }
+    else if( ksize == 5 )
+    {
+        if( width == 1 || height == 1 )
+        {
+            int len = width + height - 1;
+            int sdelta = height == 1 ? cn : sstep;
+            int sdelta0 = height == 1 ? 0 : sstep - cn;
+            int ddelta = height == 1 ? cn : dstep;
+
+            for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
+                for( j = 0; j < cn; j++, src++ )
+                {
+                    int i1 = i > 0 ? -sdelta : 0;
+                    int i0 = i > 1 ? -sdelta*2 : i1;
+                    int i3 = i < len-1 ? sdelta : 0;
+                    int i4 = i < len-2 ? sdelta*2 : i3;
+                    T p0 = src[i0], p1 = src[i1], p2 = src[0], p3 = src[i3], p4 = src[i4];
+
+                    op.scalar(p0, p1); op.scalar(p3, p4); op.scalar(p2, p3); op.scalar(p3, p4); op.scalar(p0, p2);
+                    op.scalar(p2, p4); op.scalar(p1, p3); op.scalar(p1, p2);
+                    dst[j] = (T)p2;
+                }
+            return;
+        }
+
+        width *= cn;
+        for( i = 0; i < height; i++, dst += dstep )
+        {
+            const T* row[5];
+            row[0] = src + std::max(i - 2, 0)*sstep;
+            row[1] = src + std::max(i - 1, 0)*sstep;
+            row[2] = src + i*sstep;
+            row[3] = src + std::min(i + 1, height-1)*sstep;
+            row[4] = src + std::min(i + 2, height-1)*sstep;
+            int limit = cn*2;
+
+            for(j = 0;; )
+            {
+                for( ; j < limit; j++ )
+                {
+                    T p[25];
+                    int j1 = j >= cn ? j - cn : j;
+                    int j0 = j >= cn*2 ? j - cn*2 : j1;
+                    int j3 = j < width - cn ? j + cn : j;
+                    int j4 = j < width - cn*2 ? j + cn*2 : j3;
+                    for( k = 0; k < 5; k++ )
+                    {
+                        const T* rowk = row[k];
+                        p[k*5] = rowk[j0]; p[k*5+1] = rowk[j1];
+                        p[k*5+2] = rowk[j]; p[k*5+3] = rowk[j3];
+                        p[k*5+4] = rowk[j4];
+                    }
+
+                    op.scalar(p[1], p[2]); op.scalar(p[0], p[1]); op.scalar(p[1], p[2]); op.scalar(p[4], p[5]); op.scalar(p[3], p[4]);
+                    op.scalar(p[4], p[5]); op.scalar(p[0], p[3]); op.scalar(p[2], p[5]); op.scalar(p[2], p[3]); op.scalar(p[1], p[4]);
+                    op.scalar(p[1], p[2]); op.scalar(p[3], p[4]); op.scalar(p[7], p[8]); op.scalar(p[6], p[7]); op.scalar(p[7], p[8]);
+                    op.scalar(p[10], p[11]); op.scalar(p[9], p[10]); op.scalar(p[10], p[11]); op.scalar(p[6], p[9]); op.scalar(p[8], p[11]);
+                    op.scalar(p[8], p[9]); op.scalar(p[7], p[10]); op.scalar(p[7], p[8]); op.scalar(p[9], p[10]); op.scalar(p[0], p[6]);
+                    op.scalar(p[4], p[10]); op.scalar(p[4], p[6]); op.scalar(p[2], p[8]); op.scalar(p[2], p[4]); op.scalar(p[6], p[8]);
+                    op.scalar(p[1], p[7]); op.scalar(p[5], p[11]); op.scalar(p[5], p[7]); op.scalar(p[3], p[9]); op.scalar(p[3], p[5]);
+                    op.scalar(p[7], p[9]); op.scalar(p[1], p[2]); op.scalar(p[3], p[4]); op.scalar(p[5], p[6]); op.scalar(p[7], p[8]);
+                    op.scalar(p[9], p[10]); op.scalar(p[13], p[14]); op.scalar(p[12], p[13]); op.scalar(p[13], p[14]); op.scalar(p[16], p[17]);
+                    op.scalar(p[15], p[16]); op.scalar(p[16], p[17]); op.scalar(p[12], p[15]); op.scalar(p[14], p[17]); op.scalar(p[14], p[15]);
+                    op.scalar(p[13], p[16]); op.scalar(p[13], p[14]); op.scalar(p[15], p[16]); op.scalar(p[19], p[20]); op.scalar(p[18], p[19]);
+                    op.scalar(p[19], p[20]); op.scalar(p[21], p[22]); op.scalar(p[23], p[24]); op.scalar(p[21], p[23]); op.scalar(p[22], p[24]);
+                    op.scalar(p[22], p[23]); op.scalar(p[18], p[21]); op.scalar(p[20], p[23]); op.scalar(p[20], p[21]); op.scalar(p[19], p[22]);
+                    op.scalar(p[22], p[24]); op.scalar(p[19], p[20]); op.scalar(p[21], p[22]); op.scalar(p[23], p[24]); op.scalar(p[12], p[18]);
+                    op.scalar(p[16], p[22]); op.scalar(p[16], p[18]); op.scalar(p[14], p[20]); op.scalar(p[20], p[24]); op.scalar(p[14], p[16]);
+                    op.scalar(p[18], p[20]); op.scalar(p[22], p[24]); op.scalar(p[13], p[19]); op.scalar(p[17], p[23]); op.scalar(p[17], p[19]);
+                    op.scalar(p[15], p[21]); op.scalar(p[15], p[17]); op.scalar(p[19], p[21]); op.scalar(p[13], p[14]); op.scalar(p[15], p[16]);
+                    op.scalar(p[17], p[18]); op.scalar(p[19], p[20]); op.scalar(p[21], p[22]); op.scalar(p[23], p[24]); op.scalar(p[0], p[12]);
+                    op.scalar(p[8], p[20]); op.scalar(p[8], p[12]); op.scalar(p[4], p[16]); op.scalar(p[16], p[24]); op.scalar(p[12], p[16]);
+                    op.scalar(p[2], p[14]); op.scalar(p[10], p[22]); op.scalar(p[10], p[14]); op.scalar(p[6], p[18]); op.scalar(p[6], p[10]);
+                    op.scalar(p[10], p[12]); op.scalar(p[1], p[13]); op.scalar(p[9], p[21]); op.scalar(p[9], p[13]); op.scalar(p[5], p[17]);
+                    op.scalar(p[13], p[17]); op.scalar(p[3], p[15]); op.scalar(p[11], p[23]); op.scalar(p[11], p[15]); op.scalar(p[7], p[19]);
+                    op.scalar(p[7], p[11]); op.scalar(p[11], p[13]); op.scalar(p[11], p[12]);
+                    dst[j] = (T)p[12];
+                }
+
+                if( limit == width )
+                    break;
+
+                int nlanes = 8 / sizeof(T);
+
+                for( ; (cn % nlanes == 0) && (j <= width - nlanes - cn*2); j += nlanes )
+                {
+                    VT p0 = *(VT*)(row[0]+j-cn*2), p5 = *(VT*)(row[1]+j-cn*2), p10 = *(VT*)(row[2]+j-cn*2), p15 = *(VT*)(row[3]+j-cn*2), p20 = *(VT*)(row[4]+j-cn*2);
+                    VT p1 = *(VT*)(row[0]+j-cn*1), p6 = *(VT*)(row[1]+j-cn*1), p11 = *(VT*)(row[2]+j-cn*1), p16 = *(VT*)(row[3]+j-cn*1), p21 = *(VT*)(row[4]+j-cn*1);
+                    VT p2 = *(VT*)(row[0]+j-cn*0), p7 = *(VT*)(row[1]+j-cn*0), p12 = *(VT*)(row[2]+j-cn*0), p17 = *(VT*)(row[3]+j-cn*0), p22 = *(VT*)(row[4]+j-cn*0);
+                    VT p3 = *(VT*)(row[0]+j+cn*1), p8 = *(VT*)(row[1]+j+cn*1), p13 = *(VT*)(row[2]+j+cn*1), p18 = *(VT*)(row[3]+j+cn*1), p23 = *(VT*)(row[4]+j+cn*1);
+                    VT p4 = *(VT*)(row[0]+j+cn*2), p9 = *(VT*)(row[1]+j+cn*2), p14 = *(VT*)(row[2]+j+cn*2), p19 = *(VT*)(row[3]+j+cn*2), p24 = *(VT*)(row[4]+j+cn*2);
+
+                    op.vector(p1, p2); op.vector(p0, p1); op.vector(p1, p2); op.vector(p4, p5); op.vector(p3, p4);
+                    op.vector(p4, p5); op.vector(p0, p3); op.vector(p2, p5); op.vector(p2, p3); op.vector(p1, p4);
+                    op.vector(p1, p2); op.vector(p3, p4); op.vector(p7, p8); op.vector(p6, p7); op.vector(p7, p8);
+                    op.vector(p10, p11); op.vector(p9, p10); op.vector(p10, p11); op.vector(p6, p9); op.vector(p8, p11);
+                    op.vector(p8, p9); op.vector(p7, p10); op.vector(p7, p8); op.vector(p9, p10); op.vector(p0, p6);
+                    op.vector(p4, p10); op.vector(p4, p6); op.vector(p2, p8); op.vector(p2, p4); op.vector(p6, p8);
+                    op.vector(p1, p7); op.vector(p5, p11); op.vector(p5, p7); op.vector(p3, p9); op.vector(p3, p5);
+                    op.vector(p7, p9); op.vector(p1, p2); op.vector(p3, p4); op.vector(p5, p6); op.vector(p7, p8);
+                    op.vector(p9, p10); op.vector(p13, p14); op.vector(p12, p13); op.vector(p13, p14); op.vector(p16, p17);
+                    op.vector(p15, p16); op.vector(p16, p17); op.vector(p12, p15); op.vector(p14, p17); op.vector(p14, p15);
+                    op.vector(p13, p16); op.vector(p13, p14); op.vector(p15, p16); op.vector(p19, p20); op.vector(p18, p19);
+                    op.vector(p19, p20); op.vector(p21, p22); op.vector(p23, p24); op.vector(p21, p23); op.vector(p22, p24);
+                    op.vector(p22, p23); op.vector(p18, p21); op.vector(p20, p23); op.vector(p20, p21); op.vector(p19, p22);
+                    op.vector(p22, p24); op.vector(p19, p20); op.vector(p21, p22); op.vector(p23, p24); op.vector(p12, p18);
+                    op.vector(p16, p22); op.vector(p16, p18); op.vector(p14, p20); op.vector(p20, p24); op.vector(p14, p16);
+                    op.vector(p18, p20); op.vector(p22, p24); op.vector(p13, p19); op.vector(p17, p23); op.vector(p17, p19);
+                    op.vector(p15, p21); op.vector(p15, p17); op.vector(p19, p21); op.vector(p13, p14); op.vector(p15, p16);
+                    op.vector(p17, p18); op.vector(p19, p20); op.vector(p21, p22); op.vector(p23, p24); op.vector(p0, p12);
+                    op.vector(p8, p20); op.vector(p8, p12); op.vector(p4, p16); op.vector(p16, p24); op.vector(p12, p16);
+                    op.vector(p2, p14); op.vector(p10, p22); op.vector(p10, p14); op.vector(p6, p18); op.vector(p6, p10);
+                    op.vector(p10, p12); op.vector(p1, p13); op.vector(p9, p21); op.vector(p9, p13); op.vector(p5, p17);
+                    op.vector(p13, p17); op.vector(p3, p15); op.vector(p11, p23); op.vector(p11, p15); op.vector(p7, p19);
+                    op.vector(p7, p11); op.vector(p11, p13); op.vector(p11, p12);
+                    *(VT*)(dst+j) = p12;
+                }
+
+                limit = width;
+            }
+        }
+    }
+}
+
+int medianBlur(const uchar* src_data, size_t src_step,
+    uchar* dst_data, size_t dst_step,
+    int width, int height, int depth, int cn, int ksize)
+{
+    bool useSortNet = ((ksize == 3) || (ksize == 5 && ( depth > CV_8U || cn == 2 || cn > 4 )));
+
+    if( useSortNet )
+    {
+        uchar* src_data_rep;
+        if( dst_data == src_data ) {
+            std::vector<uchar> src_data_copy(src_step * height);
+            memcpy(src_data_copy.data(), src_data, src_step * height);
+            src_data_rep = &src_data_copy[0];
+        }
+        else {
+            src_data_rep = (uchar*)src_data;
+        }
+
+        if( depth == CV_8U )
+            medianBlur_SortNet<uchar, int, uint8x8_t>( src_data_rep, src_step, dst_data, dst_step, width, height, cn, ksize );
+        else if( depth == CV_8S )
+            medianBlur_SortNet<schar, int, int8x8_t>( src_data_rep, src_step, dst_data, dst_step, width, height, cn, ksize );
+        else if( depth == CV_16U )
+            medianBlur_SortNet<ushort, int, uint16x4_t>( src_data_rep, src_step, dst_data, dst_step, width, height, cn, ksize );
+        else if( depth == CV_16S )
+            medianBlur_SortNet<short, int, int16x4_t>( src_data_rep, src_step, dst_data, dst_step, width, height, cn, ksize );
+        else 
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+        return CV_HAL_ERROR_OK;
+    }
+    else return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
--- a/3rdparty/openvx/hal/CMakeLists.txt
+++ b/3rdparty/openvx/hal/CMakeLists.txt
@ -4,6 +4,7 @@ target_include_directories(openvx_hal PUBLIC
  ${OPENCV_3P_OPENVX_DIR}/include
  ${CMAKE_SOURCE_DIR}/modules/core/include
  ${CMAKE_SOURCE_DIR}/modules/imgproc/include
+  ${CMAKE_SOURCE_DIR}/modules/features2d/include
  ${OPENVX_INCLUDE_DIR})
 target_link_libraries(openvx_hal PUBLIC ${OPENVX_LIBRARIES})
 set_target_properties(openvx_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
--- a/3rdparty/openvx/hal/openvx_hal.cpp
+++ b/3rdparty/openvx/hal/openvx_hal.cpp
@ -1,5 +1,7 @@
 #include "openvx_hal.hpp"
+#include "opencv2/core/hal/interface.h"
 #include "opencv2/imgproc/hal/interface.h"
+#include "opencv2/features2d/hal/interface.h"

 #define IVX_HIDE_INFO_WARNINGS
 #include "ivx.hpp"
@ -191,7 +193,7 @@ int ovx_hal_mul(const T *a, size_t astep, const T *b, size_t bstep, T *c, size_t
 #ifdef _WIN32
    const float MAGIC_SCALE = 0x0.01010102p0;
 #else
-    const float MAGIC_SCALE = 0x1.010102p-8;
+    const float MAGIC_SCALE = 0.003922; // 0x1.010102p-8;
 #endif
    try
    {
@ -1145,3 +1147,931 @@ int ovx_hal_integral(int depth, int sdepth, int, const uchar * a, size_t astep,

    return CV_HAL_ERROR_OK;
 }
+
+int ovx_hal_meanStdDev(const uchar* src_data, size_t src_step, int width, int height,
+                       int src_type, double* mean_val, double* stddev_val, uchar* mask, size_t mask_step)
+{
+    (void)mask_step;
+
+    if (src_type != CV_8UC1 || mask)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if (skipSmallImages<VX_KERNEL_MEAN_STDDEV>(width, height))
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if (src_step == 0)
+    {
+        src_step = (int)width;
+    }
+
+    try
+    {
+        ivx::Context ctx = getOpenVXHALContext();
+#ifndef VX_VERSION_1_1
+        if (ctx.vendorID() == VX_ID_KHRONOS)
+            return false; // Do not use OpenVX meanStdDev estimation for sample 1.0.1 implementation due to lack of accuracy
+#endif
+
+        ivx::Image ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                        ivx::Image::createAddressing(width, height, 1, (vx_int32)src_step), const_cast<uchar*>(src_data));
+
+        vx_float32 mean_temp, stddev_temp;
+        ivx::IVX_CHECK_STATUS(vxuMeanStdDev(ctx, ia, &mean_temp, &stddev_temp));
+
+        if (mean_val)
+        {
+            mean_val[0] = mean_temp;
+        }
+
+        if (stddev_val)
+        {
+            stddev_val[0] = stddev_temp;
+        }
+    }
+    catch (const ivx::RuntimeError & e)
+    {
+        PRINT_HALERR_MSG(runtime);
+        return CV_HAL_ERROR_UNKNOWN;
+
+    }
+    catch (const ivx::WrapperError & e)
+    {
+        PRINT_HALERR_MSG(wrapper);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+int ovx_hal_lut(const uchar *src_data, size_t src_step, size_t src_type,
+                const uchar* lut_data, size_t lut_channel_size, size_t lut_channels,
+                uchar *dst_data, size_t dst_step, int width, int height)
+{
+    if (src_type != CV_8UC1 || lut_channels != 1 || lut_channel_size != 1)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if (skipSmallImages<VX_KERNEL_TABLE_LOOKUP>(width, height))
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    try
+    {
+        ivx::Context ctx = getOpenVXHALContext();
+
+        ivx::Image ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                                                     ivx::Image::createAddressing(width, height, 1, (vx_int32)src_step),
+                                                     const_cast<uchar*>(src_data));
+        ivx::Image ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                                                     ivx::Image::createAddressing(width, height, 1, (vx_int32)dst_step),
+                                                     dst_data);
+
+        ivx::LUT lut = ivx::LUT::create(ctx);
+        lut.copyFrom(lut_data);
+        ivx::IVX_CHECK_STATUS(vxuTableLookup(ctx, ia, lut, ib));
+    }
+    catch (const ivx::RuntimeError & e)
+    {
+        PRINT_HALERR_MSG(runtime);
+        return CV_HAL_ERROR_UNKNOWN;
+
+    }
+    catch (const ivx::WrapperError & e)
+    {
+        PRINT_HALERR_MSG(wrapper);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <> inline bool skipSmallImages<VX_KERNEL_MINMAXLOC>(int w, int h) { return w*h < 3840 * 2160; }
+
+int ovx_hal_minMaxIdxMaskStep(const uchar* src_data, size_t src_step, int width, int height, int depth,
+                              double* minVal, double* maxVal, int* minIdx, int* maxIdx, uchar* mask, size_t mask_step)
+{
+    (void)mask_step;
+
+    if ((depth != CV_8U && depth != CV_16S) || mask )
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if (skipSmallImages<VX_KERNEL_MINMAXLOC>(width, height))
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if (src_step == 0)
+    {
+        src_step = (int)width;
+    }
+
+    try
+    {
+        ivx::Context ctx = getOpenVXHALContext();
+        ivx::Image ia = ivx::Image::createFromHandle(ctx, depth == CV_8U ? VX_DF_IMAGE_U8 : VX_DF_IMAGE_S16,
+                                                     ivx::Image::createAddressing(width, height, depth == CV_8U ? 1 : 2, (vx_int32)src_step),
+                                                     const_cast<uchar*>(src_data));
+
+        ivx::Scalar vxMinVal = ivx::Scalar::create(ctx, depth == CV_8U ? VX_TYPE_UINT8 : VX_TYPE_INT16, 0);
+        ivx::Scalar vxMaxVal = ivx::Scalar::create(ctx, depth == CV_8U ? VX_TYPE_UINT8 : VX_TYPE_INT16, 0);
+        ivx::Array vxMinInd, vxMaxInd;
+        ivx::Scalar vxMinCount, vxMaxCount;
+        if (minIdx)
+        {
+            vxMinInd = ivx::Array::create(ctx, VX_TYPE_COORDINATES2D, 1);
+            vxMinCount = ivx::Scalar::create(ctx, VX_TYPE_UINT32, 0);
+        }
+        if (maxIdx)
+        {
+            vxMaxInd = ivx::Array::create(ctx, VX_TYPE_COORDINATES2D, 1);
+            vxMaxCount = ivx::Scalar::create(ctx, VX_TYPE_UINT32, 0);
+        }
+
+        ivx::IVX_CHECK_STATUS(vxuMinMaxLoc(ctx, ia, vxMinVal, vxMaxVal, vxMinInd, vxMaxInd, vxMinCount, vxMaxCount));
+
+        if (minVal)
+        {
+            *minVal = depth == CV_8U ? vxMinVal.getValue<vx_uint8>() : vxMinVal.getValue<vx_int16>();
+        }
+        if (maxVal)
+        {
+            *maxVal = depth == CV_8U ? vxMaxVal.getValue<vx_uint8>() : vxMaxVal.getValue<vx_int16>();
+        }
+        if (minIdx)
+        {
+            if(vxMinCount.getValue<vx_uint32>()<1) throw ivx::RuntimeError(VX_ERROR_INVALID_VALUE, std::string(__func__) + "(): minimum value location not found");
+            vx_coordinates2d_t loc;
+            vxMinInd.copyRangeTo(0, 1, &loc);
+            minIdx[0] = loc.y;
+            minIdx[1] = loc.x;
+        }
+        if (maxIdx)
+        {
+            if (vxMaxCount.getValue<vx_uint32>()<1) throw ivx::RuntimeError(VX_ERROR_INVALID_VALUE, std::string(__func__) + "(): maximum value location not found");
+            vx_coordinates2d_t loc;
+            vxMaxInd.copyRangeTo(0, 1, &loc);
+            maxIdx[0] = loc.y;
+            maxIdx[1] = loc.x;
+        }
+    }
+    catch (const ivx::RuntimeError & e)
+    {
+        PRINT_HALERR_MSG(runtime);
+        return CV_HAL_ERROR_UNKNOWN;
+
+    }
+    catch (const ivx::WrapperError & e)
+    {
+        PRINT_HALERR_MSG(wrapper);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <> inline bool skipSmallImages<VX_KERNEL_FAST_CORNERS>(int w, int h) { return w*h < 800 * 600; }
+
+int ovx_hal_FAST(const uchar* src_data, size_t src_step, int width, int height, uchar* keypoints_data, size_t* keypoints_count,
+                 int threshold, bool nonmax_suppression, int /*cv::FastFeatureDetector::DetectorType*/ dtype)
+{
+    // Nonmax suppression is done differently in OpenCV than in OpenVX
+    // 9/16 is the only supported mode in OpenVX
+    if(nonmax_suppression || dtype != CV_HAL_TYPE_9_16)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if (skipSmallImages<VX_KERNEL_FAST_CORNERS>(width, height))
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    try
+    {
+        ivx::Context context = getOpenVXHALContext();
+        ivx::Image img = ivx::Image::createFromHandle(context, VX_DF_IMAGE_U8,
+                                                      ivx::Image::createAddressing(width, height, 1, (vx_int32)src_step),
+                                                      const_cast<uchar*>(src_data));
+
+        ivx::Scalar vxthreshold = ivx::Scalar::create<VX_TYPE_FLOAT32>(context, threshold);
+        vx_size capacity = width * height;
+        ivx::Array corners = ivx::Array::create(context, VX_TYPE_KEYPOINT, capacity);
+
+        ivx::Scalar numCorners = ivx::Scalar::create<VX_TYPE_SIZE>(context, 0);
+
+        ivx::IVX_CHECK_STATUS(vxuFastCorners(context, img, vxthreshold, (vx_bool)nonmax_suppression, corners, numCorners));
+
+        size_t nPoints = numCorners.getValue<vx_size>();
+        std::vector<vx_keypoint_t> vxCorners(nPoints);
+        corners.copyTo(vxCorners);
+        cvhalKeyPoint* keypoints = (cvhalKeyPoint*)keypoints_data;
+        for(size_t i = 0; i < std::min(nPoints, *keypoints_count); i++)
+        {
+            //if nonmaxSuppression is false, vxCorners[i].strength is undefined
+            keypoints[i].x = vxCorners[i].x;
+            keypoints[i].y = vxCorners[i].y;
+            keypoints[i].size = 7;
+            keypoints[i].angle = -1;
+            keypoints[i].response = vxCorners[i].strength;
+        }
+
+        *keypoints_count = std::min(nPoints, *keypoints_count);
+
+#ifdef VX_VERSION_1_1
+        //we should take user memory back before release
+        //(it's not done automatically according to standard)
+        img.swapHandle();
+#endif
+    }
+    catch (const ivx::RuntimeError & e)
+    {
+        PRINT_HALERR_MSG(runtime);
+        return CV_HAL_ERROR_UNKNOWN;
+
+    }
+    catch (const ivx::WrapperError & e)
+    {
+        PRINT_HALERR_MSG(wrapper);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <> inline bool skipSmallImages<VX_KERNEL_MEDIAN_3x3>(int w, int h) { return w*h < 1280 * 720; }
+
+int ovx_hal_medianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                       int width, int height, int depth, int cn, int ksize)
+{
+    if (depth != CV_8U || cn != 1
+#ifndef VX_VERSION_1_1
+        || ksize != 3
+#endif
+        )
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if (
+#ifdef VX_VERSION_1_1
+         ksize != 3 ? skipSmallImages<VX_KERNEL_NON_LINEAR_FILTER>(width, height) :
+#endif
+         skipSmallImages<VX_KERNEL_MEDIAN_3x3>(width, height)
+       )
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    try
+    {
+        ivx::Context ctx = getOpenVXHALContext();
+#ifdef VX_VERSION_1_1
+        if ((vx_size)ksize > ctx.nonlinearMaxDimension())
+        {
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+        }
+#endif
+
+        ivx::Image ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                        ivx::Image::createAddressing(width, height, 1, (vx_int32)src_step),
+                                                     const_cast<uchar*>(src_data));
+
+        ivx::Image ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                        ivx::Image::createAddressing(width, height, 1, (vx_int32)(dst_step)),
+                                                     dst_data);
+
+        //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments
+        //since OpenVX standard says nothing about thread-safety for now
+        ivx::border_t prevBorder = ctx.immediateBorder();
+        ctx.setImmediateBorder(VX_BORDER_REPLICATE);
+#ifdef VX_VERSION_1_1
+        if (ksize == 3)
+#endif
+        {
+            ivx::IVX_CHECK_STATUS(vxuMedian3x3(ctx, ia, ib));
+        }
+#ifdef VX_VERSION_1_1
+        else
+        {
+            ivx::Matrix mtx;
+            if(ksize == 5)
+                mtx = ivx::Matrix::createFromPattern(ctx, VX_PATTERN_BOX, ksize, ksize);
+            else
+            {
+                vx_size supportedSize;
+                ivx::IVX_CHECK_STATUS(vxQueryContext(ctx, VX_CONTEXT_NONLINEAR_MAX_DIMENSION, &supportedSize, sizeof(supportedSize)));
+                if ((vx_size)ksize > supportedSize)
+                {
+                    ctx.setImmediateBorder(prevBorder);
+                    return false;
+                }
+
+                std::vector<uchar> mtx_data(ksize*ksize, 255);
+                mtx = ivx::Matrix::create(ctx, VX_TYPE_UINT8, ksize, ksize);
+                mtx.copyFrom(&mtx_data[0]);
+            }
+            ivx::IVX_CHECK_STATUS(vxuNonLinearFilter(ctx, VX_NONLINEAR_FILTER_MEDIAN, ia, mtx, ib));
+        }
+#endif
+        ctx.setImmediateBorder(prevBorder);
+    }
+    catch (const ivx::RuntimeError & e)
+    {
+        PRINT_HALERR_MSG(runtime);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+    catch (const ivx::WrapperError & e)
+    {
+        PRINT_HALERR_MSG(wrapper);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <> inline bool skipSmallImages<VX_KERNEL_SOBEL_3x3>(int w, int h) { return w*h < 320 * 240; }
+
+int ovx_hal_sobel(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, int dx, int dy, int ksize, double scale, double delta, int border_type)
+{
+    if (cn != 1 || src_depth != CV_8U || dst_depth != CV_16S ||
+        ksize != 3 || scale != 1.0 || delta != 0.0 ||
+        (dx | dy) != 1 || (dx + dy) != 1 || width < ksize || height < ksize)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    // ~BORDER_ISOLATED case not supported for now
+    if (margin_left != 0 || margin_top != 0 || margin_right != 0 || margin_bottom != 0)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if (skipSmallImages<VX_KERNEL_SOBEL_3x3>(width, height))
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    vx_enum border;
+    switch (border_type)
+    {
+    case CV_HAL_BORDER_CONSTANT:
+        border = VX_BORDER_CONSTANT;
+        break;
+    case CV_HAL_BORDER_REPLICATE:
+//            border = VX_BORDER_REPLICATE;
+//            break;
+    default:
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    try
+    {
+        ivx::Context ctx = getOpenVXHALContext();
+        //if ((vx_size)ksize > ctx.convolutionMaxDimension())
+        //    return false;
+
+        ivx::Image ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                        ivx::Image::createAddressing(width, height, 1, (vx_int32)(src_step)),
+                                                     const_cast<uchar*>(src_data));
+        ivx::Image ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_S16,
+                        ivx::Image::createAddressing(width, height, 2, (vx_int32)dst_step),
+                                                     dst_data);
+
+        //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments
+        //since OpenVX standard says nothing about thread-safety for now
+        ivx::border_t prevBorder = ctx.immediateBorder();
+        ctx.setImmediateBorder(border, (vx_uint8)(0));
+        if(dx)
+            ivx::IVX_CHECK_STATUS(vxuSobel3x3(ctx, ia, ib, NULL));
+        else
+            ivx::IVX_CHECK_STATUS(vxuSobel3x3(ctx, ia, NULL, ib));
+        ctx.setImmediateBorder(prevBorder);
+    }
+    catch (const ivx::RuntimeError & e)
+    {
+        PRINT_HALERR_MSG(runtime);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+    catch (const ivx::WrapperError & e)
+    {
+        PRINT_HALERR_MSG(wrapper);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <> inline bool skipSmallImages<VX_KERNEL_CANNY_EDGE_DETECTOR>(int w, int h) { return w*h < 640 * 480; }
+
+int ovx_hal_canny(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                  int width, int height, int cn, double lowThreshold, double highThreshold, int ksize, bool L2gradient)
+{
+    if (cn != 1 || width <= ksize || height <= ksize)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if (skipSmallImages<VX_KERNEL_CANNY_EDGE_DETECTOR>(width, height))
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    ivx::Context context = getOpenVXHALContext();
+    try
+    {
+        ivx::Image _src = ivx::Image::createFromHandle(context, VX_DF_IMAGE_U8,
+                          ivx::Image::createAddressing(width, height, 1, (vx_int32)src_step),
+                                                       const_cast<uchar*>(src_data));
+
+        ivx::Image _dst = ivx::Image::createFromHandle( context, VX_DF_IMAGE_U8,
+                          ivx::Image::createAddressing(width, height, 1, (vx_int32)dst_step),
+                                                       dst_data);
+
+        ivx::Threshold threshold = ivx::Threshold::createRange(context, VX_TYPE_UINT8,
+                                                               (vx_int32)lowThreshold,
+                                                               (vx_int32)highThreshold);
+
+        #if 0
+        // the code below is disabled because vxuCannyEdgeDetector()
+        // ignores context attribute VX_CONTEXT_IMMEDIATE_BORDER
+
+        // FIXME: may fail in multithread case
+        border_t prevBorder = context.immediateBorder();
+        context.setImmediateBorder(VX_BORDER_REPLICATE);
+        IVX_CHECK_STATUS( vxuCannyEdgeDetector(context, _src, threshold, ksize, (L2gradient ? VX_NORM_L2 : VX_NORM_L1), _dst) );
+        context.setImmediateBorder(prevBorder);
+        #else
+        // alternative code without vxuCannyEdgeDetector()
+        ivx::Graph graph = ivx::Graph::create(context);
+        ivx::Node node = ivx::Node(vxCannyEdgeDetectorNode(graph, _src, threshold, ksize,
+                                                           (L2gradient ? VX_NORM_L2 : VX_NORM_L1), _dst) );
+        node.setBorder(VX_BORDER_REPLICATE);
+        graph.verify();
+        graph.process();
+        #endif
+
+#ifdef VX_VERSION_1_1
+        _src.swapHandle();
+        _dst.swapHandle();
+#endif
+    }
+    catch (const ivx::RuntimeError & e)
+    {
+        PRINT_HALERR_MSG(runtime);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+    catch (const ivx::WrapperError & e)
+    {
+        PRINT_HALERR_MSG(wrapper);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+// static bool openvx_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
+int ovx_hal_pyrdown(const uchar* src_data, size_t src_step, int src_width, int src_height,
+                    uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type)
+{
+    if (depth != CV_8U || border_type != CV_HAL_BORDER_REPLICATE)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if (skipSmallImages<VX_KERNEL_HALFSCALE_GAUSSIAN>(src_width, src_height))
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    // The only border mode which is supported by both cv::pyrDown() and OpenVX
+    // and produces predictable results
+    ivx::border_t borderMode;
+    borderMode.mode = VX_BORDER_REPLICATE;
+
+    try
+    {
+        ivx::Context context = getOpenVXHALContext();
+        if(context.vendorID() == VX_ID_KHRONOS)
+        {
+            // This implementation performs floor-like rounding
+            // (OpenCV uses floor(x+0.5)-like rounding)
+            // and ignores border mode (and loses 1px size border)
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+        }
+
+        ivx::Image srcImg = ivx::Image::createFromHandle(context, ivx::Image::matTypeToFormat(CV_8UC(cn)),
+                            ivx::Image::createAddressing(src_width, src_height, 1, (vx_int32)src_step),
+                                                         const_cast<uchar*>(src_data));
+
+        ivx::Image dstImg = ivx::Image::createFromHandle(context, ivx::Image::matTypeToFormat(CV_8UC(cn)),
+                            ivx::Image::createAddressing(dst_width, dst_height, 1, (vx_int32)dst_step),
+                                                         dst_data);
+
+        ivx::Scalar kernelSize = ivx::Scalar::create<VX_TYPE_INT32>(context, 5);
+        ivx::Graph graph = ivx::Graph::create(context);
+        ivx::Node halfNode = ivx::Node::create(graph, VX_KERNEL_HALFSCALE_GAUSSIAN, srcImg, dstImg, kernelSize);
+        halfNode.setBorder(borderMode);
+        graph.verify();
+        graph.process();
+
+#ifdef VX_VERSION_1_1
+        //we should take user memory back before release
+        //(it's not done automatically according to standard)
+        srcImg.swapHandle(); dstImg.swapHandle();
+#endif
+    }
+    catch (const ivx::RuntimeError & e)
+    {
+        PRINT_HALERR_MSG(runtime);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+    catch (const ivx::WrapperError & e)
+    {
+        PRINT_HALERR_MSG(wrapper);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <> inline bool skipSmallImages<VX_KERNEL_BOX_3x3>(int w, int h) { return w*h < 640 * 480; }
+
+int ovx_hal_boxFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                      int width, int height, int src_depth, int dst_depth, int cn,
+                      int margin_left, int margin_top, int margin_right, int margin_bottom,
+                      size_t ksize_width, size_t ksize_height, int anchor_x, int anchor_y,
+                      bool normalize, int border_type)
+{
+    if (src_depth != CV_8U || cn != 1 || ksize_width != 3 || ksize_height != 3 || dst_depth != CV_8U ||
+        (anchor_x >= 0 && anchor_x != 1) || (anchor_y >= 0 && anchor_y != 1) || !normalize)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    // ~BORDER_ISOLATED case not supported for now
+    if (margin_left != 0 || margin_top != 0 || margin_right != 0 || margin_bottom != 0)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if(skipSmallImages<VX_KERNEL_BOX_3x3>(width, height))
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    vx_enum border;
+    switch (border_type)
+    {
+        case CV_HAL_BORDER_CONSTANT:
+            border = VX_BORDER_CONSTANT;
+            break;
+        case CV_HAL_BORDER_REPLICATE:
+            border = VX_BORDER_REPLICATE;
+            break;
+        default:
+            return false;
+    }
+
+    try
+    {
+        ivx::Context ctx = getOpenVXHALContext();
+
+        ivx::Image ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                        ivx::Image::createAddressing(width, height, 1, (vx_int32)src_step),
+                                                     const_cast<uchar*>(src_data));
+
+        ivx::Image ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                        ivx::Image::createAddressing(width, height, 1, (vx_int32)dst_step),
+                                                     dst_data);
+
+        //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments
+        //since OpenVX standard says nothing about thread-safety for now
+        ivx::border_t prevBorder = ctx.immediateBorder();
+        ctx.setImmediateBorder(border, (vx_uint8)(0));
+        ivx::IVX_CHECK_STATUS(vxuBox3x3(ctx, ia, ib));
+        ctx.setImmediateBorder(prevBorder);
+    }
+    catch (const ivx::RuntimeError & e)
+    {
+        PRINT_HALERR_MSG(runtime);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+    catch (const ivx::WrapperError & e)
+    {
+        PRINT_HALERR_MSG(wrapper);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+int ovx_hal_equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
+{
+    if (skipSmallImages<VX_KERNEL_EQUALIZE_HISTOGRAM>(width, height))
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    try
+    {
+        ivx::Context context = getOpenVXHALContext();
+
+        ivx::Image srcImage = ivx::Image::createFromHandle(context, VX_DF_IMAGE_U8,
+                              ivx::Image::createAddressing(width, height, 1, (vx_int32)src_step),
+                              const_cast<uchar*>(src_data));
+
+        ivx::Image dstImage = ivx::Image::createFromHandle(context, VX_DF_IMAGE_U8,
+                              ivx::Image::createAddressing(width, height, 1, (vx_int32)dst_step),
+                              dst_data);
+
+        ivx::IVX_CHECK_STATUS(vxuEqualizeHist(context, srcImage, dstImage));
+
+#ifdef VX_VERSION_1_1
+        //we should take user memory back before release
+        //(it's not done automatically according to standard)
+        srcImage.swapHandle(); dstImage.swapHandle();
+#endif
+    }
+    catch (const ivx::RuntimeError & e)
+    {
+        PRINT_HALERR_MSG(runtime);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+    catch (const ivx::WrapperError & e)
+    {
+        PRINT_HALERR_MSG(wrapper);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+int ovx_hal_gaussianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,  int width, int height,
+                         int depth, int cn, size_t margin_left, size_t margin_top, size_t margin_right, size_t margin_bottom,
+                         size_t ksize_width, size_t ksize_height, double sigmaX, double sigmaY, int border_type)
+{
+    if (sigmaY <= 0)
+        sigmaY = sigmaX;
+    // automatic detection of kernel size from sigma
+    if (ksize_width <= 0 && sigmaX > 0)
+        ksize_width = (vx_int32)(sigmaX*6 + 1) | 1;
+    if (ksize_height <= 0 && sigmaY > 0)
+        ksize_height = (vx_int32)(sigmaY*6 + 1) | 1;
+
+    if (depth != CV_8U || cn != 1 || width < 3 || height < 3 ||
+        ksize_width != 3 || ksize_height != 3)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    sigmaX = std::max(sigmaX, 0.);
+    sigmaY = std::max(sigmaY, 0.);
+
+    if (!(sigmaX == 0.0 || (sigmaX - 0.8) < DBL_EPSILON) || !(sigmaY == 0.0 || (sigmaY - 0.8) < DBL_EPSILON))
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    // ~BORDER_ISOLATED case not supported for now
+    if (margin_left != 0 || margin_top != 0 || margin_right != 0 || margin_bottom != 0)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if (skipSmallImages<VX_KERNEL_GAUSSIAN_3x3>(width, height))
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    vx_enum border;
+    switch (border_type)
+    {
+    case CV_HAL_BORDER_CONSTANT:
+        border = VX_BORDER_CONSTANT;
+        break;
+    case CV_HAL_BORDER_REPLICATE:
+        border = VX_BORDER_REPLICATE;
+        break;
+    default:
+        return false;
+    }
+
+    try
+    {
+        ivx::Context ctx = getOpenVXHALContext();
+
+        ivx::Image ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                        ivx::Image::createAddressing(width, height, 1, (vx_int32)src_step),
+                                                     const_cast<uchar*>(src_data));
+
+        ivx::Image ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                        ivx::Image::createAddressing(width, height, 1, (vx_int32)dst_step),
+                                                     dst_data);
+
+        //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments
+        //since OpenVX standard says nothing about thread-safety for now
+        ivx::border_t prevBorder = ctx.immediateBorder();
+        ctx.setImmediateBorder(border, (vx_uint8)(0));
+        ivx::IVX_CHECK_STATUS(vxuGaussian3x3(ctx, ia, ib));
+        ctx.setImmediateBorder(prevBorder);
+    }
+    catch (const ivx::RuntimeError & e)
+    {
+        PRINT_HALERR_MSG(runtime);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+    catch (const ivx::WrapperError & e)
+    {
+        PRINT_HALERR_MSG(wrapper);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+int ovx_hal_remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+                    uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+                    float* mapx, size_t mapx_step, float* mapy, size_t mapy_step,
+                    int interpolation, int border_type, const double border_value[4])
+{
+
+    if (src_type != CV_8UC1 || border_type != CV_HAL_BORDER_CONSTANT || (interpolation & CV_HAL_WARP_RELATIVE_MAP))
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if (skipSmallImages<VX_KERNEL_REMAP>(src_width, src_height))
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    vx_interpolation_type_e inter_type;
+    switch (interpolation)
+    {
+    case CV_HAL_INTER_LINEAR:
+#if VX_VERSION > VX_VERSION_1_0
+        inter_type = VX_INTERPOLATION_BILINEAR;
+#else
+        inter_type = VX_INTERPOLATION_TYPE_BILINEAR;
+#endif
+        break;
+    case CV_HAL_INTER_NEAREST:
+/* NEAREST_NEIGHBOR mode disabled since OpenCV round half to even while OpenVX sample implementation round half up
+#if VX_VERSION > VX_VERSION_1_0
+        inter_type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
+#else
+        inter_type = VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR;
+#endif
+        if (!map1.empty())
+            for (int y = 0; y < map1.rows; ++y)
+            {
+                float* line = map1.ptr<float>(y);
+                for (int x = 0; x < map1.cols; ++x)
+                    line[x] = cvRound(line[x]);
+            }
+        if (!map2.empty())
+            for (int y = 0; y < map2.rows; ++y)
+            {
+                float* line = map2.ptr<float>(y);
+                for (int x = 0; x < map2.cols; ++x)
+                    line[x] = cvRound(line[x]);
+            }
+        break;
+*/
+    case CV_HAL_INTER_AREA://AREA interpolation mode is unsupported
+    default:
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    try
+    {
+        ivx::Context ctx = getOpenVXHALContext();
+
+        ivx::Image ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                        ivx::Image::createAddressing(src_width, src_height, 1, (vx_int32)src_step),
+                                                     const_cast<uchar*>(src_data));
+        ivx::Image ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                        ivx::Image::createAddressing(dst_width, dst_height, 1, (vx_int32)dst_step),
+                                                     dst_data);
+
+        //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments
+        //since OpenVX standard says nothing about thread-safety for now
+        ivx::border_t prevBorder = ctx.immediateBorder();
+        ctx.setImmediateBorder(VX_BORDER_CONSTANT, (vx_uint8)(border_value[0]));
+
+        ivx::Remap map = ivx::Remap::create(ctx, src_width, src_height, dst_width, dst_height);
+        if (!mapx) map.setMappings(mapy, mapy_step);
+        else if (!mapy) map.setMappings(mapx, mapx_step);
+        else map.setMappings(mapx, mapx_step, mapy, mapy_step);
+        ivx::IVX_CHECK_STATUS(vxuRemap(ctx, ia, map, inter_type, ib));
+#ifdef VX_VERSION_1_1
+        ib.swapHandle();
+        ia.swapHandle();
+#endif
+        ctx.setImmediateBorder(prevBorder);
+    }
+    catch (const ivx::RuntimeError & e)
+    {
+        PRINT_HALERR_MSG(runtime);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+    catch (const ivx::WrapperError & e)
+    {
+        PRINT_HALERR_MSG(wrapper);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+#define IMPL_OPENVX_TOZERO 1
+int ovx_hal_threshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                      int width, int height, int depth, int cn, double thresh, double maxValue, int thresholdType)
+{
+    if(depth != CV_8U)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    int trueVal, falseVal;
+    switch (thresholdType)
+    {
+    case CV_HAL_THRESH_BINARY:
+#ifndef VX_VERSION_1_1
+        if (maxValue != 255)
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+#endif
+        trueVal = maxValue;
+        falseVal = 0;
+        break;
+    case CV_HAL_THRESH_TOZERO:
+#if IMPL_OPENVX_TOZERO
+        trueVal = 255;
+        falseVal = 0;
+        break;
+#endif
+    case CV_HAL_THRESH_BINARY_INV:
+#ifdef VX_VERSION_1_1
+        trueVal = 0;
+        falseVal = maxValue;
+        break;
+#endif
+    case CV_HAL_THRESH_TOZERO_INV:
+#ifdef VX_VERSION_1_1
+#if IMPL_OPENVX_TOZERO
+        trueVal = 0;
+        falseVal = 255;
+        break;
+#endif
+#endif
+    case CV_HAL_THRESH_TRUNC:
+    default:
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    try
+    {
+        ivx::Context ctx = getOpenVXHALContext();
+
+        ivx::Threshold thh = ivx::Threshold::createBinary(ctx, VX_TYPE_UINT8, thresh);
+        thh.setValueTrue(trueVal);
+        thh.setValueFalse(falseVal);
+
+        ivx::Image ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                        ivx::Image::createAddressing(width*cn, height, 1, (vx_int32)src_step),
+                                                     const_cast<uchar*>(src_data));
+        ivx::Image ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                        ivx::Image::createAddressing(width*cn, height, 1, (vx_int32)dst_step),
+                                                     dst_data);
+
+        ivx::IVX_CHECK_STATUS(vxuThreshold(ctx, ia, thh, ib));
+#if IMPL_OPENVX_TOZERO
+        if (thresholdType == CV_HAL_THRESH_TOZERO || thresholdType == CV_HAL_THRESH_TOZERO_INV)
+        {
+            ivx::Image ic = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                            ivx::Image::createAddressing(width*cn, height, 1, (vx_int32)dst_step), dst_data);
+            ivx::IVX_CHECK_STATUS(vxuAnd(ctx, ib, ia, ic));
+        }
+#endif
+    }
+    catch (const ivx::RuntimeError & e)
+    {
+        PRINT_HALERR_MSG(runtime);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+    catch (const ivx::WrapperError & e)
+    {
+        PRINT_HALERR_MSG(wrapper);
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
--- a/3rdparty/openvx/hal/openvx_hal.hpp
+++ b/3rdparty/openvx/hal/openvx_hal.hpp
@ -54,6 +54,33 @@ int ovx_hal_cvtThreePlaneYUVtoBGR(const uchar * a, size_t astep, uchar * b, size
 int ovx_hal_cvtBGRtoThreePlaneYUV(const uchar * a, size_t astep, uchar * b, size_t bstep, int w, int h, int acn, bool swapBlue, int uIdx);
 int ovx_hal_cvtOnePlaneYUVtoBGR(const uchar * a, size_t astep, uchar * b, size_t bstep, int w, int h, int bcn, bool swapBlue, int uIdx, int ycn);
 int ovx_hal_integral(int depth, int sdepth, int, const uchar * a, size_t astep, uchar * b, size_t bstep, uchar * c, size_t, uchar * d, size_t, int w, int h, int cn);
+int ovx_hal_meanStdDev(const uchar* src_data, size_t src_step, int width, int height,
+                       int src_type, double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
+int ovx_hal_lut(const uchar *src_data, size_t src_step, size_t src_type, const uchar* lut_data, size_t lut_channel_size, size_t lut_channels, uchar *dst_data, size_t dst_step, int width, int height);
+int ovx_hal_minMaxIdxMaskStep(const uchar* src_data, size_t src_step, int width, int height, int depth,
+                              double* minVal, double* maxVal, int* minIdx, int* maxIdx, uchar* mask, size_t mask_step);
+int ovx_hal_medianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, int ksize);
+int ovx_hal_sobel(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, int dx, int dy, int ksize, double scale, double delta, int border_type);
+int ovx_hal_canny(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                  int width, int height, int cn, double lowThreshold, double highThreshold, int ksize, bool L2gradient);
+int ovx_hal_pyrdown(const uchar* src_data, size_t src_step, int src_width, int src_height,
+                    uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type);
+int ovx_hal_boxFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                      int width, int height, int src_depth, int dst_depth, int cn,
+                      int margin_left, int margin_top, int margin_right, int margin_bottom,
+                      size_t ksize_width, size_t ksize_height, int anchor_x, int anchor_y, bool normalize, int border_type);
+int ovx_hal_equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height);
+int ovx_hal_gaussianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,  int width, int height,
+                        int depth, int cn, size_t margin_left, size_t margin_top, size_t margin_right, size_t margin_bottom,
+                        size_t ksize_width, size_t ksize_height, double sigmaX, double sigmaY, int border_type);
+int ovx_hal_remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+                     uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+                     float* mapx, size_t mapx_step, float* mapy, size_t mapy_step,
+                     int interpolation, int border_type, const double border_value[4]);
+int ovx_hal_threshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                      int width, int height, int depth, int cn, double thresh, double maxValue, int thresholdType);
+int ovx_hal_FAST(const uchar* src_data, size_t src_step, int width, int height, uchar* keypoints_data, size_t* keypoints_count,
+                 int threshold, bool nonmax_suppression, int /*cv::FastFeatureDetector::DetectorType*/ dtype);

 //==================================================================================================
 // functions redefinition
@ -141,5 +168,11 @@ int ovx_hal_integral(int depth, int sdepth, int, const uchar * a, size_t astep,
 #define cv_hal_cvtOnePlaneYUVtoBGR ovx_hal_cvtOnePlaneYUVtoBGR
 #undef cv_hal_integral
 #define cv_hal_integral ovx_hal_integral
+#undef cv_hal_meanStdDev
+#define cv_hal_meanStdDev ovx_hal_meanStdDev
+#undef cv_hal_lut
+#define cv_hal_lut ovx_hal_lut
+#undef cv_hal_minMaxIdxMaskStep
+#define cv_hal_minMaxIdxMaskStep ovx_hal_minMaxIdxMaskStep

 #endif
--- a/3rdparty/openvx/include/ivx.hpp
+++ b/3rdparty/openvx/include/ivx.hpp
@ -22,7 +22,13 @@ Details: TBD
 #include <VX/vx.h>
 #include <VX/vxu.h>

-#ifndef VX_VERSION_1_1
+// For OpenVX 1.2 & 1.3
+#if (VX_VERSION > VX_VERSION_1_1)
+# include <VX/vx_compatibility.h>
+#endif
+
+
+#if (VX_VERSION == VX_VERSION_1_0)
 // 1.1 to 1.0 backward compatibility defines

 static const vx_enum VX_INTERPOLATION_BILINEAR = VX_INTERPOLATION_TYPE_BILINEAR;
@ -32,12 +38,6 @@ static const vx_enum VX_INTERPOLATION_NEAREST_NEIGHBOR = VX_INTERPOLATION_TYPE_N
 static const vx_enum VX_BORDER_CONSTANT = VX_BORDER_MODE_CONSTANT;
 static const vx_enum VX_BORDER_REPLICATE = VX_BORDER_MODE_REPLICATE;

-#else
-
-    #ifdef IVX_RENAMED_REFS
-        static const vx_enum VX_REF_ATTRIBUTE_TYPE = VX_REFERENCE_TYPE;
-    #endif
-
 #endif

 #ifndef IVX_USE_CXX98
@ -218,7 +218,7 @@ template<> struct TypeToEnum<vx_int64>    { static const vx_enum value = VX_TYPE
 template<> struct TypeToEnum<vx_uint64>   { static const vx_enum value = VX_TYPE_UINT64; };
 template<> struct TypeToEnum<vx_float32>  { static const vx_enum value = VX_TYPE_FLOAT32, imgType = VX_DF_IMAGE('F', '0', '3', '2'); };
 template<> struct TypeToEnum<vx_float64>  { static const vx_enum value = VX_TYPE_FLOAT64; };
-template<> struct TypeToEnum<vx_bool>     { static const vx_enum value = VX_TYPE_BOOL; };
+//template<> struct TypeToEnum<vx_bool>     { static const vx_enum value = VX_TYPE_BOOL; };
 template<> struct TypeToEnum<vx_keypoint_t> {static const vx_enum value = VX_TYPE_KEYPOINT; };
 // the commented types are aliases (of integral tyes) and have conflicts with the types above
 //template<> struct TypeToEnum<vx_enum>     { static const vx_enum val = VX_TYPE_ENUM; };
@ -1717,6 +1717,22 @@ static const vx_enum
 #endif
    }

+    /// Convert cv::Mat type to standard image format (fourcc), throws WrapperError if not possible
+    static vx_df_image matTypeToFormat(int matType)
+    {
+        switch (matType)
+        {
+            case CV_8UC4:  return VX_DF_IMAGE_RGBX;
+            case CV_8UC3:  return VX_DF_IMAGE_RGB;
+            case CV_8UC1:  return VX_DF_IMAGE_U8;
+            case CV_16UC1: return VX_DF_IMAGE_U16;
+            case CV_16SC1: return VX_DF_IMAGE_S16;
+            case CV_32SC1: return VX_DF_IMAGE_S32;
+            case CV_32FC1: return VX_DF_IMAGE('F', '0', '3', '2');
+            default:       throw WrapperError(std::string(__func__)+"(): unsupported cv::Mat type");
+        }
+    }
+
 #ifdef IVX_USE_OPENCV
    /// Convert image format (fourcc) to cv::Mat type, throws WrapperError if not possible
    static int formatToMatType(vx_df_image format, vx_uint32 planeIdx = 0)
@ -1742,22 +1758,6 @@ static const vx_enum
        }
    }

-    /// Convert cv::Mat type to standard image format (fourcc), throws WrapperError if not possible
-    static vx_df_image matTypeToFormat(int matType)
-    {
-        switch (matType)
-        {
-        case CV_8UC4:  return VX_DF_IMAGE_RGBX;
-        case CV_8UC3:  return VX_DF_IMAGE_RGB;
-        case CV_8UC1:  return VX_DF_IMAGE_U8;
-        case CV_16UC1: return VX_DF_IMAGE_U16;
-        case CV_16SC1: return VX_DF_IMAGE_S16;
-        case CV_32SC1: return VX_DF_IMAGE_S32;
-        case CV_32FC1: return VX_DF_IMAGE('F', '0', '3', '2');
-        default:       throw WrapperError(std::string(__func__)+"(): unsupported cv::Mat type");
-        }
-    }
-
    /// Initialize cv::Mat shape to fit the specified image plane data
    void createMatForPlane(cv::Mat& m, vx_uint32 planeIdx)
    {
@ -3177,6 +3177,27 @@ public:
    void getMapping(vx_uint32 dst_x, vx_uint32 dst_y, vx_float32 &src_x, vx_float32 &src_y) const
    { IVX_CHECK_STATUS(vxGetRemapPoint(ref, dst_x, dst_y, &src_x, &src_y)); }

+    void setMappings(vx_float32* map_x, size_t map_x_stride, vx_float32* map_y, size_t map_y_stride)
+    {
+        for (vx_uint32 y = 0; y < dstHeight(); y++)
+        {
+            const vx_float32* map_x_line = (vx_float32*)((char*)map_x + y*map_x_stride);
+            const vx_float32* map_y_line = (vx_float32*)((char*)map_y + y*map_y_stride);
+            for (vx_uint32 x = 0; x < dstWidth(); x++)
+                setMapping(x, y, map_x_line[x], map_y_line[x]);
+        }
+    }
+
+    void setMappings(vx_float32* map, size_t map_stride)
+    {
+        for (vx_uint32 y = 0; y < dstHeight(); y++)
+        {
+            const vx_float32* map_line = (vx_float32*)((char*)map + y*map_stride);
+            for (vx_uint32 x = 0; x < 2*dstWidth(); x+=2)
+                setMapping(x, y, map_line[x], map_line[x+1]);
+        }
+    }
+
 #ifdef IVX_USE_OPENCV
    void setMappings(const cv::Mat& map_x, const cv::Mat& map_y)
    {
--- a/3rdparty/zlib-ng/CMakeLists.txt
+++ b/3rdparty/zlib-ng/CMakeLists.txt
--- a/3rdparty/zlib-ng/LICENSE.md
+++ b/3rdparty/zlib-ng/LICENSE.md
@ -1,4 +1,4 @@
-(C) 1995-2013 Jean-loup Gailly and Mark Adler
+(C) 1995-2024 Jean-loup Gailly and Mark Adler

 This software is provided 'as-is', without any express or implied
 warranty. In no event will the authors be held liable for any damages
--- a/3rdparty/zlib-ng/README.md
+++ b/3rdparty/zlib-ng/README.md
@ -21,7 +21,6 @@ Features
 * Support for CPU intrinsics when available
  * Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
  * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
-  * Hash table implementation using CRC32-C intrinsics on x86 and ARM
  * Slide hash implementations using SSE2, AVX2, ARMv6, Neon, VMX & VSX
  * Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV
  * Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
@ -95,20 +94,21 @@ make test
 Build Options
 -------------

-| CMake                    | configure                | Description                                                                           | Default |
-|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------|
-| ZLIB_COMPAT              | --zlib-compat            | Compile with zlib compatible API                                                      | OFF     |
-| ZLIB_ENABLE_TESTS        |                          | Build test binaries                                                                   | ON      |
-| WITH_GZFILEOP            | --without-gzfileops      | Compile with support for gzFile related functions                                     | ON      |
-| WITH_OPTIM               | --without-optimizations  | Build with optimisations                                                              | ON      |
-| WITH_NEW_STRATEGIES      | --without-new-strategies | Use new strategies                                                                    | ON      |
-| WITH_NATIVE_INSTRUCTIONS |                          | Compiles with full instruction set supported on this host (gcc/clang -march=native)   | OFF     |
-| WITH_SANITIZER           |                          | Build with sanitizer (memory, address, undefined)                                     | OFF     |
-| WITH_GTEST               |                          | Build gtest_zlib                                                                      | ON      |
-| WITH_FUZZERS             |                          | Build test/fuzz                                                                       | OFF     |
-| WITH_BENCHMARKS          |                          | Build test/benchmarks                                                                 | OFF     |
-| WITH_MAINTAINER_WARNINGS |                          | Build with project maintainer warnings                                                | OFF     |
-| WITH_CODE_COVERAGE       |                          | Enable code coverage reporting                                                        | OFF     |
+| CMake                      | configure                | Description                                                                         | Default |
+|:---------------------------|:-------------------------|:------------------------------------------------------------------------------------|---------|
+| ZLIB_COMPAT                | --zlib-compat            | Compile with zlib compatible API                                                    | OFF     |
+| ZLIB_ENABLE_TESTS          |                          | Build test binaries                                                                 | ON      |
+| WITH_GZFILEOP              | --without-gzfileops      | Compile with support for gzFile related functions                                   | ON      |
+| WITH_OPTIM                 | --without-optimizations  | Build with optimisations                                                            | ON      |
+| WITH_NEW_STRATEGIES        | --without-new-strategies | Use new strategies                                                                  | ON      |
+| WITH_NATIVE_INSTRUCTIONS   |                          | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF     |
+| WITH_RUNTIME_CPU_DETECTION |                          | Compiles with runtime CPU detection                                                 | ON      |
+| WITH_SANITIZER             |                          | Build with sanitizer (memory, address, undefined)                                   | OFF     |
+| WITH_GTEST                 |                          | Build gtest_zlib                                                                    | ON      |
+| WITH_FUZZERS               |                          | Build test/fuzz                                                                     | OFF     |
+| WITH_BENCHMARKS            |                          | Build test/benchmarks                                                               | OFF     |
+| WITH_MAINTAINER_WARNINGS   |                          | Build with project maintainer warnings                                              | OFF     |
+| WITH_CODE_COVERAGE         |                          | Enable code coverage reporting                                                      | OFF     |


 Install
--- a/3rdparty/zlib-ng/adler32.c
+++ b/3rdparty/zlib-ng/adler32.c
@ -7,70 +7,24 @@
 #include "functable.h"
 #include "adler32_p.h"

-/* ========================================================================= */
-Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
-    uint32_t sum2;
-    unsigned n;
-
-    /* split Adler-32 into component sums */
-    sum2 = (adler >> 16) & 0xffff;
-    adler &= 0xffff;
-
-    /* in case user likes doing a byte at a time, keep it fast */
-    if (UNLIKELY(len == 1))
-        return adler32_len_1(adler, buf, sum2);
-
-    /* initial Adler-32 value (deferred check for len == 1 speed) */
-    if (UNLIKELY(buf == NULL))
-        return 1L;
-
-    /* in case short lengths are provided, keep it somewhat fast */
-    if (UNLIKELY(len < 16))
-        return adler32_len_16(adler, buf, len, sum2);
-
-    /* do length NMAX blocks -- requires just one modulo operation */
-    while (len >= NMAX) {
-        len -= NMAX;
-#ifdef UNROLL_MORE
-        n = NMAX / 16;          /* NMAX is divisible by 16 */
-#else
-        n = NMAX / 8;           /* NMAX is divisible by 8 */
-#endif
-        do {
-#ifdef UNROLL_MORE
-            DO16(adler, sum2, buf);          /* 16 sums unrolled */
-            buf += 16;
-#else
-            DO8(adler, sum2, buf, 0);         /* 8 sums unrolled */
-            buf += 8;
-#endif
-        } while (--n);
-        adler %= BASE;
-        sum2 %= BASE;
-    }
-
-    /* do remaining bytes (less than NMAX, still just one modulo) */
-    return adler32_len_64(adler, buf, len, sum2);
-}
-
 #ifdef ZLIB_COMPAT
 unsigned long Z_EXPORT PREFIX(adler32_z)(unsigned long adler, const unsigned char *buf, size_t len) {
-    return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
+    return (unsigned long)FUNCTABLE_CALL(adler32)((uint32_t)adler, buf, len);
 }
 #else
 uint32_t Z_EXPORT PREFIX(adler32_z)(uint32_t adler, const unsigned char *buf, size_t len) {
-    return functable.adler32(adler, buf, len);
+    return FUNCTABLE_CALL(adler32)(adler, buf, len);
 }
 #endif

 /* ========================================================================= */
 #ifdef ZLIB_COMPAT
 unsigned long Z_EXPORT PREFIX(adler32)(unsigned long adler, const unsigned char *buf, unsigned int len) {
-    return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
+    return (unsigned long)FUNCTABLE_CALL(adler32)((uint32_t)adler, buf, len);
 }
 #else
 uint32_t Z_EXPORT PREFIX(adler32)(uint32_t adler, const unsigned char *buf, uint32_t len) {
-    return functable.adler32(adler, buf, len);
+    return FUNCTABLE_CALL(adler32)(adler, buf, len);
 }
 #endif

--- a/3rdparty/zlib-ng/adler32_fold.h
+++ b/3rdparty/zlib-ng/adler32_fold.h
@ -1,11 +0,0 @@
-/* adler32_fold.h -- adler32 folding interface
- * Copyright (C) 2022 Adam Stylinski
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#ifndef ADLER32_FOLD_H_
-#define ADLER32_FOLD_H_
-
-Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-
-#endif
--- a/3rdparty/zlib-ng/arch/.gitignore
+++ b/3rdparty/zlib-ng/arch/.gitignore
@ -1,2 +0,0 @@
-# ignore Makefiles; they're all automatically generated
-Makefile
--- a/3rdparty/zlib-ng/arch/arm/Makefile.in
+++ b/3rdparty/zlib-ng/arch/arm/Makefile.in
@ -25,7 +25,6 @@ all: \
 	crc32_acle.o crc32_acle.lo \
 	slide_hash_neon.o slide_hash_neon.lo \
 	slide_hash_armv6.o slide_hash_armv6.lo \
-	insert_string_acle.o insert_string_acle.lo

 adler32_neon.o:
 	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
@ -69,12 +68,6 @@ slide_hash_armv6.o:
 slide_hash_armv6.lo:
 	$(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c

-insert_string_acle.o:
-	$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
-
-insert_string_acle.lo:
-	$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
-
 mostlyclean: clean
 clean:
 	rm -f *.o *.lo *~
--- a/3rdparty/zlib-ng/arch/arm/adler32_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/adler32_neon.c
@ -7,8 +7,8 @@
 */
 #ifdef ARM_NEON
 #include "neon_intrins.h"
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
+#include "zbuild.h"
+#include "adler32_p.h"

 static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
    static const uint16_t ALIGNED_(16) taps[64] = {
--- a/3rdparty/zlib-ng/arch/arm/arm_features.c
+++ b/3rdparty/zlib-ng/arch/arm/arm_features.c
@ -1,4 +1,4 @@
-#include "../../zbuild.h"
+#include "zbuild.h"
 #include "arm_features.h"

 #if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
@ -11,6 +11,11 @@
 #  ifndef ID_AA64ISAR0_CRC32_VAL
 #    define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
 #  endif
+#elif defined(__OpenBSD__) && defined(__aarch64__)
+#  include <machine/armreg.h>
+#  include <machine/cpu.h>
+#  include <sys/sysctl.h>
+#  include <sys/types.h>
 #elif defined(__APPLE__)
 #  if !defined(_DARWIN_C_SOURCE)
 #    define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
@ -30,6 +35,16 @@ static int arm_has_crc32() {
 #elif defined(__FreeBSD__) && defined(__aarch64__)
    return getenv("QEMU_EMULATING") == NULL
      && ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
+#elif defined(__OpenBSD__) && defined(__aarch64__)
+    int hascrc32 = 0;
+    int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+    uint64_t isar0 = 0;
+    size_t len = sizeof(isar0);
+    if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+      if (ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE)
+          hascrc32 = 1;
+    }
+    return hascrc32;
 #elif defined(__APPLE__)
    int hascrc32;
    size_t size = sizeof(hascrc32);
--- a/3rdparty/zlib-ng/arch/arm/arm_features.h
+++ b/3rdparty/zlib-ng/arch/arm/arm_features.h
@ -2,8 +2,8 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#ifndef ARM_H_
-#define ARM_H_
+#ifndef ARM_FEATURES_H_
+#define ARM_FEATURES_H_

 struct arm_cpu_features {
    int has_simd;
@ -13,4 +13,4 @@ struct arm_cpu_features {

 void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);

-#endif /* ARM_H_ */
+#endif /* ARM_FEATURES_H_ */
--- a/3rdparty/zlib-ng/arch/arm/arm_functions.h
+++ b/3rdparty/zlib-ng/arch/arm/arm_functions.h
@ -0,0 +1,65 @@
+/* arm_functions.h -- ARM implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_FUNCTIONS_H_
+#define ARM_FUNCTIONS_H_
+
+#ifdef ARM_NEON
+uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t chunksize_neon(void);
+uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+
+#  ifdef HAVE_BUILTIN_CTZLL
+uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
+uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
+uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
+#  endif
+void slide_hash_neon(deflate_state *s);
+void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef ARM_ACLE
+uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len);
+#endif
+
+#ifdef ARM_SIMD
+void slide_hash_armv6(deflate_state *s);
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// ARM - SIMD
+#  if (defined(ARM_SIMD) && defined(__ARM_FEATURE_SIMD32)) || defined(ARM_NOCHECK_SIMD)
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_armv6
+#  endif
+// ARM - NEON
+#  if (defined(ARM_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) || ARM_NOCHECK_NEON
+#    undef native_adler32
+#    define native_adler32 adler32_neon
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_neon
+#    undef native_chunksize
+#    define native_chunksize chunksize_neon
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_neon
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_neon
+#    ifdef HAVE_BUILTIN_CTZLL
+#      undef native_compare256
+#      define native_compare256 compare256_neon
+#      undef native_longest_match
+#      define native_longest_match longest_match_neon
+#      undef native_longest_match_slow
+#      define native_longest_match_slow longest_match_slow_neon
+#    endif
+#  endif
+// ARM - ACLE
+#  if defined(ARM_ACLE) && defined(__ARM_ACLE) && defined(__ARM_FEATURE_CRC32)
+#    undef native_crc32
+#    define native_crc32 crc32_acle
+#  endif
+#endif
+
+#endif /* ARM_FUNCTIONS_H_ */
--- a/3rdparty/zlib-ng/arch/arm/chunkset_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/chunkset_neon.c
@ -4,8 +4,8 @@

 #ifdef ARM_NEON
 #include "neon_intrins.h"
-#include "../../zbuild.h"
-#include "../generic/chunk_permute_table.h"
+#include "zbuild.h"
+#include "arch/generic/chunk_permute_table.h"

 typedef uint8x16_t chunk_t;

--- a/3rdparty/zlib-ng/arch/arm/compare256_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/compare256_neon.c
@ -3,8 +3,9 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "../../zbuild.h"
-
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"

 #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
--- a/3rdparty/zlib-ng/arch/arm/crc32_acle.c
+++ b/3rdparty/zlib-ng/arch/arm/crc32_acle.c
@ -7,7 +7,7 @@

 #ifdef ARM_ACLE
 #include "acle_intrins.h"
-#include "../../zbuild.h"
+#include "zbuild.h"

 Z_INTERNAL Z_TARGET_CRC uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
    Z_REGISTER uint32_t c;
--- a/3rdparty/zlib-ng/arch/arm/insert_string_acle.c
+++ b/3rdparty/zlib-ng/arch/arm/insert_string_acle.c
@ -1,24 +0,0 @@
-/* insert_string_acle.c -- insert_string integer hash variant using ACLE's CRC instructions
- *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- *
- */
-
-#ifdef ARM_ACLE
-#include "acle_intrins.h"
-#include "../../zbuild.h"
-#include "../../deflate.h"
-
-#define HASH_CALC(s, h, val) \
-    h = __crc32w(0, val)
-
-#define HASH_CALC_VAR       h
-#define HASH_CALC_VAR_INIT  uint32_t h = 0
-
-#define UPDATE_HASH         Z_TARGET_CRC update_hash_acle
-#define INSERT_STRING       Z_TARGET_CRC insert_string_acle
-#define QUICK_INSERT_STRING Z_TARGET_CRC quick_insert_string_acle
-
-#include "../../insert_string_tpl.h"
-#endif
--- a/3rdparty/zlib-ng/arch/arm/neon_intrins.h
+++ b/3rdparty/zlib-ng/arch/arm/neon_intrins.h
@ -25,6 +25,13 @@
    out.val[3] = vqsubq_u16(a.val[3], b); \
 } while (0)

+#  if defined(__clang__) && defined(__arm__) && defined(__ANDROID__)
+/* Clang for 32-bit Android has too strict alignment requirement (:256) for x4 NEON intrinsics */
+#    undef ARM_NEON_HASLD4
+#    undef vld1q_u16_x4
+#    undef vld1q_u8_x4
+#    undef vst1q_u16_x4
+#  endif

 #  ifndef ARM_NEON_HASLD4

--- a/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
+++ b/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
@ -5,8 +5,8 @@

 #if defined(ARM_SIMD)
 #include "acle_intrins.h"
-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"

 /* SIMD version of hash_chain rebase */
 static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
--- a/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
@ -10,8 +10,8 @@

 #ifdef ARM_NEON
 #include "neon_intrins.h"
-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"

 /* SIMD version of hash_chain rebase */
 static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
--- a/3rdparty/zlib-ng/arch/generic/Makefile.in
+++ b/3rdparty/zlib-ng/arch/generic/Makefile.in
@ -1,5 +1,6 @@
-# Makefile for zlib
+# Makefile for zlib-ng
 # Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# Copyright (C) 2024 Hans Kristian Rosbach
 # For conditions of distribution and use, see copyright notice in zlib.h

 CC=
@ -11,12 +12,62 @@ SRCDIR=.
 SRCTOP=../..
 TOPDIR=$(SRCTOP)

-all:
+all: \
+ adler32_c.o adler32_c.lo \
+ adler32_fold_c.o adler32_fold_c.lo \
+ chunkset_c.o chunkset_c.lo \
+ compare256_c.o compare256_c.lo \
+ crc32_braid_c.o crc32_braid_c.lo \
+ crc32_fold_c.o crc32_fold_c.lo \
+ slide_hash_c.o slide_hash_c.lo
+
+
+adler32_c.o: $(SRCDIR)/adler32_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
+
+adler32_c.lo: $(SRCDIR)/adler32_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
+
+adler32_fold_c.o: $(SRCDIR)/adler32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c
+
+adler32_fold_c.lo: $(SRCDIR)/adler32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c
+
+chunkset_c.o: $(SRCDIR)/chunkset_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
+
+chunkset_c.lo: $(SRCDIR)/chunkset_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
+
+compare256_c.o: $(SRCDIR)/compare256_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
+
+compare256_c.lo: $(SRCDIR)/compare256_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
+
+crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
+
+crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
+
+crc32_fold_c.o: $(SRCDIR)/crc32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c
+
+crc32_fold_c.lo: $(SRCDIR)/crc32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c
+
+slide_hash_c.o: $(SRCDIR)/slide_hash_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
+
+slide_hash_c.lo: $(SRCDIR)/slide_hash_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c


 mostlyclean: clean
 clean:
-	rm -f *.o *.lo *~ \
+	rm -f *.o *.lo *~
 	rm -rf objs
 	rm -f *.gcda *.gcno *.gcov

--- a/3rdparty/zlib-ng/arch/generic/adler32_c.c
+++ b/3rdparty/zlib-ng/arch/generic/adler32_c.c
@ -0,0 +1,54 @@
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "functable.h"
+#include "adler32_p.h"
+
+/* ========================================================================= */
+Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t sum2;
+    unsigned n;
+
+    /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    /* do length NMAX blocks -- requires just one modulo operation */
+    while (len >= NMAX) {
+        len -= NMAX;
+#ifdef UNROLL_MORE
+        n = NMAX / 16;          /* NMAX is divisible by 16 */
+#else
+        n = NMAX / 8;           /* NMAX is divisible by 8 */
+#endif
+        do {
+#ifdef UNROLL_MORE
+            DO16(adler, sum2, buf);          /* 16 sums unrolled */
+            buf += 16;
+#else
+            DO8(adler, sum2, buf, 0);         /* 8 sums unrolled */
+            buf += 8;
+#endif
+        } while (--n);
+        adler %= BASE;
+        sum2 %= BASE;
+    }
+
+    /* do remaining bytes (less than NMAX, still just one modulo) */
+    return adler32_len_64(adler, buf, len, sum2);
+}
--- a/3rdparty/zlib-ng/arch/generic/adler32_fold_c.c
+++ b/3rdparty/zlib-ng/arch/generic/adler32_fold_c.c
@ -5,12 +5,11 @@

 #include "zbuild.h"
 #include "functable.h"
-#include "adler32_fold.h"

 #include <limits.h>

 Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
-    adler = functable.adler32(adler, src, len);
+    adler = FUNCTABLE_CALL(adler32)(adler, src, len);
    memcpy(dst, src, len);
    return adler;
 }
--- a/3rdparty/zlib-ng/arch/generic/chunkset_c.c
+++ b/3rdparty/zlib-ng/arch/generic/chunkset_c.c
--- a/3rdparty/zlib-ng/arch/generic/compare256_c.c
+++ b/3rdparty/zlib-ng/arch/generic/compare256_c.c
@ -5,6 +5,7 @@

 #include "zbuild.h"
 #include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"

 /* ALIGNED, byte comparison */
--- a/3rdparty/zlib-ng/arch/generic/crc32_braid_c.c
+++ b/3rdparty/zlib-ng/arch/generic/crc32_braid_c.c
@ -8,43 +8,9 @@
 */

 #include "zbuild.h"
-#include "zutil.h"
-#include "functable.h"
 #include "crc32_braid_p.h"
 #include "crc32_braid_tbl.h"

-/* ========================================================================= */
-
-const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) {
-    return (const uint32_t *)crc_table;
-}
-
-#ifdef ZLIB_COMPAT
-unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) {
-    if (buf == NULL) return 0;
-
-    return (unsigned long)functable.crc32((uint32_t)crc, buf, len);
-}
-#else
-uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) {
-    if (buf == NULL) return 0;
-
-    return functable.crc32(crc, buf, len);
-}
-#endif
-
-#ifdef ZLIB_COMPAT
-unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) {
-    return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len);
-}
-#else
-uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) {
-    return PREFIX(crc32_z)(crc, buf, len);
-}
-#endif
-
-/* ========================================================================= */
-
 /*
  A CRC of a message is computed on N braids of words in the message, where
  each word consists of W bytes (4 or 8). If N is 3, for example, then three
@ -66,24 +32,6 @@ uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t
  level. Your mileage may vary.
 */

-/* ========================================================================= */
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-#  define ZSWAPWORD(word) (word)
-#  define BRAID_TABLE crc_braid_table
-#elif BYTE_ORDER == BIG_ENDIAN
-#  if W == 8
-#    define ZSWAPWORD(word) ZSWAP64(word)
-#  elif W == 4
-#    define ZSWAPWORD(word) ZSWAP32(word)
-#  endif
-#  define BRAID_TABLE crc_braid_big_table
-#else
-#  error "No endian defined"
-#endif
-#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
-#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
-
 /* ========================================================================= */
 #ifdef W
 /*
@ -112,7 +60,7 @@ static z_word_t crc_word(z_word_t data) {

 /* ========================================================================= */
 Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) {
-    Z_REGISTER uint32_t c;
+    uint32_t c;

    /* Pre-condition the CRC */
    c = (~crc) & 0xffffffff;
--- a/3rdparty/zlib-ng/arch/generic/crc32_fold_c.c
+++ b/3rdparty/zlib-ng/arch/generic/crc32_fold_c.c
@ -3,11 +3,9 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */
 #include "zbuild.h"
+#include "zutil.h"
 #include "functable.h"
-
-#include "crc32_fold.h"
-
-#include <limits.h>
+#include "crc32.h"

 Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
    crc->value = CRC32_INITIAL_VALUE;
@ -15,7 +13,7 @@ Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
 }

 Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
-    crc->value = functable.crc32(crc->value, src, len);
+    crc->value = FUNCTABLE_CALL(crc32)(crc->value, src, len);
    memcpy(dst, src, len);
 }

@ -25,7 +23,7 @@ Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, ui
     * same arguments for the versions that _do_ do a folding CRC but we don't want a copy. The
     * init_crc is an unused argument in this context */
    Z_UNUSED(init_crc);
-    crc->value = functable.crc32(crc->value, src, len);
+    crc->value = FUNCTABLE_CALL(crc32)(crc->value, src, len);
 }

 Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) {
--- a/3rdparty/zlib-ng/arch/generic/generic_functions.h
+++ b/3rdparty/zlib-ng/arch/generic/generic_functions.h
@ -0,0 +1,106 @@
+/* generic_functions.h -- generic C implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef GENERIC_FUNCTIONS_H_
+#define GENERIC_FUNCTIONS_H_
+
+#include "zendian.h"
+
+Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
+Z_INTERNAL void     crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+Z_INTERNAL void     crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc);
+
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+
+
+typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
+typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
+typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len);
+
+uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
+
+uint32_t chunksize_c(void);
+uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+void     inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
+
+uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
+
+uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1);
+#  ifdef HAVE_BUILTIN_CTZ
+    uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1);
+#  endif
+#  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+    uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
+#  endif
+#endif
+
+typedef void (*slide_hash_func)(deflate_state *s);
+
+void     slide_hash_c(deflate_state *s);
+
+uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
+#  if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+    uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
+#    ifdef HAVE_BUILTIN_CTZ
+        uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
+#    endif
+#    if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+        uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
+#    endif
+#  endif
+
+uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
+#  if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+    uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
+    uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
+#    ifdef UNALIGNED64_OK
+        uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
+#    endif
+#  endif
+
+
+// Select generic implementation for longest_match, longest_match_slow, longest_match_slow functions.
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+#  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+#    define longest_match_generic longest_match_unaligned_64
+#    define longest_match_slow_generic longest_match_slow_unaligned_64
+#    define compare256_generic compare256_unaligned_64
+#  elif defined(HAVE_BUILTIN_CTZ)
+#    define longest_match_generic longest_match_unaligned_32
+#    define longest_match_slow_generic longest_match_slow_unaligned_32
+#    define compare256_generic compare256_unaligned_32
+#  else
+#    define longest_match_generic longest_match_unaligned_16
+#    define longest_match_slow_generic longest_match_slow_unaligned_16
+#    define compare256_generic compare256_unaligned_16
+#  endif
+#else
+#  define longest_match_generic longest_match_c
+#  define longest_match_slow_generic longest_match_slow_c
+#  define compare256_generic compare256_c
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// Generic code
+#  define native_adler32 adler32_c
+#  define native_adler32_fold_copy adler32_fold_copy_c
+#  define native_chunkmemset_safe chunkmemset_safe_c
+#  define native_chunksize chunksize_c
+#  define native_crc32 PREFIX(crc32_braid)
+#  define native_crc32_fold crc32_fold_c
+#  define native_crc32_fold_copy crc32_fold_copy_c
+#  define native_crc32_fold_final crc32_fold_final_c
+#  define native_crc32_fold_reset crc32_fold_reset_c
+#  define native_inflate_fast inflate_fast_c
+#  define native_slide_hash slide_hash_c
+#  define native_longest_match longest_match_generic
+#  define native_longest_match_slow longest_match_slow_generic
+#  define native_compare256 compare256_generic
+#endif
+
+#endif
--- a/3rdparty/zlib-ng/arch/generic/slide_hash_c.c
+++ b/3rdparty/zlib-ng/arch/generic/slide_hash_c.c
@ -1,6 +1,6 @@
 /* slide_hash.c -- slide hash table C implementation
 *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

--- a/3rdparty/zlib-ng/arch/power/chunkset_power8.c
+++ b/3rdparty/zlib-ng/arch/power/chunkset_power8.c
@ -4,7 +4,7 @@

 #ifdef POWER8_VSX
 #include <altivec.h>
-#include "../../zbuild.h"
+#include "zbuild.h"

 typedef vector unsigned char chunk_t;

--- a/3rdparty/zlib-ng/arch/power/compare256_power9.c
+++ b/3rdparty/zlib-ng/arch/power/compare256_power9.c
@ -5,8 +5,10 @@

 #ifdef POWER9
 #include <altivec.h>
-#include "../../zbuild.h"
-#include "../../zendian.h"
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
+#include "zendian.h"

 /* Older versions of GCC misimplemented semantics for these bit counting builtins.
 * https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
--- a/3rdparty/zlib-ng/arch/power/power_features.c
+++ b/3rdparty/zlib-ng/arch/power/power_features.c
@ -1,16 +1,19 @@
 /* power_features.c - POWER feature check
 * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
- * Copyright (C) 2021-2022 Mika T. Lindqvist <postmaster@raasu.org>
+ * Copyright (C) 2021-2024 Mika T. Lindqvist <postmaster@raasu.org>
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

 #ifdef HAVE_SYS_AUXV_H
 #  include <sys/auxv.h>
 #endif
+#ifdef POWER_NEED_AUXVEC_H
+#  include <linux/auxvec.h>
+#endif
 #ifdef __FreeBSD__
 #  include <machine/cpu.h>
 #endif
-#include "../../zbuild.h"
+#include "zbuild.h"
 #include "power_features.h"

 void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
--- a/3rdparty/zlib-ng/arch/power/power_features.h
+++ b/3rdparty/zlib-ng/arch/power/power_features.h
@ -4,8 +4,8 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#ifndef POWER_H_
-#define POWER_H_
+#ifndef POWER_FEATURES_H_
+#define POWER_FEATURES_H_

 struct power_cpu_features {
    int has_altivec;
@ -15,4 +15,4 @@ struct power_cpu_features {

 void Z_INTERNAL power_check_features(struct power_cpu_features *features);

-#endif /* POWER_H_ */
+#endif /* POWER_FEATURES_H_ */
--- a/3rdparty/zlib-ng/arch/power/power_functions.h
+++ b/3rdparty/zlib-ng/arch/power/power_functions.h
@ -0,0 +1,67 @@
+/* power_functions.h -- POWER implementations for arch-specific functions.
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_FUNCTIONS_H_
+#define POWER_FUNCTIONS_H_
+
+#ifdef PPC_VMX
+uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
+void slide_hash_vmx(deflate_state *s);
+#endif
+
+#ifdef POWER8_VSX
+uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t chunksize_power8(void);
+uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
+void slide_hash_power8(deflate_state *s);
+void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef POWER9
+uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
+uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
+uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// Power - VMX
+#  if defined(PPC_VMX) && defined(__ALTIVEC__)
+#    undef native_adler32
+#    define native_adler32 adler32_vmx
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_vmx
+#  endif
+// Power8 - VSX
+#  if defined(POWER8_VSX) && defined(_ARCH_PWR8) && defined(__VSX__)
+#    undef native_adler32
+#    define native_adler32 adler32_power8
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_power8
+#    undef native_chunksize
+#    define native_chunksize chunksize_power8
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_power8
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_power8
+#  endif
+#  if defined(POWER8_VSX_CRC32) && defined(_ARCH_PWR8) && defined(__VSX__)
+#    undef native_crc32
+#    define native_crc32 crc32_power8
+#  endif
+// Power9
+#  if defined(POWER9) && defined(_ARCH_PWR9)
+#    undef native_compare256
+#    define native_compare256 compare256_power9
+#    undef native_longest_match
+#    define native_longest_match longest_match_power9
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_power9
+#  endif
+#endif
+
+#endif /* POWER_FUNCTIONS_H_ */
--- a/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
+++ b/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
@ -9,8 +9,8 @@
 #include <riscv_vector.h>
 #include <stdint.h>

-#include "../../zbuild.h"
-#include "../../adler32_p.h"
+#include "zbuild.h"
+#include "adler32_p.h"

 static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {
    /* split Adler-32 into component sums */
--- a/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
+++ b/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
@ -6,7 +6,9 @@

 #ifdef RISCV_RVV

-#include "../../zbuild.h"
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"

 #include <riscv_vector.h>
--- a/3rdparty/zlib-ng/arch/riscv/riscv_features.c
+++ b/3rdparty/zlib-ng/arch/riscv/riscv_features.c
@ -1,10 +1,13 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/auxv.h>
 #include <sys/utsname.h>

-#include "../../zbuild.h"
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+#  include <sys/auxv.h>
+#endif
+
+#include "zbuild.h"
 #include "riscv_features.h"

 #define ISA_V_HWCAP (1 << ('v' - 'a'))
@ -33,7 +36,11 @@ void Z_INTERNAL riscv_check_features_compile_time(struct riscv_cpu_features *fea
 }

 void Z_INTERNAL riscv_check_features_runtime(struct riscv_cpu_features *features) {
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
    unsigned long hw_cap = getauxval(AT_HWCAP);
+#else
+    unsigned long hw_cap = 0;
+#endif
    features->has_rvv = hw_cap & ISA_V_HWCAP;
 }

--- a/3rdparty/zlib-ng/arch/riscv/riscv_features.h
+++ b/3rdparty/zlib-ng/arch/riscv/riscv_features.h
@ -6,8 +6,8 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#ifndef RISCV_H_
-#define RISCV_H_
+#ifndef RISCV_FEATURES_H_
+#define RISCV_FEATURES_H_

 struct riscv_cpu_features {
    int has_rvv;
@ -15,4 +15,4 @@ struct riscv_cpu_features {

 void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);

-#endif /* RISCV_H_ */
+#endif /* RISCV_FEATURES_H_ */
--- a/3rdparty/zlib-ng/arch/riscv/riscv_functions.h
+++ b/3rdparty/zlib-ng/arch/riscv/riscv_functions.h
@ -0,0 +1,49 @@
+/* riscv_functions.h -- RISCV implementations for arch-specific functions.
+ *
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_FUNCTIONS_H_
+#define RISCV_FUNCTIONS_H_
+
+#ifdef RISCV_RVV
+uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint32_t chunksize_rvv(void);
+uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
+
+uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);
+uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match);
+void slide_hash_rvv(deflate_state *s);
+void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// RISCV - RVV
+#  if defined(RISCV_RVV) && defined(__riscv_v) && defined(__linux__)
+#    undef native_adler32
+#    define native_adler32 adler32_rvv
+#    undef native_adler32_fold_copy
+#    define native_adler32_fold_copy adler32_fold_copy_rvv
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_rvv
+#    undef native_chunksize
+#    define native_chunksize chunksize_rvv
+#    undef native_compare256
+#    define native_compare256 compare256_rvv
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_rvv
+#    undef native_longest_match
+#    define native_longest_match longest_match_rvv
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_rvv
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_rvv
+#  endif
+#endif
+
+#endif /* RISCV_FUNCTIONS_H_ */
--- a/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
+++ b/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
@ -8,18 +8,16 @@

 #include <riscv_vector.h>

-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"

 static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
    size_t vl;
    while (entries > 0) {
        vl = __riscv_vsetvl_e16m4(entries);
        vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl);
-        vuint16m4_t v_diff = __riscv_vsub_vx_u16m4(v_tab, wsize, vl);
-        vbool4_t mask = __riscv_vmsltu_vx_u16m4_b4(v_tab, wsize, vl);
-        v_tab = __riscv_vmerge_vxm_u16m4(v_diff, 0, mask, vl);
-        __riscv_vse16_v_u16m4(table, v_tab, vl);
+        vuint16m4_t v_diff = __riscv_vssubu_vx_u16m4(v_tab, wsize, vl);
+        __riscv_vse16_v_u16m4(table, v_diff, vl);
        table += vl, entries -= vl;
    }
 }
--- a/3rdparty/zlib-ng/arch/s390/Makefile.in
+++ b/3rdparty/zlib-ng/arch/s390/Makefile.in
@ -0,0 +1,48 @@
+# Makefile for zlib-ng
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+VGFMAFLAG=
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+s390_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
+
+s390_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
+
+dfltcc_deflate.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
+
+dfltcc_deflate.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
+
+dfltcc_inflate.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
+
+dfltcc_inflate.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
+
+crc32-vx.o:
+	$(CC) $(CFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
+crc32-vx.lo:
+	$(CC) $(SFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
--- a/3rdparty/zlib-ng/arch/s390/README.md
+++ b/3rdparty/zlib-ng/arch/s390/README.md
@ -0,0 +1,277 @@
+# Introduction
+
+This directory contains SystemZ deflate hardware acceleration support.
+It can be enabled using the following build commands:
+
+    $ ./configure --with-dfltcc-deflate --with-dfltcc-inflate
+    $ make
+
+or
+
+    $ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 .
+    $ make
+
+When built like this, zlib-ng would compress using hardware on level 1,
+and using software on all other levels. Decompression will always happen
+in hardware. In order to enable hardware compression for levels 1-6
+(i.e. to make it used by default) one could add
+`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng.
+
+SystemZ deflate hardware acceleration is available on [IBM z15](
+https://www.ibm.com/products/z15) and newer machines under the name [
+"Integrated Accelerator for zEnterprise Data Compression"](
+https://www.ibm.com/support/z-content-solutions/compression/). The
+programming interface to it is a machine instruction called DEFLATE
+CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles
+of Operation](https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both
+the code and the rest of this document refer to this feature simply as
+"DFLTCC".
+
+# Performance
+
+Performance figures are published [here](
+https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine
+). The compression speed-up can be as high as 110x and the decompression
+speed-up can be as high as 15x.
+
+# Limitations
+
+Two DFLTCC compression calls with identical inputs are not guaranteed to
+produce identical outputs. Therefore care should be taken when using
+hardware compression when reproducible results are desired. In
+particular, zlib-ng-specific `zng_deflateSetParams` call allows setting
+`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a
+particular stream.
+
+DFLTCC does not support every single zlib-ng feature, in particular:
+
+* `inflate(Z_BLOCK)` and `inflate(Z_TREES)`
+* `inflateMark()`
+* `inflatePrime()`
+* `inflateSyncPoint()`
+
+When used, these functions will either switch to software, or, in case
+this is not possible, gracefully fail.
+
+# Code structure
+
+All SystemZ-specific code lives in `arch/s390` directory and is
+integrated with the rest of zlib-ng using hook macros.
+
+## Hook macros
+
+DFLTCC takes as arguments a parameter block, an input buffer, an output
+buffer, and a window. Parameter blocks are stored alongside zlib states;
+buffers are forwarded from the caller; and window - which must be
+4k-aligned and is always 64k large, is managed using the `PAD_WINDOW()`,
+`WINDOW_PAD_SIZE`, `HINT_ALIGNED_WINDOW` and `DEFLATE_ADJUST_WINDOW_SIZE()`
+and `INFLATE_ADJUST_WINDOW_SIZE()` hooks.
+
+Software and hardware window formats do not match, therefore,
+`deflateSetDictionary()`, `deflateGetDictionary()`, `inflateSetDictionary()`
+and `inflateGetDictionary()` need special handling, which is triggered using
+`DEFLATE_SET_DICTIONARY_HOOK()`, `DEFLATE_GET_DICTIONARY_HOOK()`,
+`INFLATE_SET_DICTIONARY_HOOK()` and `INFLATE_GET_DICTIONARY_HOOK()` macros.
+
+`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC
+parameter block using `DEFLATE_RESET_KEEP_HOOK()` and
+`INFLATE_RESET_KEEP_HOOK()` macros.
+
+`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and
+`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported
+calls gracefully fail.
+
+`DEFLATE_PARAMS_HOOK()` implements switching between hardware and
+software compression mid-stream using `deflateParams()`. Switching
+normally entails flushing the current block, which might not be possible
+in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook
+in order to detect and gracefully handle such situations.
+
+The algorithm implemented in hardware has different compression ratio
+than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()`
+and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()`
+return the correct results for the hardware implementation.
+
+Actual compression and decompression are handled by `DEFLATE_HOOK()` and
+`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the
+window on its own, calling `updatewindow()` is suppressed using
+`INFLATE_NEED_UPDATEWINDOW()` macro.
+
+In addition to compression, DFLTCC computes CRC-32 and Adler-32
+checksums, therefore, whenever it's used, software checksumming is
+suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()`
+macros.
+
+While software always produces reproducible compression results, this
+is not the case for DFLTCC. Therefore, zlib-ng users are given the
+ability to specify whether or not reproducible compression results
+are required. While it is always possible to specify this setting
+before the compression begins, it is not always possible to do so in
+the middle of a deflate stream - the exact conditions for that are
+determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro.
+
+## SystemZ-specific code
+
+When zlib-ng is built with DFLTCC, the hooks described above are
+converted to calls to functions, which are implemented in
+`arch/s390/dfltcc_*` files. The functions can be grouped in three broad
+categories:
+
+* Base DFLTCC support, e.g. wrapping the machine instruction - `dfltcc()`.
+* Translating between software and hardware data formats, e.g.
+  `dfltcc_deflate_set_dictionary()`.
+* Translating between software and hardware state machines, e.g.
+  `dfltcc_deflate()` and `dfltcc_inflate()`.
+
+The functions from the first two categories are fairly simple, however,
+various quirks in both software and hardware state machines make the
+functions from the third category quite complicated.
+
+### `dfltcc_deflate()` function
+
+This function is called by `deflate()` and has the following
+responsibilities:
+
+* Checking whether DFLTCC can be used with the current stream. If this
+  is not the case, then it returns `0`, making `deflate()` use some
+  other function in order to compress in software. Otherwise it returns
+  `1`.
+* Block management and Huffman table generation. DFLTCC ends blocks only
+  when explicitly instructed to do so by the software. Furthermore,
+  whether to use fixed or dynamic Huffman tables must also be determined
+  by the software. Since looking at data in order to gather statistics
+  would negate performance benefits, the following approach is used: the
+  first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed
+  block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into
+  dynamic blocks.
+* Writing EOBS. Block Closing Control bit in the parameter block
+  instructs DFLTCC to write EOBS, however, certain conditions need to be
+  met: input data length must be non-zero or Continuation Flag must be
+  set. To put this in simpler terms, DFLTCC will silently refuse to
+  write EOBS if this is the only thing that it is asked to do. Since the
+  code has to be able to emit EOBS in software anyway, in order to avoid
+  tricky corner cases Block Closing Control is never used. Whether to
+  write EOBS is instead controlled by `soft_bcc` variable.
+* Triggering block post-processing. Depending on flush mode, `deflate()`
+  must perform various additional actions when a block or a stream ends.
+  `dfltcc_deflate()` informs `deflate()` about this using
+  `block_state *result` parameter.
+* Converting software state fields into hardware parameter block fields,
+  and vice versa. For example, `wrap` and Check Value Type or `bi_valid`
+  and Sub-Byte Boundary. Certain fields cannot be translated and must
+  persist untouched in the parameter block between calls, for example,
+  Continuation Flag or Continuation State Buffer.
+* Handling flush modes and low-memory situations. These aspects are
+  quite intertwined and pervasive. The general idea here is that the
+  code must not do anything in software - whether explicitly by e.g.
+  calling `send_eobs()`, or implicitly - by returning to `deflate()`
+  with certain return and `*result` values, when Continuation Flag is
+  set.
+* Ending streams. When a new block is started and flush mode is
+  `Z_FINISH`, Block Header Final parameter block bit is used to mark
+  this block as final. However, sometimes an empty final block is
+  needed, and, unfortunately, just like with EOBS, DFLTCC will silently
+  refuse to do this. The general idea of DFLTCC implementation is to
+  rely as much as possible on the existing code. Here in order to do
+  this, the code pretends that it does not support DFLTCC, which makes
+  `deflate()` call a software compression function, which writes an
+  empty final block. Whether this is required is controlled by
+  `need_empty_block` variable.
+* Error handling. This is simply converting
+  Operation-Ending-Supplemental Code to string. Errors can only happen
+  due to things like memory corruption, and therefore they don't affect
+  the `deflate()` return code.
+
+### `dfltcc_inflate()` function
+
+This function is called by `inflate()` from the `TYPEDO` state (that is,
+when all the metadata is parsed and the stream is positioned at the type
+bits of deflate block header) and it's responsible for the following:
+
+* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`.
+  Unfortunately, there is no way to ask DFLTCC to stop decompressing on
+  block or tree boundary.
+* `inflate()` decompression loop management. This is controlled using
+  the return value, which can be either `DFLTCC_INFLATE_BREAK` or
+  `DFLTCC_INFLATE_CONTINUE`.
+* Converting software state fields into hardware parameter block fields,
+  and vice versa. For example, `whave` and History Length or `wnext` and
+  History Offset.
+* Ending streams. This instructs `inflate()` to return `Z_STREAM_END`
+  and is controlled by `last` state field.
+* Error handling. Like deflate, error handling comprises
+  Operation-Ending-Supplemental Code to string conversion. Unlike
+  deflate, errors may happen due to bad inputs, therefore they are
+  propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`.
+
+# Testing
+
+Given complexity of DFLTCC machine instruction, it is not clear whether
+QEMU TCG will ever support it. At the time of writing, one has to have
+access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since
+DFLTCC is a non-privileged instruction, neither special VM/LPAR
+configuration nor root are required.
+
+zlib-ng CI uses an IBM-provided z15 self-hosted builder for the DFLTCC
+testing. There is no official IBM Z GitHub Actions runner, so we build
+one inspired by `anup-kodlekere/gaplib`.
+Future updates to actions-runner might need an updated patch. The .net
+version number patch has been separated into a separate file to avoid a
+need for constantly changing the patch.
+
+## Configuring the builder.
+
+### Install prerequisites.
+
+```
+sudo dnf install podman
+```
+
+### Add actions-runner service.
+
+```
+sudo cp self-hosted-builder/actions-runner.service /etc/systemd/system/
+sudo systemctl daemon-reload
+```
+
+### Create a config file, needs github personal access token.
+
+```
+# Create file /etc/actions-runner
+repo=<owner>/<name>
+access_token=<ghp_***>
+```
+
+Access token should have the repo scope, consult
+https://docs.github.com/en/rest/reference/actions#create-a-registration-token-for-a-repository
+for details.
+
+### Autostart actions-runner.
+
+```
+$ sudo systemctl enable --now actions-runner
+```
+
+## Rebuilding the container
+
+In order to update the `gaplib-actions-runner` podman container, e.g. to get the
+latest OS security fixes, follow these steps:
+```
+# Stop actions-runner service
+sudo systemctl stop actions-runner
+
+# Delete old container
+sudo podman container rm gaplib-actions-runner
+
+# Delete old image
+sudo podman image rm localhost/zlib-ng/actions-runner
+
+# Build image
+sudo podman build --squash -f Dockerfile.zlib-ng --tag zlib-ng/actions-runner --build-arg .
+
+# Build container
+sudo podman create --name=gaplib-actions-runner --env-file=/etc/actions-runner --init --interactive --volume=actions-runner-temp:/home/actions-runner zlib-ng/actions-runner
+
+# Start actions-runner service
+sudo systemctl start actions-runner
+```
--- a/3rdparty/zlib-ng/arch/s390/crc32-vx.c
+++ b/3rdparty/zlib-ng/arch/s390/crc32-vx.c
@ -0,0 +1,222 @@
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of bitreflected CRC-32 checksums.
+ *
+ * This CRC-32 implementation algorithm is bitreflected and processes
+ * the least-significant bit first (Little-Endian).
+ *
+ * This code was originally written by Hendrik Brueckner
+ * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
+ * relicensed under the zlib license.
+ */
+
+#include "zbuild.h"
+#include "arch_functions.h"
+
+#include <vecintrin.h>
+
+typedef unsigned char uv16qi __attribute__((vector_size(16)));
+typedef unsigned int uv4si __attribute__((vector_size(16)));
+typedef unsigned long long uv2di __attribute__((vector_size(16)));
+
+static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, size_t len) {
+    /*
+     * The CRC-32 constant block contains reduction constants to fold and
+     * process particular chunks of the input data stream in parallel.
+     *
+     * For the CRC-32 variants, the constants are precomputed according to
+     * these definitions:
+     *
+     *      R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+     *      R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+     *      R3 = [(x128+32 mod P'(x) << 32)]'   << 1
+     *      R4 = [(x128-32 mod P'(x) << 32)]'   << 1
+     *      R5 = [(x64 mod P'(x) << 32)]'       << 1
+     *      R6 = [(x32 mod P'(x) << 32)]'       << 1
+     *
+     *      The bitreflected Barret reduction constant, u', is defined as
+     *      the bit reversal of floor(x**64 / P(x)).
+     *
+     *      where P(x) is the polynomial in the normal domain and the P'(x) is the
+     *      polynomial in the reversed (bitreflected) domain.
+     *
+     * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+     *
+     *      P(x)  = 0x04C11DB7
+     *      P'(x) = 0xEDB88320
+     */
+    const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};  /* BE->LE mask */
+    const uv2di r2r1 = {0x1C6E41596, 0x154442BD4};                                     /* R2, R1 */
+    const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0};                                     /* R4, R3 */
+    const uv2di r5 = {0, 0x163CD6124};                                                 /* R5 */
+    const uv2di ru_poly = {0, 0x1F7011641};                                            /* u' */
+    const uv2di crc_poly = {0, 0x1DB710641};                                           /* P'(x) << 1 */
+
+    /*
+     * Load the initial CRC value.
+     *
+     * The CRC value is loaded into the rightmost word of the
+     * vector register and is later XORed with the LSB portion
+     * of the loaded input data.
+     */
+    uv2di v0 = {0, 0};
+    v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
+
+    /* Load a 64-byte data chunk and XOR with CRC */
+    uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
+    uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
+    uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
+    uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
+
+    v1 ^= v0;
+    buf += 64;
+    len -= 64;
+
+    while (len >= 64) {
+        /* Load the next 64-byte data chunk */
+        uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
+        uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
+        uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
+        uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
+
+        /*
+         * Perform a GF(2) multiplication of the doublewords in V1 with
+         * the R1 and R2 reduction constants in V0.  The intermediate result
+         * is then folded (accumulated) with the next data chunk in PART1 and
+         * stored in V1. Repeat this step for the register contents
+         * in V2, V3, and V4 respectively.
+         */
+        v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
+        v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
+        v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
+        v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
+
+        buf += 64;
+        len -= 64;
+    }
+
+    /*
+     * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
+     * and R4 and accumulating the next 128-bit chunk until a single 128-bit
+     * value remains.
+     */
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
+
+    while (len >= 16) {
+        /* Load next data chunk */
+        v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
+
+        /* Fold next data chunk */
+        v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+
+        buf += 16;
+        len -= 16;
+    }
+
+    /*
+     * Set up a vector register for byte shifts.  The shift value must
+     * be loaded in bits 1-4 in byte element 7 of a vector register.
+     * Shift by 8 bytes: 0x40
+     * Shift by 4 bytes: 0x20
+     */
+    uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    v9 = vec_insert((unsigned char)0x40, v9, 7);
+
+    /*
+     * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
+     * to move R4 into the rightmost doubleword and set the leftmost
+     * doubleword to 0x1.
+     */
+    v0 = vec_srb(r4r3, (uv2di)v9);
+    v0[0] = 1;
+
+    /*
+     * Compute GF(2) product of V1 and V0.  The rightmost doubleword
+     * of V1 is multiplied with R4.  The leftmost doubleword of V1 is
+     * multiplied by 0x1 and is then XORed with rightmost product.
+     * Implicitly, the intermediate leftmost product becomes padded
+     */
+    v1 = (uv2di)vec_gfmsum_128(v0, v1);
+
+    /*
+     * Now do the final 32-bit fold by multiplying the rightmost word
+     * in V1 with R5 and XOR the result with the remaining bits in V1.
+     *
+     * To achieve this by a single VGFMAG, right shift V1 by a word
+     * and store the result in V2 which is then accumulated.  Use the
+     * vector unpack instruction to load the rightmost half of the
+     * doubleword into the rightmost doubleword element of V1; the other
+     * half is loaded in the leftmost doubleword.
+     * The vector register with CONST_R5 contains the R5 constant in the
+     * rightmost doubleword and the leftmost doubleword is zero to ignore
+     * the leftmost product of V1.
+     */
+    v9 = vec_insert((unsigned char)0x20, v9, 7);
+    v2 = vec_srb(v1, (uv2di)v9);
+    v1 = vec_unpackl((uv4si)v1);  /* Split rightmost doubleword */
+    v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
+
+    /*
+     * Apply a Barret reduction to compute the final 32-bit CRC value.
+     *
+     * The input values to the Barret reduction are the degree-63 polynomial
+     * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+     * constant u.  The Barret reduction result is the CRC value of R(x) mod
+     * P(x).
+     *
+     * The Barret reduction algorithm is defined as:
+     *
+     *    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+     *    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+     *    3. C(x)  = R(x) XOR T2(x) mod x^32
+     *
+     *  Note: The leftmost doubleword of vector register containing
+     *  CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
+     *  is zero and does not contribute to the final result.
+     */
+
+    /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+    v2 = vec_unpackl((uv4si)v1);
+    v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
+
+    /*
+     * Compute the GF(2) product of the CRC polynomial with T1(x) in
+     * V2 and XOR the intermediate result, T2(x), with the value in V1.
+     * The final result is stored in word element 2 of V2.
+     */
+    v2 = vec_unpackl((uv4si)v2);
+    v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
+
+    return ((uv4si)v2)[2];
+}
+
+#define VX_MIN_LEN 64
+#define VX_ALIGNMENT 16L
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
+
+uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t len) {
+    size_t prealign, aligned, remaining;
+
+    if (len < VX_MIN_LEN + VX_ALIGN_MASK)
+        return PREFIX(crc32_braid)(crc, buf, len);
+
+    if ((uintptr_t)buf & VX_ALIGN_MASK) {
+        prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
+        len -= prealign;
+        crc = PREFIX(crc32_braid)(crc, buf, prealign);
+        buf += prealign;
+    }
+    aligned = len & ~VX_ALIGN_MASK;
+    remaining = len & VX_ALIGN_MASK;
+
+    crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff;
+
+    if (remaining)
+        crc = PREFIX(crc32_braid)(crc, buf + aligned, remaining);
+
+    return crc;
+}
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_common.h
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_common.h
@ -0,0 +1,119 @@
+#ifndef DFLTCC_COMMON_H
+#define DFLTCC_COMMON_H
+
+#include "zutil.h"
+
+/*
+   Parameter Block for Query Available Functions.
+ */
+struct dfltcc_qaf_param {
+    char fns[16];
+    char reserved1[8];
+    char fmts[2];
+    char reserved2[6];
+} ALIGNED_(8);
+
+/*
+   Parameter Block for Generate Dynamic-Huffman Table, Compress and Expand.
+ */
+struct dfltcc_param_v0 {
+    uint16_t pbvn;                     /* Parameter-Block-Version Number */
+    uint8_t mvn;                       /* Model-Version Number */
+    uint8_t ribm;                      /* Reserved for IBM use */
+    uint32_t reserved32 : 31;
+    uint32_t cf : 1;                   /* Continuation Flag */
+    uint8_t reserved64[8];
+    uint32_t nt : 1;                   /* New Task */
+    uint32_t reserved129 : 1;
+    uint32_t cvt : 1;                  /* Check Value Type */
+    uint32_t reserved131 : 1;
+    uint32_t htt : 1;                  /* Huffman-Table Type */
+    uint32_t bcf : 1;                  /* Block-Continuation Flag */
+    uint32_t bcc : 1;                  /* Block Closing Control */
+    uint32_t bhf : 1;                  /* Block Header Final */
+    uint32_t reserved136 : 1;
+    uint32_t reserved137 : 1;
+    uint32_t dhtgc : 1;                /* DHT Generation Control */
+    uint32_t reserved139 : 5;
+    uint32_t reserved144 : 5;
+    uint32_t sbb : 3;                  /* Sub-Byte Boundary */
+    uint8_t oesc;                      /* Operation-Ending-Supplemental Code */
+    uint32_t reserved160 : 12;
+    uint32_t ifs : 4;                  /* Incomplete-Function Status */
+    uint16_t ifl;                      /* Incomplete-Function Length */
+    uint8_t reserved192[8];
+    uint8_t reserved256[8];
+    uint8_t reserved320[4];
+    uint16_t hl;                       /* History Length */
+    uint32_t reserved368 : 1;
+    uint16_t ho : 15;                  /* History Offset */
+    uint32_t cv;                       /* Check Value */
+    uint32_t eobs : 15;                /* End-of-block Symbol */
+    uint32_t reserved431: 1;
+    uint8_t eobl : 4;                  /* End-of-block Length */
+    uint32_t reserved436 : 12;
+    uint32_t reserved448 : 4;
+    uint16_t cdhtl : 12;               /* Compressed-Dynamic-Huffman Table
+                                          Length */
+    uint8_t reserved464[6];
+    uint8_t cdht[288];                 /* Compressed-Dynamic-Huffman Table */
+    uint8_t reserved[24];
+    uint8_t ribm2[8];                  /* Reserved for IBM use */
+    uint8_t csb[1152];                 /* Continuation-State Buffer */
+} ALIGNED_(8);
+
+/*
+   Extension of inflate_state and deflate_state.
+ */
+struct dfltcc_state {
+    struct dfltcc_param_v0 param;      /* Parameter block. */
+    struct dfltcc_qaf_param af;        /* Available functions. */
+    char msg[64];                      /* Buffer for strm->msg */
+};
+
+typedef struct {
+    struct dfltcc_state common;
+    uint16_t level_mask;               /* Levels on which to use DFLTCC */
+    uint32_t block_size;               /* New block each X bytes */
+    size_t block_threshold;            /* New block after total_in > X */
+    uint32_t dht_threshold;            /* New block only if avail_in >= X */
+} arch_deflate_state;
+
+typedef struct {
+    struct dfltcc_state common;
+} arch_inflate_state;
+
+/*
+   History buffer size.
+ */
+#define HB_BITS 15
+#define HB_SIZE (1 << HB_BITS)
+
+/*
+   Sizes of deflate block parts.
+ */
+#define DFLTCC_BLOCK_HEADER_BITS 3
+#define DFLTCC_HLITS_COUNT_BITS 5
+#define DFLTCC_HDISTS_COUNT_BITS 5
+#define DFLTCC_HCLENS_COUNT_BITS 4
+#define DFLTCC_MAX_HCLENS 19
+#define DFLTCC_HCLEN_BITS 3
+#define DFLTCC_MAX_HLITS 286
+#define DFLTCC_MAX_HDISTS 30
+#define DFLTCC_MAX_HLIT_HDIST_BITS 7
+#define DFLTCC_MAX_SYMBOL_BITS 16
+#define DFLTCC_MAX_EOBS_BITS 15
+#define DFLTCC_MAX_PADDING_BITS 7
+
+#define DEFLATE_BOUND_COMPLEN(source_len) \
+    ((DFLTCC_BLOCK_HEADER_BITS + \
+      DFLTCC_HLITS_COUNT_BITS + \
+      DFLTCC_HDISTS_COUNT_BITS + \
+      DFLTCC_HCLENS_COUNT_BITS + \
+      DFLTCC_MAX_HCLENS * DFLTCC_HCLEN_BITS + \
+      (DFLTCC_MAX_HLITS + DFLTCC_MAX_HDISTS) * DFLTCC_MAX_HLIT_HDIST_BITS + \
+      (source_len) * DFLTCC_MAX_SYMBOL_BITS + \
+      DFLTCC_MAX_EOBS_BITS + \
+      DFLTCC_MAX_PADDING_BITS) >> 3)
+
+#endif
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.c
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.c
@ -0,0 +1,383 @@
+/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL compression support. */
+
+/*
+   Use the following commands to build zlib-ng with DFLTCC compression support:
+
+        $ ./configure --with-dfltcc-deflate
+   or
+
+        $ cmake -DWITH_DFLTCC_DEFLATE=1 .
+
+   and then
+
+        $ make
+*/
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "trees_emit.h"
+#include "dfltcc_deflate.h"
+#include "dfltcc_detail.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    arch_deflate_state *dfltcc_state = &state->arch;
+
+    dfltcc_reset_state(&dfltcc_state->common);
+
+    /* Initialize tuning parameters */
+    dfltcc_state->level_mask = DFLTCC_LEVEL_MASK;
+    dfltcc_state->block_size = DFLTCC_BLOCK_SIZE;
+    dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE;
+    dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE;
+}
+
+static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy,
+                                       int reproducible) {
+    deflate_state *state = (deflate_state *)strm->state;
+    arch_deflate_state *dfltcc_state = &state->arch;
+
+    /* Unsupported compression settings */
+    if ((dfltcc_state->level_mask & (1 << level)) == 0)
+        return 0;
+    if (window_bits != HB_BITS)
+        return 0;
+    if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY)
+        return 0;
+    if (reproducible)
+        return 0;
+
+    /* Unsupported hardware */
+    if (!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_GDHT) ||
+            !is_bit_set(dfltcc_state->common.af.fns, DFLTCC_CMPR) ||
+            !is_bit_set(dfltcc_state->common.af.fmts, DFLTCC_FMT0))
+        return 0;
+
+    return 1;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    return dfltcc_can_deflate_with_params(strm, state->level, state->w_bits, state->strategy, state->reproducible);
+}
+
+static inline void dfltcc_gdht(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+    size_t avail_in = strm->avail_in;
+
+    dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL);
+}
+
+static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+    size_t avail_in = strm->avail_in;
+    size_t avail_out = strm->avail_out;
+    dfltcc_cc cc;
+
+    cc = dfltcc(DFLTCC_CMPR | HBT_CIRCULAR,
+                param, &strm->next_out, &avail_out,
+                &strm->next_in, &avail_in, state->window);
+    strm->total_in += (strm->avail_in - avail_in);
+    strm->total_out += (strm->avail_out - avail_out);
+    strm->avail_in = avail_in;
+    strm->avail_out = avail_out;
+    return cc;
+}
+
+static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    send_bits(state, PREFIX(bi_reverse)(param->eobs >> (15 - param->eobl), param->eobl), param->eobl, state->bi_buf, state->bi_valid);
+    PREFIX(flush_pending)(strm);
+    if (state->pending != 0) {
+        /* The remaining data is located in pending_out[0:pending]. If someone
+         * calls put_byte() - this might happen in deflate() - the byte will be
+         * placed into pending_buf[pending], which is incorrect. Move the
+         * remaining data to the beginning of pending_buf so that put_byte() is
+         * usable again.
+         */
+        memmove(state->pending_buf, state->pending_out, state->pending);
+        state->pending_out = state->pending_buf;
+    }
+#ifdef ZLIB_DEBUG
+    state->compressed_len += param->eobl;
+#endif
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result) {
+    deflate_state *state = (deflate_state *)strm->state;
+    arch_deflate_state *dfltcc_state = &state->arch;
+    struct dfltcc_param_v0 *param = &dfltcc_state->common.param;
+    uInt masked_avail_in;
+    dfltcc_cc cc;
+    int need_empty_block;
+    int soft_bcc;
+    int no_flush;
+
+    if (!PREFIX(dfltcc_can_deflate)(strm)) {
+        /* Clear history. */
+        if (flush == Z_FULL_FLUSH)
+            param->hl = 0;
+        return 0;
+    }
+
+again:
+    masked_avail_in = 0;
+    soft_bcc = 0;
+    no_flush = flush == Z_NO_FLUSH;
+
+    /* No input data. Return, except when Continuation Flag is set, which means
+     * that DFLTCC has buffered some output in the parameter block and needs to
+     * be called again in order to flush it.
+     */
+    if (strm->avail_in == 0 && !param->cf) {
+        /* A block is still open, and the hardware does not support closing
+         * blocks without adding data. Thus, close it manually.
+         */
+        if (!no_flush && param->bcf) {
+            send_eobs(strm, param);
+            param->bcf = 0;
+        }
+        /* Let one of deflate_* functions write a trailing empty block. */
+        if (flush == Z_FINISH)
+            return 0;
+        /* Clear history. */
+        if (flush == Z_FULL_FLUSH)
+            param->hl = 0;
+        /* Trigger block post-processing if necessary. */
+        *result = no_flush ? need_more : block_done;
+        return 1;
+    }
+
+    /* There is an open non-BFINAL block, we are not going to close it just
+     * yet, we have compressed more than DFLTCC_BLOCK_SIZE bytes and we see
+     * more than DFLTCC_DHT_MIN_SAMPLE_SIZE bytes. Open a new block with a new
+     * DHT in order to adapt to a possibly changed input data distribution.
+     */
+    if (param->bcf && no_flush &&
+            strm->total_in > dfltcc_state->block_threshold &&
+            strm->avail_in >= dfltcc_state->dht_threshold) {
+        if (param->cf) {
+            /* We need to flush the DFLTCC buffer before writing the
+             * End-of-block Symbol. Mask the input data and proceed as usual.
+             */
+            masked_avail_in += strm->avail_in;
+            strm->avail_in = 0;
+            no_flush = 0;
+        } else {
+            /* DFLTCC buffer is empty, so we can manually write the
+             * End-of-block Symbol right away.
+             */
+            send_eobs(strm, param);
+            param->bcf = 0;
+            dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
+        }
+    }
+
+    /* No space for compressed data. If we proceed, dfltcc_cmpr() will return
+     * DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still
+     * set BCF=1, which is wrong. Avoid complications and return early.
+     */
+    if (strm->avail_out == 0) {
+        *result = need_more;
+        return 1;
+    }
+
+    /* The caller gave us too much data. Pass only one block worth of
+     * uncompressed data to DFLTCC and mask the rest, so that on the next
+     * iteration we start a new block.
+     */
+    if (no_flush && strm->avail_in > dfltcc_state->block_size) {
+        masked_avail_in += (strm->avail_in - dfltcc_state->block_size);
+        strm->avail_in = dfltcc_state->block_size;
+    }
+
+    /* When we have an open non-BFINAL deflate block and caller indicates that
+     * the stream is ending, we need to close an open deflate block and open a
+     * BFINAL one.
+     */
+    need_empty_block = flush == Z_FINISH && param->bcf && !param->bhf;
+
+    /* Translate stream to parameter block */
+    param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32;
+    if (!no_flush)
+        /* We need to close a block. Always do this in software - when there is
+         * no input data, the hardware will not honor BCC. */
+        soft_bcc = 1;
+    if (flush == Z_FINISH && !param->bcf)
+        /* We are about to open a BFINAL block, set Block Header Final bit
+         * until the stream ends.
+         */
+        param->bhf = 1;
+    /* DFLTCC-CMPR will write to next_out, so make sure that buffers with
+     * higher precedence are empty.
+     */
+    Assert(state->pending == 0, "There must be no pending bytes");
+    Assert(state->bi_valid < 8, "There must be less than 8 pending bits");
+    param->sbb = (unsigned int)state->bi_valid;
+    if (param->sbb > 0)
+        *strm->next_out = (unsigned char)state->bi_buf;
+    /* Honor history and check value */
+    param->nt = 0;
+    if (state->wrap == 1)
+        param->cv = strm->adler;
+    else if (state->wrap == 2)
+        param->cv = ZSWAP32(state->crc_fold.value);
+
+    /* When opening a block, choose a Huffman-Table Type */
+    if (!param->bcf) {
+        if (state->strategy == Z_FIXED || (strm->total_in == 0 && dfltcc_state->block_threshold > 0))
+            param->htt = HTT_FIXED;
+        else {
+            param->htt = HTT_DYNAMIC;
+            dfltcc_gdht(strm);
+        }
+    }
+
+    /* Deflate */
+    do {
+        cc = dfltcc_cmpr(strm);
+        if (strm->avail_in < 4096 && masked_avail_in > 0)
+            /* We are about to call DFLTCC with a small input buffer, which is
+             * inefficient. Since there is masked data, there will be at least
+             * one more DFLTCC call, so skip the current one and make the next
+             * one handle more data.
+             */
+            break;
+    } while (cc == DFLTCC_CC_AGAIN);
+
+    /* Translate parameter block to stream */
+    strm->msg = oesc_msg(dfltcc_state->common.msg, param->oesc);
+    state->bi_valid = param->sbb;
+    if (state->bi_valid == 0)
+        state->bi_buf = 0; /* Avoid accessing next_out */
+    else
+        state->bi_buf = *strm->next_out & ((1 << state->bi_valid) - 1);
+    if (state->wrap == 1)
+        strm->adler = param->cv;
+    else if (state->wrap == 2)
+        state->crc_fold.value = ZSWAP32(param->cv);
+
+    /* Unmask the input data */
+    strm->avail_in += masked_avail_in;
+    masked_avail_in = 0;
+
+    /* If we encounter an error, it means there is a bug in DFLTCC call */
+    Assert(cc != DFLTCC_CC_OP2_CORRUPT || param->oesc == 0, "BUG");
+
+    /* Update Block-Continuation Flag. It will be used to check whether to call
+     * GDHT the next time.
+     */
+    if (cc == DFLTCC_CC_OK) {
+        if (soft_bcc) {
+            send_eobs(strm, param);
+            param->bcf = 0;
+            dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
+        } else
+            param->bcf = 1;
+        if (flush == Z_FINISH) {
+            if (need_empty_block)
+                /* Make the current deflate() call also close the stream */
+                return 0;
+            else {
+                bi_windup(state);
+                *result = finish_done;
+            }
+        } else {
+            if (flush == Z_FULL_FLUSH)
+                param->hl = 0; /* Clear history */
+            *result = flush == Z_NO_FLUSH ? need_more : block_done;
+        }
+    } else {
+        param->bcf = 1;
+        *result = need_more;
+    }
+    if (strm->avail_in != 0 && strm->avail_out != 0)
+        goto again; /* deflate() must use all input or all output */
+    return 1;
+}
+
+/*
+   Switching between hardware and software compression.
+
+   DFLTCC does not support all zlib settings, e.g. generation of non-compressed
+   blocks or alternative window sizes. When such settings are applied on the
+   fly with deflateParams, we need to convert between hardware and software
+   window formats.
+*/
+static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    return strm->total_in > 0 || param->nt == 0 || param->hl > 0;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush) {
+    deflate_state *state = (deflate_state *)strm->state;
+    int could_deflate = PREFIX(dfltcc_can_deflate)(strm);
+    int can_deflate = dfltcc_can_deflate_with_params(strm, level, state->w_bits, strategy, state->reproducible);
+
+    if (can_deflate == could_deflate)
+        /* We continue to work in the same mode - no changes needed */
+        return Z_OK;
+
+    if (!dfltcc_was_deflate_used(strm))
+        /* DFLTCC was not used yet - no changes needed */
+        return Z_OK;
+
+    /* For now, do not convert between window formats - simply get rid of the old data instead */
+    *flush = Z_FULL_FLUSH;
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    /* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might
+     * close the block without resetting the compression state. Detect this
+     * situation and return that deflation is not done.
+     */
+    if (flush == Z_FULL_FLUSH && strm->avail_out == 0)
+        return 0;
+
+    /* Return that deflation is not done if DFLTCC is used and either it
+     * buffered some data (Continuation Flag is set), or has not written EOBS
+     * yet (Block-Continuation Flag is set).
+     */
+    return !PREFIX(dfltcc_can_deflate)(strm) || (!param->cf && !param->bcf);
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm);
+}
+
+/*
+   Preloading history.
+*/
+int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                const unsigned char *dictionary, uInt dict_length) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    append_history(param, state->window, dictionary, dict_length);
+    state->strstart = 1; /* Add FDICT to zlib header */
+    state->block_start = state->strstart; /* Make deflate_stored happy */
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    if (dictionary)
+        get_history(param, state->window, dictionary);
+    if (dict_length)
+        *dict_length = param->hl;
+    return Z_OK;
+}
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.h
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.h
@ -0,0 +1,58 @@
+#ifndef DFLTCC_DEFLATE_H
+#define DFLTCC_DEFLATE_H
+
+#include "deflate.h"
+#include "dfltcc_common.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp));
+int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result);
+int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush);
+int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush);
+int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible);
+int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                const unsigned char *dictionary, uInt dict_length);
+int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
+
+#define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_deflate)((strm))) \
+            return PREFIX(dfltcc_deflate_set_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define DEFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_deflate)((strm))) \
+            return PREFIX(dfltcc_deflate_get_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define DEFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_deflate_state)
+
+#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \
+    do { \
+        int err; \
+\
+        err = PREFIX(dfltcc_deflate_params)((strm), (level), (strategy), (hook_flush)); \
+        if (err == Z_STREAM_ERROR) \
+            return err; \
+    } while (0)
+
+#define DEFLATE_DONE PREFIX(dfltcc_deflate_done)
+
+#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \
+    do { \
+        if (deflateStateCheck((strm)) || PREFIX(dfltcc_can_deflate)((strm))) \
+            (complen) = DEFLATE_BOUND_COMPLEN(source_len); \
+    } while (0)
+
+#define DEFLATE_NEED_CONSERVATIVE_BOUND(strm) (PREFIX(dfltcc_can_deflate)((strm)))
+
+#define DEFLATE_HOOK PREFIX(dfltcc_deflate)
+
+#define DEFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_deflate)((strm)))
+
+#define DEFLATE_CAN_SET_REPRODUCIBLE PREFIX(dfltcc_can_set_reproducible)
+
+#define DEFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
+
+#endif
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_detail.h
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_detail.h
@ -0,0 +1,275 @@
+#include "zbuild.h"
+#include <stdio.h>
+
+#ifdef HAVE_SYS_SDT_H
+#include <sys/sdt.h>
+#endif
+
+/*
+   Tuning parameters.
+ */
+#ifndef DFLTCC_LEVEL_MASK
+#define DFLTCC_LEVEL_MASK 0x2
+#endif
+#ifndef DFLTCC_BLOCK_SIZE
+#define DFLTCC_BLOCK_SIZE 1048576
+#endif
+#ifndef DFLTCC_FIRST_FHT_BLOCK_SIZE
+#define DFLTCC_FIRST_FHT_BLOCK_SIZE 4096
+#endif
+#ifndef DFLTCC_DHT_MIN_SAMPLE_SIZE
+#define DFLTCC_DHT_MIN_SAMPLE_SIZE 4096
+#endif
+#ifndef DFLTCC_RIBM
+#define DFLTCC_RIBM 0
+#endif
+
+#define static_assert(c, msg) __attribute__((unused)) static char static_assert_failed_ ## msg[c ? 1 : -1]
+
+#define DFLTCC_SIZEOF_QAF 32
+static_assert(sizeof(struct dfltcc_qaf_param) == DFLTCC_SIZEOF_QAF, qaf);
+
+static inline int is_bit_set(const char *bits, int n) {
+    return bits[n / 8] & (1 << (7 - (n % 8)));
+}
+
+static inline void clear_bit(char *bits, int n) {
+    bits[n / 8] &= ~(1 << (7 - (n % 8)));
+}
+
+#define DFLTCC_FACILITY 151
+
+static inline int is_dfltcc_enabled(void) {
+    uint64_t facilities[(DFLTCC_FACILITY / 64) + 1];
+    Z_REGISTER uint8_t r0 __asm__("r0");
+
+    memset(facilities, 0, sizeof(facilities));
+    r0 = sizeof(facilities) / sizeof(facilities[0]) - 1;
+    /* STFLE is supported since z9-109 and only in z/Architecture mode. When
+     * compiling with -m31, gcc defaults to ESA mode, however, since the kernel
+     * is 64-bit, it's always z/Architecture mode at runtime.
+     */
+    __asm__ volatile(
+#ifndef __clang__
+                     ".machinemode push\n"
+                     ".machinemode zarch\n"
+#endif
+                     "stfle %[facilities]\n"
+#ifndef __clang__
+                     ".machinemode pop\n"
+#endif
+                     : [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
+    return is_bit_set((const char *)facilities, DFLTCC_FACILITY);
+}
+
+#define DFLTCC_FMT0 0
+
+#define CVT_CRC32 0
+#define CVT_ADLER32 1
+#define HTT_FIXED 0
+#define HTT_DYNAMIC 1
+
+#define DFLTCC_SIZEOF_GDHT_V0 384
+#define DFLTCC_SIZEOF_CMPR_XPND_V0 1536
+static_assert(offsetof(struct dfltcc_param_v0, csb) == DFLTCC_SIZEOF_GDHT_V0, gdht_v0);
+static_assert(sizeof(struct dfltcc_param_v0) == DFLTCC_SIZEOF_CMPR_XPND_V0, cmpr_xpnd_v0);
+
+static inline z_const char *oesc_msg(char *buf, int oesc) {
+    if (oesc == 0x00)
+        return NULL; /* Successful completion */
+    else {
+        sprintf(buf, "Operation-Ending-Supplemental Code is 0x%.2X", oesc);
+        return buf;
+    }
+}
+
+/*
+   C wrapper for the DEFLATE CONVERSION CALL instruction.
+ */
+typedef enum {
+    DFLTCC_CC_OK = 0,
+    DFLTCC_CC_OP1_TOO_SHORT = 1,
+    DFLTCC_CC_OP2_TOO_SHORT = 2,
+    DFLTCC_CC_OP2_CORRUPT = 2,
+    DFLTCC_CC_AGAIN = 3,
+} dfltcc_cc;
+
+#define DFLTCC_QAF 0
+#define DFLTCC_GDHT 1
+#define DFLTCC_CMPR 2
+#define DFLTCC_XPND 4
+#define HBT_CIRCULAR (1 << 7)
+#define DFLTCC_FN_MASK ((1 << 7) - 1)
+
+/* Return lengths of high (starting at param->ho) and low (starting at 0) fragments of the circular history buffer. */
+static inline void get_history_lengths(struct dfltcc_param_v0 *param, size_t *hl_high, size_t *hl_low) {
+    *hl_high = MIN(param->hl, HB_SIZE - param->ho);
+    *hl_low = param->hl - *hl_high;
+}
+
+/* Notify instrumentation about an upcoming read/write access to the circular history buffer. */
+static inline void instrument_read_write_hist(struct dfltcc_param_v0 *param, void *hist) {
+    size_t hl_high, hl_low;
+
+    get_history_lengths(param, &hl_high, &hl_low);
+    instrument_read_write(hist + param->ho, hl_high);
+    instrument_read_write(hist, hl_low);
+}
+
+/* Notify MSan about a completed write to the circular history buffer. */
+static inline void msan_unpoison_hist(struct dfltcc_param_v0 *param, void *hist) {
+    size_t hl_high, hl_low;
+
+    get_history_lengths(param, &hl_high, &hl_low);
+    __msan_unpoison(hist + param->ho, hl_high);
+    __msan_unpoison(hist, hl_low);
+}
+
+static inline dfltcc_cc dfltcc(int fn, void *param,
+                               unsigned char **op1, size_t *len1,
+                               z_const unsigned char **op2, size_t *len2, void *hist) {
+    unsigned char *t2 = op1 ? *op1 : NULL;
+    unsigned char *orig_t2 = t2;
+    size_t t3 = len1 ? *len1 : 0;
+    z_const unsigned char *t4 = op2 ? *op2 : NULL;
+    size_t t5 = len2 ? *len2 : 0;
+    Z_REGISTER int r0 __asm__("r0");
+    Z_REGISTER void *r1 __asm__("r1");
+    Z_REGISTER unsigned char *r2 __asm__("r2");
+    Z_REGISTER size_t r3 __asm__("r3");
+    Z_REGISTER z_const unsigned char *r4 __asm__("r4");
+    Z_REGISTER size_t r5 __asm__("r5");
+    int cc;
+
+    /* Insert pre-instrumentation for DFLTCC. */
+    switch (fn & DFLTCC_FN_MASK) {
+    case DFLTCC_QAF:
+        instrument_write(param, DFLTCC_SIZEOF_QAF);
+        break;
+    case DFLTCC_GDHT:
+        instrument_read_write(param, DFLTCC_SIZEOF_GDHT_V0);
+        instrument_read(t4, t5);
+        break;
+    case DFLTCC_CMPR:
+    case DFLTCC_XPND:
+        instrument_read_write(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        instrument_read(t4, t5);
+        instrument_write(t2, t3);
+        instrument_read_write_hist(param, hist);
+        break;
+    }
+
+    r0 = fn; r1 = param; r2 = t2; r3 = t3; r4 = t4; r5 = t5;
+    __asm__ volatile(
+#ifdef HAVE_SYS_SDT_H
+                     STAP_PROBE_ASM(zlib, dfltcc_entry, STAP_PROBE_ASM_TEMPLATE(5))
+#endif
+                     ".insn rrf,0xb9390000,%[r2],%[r4],%[hist],0\n"
+#ifdef HAVE_SYS_SDT_H
+                     STAP_PROBE_ASM(zlib, dfltcc_exit, STAP_PROBE_ASM_TEMPLATE(5))
+#endif
+                     "ipm %[cc]\n"
+                     : [r2] "+r" (r2)
+                     , [r3] "+r" (r3)
+                     , [r4] "+r" (r4)
+                     , [r5] "+r" (r5)
+                     , [cc] "=r" (cc)
+                     : [r0] "r" (r0)
+                     , [r1] "r" (r1)
+                     , [hist] "r" (hist)
+#ifdef HAVE_SYS_SDT_H
+                     , STAP_PROBE_ASM_OPERANDS(5, r2, r3, r4, r5, hist)
+#endif
+                     : "cc", "memory");
+    t2 = r2; t3 = r3; t4 = r4; t5 = r5;
+
+    /* Insert post-instrumentation for DFLTCC. */
+    switch (fn & DFLTCC_FN_MASK) {
+    case DFLTCC_QAF:
+        __msan_unpoison(param, DFLTCC_SIZEOF_QAF);
+        break;
+    case DFLTCC_GDHT:
+        __msan_unpoison(param, DFLTCC_SIZEOF_GDHT_V0);
+        break;
+    case DFLTCC_CMPR:
+        __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        __msan_unpoison(orig_t2, t2 - orig_t2 + (((struct dfltcc_param_v0 *)param)->sbb == 0 ? 0 : 1));
+        msan_unpoison_hist(param, hist);
+        break;
+    case DFLTCC_XPND:
+        __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        __msan_unpoison(orig_t2, t2 - orig_t2);
+        msan_unpoison_hist(param, hist);
+        break;
+    }
+
+    if (op1)
+        *op1 = t2;
+    if (len1)
+        *len1 = t3;
+    if (op2)
+        *op2 = t4;
+    if (len2)
+        *len2 = t5;
+    return (cc >> 28) & 3;
+}
+
+#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
+
+static inline void dfltcc_reset_state(struct dfltcc_state *dfltcc_state) {
+    /* Initialize available functions */
+    if (is_dfltcc_enabled()) {
+        dfltcc(DFLTCC_QAF, &dfltcc_state->param, NULL, NULL, NULL, NULL, NULL);
+        memmove(&dfltcc_state->af, &dfltcc_state->param, sizeof(dfltcc_state->af));
+    } else
+        memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
+
+    /* Initialize parameter block */
+    memset(&dfltcc_state->param, 0, sizeof(dfltcc_state->param));
+    dfltcc_state->param.nt = 1;
+    dfltcc_state->param.ribm = DFLTCC_RIBM;
+}
+
+static inline void dfltcc_copy_state(void *dst, const void *src, uInt size, uInt extension_size) {
+    memcpy(dst, src, ALIGN_UP(size, 8) + extension_size);
+}
+
+static inline void append_history(struct dfltcc_param_v0 *param, unsigned char *history,
+                                  const unsigned char *buf, uInt count) {
+    size_t offset;
+    size_t n;
+
+    /* Do not use more than 32K */
+    if (count > HB_SIZE) {
+        buf += count - HB_SIZE;
+        count = HB_SIZE;
+    }
+    offset = (param->ho + param->hl) % HB_SIZE;
+    if (offset + count <= HB_SIZE)
+        /* Circular history buffer does not wrap - copy one chunk */
+        memcpy(history + offset, buf, count);
+    else {
+        /* Circular history buffer wraps - copy two chunks */
+        n = HB_SIZE - offset;
+        memcpy(history + offset, buf, n);
+        memcpy(history, buf + n, count - n);
+    }
+    n = param->hl + count;
+    if (n <= HB_SIZE)
+        /* All history fits into buffer - no need to discard anything */
+        param->hl = n;
+    else {
+        /* History does not fit into buffer - discard extra bytes */
+        param->ho = (param->ho + (n - HB_SIZE)) % HB_SIZE;
+        param->hl = HB_SIZE;
+    }
+}
+
+static inline void get_history(struct dfltcc_param_v0 *param, const unsigned char *history,
+                               unsigned char *buf) {
+    size_t hl_high, hl_low;
+
+    get_history_lengths(param, &hl_high, &hl_low);
+    memcpy(buf, history + param->ho, hl_high);
+    memcpy(buf + hl_high, history, hl_low);
+}
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.c
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.c
@ -0,0 +1,191 @@
+/* dfltcc_inflate.c - IBM Z DEFLATE CONVERSION CALL decompression support. */
+
+/*
+   Use the following commands to build zlib-ng with DFLTCC decompression support:
+
+        $ ./configure --with-dfltcc-inflate
+   or
+
+        $ cmake -DWITH_DFLTCC_INFLATE=1 .
+
+   and then
+
+        $ make
+*/
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "dfltcc_inflate.h"
+#include "dfltcc_detail.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+
+    dfltcc_reset_state(&state->arch.common);
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = &state->arch.common;
+
+    /* Unsupported hardware */
+    return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0);
+}
+
+static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+    size_t avail_in = strm->avail_in;
+    size_t avail_out = strm->avail_out;
+    dfltcc_cc cc;
+
+    cc = dfltcc(DFLTCC_XPND | HBT_CIRCULAR,
+                param, &strm->next_out, &avail_out,
+                &strm->next_in, &avail_in, state->window);
+    strm->avail_in = avail_in;
+    strm->avail_out = avail_out;
+    return cc;
+}
+
+dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = &state->arch.common;
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+    dfltcc_cc cc;
+
+    if (flush == Z_BLOCK || flush == Z_TREES) {
+        /* DFLTCC does not support stopping on block boundaries */
+        if (PREFIX(dfltcc_inflate_disable)(strm)) {
+            *ret = Z_STREAM_ERROR;
+            return DFLTCC_INFLATE_BREAK;
+        } else
+            return DFLTCC_INFLATE_SOFTWARE;
+    }
+
+    if (state->last) {
+        if (state->bits != 0) {
+            strm->next_in++;
+            strm->avail_in--;
+            state->bits = 0;
+        }
+        state->mode = CHECK;
+        return DFLTCC_INFLATE_CONTINUE;
+    }
+
+    if (strm->avail_in == 0 && !param->cf)
+        return DFLTCC_INFLATE_BREAK;
+
+    /* if window not in use yet, initialize */
+    if (state->wsize == 0)
+        state->wsize = 1U << state->wbits;
+
+    /* Translate stream to parameter block */
+    param->cvt = ((state->wrap & 4) && state->flags) ? CVT_CRC32 : CVT_ADLER32;
+    param->sbb = state->bits;
+    if (param->hl)
+        param->nt = 0; /* Honor history for the first block */
+    if (state->wrap & 4)
+        param->cv = state->flags ? ZSWAP32(state->check) : state->check;
+
+    /* Inflate */
+    do {
+        cc = dfltcc_xpnd(strm);
+    } while (cc == DFLTCC_CC_AGAIN);
+
+    /* Translate parameter block to stream */
+    strm->msg = oesc_msg(dfltcc_state->msg, param->oesc);
+    state->last = cc == DFLTCC_CC_OK;
+    state->bits = param->sbb;
+    if (state->wrap & 4)
+        strm->adler = state->check = state->flags ? ZSWAP32(param->cv) : param->cv;
+    if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) {
+        /* Report an error if stream is corrupted */
+        state->mode = BAD;
+        return DFLTCC_INFLATE_CONTINUE;
+    }
+    state->mode = TYPEDO;
+    /* Break if operands are exhausted, otherwise continue looping */
+    return (cc == DFLTCC_CC_OP1_TOO_SHORT || cc == DFLTCC_CC_OP2_TOO_SHORT) ?
+        DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+
+    return !state->arch.common.param.nt;
+}
+
+/*
+   Rotates a circular buffer.
+   The implementation is based on https://cplusplus.com/reference/algorithm/rotate/
+ */
+static void rotate(unsigned char *start, unsigned char *pivot, unsigned char *end) {
+    unsigned char *p = pivot;
+    unsigned char tmp;
+
+    while (p != start) {
+        tmp = *start;
+        *start = *p;
+        *p = tmp;
+
+        start++;
+        p++;
+
+        if (p == end)
+            p = pivot;
+        else if (start == pivot)
+            pivot = p;
+    }
+}
+
+int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = &state->arch.common;
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+
+    if (!PREFIX(dfltcc_can_inflate)(strm))
+        return 0;
+    if (PREFIX(dfltcc_was_inflate_used)(strm))
+        /* DFLTCC has already decompressed some data. Since there is not
+         * enough information to resume decompression in software, the call
+         * must fail.
+         */
+        return 1;
+    /* DFLTCC was not used yet - decompress in software */
+    memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
+    /* Convert the window from the hardware to the software format */
+    rotate(state->window, state->window + param->ho, state->window + HB_SIZE);
+    state->whave = state->wnext = MIN(param->hl, state->wsize);
+    return 0;
+}
+
+/*
+   Preloading history.
+*/
+int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                     const unsigned char *dictionary, uInt dict_length) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    /* if window not in use yet, initialize */
+    if (state->wsize == 0)
+        state->wsize = 1U << state->wbits;
+
+    append_history(param, state->window, dictionary, dict_length);
+    state->havedict = 1;
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
+                                                     unsigned char *dictionary, uInt *dict_length) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    if (dictionary && state->window)
+        get_history(param, state->window, dictionary);
+    if (dict_length)
+        *dict_length = param->hl;
+    return Z_OK;
+}
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.h
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.h
@ -0,0 +1,67 @@
+#ifndef DFLTCC_INFLATE_H
+#define DFLTCC_INFLATE_H
+
+#include "dfltcc_common.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm);
+typedef enum {
+    DFLTCC_INFLATE_CONTINUE,
+    DFLTCC_INFLATE_BREAK,
+    DFLTCC_INFLATE_SOFTWARE,
+} dfltcc_inflate_action;
+dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret);
+int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                     const unsigned char *dictionary, uInt dict_length);
+int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
+                                                     unsigned char *dictionary, uInt* dict_length);
+
+#define INFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_inflate_state)
+
+#define INFLATE_PRIME_HOOK(strm, bits, value) \
+    do { if (PREFIX(dfltcc_inflate_disable)((strm))) return Z_STREAM_ERROR; } while (0)
+
+#define INFLATE_TYPEDO_HOOK(strm, flush) \
+    if (PREFIX(dfltcc_can_inflate)((strm))) { \
+        dfltcc_inflate_action action; \
+\
+        RESTORE(); \
+        action = PREFIX(dfltcc_inflate)((strm), (flush), &ret); \
+        LOAD(); \
+        if (action == DFLTCC_INFLATE_CONTINUE) \
+            break; \
+        else if (action == DFLTCC_INFLATE_BREAK) \
+            goto inf_leave; \
+    }
+
+#define INFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
+
+#define INFLATE_NEED_UPDATEWINDOW(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
+
+#define INFLATE_MARK_HOOK(strm) \
+    do { \
+        if (PREFIX(dfltcc_was_inflate_used)((strm))) return -(1L << 16); \
+    } while (0)
+
+#define INFLATE_SYNC_POINT_HOOK(strm) \
+    do { \
+        if (PREFIX(dfltcc_was_inflate_used)((strm))) return Z_STREAM_ERROR; \
+    } while (0)
+
+#define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_inflate)((strm))) \
+            return PREFIX(dfltcc_inflate_set_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_inflate)((strm))) \
+            return PREFIX(dfltcc_inflate_get_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define INFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
+
+#endif
--- a/3rdparty/zlib-ng/arch/s390/s390_features.c
+++ b/3rdparty/zlib-ng/arch/s390/s390_features.c
@ -0,0 +1,14 @@
+#include "zbuild.h"
+#include "s390_features.h"
+
+#ifdef HAVE_SYS_AUXV_H
+#  include <sys/auxv.h>
+#endif
+
+#ifndef HWCAP_S390_VXRS
+#define HWCAP_S390_VXRS HWCAP_S390_VX
+#endif
+
+void Z_INTERNAL s390_check_features(struct s390_cpu_features *features) {
+    features->has_vx = getauxval(AT_HWCAP) & HWCAP_S390_VXRS;
+}
--- a/Show More
+++ b/Show More