Merge pull request #27723 from fanchenkong1:fastconv-wasm-scalar-opt

DNN: Add large scalar fastconv kernel for WASM/no-SIMD build
Merge pull request #27949 from cudawarped:fix_cudart_dependency
2025-12-06 00:19:46 +01:00 · 2025-10-31 22:50:41 +03:00 · 2025-10-31 22:43:45 +03:00 · 2025-10-30 22:16:27 +03:00 · 2025-10-30 16:11:11 +03:00 · 2025-10-30 16:10:13 +03:00
76 changed files with 2645 additions and 1732 deletions
--- a/3rdparty/openexr/CMakeLists.txt
+++ b/3rdparty/openexr/CMakeLists.txt
@ -110,6 +110,7 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow -Wunused -Wsign-compare -Wundef -W
                                     -Wreorder
                                     -Wunused-result
                                     -Wimplicit-const-int-float-conversion  # clang
+                                     -Wno-nontrivial-memcall # for clang 21
 )
 if(CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.0)
  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wclass-memaccess)
--- a/apps/pattern-tools/generate_pattern.py
+++ b/apps/pattern-tools/generate_pattern.py
@ -207,12 +207,36 @@ class PatternMaker:
                    square = SVG("rect", x=x_pos+ch_ar_border, y=y_pos+ch_ar_border, width=self.aruco_marker_size,
                                             height=self.aruco_marker_size, fill="black", stroke="none")
                    self.g.append(square)
+
+                    # BUG: https://github.com/opencv/opencv/issues/27871
+                    # The loop bellow merges white squares horizontally and vertically to exclude visible grid on the final pattern
                    for x_ in range(len(img_mark[0])):
-                        for y_ in range(len(img_mark)):
-                            if (img_mark[y_][x_] != 0):
-                                square = SVG("rect", x=x_pos+ch_ar_border+(x_)*side, y=y_pos+ch_ar_border+(y_)*side, width=side,
-                                             height=side, fill="white", stroke="white", stroke_width = spacing*0.01)
-                                self.g.append(square)
+                        y_ = 0
+                        while y_ < len(img_mark):
+                            y_start = y_
+                            while y_ < len(img_mark) and img_mark[y_][x_] != 0:
+                                y_ += 1
+
+                            if y_ > y_start:
+                                rect = SVG("rect", x=x_pos+ch_ar_border+(x_)*side, y=y_pos+ch_ar_border+(y_start)*side, width=side,
+                                           height=(y_ - y_start)*side, fill="white", stroke="none")
+                                self.g.append(rect)
+
+                            y_ += 1
+
+                    for y_ in range(len(img_mark)):
+                        x_ = 0
+                        while x_ < len(img_mark[0]):
+                            x_start = x_
+                            while x_ < len(img_mark[0]) and img_mark[y_][x_] != 0:
+                                x_ += 1
+
+                            if x_ > x_start:
+                                rect = SVG("rect", x=x_pos+ch_ar_border+(x_start)*side, y=y_pos+ch_ar_border+(y_)*side, width=(x_-x_start)*side,
+                                           height=side, fill="white", stroke="none")
+                                self.g.append(rect)
+
+                            x_ += 1

    def save(self):
        c = canvas(self.g, width="%d%s" % (self.width, self.units), height="%d%s" % (self.height, self.units),
--- a/apps/pattern-tools/test_charuco_board.py
+++ b/apps/pattern-tools/test_charuco_board.py
@ -11,10 +11,9 @@ class aruco_objdetect_test(NewOpenCVTests):

    def test_aruco_dicts(self):
        try:
-            from svglib.svglib import svg2rlg
-            from reportlab.graphics import renderPM
+            import cairosvg
        except:
-            raise self.skipTest("libraies svglib and reportlab not found")
+            raise self.skipTest("cairosvg library was not found")
        else:
            cols = 3
            rows = 5
@ -49,8 +48,7 @@ class aruco_objdetect_test(NewOpenCVTests):
                                os.path.join(basedir, aruco_type_str[aruco_type_i]+'.json.gz'), 0)
                    pm.make_charuco_board()
                    pm.save()
-                    drawing = svg2rlg(filesvg)
-                    renderPM.drawToFile(drawing, filepng, fmt='PNG', dpi=72)
+                    cairosvg.svg2png(url=filesvg, write_to=filepng, background_color="white")
                    from_svg_img = cv.imread(filepng)
                    _charucoCorners, _charuco_ids_svg, marker_corners_svg, marker_ids_svg = charuco_detector.detectBoard(from_svg_img)
                    _charucoCorners, _charuco_ids_cv, marker_corners_cv, marker_ids_cv = charuco_detector.detectBoard(from_cv_img)
@ -72,10 +70,9 @@ class aruco_objdetect_test(NewOpenCVTests):

    def test_aruco_marker_sizes(self):
        try:
-            from svglib.svglib import svg2rlg
-            from reportlab.graphics import renderPM
+            import cairosvg
        except:
-            raise self.skipTest("libraies svglib and reportlab not found")
+            raise self.skipTest("cairosvg library was not found")
        else:
            cols = 3
            rows = 5
@ -106,8 +103,7 @@ class aruco_objdetect_test(NewOpenCVTests):
                                board_height, "charuco_checkboard", marker_size, os.path.join(basedir, aruco_type_str+'.json.gz'), 0)
                    pm.make_charuco_board()
                    pm.save()
-                    drawing = svg2rlg(filesvg)
-                    renderPM.drawToFile(drawing, filepng, fmt='PNG', dpi=72)
+                    cairosvg.svg2png(url=filesvg, write_to=filepng, background_color="white")
                    from_svg_img = cv.imread(filepng)

                    #test
--- a/cmake/OpenCVFindLibsGrfmt.cmake
+++ b/cmake/OpenCVFindLibsGrfmt.cmake
@ -385,6 +385,16 @@ if(WITH_GDAL)
    else()
        set(HAVE_GDAL YES)
        ocv_include_directories(${GDAL_INCLUDE_DIR})
+
+        # GDAL_VERSION requires CMake 3.14 or later, if not found, GDAL_RELEASE_NAME is used instead.
+        # See https://cmake.org/cmake/help/latest/module/FindGDAL.html
+        if(NOT GDAL_VERSION)
+            if(EXISTS "${GDAL_INCLUDE_DIR}/gdal_version.h")
+                file(STRINGS "${GDAL_INCLUDE_DIR}/gdal_version.h" gdal_version_str REGEX "^#[\t ]+define[\t ]+GDAL_RELEASE_NAME[\t ]*")
+                string(REGEX REPLACE "^#\[\t ]+define[\t ]+GDAL_RELEASE_NAME[\t ]+\"([^ \\n]*)\"" "\\1" GDAL_VERSION "${gdal_version_str}")
+                unset(gdal_version_str)
+            endif()
+        endif()
    endif()
 endif()

--- a/cmake/checks/cpu_neon_dotprod.cpp
+++ b/cmake/checks/cpu_neon_dotprod.cpp
@ -1,6 +1,8 @@
 #include <stdio.h>

-#if (defined __GNUC__ && (defined __arm__ || defined __aarch64__)) || (defined _MSC_VER && (defined _M_ARM64 || defined _M_ARM64EC))
+#if (defined __GNUC__ && (defined __arm__ || defined __aarch64__))/* || (defined _MSC_VER && (defined _M_ARM64 || defined _M_ARM64EC)) */
+// Windows + ARM64 case disabled: https://github.com/opencv/opencv/issues/25052
+
 #include "arm_neon.h"
 int test()
 {
--- a/cmake/checks/cpu_neon_fp16.cpp
+++ b/cmake/checks/cpu_neon_fp16.cpp
@ -1,6 +1,7 @@
 #include <stdio.h>

-#if (defined __GNUC__ && (defined __arm__ || defined __aarch64__)) || (defined _MSC_VER && (defined _M_ARM64 || defined _M_ARM64EC))
+#if (defined __GNUC__ && (defined __arm__ || defined __aarch64__)) /* || (defined _MSC_VER && (defined _M_ARM64 || defined _M_ARM64EC)) */
+// Windows + ARM64 case disabled: https://github.com/opencv/opencv/issues/25052
 #include "arm_neon.h"

 float16x8_t vld1q_as_f16(const float* src)
@ -36,7 +37,7 @@ void test()
    vprintreg("s1*s2[0]+s1*s2[1] + ... + s1*s2[7]", d);
 }
 #else
-#error "FP16 is not supported"
+#error "NEON FP16 is not supported"
 #endif

 int main()
--- a/doc/charuco_board_pattern.png
+++ b/doc/charuco_board_pattern.png
--- a/doc/py_tutorials/py_photo/py_hdr/py_hdr.markdown
+++ b/doc/py_tutorials/py_photo/py_hdr/py_hdr.markdown
@ -83,10 +83,15 @@ We map the 32-bit float HDR data into the range [0..1].
 Actually, in some cases the values can be larger than 1 or lower the 0, so notice
 we will later have to clip the data in order to avoid overflow.

+@note: The function `cv.createTonemap()` uses a default gamma value of 1.0.
+Set it explicitly to 2.2 to match standard display brightness and ensure consistent tone mapping results.
+
+
@code{.py}
-# Tonemap HDR image
+# Tonemap HDR images using gamma correction (set gamma=2.2 for standard display brightness)
 tonemap1 = cv.createTonemap(gamma=2.2)
 res_debevec = tonemap1.process(hdr_debevec.copy())
+res_robertson = tonemap1.process(hdr_robertson.copy())
@endcode

 ### 4. Merge exposures using Mertens fusion
@ -125,6 +130,8 @@ You can see the different results but consider that each algorithm have addition
 extra parameters that you should fit to get your desired outcome. Best practice is
 to try the different methods and see which one performs best for your scene.

+The results below were generated with a gamma value of 2.2 during tonemapping.
+
 ### Debevec:

 ![image](images/ldr_debevec.jpg)
--- a/hal/ipp/include/ipp_hal_imgproc.hpp
+++ b/hal/ipp/include/ipp_hal_imgproc.hpp
@ -8,22 +8,19 @@
 #include <opencv2/core/base.hpp>
 #include "ipp_utils.hpp"

-#ifdef HAVE_IPP_IW
+#if IPP_VERSION_X100 >= 810

+#if defined(HAVE_IPP_IW)
 int ipp_hal_warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width,
                       int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4]);
-
 #undef cv_hal_warpAffine
 #define cv_hal_warpAffine ipp_hal_warpAffine
 #endif

-#if IPP_VERSION_X100 >= 810
 int ipp_hal_warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width,
                            int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4]);
 #undef cv_hal_warpPerspective
 #define cv_hal_warpPerspective ipp_hal_warpPerspective
-#endif
-

 int ipp_hal_remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
    uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
@ -32,5 +29,6 @@ int ipp_hal_remap32f(int src_type, const uchar *src_data, size_t src_step, int s
 #undef cv_hal_remap32f
 #define cv_hal_remap32f ipp_hal_remap32f

+#endif //IPP_VERSION_X100 >= 810

 #endif //__IPP_HAL_IMGPROC_HPP__
--- a/hal/ipp/src/precomp_ipp.hpp
+++ b/hal/ipp/src/precomp_ipp.hpp
@ -73,11 +73,37 @@ static inline int ippiSuggestThreadsNum(size_t width, size_t height, size_t elem
    return 1;
 }

+static inline int ippiSuggestRowThreadsNum(size_t width, size_t height, size_t elemSize, size_t payloadSize)
+{
+    int num_threads = cv::getNumThreads();
+    if(num_threads > 1)
+    {
+        long rowThreads = static_cast<long>(height);
+
+        // row-based range shall not allow to split rows
+        num_threads = (rowThreads < num_threads) ? rowThreads : num_threads;
+        long rows_per_thread = (rowThreads + num_threads - 1) / num_threads;
+        size_t item_size = width * elemSize; // row size in bytes
+
+        if(static_cast<size_t>(item_size * rows_per_thread) < payloadSize)
+        {
+            long items_per_thread = IPP_MAX(1L, static_cast<long>(payloadSize / item_size ));
+            num_threads = static_cast<int>((height + items_per_thread - 1L) / items_per_thread);
+        }
+    }
+    return num_threads;
+}
+
 #ifdef HAVE_IPP_IW
 static inline int ippiSuggestThreadsNum(const ::ipp::IwiImage &image, double multiplier)
 {
    return ippiSuggestThreadsNum(image.m_size.width, image.m_size.height, image.m_typeSize*image.m_channels, multiplier);
 }
+
+static inline int ippiSuggestRowThreadsNum(const ::ipp::IwiImage &image, size_t payloadSize)
+{
+    return ippiSuggestRowThreadsNum(image.m_size.width, image.m_size.height, image.m_typeSize*image.m_channels, payloadSize);
+}
 #endif

 #endif //__PRECOMP_IPP_HPP__
--- a/hal/ipp/src/warp_ipp.cpp
+++ b/hal/ipp/src/warp_ipp.cpp
@ -4,24 +4,28 @@

 #include "ipp_hal_imgproc.hpp"

+#if IPP_VERSION_X100 >= 810 // integrated IPP warping/remap ABI is available since IPP v8.1
+
 #include <opencv2/core.hpp>
-#include <opencv2/core/base.hpp>
 #include "precomp_ipp.hpp"

-#ifdef HAVE_IPP_IW
-#include "iw++/iw.hpp"
-#endif
+// Uncomment to enforce IPP calls for all supported by IPP configurations
+// #define IPP_CALLS_ENFORCED

-#define IPP_WARPAFFINE_PARALLEL 1
+#define CV_IPP_SAFE_CALL(pFunc, pFlag, ...) if (pFunc(__VA_ARGS__) != ippStsNoErr) {*pFlag = false; return;}
 #define CV_TYPE(src_type) (src_type & (CV_DEPTH_MAX - 1))
+
 #ifdef HAVE_IPP_IW
+// Warp affine section
+
+#include "iw++/iw.hpp"

 class ipp_warpAffineParallel: public cv::ParallelLoopBody
 {
 public:
    ipp_warpAffineParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, IppiInterpolationType _inter, double (&_coeffs)[2][3], ::ipp::IwiBorderType _borderType, IwTransDirection _iwTransDirection, bool *_ok):m_src(src), m_dst(dst)
    {
-        pOk = _ok;
+        ok = _ok;

        inter          = _inter;
        borderType     = _borderType;
@ -31,15 +35,14 @@ public:
            for( int j = 0; j < 3; j++ )
                coeffs[i][j] = _coeffs[i][j];

-        *pOk = true;
+        *ok = true;
    }
    ~ipp_warpAffineParallel() {}

    virtual void operator() (const cv::Range& range) const CV_OVERRIDE
    {
        //CV_INSTRUMENT_REGION_IPP();
-
-        if(*pOk == false)
+        if(*ok == false)
            return;

        try
@ -49,9 +52,10 @@ public:
        }
        catch(const ::ipp::IwException &)
        {
-            *pOk = false;
+            *ok = false;
            return;
        }
+        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
    }
 private:
    ::ipp::IwiImage &m_src;
@ -62,20 +66,29 @@ private:
    ::ipp::IwiBorderType borderType;
    IwTransDirection iwTransDirection;

-    bool  *pOk;
+    bool  *ok;
    const ipp_warpAffineParallel& operator= (const ipp_warpAffineParallel&);
 };

-#if (IPP_VERSION_X100 >= 700)
 int ipp_hal_warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width,
                              int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4])
 {
    //CV_INSTRUMENT_REGION_IPP();

-    IppiInterpolationType ippInter    = ippiGetInterpolation(interpolation);
+    IppiInterpolationType ippInter  = ippiGetInterpolation(interpolation);
    if((int)ippInter < 0 || interpolation > 2)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;

+#if defined(IPP_CALLS_ENFORCED)
+                                     /* C1         C2         C3         C4 */
+    char impl[CV_DEPTH_MAX][4][3]={{{1, 1, 0}, {0, 0, 0}, {1, 1, 0}, {1, 1, 0}},   //8U
+                                   {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //8S
+                                   {{1, 1, 0}, {0, 0, 0}, {1, 1, 0}, {1, 1, 0}},   //16U
+                                   {{1, 1, 0}, {0, 0, 0}, {1, 1, 0}, {1, 1, 0}},   //16S
+                                   {{1, 1, 0}, {0, 0, 0}, {1, 1, 0}, {1, 1, 0}},   //32S
+                                   {{1, 1, 0}, {0, 0, 0}, {1, 1, 0}, {1, 1, 0}},   //32F
+                                   {{1, 1, 0}, {0, 0, 0}, {1, 1, 0}, {1, 1, 0}}};  //64F
+#else // IPP_CALLS_ENFORCED is not defined, results are strictly aligned to OpenCV implementation
                                     /* C1         C2         C3         C4 */
    char impl[CV_DEPTH_MAX][4][3]={{{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //8U
                                   {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //8S
@ -84,6 +97,7 @@ int ipp_hal_warpAffine(int src_type, const uchar *src_data, size_t src_step, int
                                   {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //32S
                                   {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //32F
                                   {{1, 0, 0}, {0, 0, 0}, {1, 0, 0}, {1, 0, 0}}};  //64F
+#endif

    if(impl[CV_TYPE(src_type)][CV_MAT_CN(src_type)-1][interpolation] == 0)
    {
@ -107,21 +121,24 @@ int ipp_hal_warpAffine(int src_type, const uchar *src_data, size_t src_step, int
            for( int j = 0; j < 3; j++ )
                coeffs[i][j] = M[i*3 + j];

-        const int threads = ippiSuggestThreadsNum(iwDst, 2);
+        int min_payload = 1 << 16; // 64KB shall be minimal per thread to maximize scalability for warping functions
+        const int threads = ippiSuggestRowThreadsNum(iwDst, min_payload);

-        if(IPP_WARPAFFINE_PARALLEL && threads > 1)
+        if (threads > 1)
        {
-            bool  ok      = true;
+            bool ok = true;
            cv::Range range(0, (int)iwDst.m_size.height);
            ipp_warpAffineParallel invoker(iwSrc, iwDst, ippInter, coeffs, ippBorder, iwTransDirection, &ok);
            if(!ok)
                return CV_HAL_ERROR_NOT_IMPLEMENTED;

-            parallel_for_(range, invoker, threads*4);
+            parallel_for_(range, invoker, threads);

            if(!ok)
                return CV_HAL_ERROR_NOT_IMPLEMENTED;
-        } else {
+        }
+        else
+        {
            CV_INSTRUMENT_FUN_IPP(::ipp::iwiWarpAffine, iwSrc, iwDst, coeffs, iwTransDirection, ippInter, ::ipp::IwiWarpAffineParams(), ippBorder);
        }
    }
@ -132,8 +149,10 @@ int ipp_hal_warpAffine(int src_type, const uchar *src_data, size_t src_step, int

    return CV_HAL_ERROR_OK;
 }
-#endif
-#endif
+
+#endif // HAVE_IPP_IW
+
+// End of Warp affine section

 typedef IppStatus (CV_STDCALL* ippiSetFunc)(const void*, void *, int, IppiSize);

@ -194,18 +213,46 @@ static bool IPPSet(const double value[4], void *dataPointer, int step, IppiSize
    return false;
 }

-#if (IPP_VERSION_X100 >= 810)
+// Warp perspective section
+
 typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveFunc)(const Ipp8u*, int, Ipp8u*, int,IppiPoint, IppiSize, const IppiWarpSpec*,Ipp8u*);
+typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveInitFunc)(IppiSize, IppiRect, IppiSize, IppDataType,const double [3][3], IppiWarpDirection, int, IppiBorderType, const Ipp64f *, int, IppiWarpSpec*);

 class IPPWarpPerspectiveInvoker :
    public cv::ParallelLoopBody
 {
+    // Mem object ot simplify IPP memory lifetime control
+    struct IPPWarpPerspectiveMem
+    {
+        IppiWarpSpec* pSpec = nullptr;
+        Ipp8u* pBuffer  = nullptr;
+
+        IPPWarpPerspectiveMem() = default;
+        IPPWarpPerspectiveMem (const IPPWarpPerspectiveMem&) = delete;
+        IPPWarpPerspectiveMem& operator= (const IPPWarpPerspectiveMem&) = delete;
+
+        void AllocateSpec(int size)
+        {
+            pSpec = (IppiWarpSpec*)ippMalloc_L(size);
+        }
+
+        void AllocateBuffer(int size)
+        {
+            pBuffer = (Ipp8u*)ippMalloc_L(size);
+        }
+
+        ~IPPWarpPerspectiveMem()
+        {
+            if (nullptr != pSpec) ippFree(pSpec);
+            if (nullptr != pBuffer) ippFree(pBuffer);
+        }
+    };
 public:
    IPPWarpPerspectiveInvoker(int _src_type, cv::Mat &_src, size_t _src_step, cv::Mat &_dst, size_t _dst_step, IppiInterpolationType _interpolation,
-                              double (&_coeffs)[3][3], int &_borderType, const double _borderValue[4], ippiWarpPerspectiveFunc _func,
+                              double (&_coeffs)[3][3], int &_borderType, const double _borderValue[4], ippiWarpPerspectiveFunc _func, ippiWarpPerspectiveInitFunc _initFunc,
                              bool *_ok) :
        ParallelLoopBody(), src_type(_src_type), src(_src), src_step(_src_step), dst(_dst), dst_step(_dst_step), inter(_interpolation), coeffs(_coeffs),
-        borderType(_borderType), func(_func), ok(_ok)
+        borderType(_borderType), func(_func), initFunc(_initFunc), ok(_ok)
    {
        memcpy(this->borderValue, _borderValue, sizeof(this->borderValue));
        *ok = true;
@ -214,9 +261,13 @@ public:
    virtual void operator() (const cv::Range& range) const CV_OVERRIDE
    {
        //CV_INSTRUMENT_REGION_IPP();
-        IppiWarpSpec* pSpec = 0;
-        int specSize = 0, initSize = 0, bufSize = 0; Ipp8u* pBuffer  = 0;
-        IppiPoint dstRoiOffset = {0, 0};
+        if (*ok == false)
+            return;
+
+        IPPWarpPerspectiveMem mem;
+
+        int specSize = 0, initSize = 0, bufSize = 0;
+
        IppiWarpDirection direction = ippWarpBackward;  //fixed for IPP
        const Ipp32u numChannels = CV_MAT_CN(src_type);

@ -225,48 +276,34 @@ public:
        IppiRect srcroi = {0, 0, src.cols, src.rows};

        /* Spec and init buffer sizes */
-        IppStatus status = ippiWarpPerspectiveGetSize(srcsize, srcroi, dstsize, ippiGetDataType(src_type), coeffs, inter, ippWarpBackward, ippiGetBorderType(borderType), &specSize, &initSize);
+        CV_IPP_SAFE_CALL(ippiWarpPerspectiveGetSize, ok, srcsize, srcroi, dstsize, ippiGetDataType(src_type), coeffs, inter, ippWarpBackward, ippiGetBorderType(borderType), &specSize, &initSize);

-        pSpec = (IppiWarpSpec*)ippMalloc_L(specSize);
+        mem.AllocateSpec(specSize);

-        if (inter == ippLinear)
+        CV_IPP_SAFE_CALL(initFunc, ok, srcsize, srcroi, dstsize, ippiGetDataType(src_type), coeffs, direction, numChannels, ippiGetBorderType(borderType),
+                                            borderValue, 0, mem.pSpec);
+
+        CV_IPP_SAFE_CALL(ippiWarpGetBufferSize, ok, mem.pSpec, dstsize, &bufSize);
+
+        mem.AllocateBuffer(bufSize);
+
+        IppiPoint dstRoiOffset = {0, range.start};
+        IppiSize dstRoiSize = {dst.cols, range.size()};
+        auto* pDst = dst.ptr(range.start);
+        if (borderType == cv::BorderTypes::BORDER_CONSTANT &&
+            !IPPSet(borderValue, pDst, (int)dst_step, dstRoiSize, src.channels(), src.depth()))
        {
-            status = ippiWarpPerspectiveLinearInit(srcsize, srcroi, dstsize, ippiGetDataType(src_type), coeffs, direction, numChannels, ippiGetBorderType(borderType),
-                                            borderValue, 0, pSpec);
-        } else
-        {
-            status = ippiWarpPerspectiveNearestInit(srcsize, srcroi, dstsize, ippiGetDataType(src_type), coeffs, direction, numChannels, ippiGetBorderType(borderType),
-                                            borderValue, 0, pSpec);
-        }
-
-        status = ippiWarpGetBufferSize(pSpec, dstsize, &bufSize);
-        pBuffer = (Ipp8u*)ippMalloc_L(bufSize);
-        IppiSize dstRoiSize = dstsize;
-
-        int cnn = src.channels();
-
-        if( borderType == cv::BorderTypes::BORDER_CONSTANT )
-        {
-            IppiSize setSize = {dst.cols, range.end - range.start};
-            void *dataPointer = dst.ptr(range.start);
-            if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
-            {
-                *ok = false;
-                ippsFree(pBuffer);
-                ippsFree(pSpec);
-                return;
-            }
-        }
-
-        status = CV_INSTRUMENT_FUN_IPP(func, src.ptr(), (int)src_step, dst.ptr(), (int)dst_step, dstRoiOffset, dstRoiSize, pSpec, pBuffer);
-        if (status != ippStsNoErr)
            *ok = false;
-        else
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
+            return;
        }
-        ippsFree(pBuffer);
-        ippsFree(pSpec);
+
+        if (ippStsNoErr != CV_INSTRUMENT_FUN_IPP(func, src.ptr(), (int)src_step, pDst, (int)dst_step, dstRoiOffset, dstRoiSize, mem.pSpec, mem.pBuffer))
+        {
+            *ok = false;
+            return;
+        }
+
+        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
    }
 private:
    int src_type;
@ -279,8 +316,8 @@ private:
    int borderType;
    double borderValue[4];
    ippiWarpPerspectiveFunc func;
+    ippiWarpPerspectiveInitFunc initFunc;
    bool *ok;
-
    const IPPWarpPerspectiveInvoker& operator= (const IPPWarpPerspectiveInvoker&);
 };

@ -288,87 +325,114 @@ int ipp_hal_warpPerspective(int src_type, const uchar *src_data, size_t src_step
                            int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4])
 {
    //CV_INSTRUMENT_REGION_IPP();
-    ippiWarpPerspectiveFunc ippFunc = 0;
+
+    if (src_height <= 1 || src_width <= 1)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    ippiWarpPerspectiveFunc ippFunc = nullptr;
+    ippiWarpPerspectiveInitFunc ippInitFunc = nullptr;
+
    if (interpolation == cv::InterpolationFlags::INTER_NEAREST)
    {
-        ippFunc = src_type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_8u_C1R :
-        src_type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_8u_C3R :
-        src_type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_8u_C4R :
-        src_type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_16u_C1R :
-        src_type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_16u_C3R :
-        src_type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_16u_C4R :
-        src_type == CV_16SC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_16s_C1R :
-        src_type == CV_16SC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_16s_C3R :
-        src_type == CV_16SC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_16s_C4R :
-        src_type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_32f_C1R :
-        src_type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_32f_C3R :
-        src_type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_32f_C4R : 0;
+        ippInitFunc = ippiWarpPerspectiveNearestInit;
+        ippFunc =
+            src_type == CV_8UC1  ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_8u_C1R :
+            src_type == CV_8UC3  ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_8u_C3R :
+            src_type == CV_8UC4  ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_8u_C4R :
+            src_type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_16u_C1R :
+            src_type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_16u_C3R :
+            src_type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_16u_C4R :
+            src_type == CV_16SC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_16s_C1R :
+            src_type == CV_16SC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_16s_C3R :
+            src_type == CV_16SC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_16s_C4R :
+            src_type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_32f_C1R :
+            src_type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_32f_C3R :
+            src_type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveNearest_32f_C4R : nullptr;
    }
    else if (interpolation == cv::InterpolationFlags::INTER_LINEAR)
    {
-        ippFunc = src_type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_8u_C1R :
-        src_type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_8u_C3R :
-        src_type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_8u_C4R :
-        src_type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_16u_C1R :
-        src_type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_16u_C3R :
-        src_type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_16u_C4R :
-        src_type == CV_16SC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_16s_C1R :
-        src_type == CV_16SC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_16s_C3R :
-        src_type == CV_16SC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_16s_C4R :
-        src_type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_32f_C1R :
-        src_type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_32f_C3R :
-        src_type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_32f_C4R : 0;
+        ippInitFunc = ippiWarpPerspectiveLinearInit;
+        ippFunc =
+            src_type == CV_8UC1  ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_8u_C1R :
+            src_type == CV_8UC3  ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_8u_C3R :
+            src_type == CV_8UC4  ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_8u_C4R :
+            src_type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_16u_C1R :
+            src_type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_16u_C3R :
+            src_type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_16u_C4R :
+            src_type == CV_16SC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_16s_C1R :
+            src_type == CV_16SC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_16s_C3R :
+            src_type == CV_16SC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_16s_C4R :
+            src_type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_32f_C1R :
+            src_type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_32f_C3R :
+            src_type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveLinear_32f_C4R : nullptr;
    }
    else
    {
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    }

-    if(src_height == 1 || src_width == 1) return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    int mode =
-    interpolation == cv::InterpolationFlags::INTER_NEAREST ? IPPI_INTER_NN :
-    interpolation == cv::InterpolationFlags::INTER_LINEAR ? IPPI_INTER_LINEAR : 0;
-
-    if (mode == 0 || ippFunc == 0)
+    if (ippFunc == nullptr)
    {
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    }

+#if defined(IPP_CALLS_ENFORCED)
                                    /* C1      C2      C3      C4 */
-    char impl[CV_DEPTH_MAX][4][2]={{{0, 0}, {1, 1}, {0, 0}, {0, 0}},   //8U
-                                   {{1, 1}, {1, 1}, {1, 1}, {1, 1}},   //8S
-                                   {{0, 0}, {1, 1}, {0, 1}, {0, 1}},   //16U
-                                   {{1, 1}, {1, 1}, {1, 1}, {1, 1}},   //16S
-                                   {{1, 1}, {1, 1}, {1, 0}, {1, 1}},   //32S
-                                   {{1, 0}, {1, 0}, {0, 0}, {1, 0}},   //32F
-                                   {{1, 1}, {1, 1}, {1, 1}, {1, 1}}};  //64F
+    char impl[CV_DEPTH_MAX][4][2]={{{1, 1}, {0, 0}, {1, 1}, {1, 1}},   //8U
+                                   {{0, 0}, {0, 0}, {0, 0}, {0, 0}},   //8S
+                                   {{1, 1}, {0, 0}, {1, 1}, {1, 1}},   //16U
+                                   {{1, 1}, {0, 0}, {1, 1}, {1, 1}},   //16S
+                                   {{1, 1}, {0, 0}, {1, 1}, {1, 1}},   //32S
+                                   {{1, 1}, {0, 0}, {1, 1}, {1, 1}},   //32F
+                                   {{0, 0}, {0, 0}, {0, 0}, {0, 0}}};  //64F
+#else // IPP_CALLS_ENFORCED is not defined, results are strictly aligned to OpenCV implementation
+                                    /* C1      C2      C3      C4 */
+    char impl[CV_DEPTH_MAX][4][2]={{{0, 0}, {0, 0}, {0, 0}, {0, 0}},   //8U
+                                   {{0, 0}, {0, 0}, {0, 0}, {0, 0}},   //8S
+                                   {{0, 0}, {0, 0}, {0, 1}, {0, 1}},   //16U
+                                   {{1, 1}, {0, 0}, {1, 1}, {1, 1}},   //16S
+                                   {{1, 1}, {0, 0}, {1, 0}, {1, 1}},   //32S
+                                   {{1, 0}, {0, 0}, {0, 0}, {1, 0}},   //32F
+                                   {{0, 0}, {0, 0}, {0, 0}, {0, 0}}};  //64F
+#endif

+    const char type_size[CV_DEPTH_MAX] = {1,1,2,2,4,4,8};
    if(impl[CV_TYPE(src_type)][CV_MAT_CN(src_type)-1][interpolation] == 0)
    {
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    }
-
    double coeffs[3][3];
    for( int i = 0; i < 3; i++ )
        for( int j = 0; j < 3; j++ )
            coeffs[i][j] = M[i*3 + j];

-    bool ok;
+    bool ok = true;
    cv::Range range(0, dst_height);
    cv::Mat src(cv::Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
    cv::Mat dst(cv::Size(dst_width, dst_height), src_type, dst_data, dst_step);
-    IppiInterpolationType ippInter    = ippiGetInterpolation(interpolation);
-    IPPWarpPerspectiveInvoker invoker(src_type, src, src_step, dst, dst_step, ippInter, coeffs, borderType, borderValue, ippFunc, &ok);
-    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
+    IppiInterpolationType ippInter = ippiGetInterpolation(interpolation);

-    if( ok )
+    int min_payload = 1 << 16; // 64KB shall be minimal per thread to maximize scalability for warping functions
+    int num_threads = ippiSuggestRowThreadsNum(dst_width, dst_height, type_size[CV_TYPE(src_type)]*CV_MAT_CN(src_type), min_payload);
+
+    IPPWarpPerspectiveInvoker invoker(src_type, src, src_step, dst, dst_step, ippInter, coeffs, borderType, borderValue, ippFunc, ippInitFunc, &ok);
+
+    (num_threads > 1) ? parallel_for_(range, invoker, num_threads) : invoker(range);
+
+    if (ok)
    {
-        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
+        CV_IMPL_ADD(CV_IMPL_IPP | CV_IMPL_MT);
+        return CV_HAL_ERROR_OK;
    }
-    return CV_HAL_ERROR_OK;
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-#endif
+
+// End of Warp perspective section
+
+// Remap section

 typedef IppStatus(CV_STDCALL *ippiRemap)(const void *pSrc, IppiSize srcSize, int srcStep, IppiRect srcRoi,
                                         const Ipp32f *pxMap, int xMapStep, const Ipp32f *pyMap, int yMapStep,
@ -391,6 +455,10 @@ public:

    virtual void operator()(const cv::Range &range) const
    {
+        //CV_INSTRUMENT_REGION_IPP();
+        if (*ok == false)
+            return;
+
        IppiRect srcRoiRect = {0, 0, src_width, src_height};
        uchar *dst_roi_data = dst + range.start * dst_step;
        IppiSize dstRoiSize = ippiSize(dst_width, range.size());
@ -403,14 +471,15 @@ public:
            return;
        }

-        if (CV_INSTRUMENT_FUN_IPP(ippFunc, src, {src_width, src_height}, (int)src_step, srcRoiRect,
+        if (ippStsNoErr != CV_INSTRUMENT_FUN_IPP(ippFunc, src, {src_width, src_height}, (int)src_step, srcRoiRect,
                                  mapx, (int)mapx_step, mapy, (int)mapy_step,
-                                  dst_roi_data, (int)dst_step, dstRoiSize, ippInterpolation) < 0)
-            *ok = false;
-        else
+                                  dst_roi_data, (int)dst_step, dstRoiSize, ippInterpolation))
        {
-            CV_IMPL_ADD(CV_IMPL_IPP | CV_IMPL_MT);
+            *ok = false;
+            return;
        }
+
+        CV_IMPL_ADD(CV_IMPL_IPP | CV_IMPL_MT);
    }

 private:
@ -436,53 +505,74 @@ int ipp_hal_remap32f(int src_type, const uchar *src_data, size_t src_step, int s
                     float *mapx, size_t mapx_step, float *mapy, size_t mapy_step,
                     int interpolation, int border_type, const double border_value[4])
 {
-    if ((interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC || interpolation == cv::INTER_NEAREST) &&
-        (border_type == cv::BORDER_CONSTANT || border_type == cv::BORDER_TRANSPARENT))
+    if (!((interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC || interpolation == cv::INTER_NEAREST) &&
+        (border_type == cv::BORDER_CONSTANT || border_type == cv::BORDER_TRANSPARENT)))
    {
-        int ippInterpolation =
-            interpolation == cv::INTER_NEAREST ? IPPI_INTER_NN : interpolation == cv::INTER_LINEAR ? IPPI_INTER_LINEAR
-                                                                                                   : IPPI_INTER_CUBIC;
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }

-                                         /* C1         C2         C3         C4 */
-        char impl[CV_DEPTH_MAX][4][3]={{{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //8U
-                                       {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //8S
-                                       {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //16U
-                                       {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //16S
-                                       {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //32S
-                                       {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //32F
-                                       {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}}};  //64F
+    int ippInterpolation = ippiGetInterpolation(interpolation);

-        if (impl[CV_TYPE(src_type)][CV_MAT_CN(src_type) - 1][interpolation] == 0)
+#if defined(IPP_CALLS_ENFORCED)
+                                    /* C1         C2         C3         C4 */
+    char impl[CV_DEPTH_MAX][4][3] = {{{1, 1, 1}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}},   //8U
+                                     {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //8S
+                                     {{1, 1, 1}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}},   //16U
+                                     {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //16S
+                                     {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //32S
+                                     {{1, 1, 1}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}},   //32F
+                                     {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}}};  //64F
+#else // IPP_CALLS_ENFORCED is not defined, results are strictly aligned to OpenCV implementation
+                                    /* C1         C2         C3         C4 */
+    char impl[CV_DEPTH_MAX][4][3] = {{{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //8U
+                                     {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //8S
+                                     {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //16U
+                                     {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //16S
+                                     {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //32S
+                                     {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},   //32F
+                                     {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}}};  //64F
+#endif
+    const char type_size[CV_DEPTH_MAX] = {1,1,2,2,4,4,8};
+
+    if (impl[CV_TYPE(src_type)][CV_MAT_CN(src_type) - 1][interpolation] == 0)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    ippiRemap ippFunc =
+        src_type == CV_8UC1  ? (ippiRemap)ippiRemap_8u_C1R :
+        src_type == CV_8UC3  ? (ippiRemap)ippiRemap_8u_C3R :
+        src_type == CV_8UC4  ? (ippiRemap)ippiRemap_8u_C4R :
+        src_type == CV_16UC1 ? (ippiRemap)ippiRemap_16u_C1R :
+        src_type == CV_16UC3 ? (ippiRemap)ippiRemap_16u_C3R :
+        src_type == CV_16UC4 ? (ippiRemap)ippiRemap_16u_C4R :
+        src_type == CV_32FC1 ? (ippiRemap)ippiRemap_32f_C1R :
+        src_type == CV_32FC3 ? (ippiRemap)ippiRemap_32f_C3R :
+        src_type == CV_32FC4 ? (ippiRemap)ippiRemap_32f_C4R : 0;
+
+    if (ippFunc)
+    {
+        bool ok = true;
+
+        IPPRemapInvoker invoker(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width,
+                                mapx, mapx_step, mapy, mapy_step, ippFunc, ippInterpolation, border_type, border_value, &ok);
+        cv::Range range(0, dst_height);
+
+        int min_payload = 1 << 16; // 64KB shall be minimal per thread to maximize scalability for warping functions
+        int num_threads = ippiSuggestRowThreadsNum(dst_width, dst_height, type_size[CV_TYPE(src_type)]*CV_MAT_CN(src_type), min_payload);
+
+        cv::parallel_for_(range, invoker, num_threads);
+
+        if (ok)
        {
-            return CV_HAL_ERROR_NOT_IMPLEMENTED;
-        }
-
-        ippiRemap ippFunc =
-            src_type == CV_8UC1 ? (ippiRemap)ippiRemap_8u_C1R : src_type == CV_8UC3 ? (ippiRemap)ippiRemap_8u_C3R
-                                                            : src_type == CV_8UC4   ? (ippiRemap)ippiRemap_8u_C4R
-                                                            : src_type == CV_16UC1  ? (ippiRemap)ippiRemap_16u_C1R
-                                                            : src_type == CV_16UC3  ? (ippiRemap)ippiRemap_16u_C3R
-                                                            : src_type == CV_16UC4  ? (ippiRemap)ippiRemap_16u_C4R
-                                                            : src_type == CV_32FC1  ? (ippiRemap)ippiRemap_32f_C1R
-                                                            : src_type == CV_32FC3  ? (ippiRemap)ippiRemap_32f_C3R
-                                                            : src_type == CV_32FC4  ? (ippiRemap)ippiRemap_32f_C4R
-                                                                                    : 0;
-
-        if (ippFunc)
-        {
-            bool ok;
-
-            IPPRemapInvoker invoker(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width,
-                                    mapx, mapx_step, mapy, mapy_step, ippFunc, ippInterpolation, border_type, border_value, &ok);
-            cv::Range range(0, dst_height);
-            cv::parallel_for_(range, invoker, dst_width * dst_height / (double)(1 << 16));
-
-            if (ok)
-            {
-                CV_IMPL_ADD(CV_IMPL_IPP | CV_IMPL_MT);
-                return CV_HAL_ERROR_OK;
-            }
+            CV_IMPL_ADD(CV_IMPL_IPP | CV_IMPL_MT);
+            return CV_HAL_ERROR_OK;
        }
    }
+
    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
+
+// End of Remap section
+
+#endif // IPP_VERSION_X100 >= 810
--- a/modules/calib3d/doc/calib3d.bib
+++ b/modules/calib3d/doc/calib3d.bib
@ -1,3 +1,11 @@
+@inproceedings{ding2023revisiting,
+  title={Revisiting the P3P Problem},
+  author={Ding, Yaqing and Yang, Jian and Larsson, Viktor and Olsson, Carl and {\AA}str{\"o}m, Kalle},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={4872--4880},
+  year={2023},
+  url={https://openaccess.thecvf.com/content/CVPR2023/papers/Ding_Revisiting_the_P3P_Problem_CVPR_2023_paper.pdf}
+}
@article{lepetit2009epnp,
  title={Epnp: An accurate o (n) solution to the pnp problem},
  author={Lepetit, Vincent and Moreno-Noguer, Francesc and Fua, Pascal},
@ -6,18 +14,8 @@
  number={2},
  pages={155--166},
  year={2009},
-  publisher={Springer}
-}
-
-@article{gao2003complete,
-  title={Complete solution classification for the perspective-three-point problem},
-  author={Gao, Xiao-Shan and Hou, Xiao-Rong and Tang, Jianliang and Cheng, Hang-Fei},
-  journal={Pattern Analysis and Machine Intelligence, IEEE Transactions on},
-  volume={25},
-  number={8},
-  pages={930--943},
-  year={2003},
-  publisher={IEEE}
+  publisher={Springer},
+  url={https://www.tugraz.at/fileadmin/user_upload/Institute/ICG/Images/team_lepetit/publications/lepetit_ijcv08.pdf}
 }

@inproceedings{hesch2011direct,
@ -26,7 +24,8 @@
  booktitle={Computer Vision (ICCV), 2011 IEEE International Conference on},
  pages={383--390},
  year={2011},
-  organization={IEEE}
+  organization={IEEE},
+  url={https://www-users.cse.umn.edu/~stergios/papers/ICCV-11-DLS-PnP.pdf}
 }

@article{penate2013exhaustive,
@ -37,16 +36,8 @@
  number={10},
  pages={2387--2400},
  year={2013},
-  publisher={IEEE}
-}
-
-@inproceedings{Terzakis2020SQPnP,
-  title={A Consistently Fast and Globally Optimal Solution to the Perspective-n-Point Problem},
-  author={George Terzakis and Manolis Lourakis},
-  booktitle={European Conference on Computer Vision},
-  pages={478--494},
-  year={2020},
-  publisher={Springer International Publishing}
+  publisher={IEEE},
+  url={https://www.researchgate.net/publication/235402233_Exhaustive_Linearization_for_Robust_Camera_Pose_and_Focal_Length_Estimation}
 }

@inproceedings{strobl2011iccv,
@ -61,3 +52,13 @@
  url={https://elib.dlr.de/71888/1/strobl_2011iccv.pdf},
  doi={10.1109/ICCVW.2011.6130369}
 }
+
+@inproceedings{Terzakis2020SQPnP,
+  title={A Consistently Fast and Globally Optimal Solution to the Perspective-n-Point Problem},
+  author={George Terzakis and Manolis Lourakis},
+  booktitle={European Conference on Computer Vision},
+  pages={478--494},
+  year={2020},
+  publisher={Springer International Publishing},
+  url={https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123460460.pdf}
+}
--- a/modules/calib3d/doc/solvePnP.markdown
+++ b/modules/calib3d/doc/solvePnP.markdown
@ -104,8 +104,8 @@ this case the function finds such a pose that minimizes reprojection error, that
 of squared distances between the observed projections "imagePoints" and the projected (using
 cv::projectPoints ) "objectPoints". Initial solution for non-planar "objectPoints" needs at least 6 points and uses the DLT algorithm.
 Initial solution for planar "objectPoints" needs at least 4 points and uses pose from homography decomposition.
-   cv::SOLVEPNP_P3P Method is based on the paper of X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang
-"Complete Solution Classification for the Perspective-Three-Point Problem" (@cite gao2003complete).
+-   cv::SOLVEPNP_P3P Method is based on the paper of Ding, Y., Yang, J., Larsson, V., Olsson, C., & Åstrom, K.
+"Revisiting the P3P Problem" (@cite ding2023revisiting).
 In this case the function requires exactly four object and image points.
 -   cv::SOLVEPNP_AP3P Method is based on the paper of T. Ke, S. Roumeliotis
 "An Efficient Algebraic Solution to the Perspective-Three-Point Problem" (@cite Ke17).
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@ -565,7 +565,7 @@ enum SolvePnPMethod {
                              //!< Initial solution for non-planar "objectPoints" needs at least 6 points and uses the DLT algorithm. \n
                              //!< Initial solution for planar "objectPoints" needs at least 4 points and uses pose from homography decomposition.
    SOLVEPNP_EPNP        = 1, //!< EPnP: Efficient Perspective-n-Point Camera Pose Estimation @cite lepetit2009epnp
-    SOLVEPNP_P3P         = 2, //!< Complete Solution Classification for the Perspective-Three-Point Problem @cite gao2003complete
+    SOLVEPNP_P3P         = 2, //!< Revisiting the P3P Problem @cite ding2023revisiting
    SOLVEPNP_DLS         = 3, //!< **Broken implementation. Using this flag will fallback to EPnP.** \n
                              //!< A Direct Least-Squares (DLS) Method for PnP @cite hesch2011direct
    SOLVEPNP_UPNP        = 4, //!< **Broken implementation. Using this flag will fallback to EPnP.** \n
@ -1154,8 +1154,8 @@ assumed.
 the model coordinate system to the camera coordinate system. A P3P problem has up to 4 solutions.
@param tvecs Output translation vectors.
@param flags Method for solving a P3P problem:
-   @ref SOLVEPNP_P3P Method is based on the paper of X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang
-"Complete Solution Classification for the Perspective-Three-Point Problem" (@cite gao2003complete).
+-   @ref SOLVEPNP_P3P Method is based on the paper of Ding, Y., Yang, J., Larsson, V., Olsson, C., & Åstrom, K.
+"Revisiting the P3P Problem" (@cite ding2023revisiting).
 -   @ref SOLVEPNP_AP3P Method is based on the paper of T. Ke and S. Roumeliotis.
 "An Efficient Algebraic Solution to the Perspective-Three-Point Problem" (@cite Ke17).

--- a/modules/calib3d/src/calibration.cpp
+++ b/modules/calib3d/src/calibration.cpp
@ -755,6 +755,9 @@ static double stereoCalibrateImpl(
    if(flags & CALIB_USE_LU) {
        solver.solveMethod = DECOMP_LU;
    }
+    else if(flags & CALIB_USE_QR) {
+        solver.solveMethod = DECOMP_QR;
+    }

    if( recomputeIntrinsics )
    {
--- a/modules/calib3d/src/fisheye.cpp
+++ b/modules/calib3d/src/fisheye.cpp
@ -429,8 +429,15 @@ void cv::fisheye::undistortPoints( InputArray distorted, OutputArray undistorted
    for(size_t i = 0; i < n; i++ )
    {
        Vec2d pi = sdepth == CV_32F ? (Vec2d)srcf[i] : srcd[i];  // image point
+        // u = fx * x' + cx (alpha = 0), v = fy * y' + cy =>
+        // x' = (u - cx) / fx, y' = (v - cy) / fy
        Vec2d pw((pi[0] - c[0])/f[0], (pi[1] - c[1])/f[1]);      // world point

+        // x' = (theta_d / r) * a, y' = (theta_d / r) * b =>
+        // x'^2 + y'^2 = theta_d^2 * (a^2 + b^2) / r^2 =>
+        // (r^2 = a^2 + b^2)
+        // x'^2 + y'^2 = theta_d^2 =>
+        // theta_d = sqrt(x'^2 + y'^2)
        double theta_d = sqrt(pw[0]*pw[0] + pw[1]*pw[1]);

        // the current camera model is only valid up to 180 FOV
@ -449,9 +456,14 @@ void cv::fisheye::undistortPoints( InputArray distorted, OutputArray undistorted

            for (int j = 0; j < maxCount; j++)
            {
+                // theta_d = theta * (1 + k1 * theta^2 + k2 * theta^4 + k3 * theta^6 + k4 * theta^8) =>
+                // f(theta) := theta * (1 + k1 * theta^2 + k2 * theta^4 + k3 * theta^6 + k4 * theta^8) - theta_d = 0
+                // Newton's method: new_theta = theta - theta_fix, theta_fix := f(theta) / f'(theta)
+                // f'(theta) = (theta * (1 + k1 * theta^2 + k2 * theta^4 + k3 * theta^6 + k4 * theta^8) - theta_d)' =
+                // (theta + k1 * theta^3 + k2 * theta^5 + k3 * theta^7 + k4 * theta^9 - theta_d)' =
+                // 1 + 3 * k1 * theta^2 + 5 * k2 * theta^4 + 7 * k3 * theta^6 + 9 * k4 * theta^8
                double theta2 = theta*theta, theta4 = theta2*theta2, theta6 = theta4*theta2, theta8 = theta6*theta2;
                double k0_theta2 = k[0] * theta2, k1_theta4 = k[1] * theta4, k2_theta6 = k[2] * theta6, k3_theta8 = k[3] * theta8;
-                /* new_theta = theta - theta_fix, theta_fix = f0(theta) / f0'(theta) */
                double theta_fix = (theta * (1 + k0_theta2 + k1_theta4 + k2_theta6 + k3_theta8) - theta_d) /
                                   (1 + 3*k0_theta2 + 5*k1_theta4 + 7*k2_theta6 + 9*k3_theta8);
                theta = theta - theta_fix;
@ -463,6 +475,10 @@ void cv::fisheye::undistortPoints( InputArray distorted, OutputArray undistorted
                }
            }

+            // x' = (theta_d / r) * a, y' = (theta_d / r) * b =>
+            // a = x' * r / theta_d, b = y' * r / theta_d =>
+            // (theta = atan(r) => r = tan(theta), scale := r / theta_d = tan(theta) / theta_d)
+            // a = x' * scale, b = y' * scale
            scale = std::tan(theta) / theta_d;
        }
        else
@ -477,6 +493,7 @@ void cv::fisheye::undistortPoints( InputArray distorted, OutputArray undistorted

        if ((converged || !isEps) && !theta_flipped)
        {
+            // a = x' * scale, b = y' * scale
            Vec2d pu = pw * scale; //undistorted point
            Vec2d fi;

--- a/modules/calib3d/src/p3p.cpp
+++ b/modules/calib3d/src/p3p.cpp
@ -2,465 +2,386 @@
 #include <cmath>
 #include <iostream>

-#include "polynom_solver.h"
 #include "p3p.h"

-void p3p::init_inverse_parameters()
+
+using namespace cv;
+
+// Copyright (c) 2020, Viktor Larsson
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in the
+//       documentation and/or other materials provided with the distribution.
+//
+//     * Neither the name of the copyright holder nor the
+//       names of its contributors may be used to endorse or promote products
+//       derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Author: Yaqing Ding
+//         Mark Shachkov
+
+// https://github.com/PoseLib/PoseLib/blob/79fe59ada3122c50383ac06e043a5e04072c6711/PoseLib/solvers/p3p.cc
+namespace yaqding
 {
-    inv_fx = 1. / fx;
-    inv_fy = 1. / fy;
-    cx_fx = cx / fx;
-    cy_fy = cy / fy;
-}

-p3p::p3p(cv::Mat cameraMatrix)
-{
-    if (cameraMatrix.depth() == CV_32F)
-        init_camera_parameters<float>(cameraMatrix);
-    else
-        init_camera_parameters<double>(cameraMatrix);
-    init_inverse_parameters();
-}
-
-p3p::p3p(double _fx, double _fy, double _cx, double _cy)
-{
-    fx = _fx;
-    fy = _fy;
-    cx = _cx;
-    cy = _cy;
-    init_inverse_parameters();
-}
-
-bool p3p::solve(cv::Mat& R, cv::Mat& tvec, const cv::Mat& opoints, const cv::Mat& ipoints)
-{
-    CV_INSTRUMENT_REGION();
-
-    double rotation_matrix[3][3] = {}, translation[3] = {};
-    std::vector<double> points;
-    if (opoints.depth() == ipoints.depth())
-    {
-        if (opoints.depth() == CV_32F)
-            extract_points<cv::Point3f,cv::Point2f>(opoints, ipoints, points);
-        else
-            extract_points<cv::Point3d,cv::Point2d>(opoints, ipoints, points);
-    }
-    else if (opoints.depth() == CV_32F)
-        extract_points<cv::Point3f,cv::Point2d>(opoints, ipoints, points);
-    else
-        extract_points<cv::Point3d,cv::Point2f>(opoints, ipoints, points);
-
-    bool result = solve(rotation_matrix, translation,
-                        points[0], points[1], points[2], points[3], points[4],
-                        points[5], points[6], points[7], points[8], points[9],
-                        points[10], points[11], points[12], points[13], points[14],
-                        points[15], points[16], points[17], points[18], points[19]);
-    cv::Mat(3, 1, CV_64F, translation).copyTo(tvec);
-    cv::Mat(3, 3, CV_64F, rotation_matrix).copyTo(R);
-    return result;
-}
-
-int p3p::solve(std::vector<cv::Mat>& Rs, std::vector<cv::Mat>& tvecs, const cv::Mat& opoints, const cv::Mat& ipoints)
-{
-    CV_INSTRUMENT_REGION();
-
-    double rotation_matrix[4][3][3] = {}, translation[4][3] = {};
-    std::vector<double> points;
-    if (opoints.depth() == ipoints.depth())
-    {
-        if (opoints.depth() == CV_32F)
-            extract_points<cv::Point3f,cv::Point2f>(opoints, ipoints, points);
-        else
-            extract_points<cv::Point3d,cv::Point2d>(opoints, ipoints, points);
-    }
-    else if (opoints.depth() == CV_32F)
-        extract_points<cv::Point3f,cv::Point2d>(opoints, ipoints, points);
-    else
-        extract_points<cv::Point3d,cv::Point2f>(opoints, ipoints, points);
-
-    const bool p4p = std::max(opoints.checkVector(3, CV_32F), opoints.checkVector(3, CV_64F)) == 4;
-    int solutions = solve(rotation_matrix, translation,
-                          points[0], points[1], points[2], points[3], points[4],
-                          points[5], points[6], points[7], points[8], points[9],
-                          points[10], points[11], points[12], points[13], points[14],
-                          points[15], points[16], points[17], points[18], points[19],
-                          p4p);
-
-    for (int i = 0; i < solutions; i++) {
-        cv::Mat R, tvec;
-        cv::Mat(3, 1, CV_64F, translation[i]).copyTo(tvec);
-        cv::Mat(3, 3, CV_64F, rotation_matrix[i]).copyTo(R);
-
-        Rs.push_back(R);
-        tvecs.push_back(tvec);
-    }
-
-    return solutions;
-}
-
-bool p3p::solve(double R[3][3], double t[3],
-    double mu0, double mv0,   double X0, double Y0, double Z0,
-    double mu1, double mv1,   double X1, double Y1, double Z1,
-    double mu2, double mv2,   double X2, double Y2, double Z2,
-    double mu3, double mv3,   double X3, double Y3, double Z3)
-{
-    double Rs[4][3][3] = {}, ts[4][3] = {};
-
-    const bool p4p = true;
-    int n = solve(Rs, ts, mu0, mv0, X0, Y0, Z0,  mu1, mv1, X1, Y1, Z1, mu2, mv2, X2, Y2, Z2, mu3, mv3, X3, Y3, Z3, p4p);
-
-    if (n == 0)
-        return false;
-
-    for(int i = 0; i < 3; i++) {
-        for(int j = 0; j < 3; j++)
-            R[i][j] = Rs[0][i][j];
-        t[i] = ts[0][i];
-    }
-
-    return true;
-}
-
-int p3p::solve(double R[4][3][3], double t[4][3],
-               double mu0, double mv0,   double X0, double Y0, double Z0,
-               double mu1, double mv1,   double X1, double Y1, double Z1,
-               double mu2, double mv2,   double X2, double Y2, double Z2,
-               double mu3, double mv3,   double X3, double Y3, double Z3,
-               bool p4p)
-{
-    double mk0, mk1, mk2;
-    double norm;
-
-    mu0 = inv_fx * mu0 - cx_fx;
-    mv0 = inv_fy * mv0 - cy_fy;
-    norm = sqrt(mu0 * mu0 + mv0 * mv0 + 1);
-    mk0 = 1. / norm; mu0 *= mk0; mv0 *= mk0;
-
-    mu1 = inv_fx * mu1 - cx_fx;
-    mv1 = inv_fy * mv1 - cy_fy;
-    norm = sqrt(mu1 * mu1 + mv1 * mv1 + 1);
-    mk1 = 1. / norm; mu1 *= mk1; mv1 *= mk1;
-
-    mu2 = inv_fx * mu2 - cx_fx;
-    mv2 = inv_fy * mv2 - cy_fy;
-    norm = sqrt(mu2 * mu2 + mv2 * mv2 + 1);
-    mk2 = 1. / norm; mu2 *= mk2; mv2 *= mk2;
-
-    mu3 = inv_fx * mu3 - cx_fx;
-    mv3 = inv_fy * mv3 - cy_fy;
-
-    double distances[3];
-    distances[0] = sqrt( (X1 - X2) * (X1 - X2) + (Y1 - Y2) * (Y1 - Y2) + (Z1 - Z2) * (Z1 - Z2) );
-    distances[1] = sqrt( (X0 - X2) * (X0 - X2) + (Y0 - Y2) * (Y0 - Y2) + (Z0 - Z2) * (Z0 - Z2) );
-    distances[2] = sqrt( (X0 - X1) * (X0 - X1) + (Y0 - Y1) * (Y0 - Y1) + (Z0 - Z1) * (Z0 - Z1) );
-
-    // Calculate angles
-    double cosines[3];
-    cosines[0] = mu1 * mu2 + mv1 * mv2 + mk1 * mk2;
-    cosines[1] = mu0 * mu2 + mv0 * mv2 + mk0 * mk2;
-    cosines[2] = mu0 * mu1 + mv0 * mv1 + mk0 * mk1;
-
-    double lengths[4][3] = {};
-    int n = solve_for_lengths(lengths, distances, cosines);
-
-    int nb_solutions = 0;
-    double reproj_errors[4];
-    for(int i = 0; i < n; i++) {
-        double M_orig[3][3];
-
-        M_orig[0][0] = lengths[i][0] * mu0;
-        M_orig[0][1] = lengths[i][0] * mv0;
-        M_orig[0][2] = lengths[i][0] * mk0;
-
-        M_orig[1][0] = lengths[i][1] * mu1;
-        M_orig[1][1] = lengths[i][1] * mv1;
-        M_orig[1][2] = lengths[i][1] * mk1;
-
-        M_orig[2][0] = lengths[i][2] * mu2;
-        M_orig[2][1] = lengths[i][2] * mv2;
-        M_orig[2][2] = lengths[i][2] * mk2;
-
-        if (!align(M_orig, X0, Y0, Z0, X1, Y1, Z1, X2, Y2, Z2, R[nb_solutions], t[nb_solutions]))
-            continue;
-
-        if (p4p) {
-            double X3p = R[nb_solutions][0][0] * X3 + R[nb_solutions][0][1] * Y3 + R[nb_solutions][0][2] * Z3 + t[nb_solutions][0];
-            double Y3p = R[nb_solutions][1][0] * X3 + R[nb_solutions][1][1] * Y3 + R[nb_solutions][1][2] * Z3 + t[nb_solutions][1];
-            double Z3p = R[nb_solutions][2][0] * X3 + R[nb_solutions][2][1] * Y3 + R[nb_solutions][2][2] * Z3 + t[nb_solutions][2];
-            double mu3p = X3p / Z3p;
-            double mv3p = Y3p / Z3p;
-            reproj_errors[nb_solutions] = (mu3p - mu3) * (mu3p - mu3) + (mv3p - mv3) * (mv3p - mv3);
-        }
-
-        nb_solutions++;
-    }
-
-    if (p4p) {
-        //sort the solutions
-        for (int i = 1; i < nb_solutions; i++) {
-            for (int j = i; j > 0 && reproj_errors[j-1] > reproj_errors[j]; j--) {
-                std::swap(reproj_errors[j], reproj_errors[j-1]);
-                std::swap(R[j], R[j-1]);
-                std::swap(t[j], t[j-1]);
-            }
-        }
-    }
-
-    return nb_solutions;
-}
-
-/// Given 3D distances between three points and cosines of 3 angles at the apex, calculates
-/// the lengths of the line segments connecting projection center (P) and the three 3D points (A, B, C).
-/// Returned distances are for |PA|, |PB|, |PC| respectively.
-/// Only the solution to the main branch.
-/// Reference : X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang; "Complete Solution Classification for the Perspective-Three-Point Problem"
-/// IEEE Trans. on PAMI, vol. 25, No. 8, August 2003
-/// \param lengths Lengths of line segments up to four solutions.
-/// \param distances Distance between 3D points in pairs |BC|, |AC|, |AB|.
-/// \param cosines Cosine of the angles /_BPC, /_APC, /_APB.
-/// \returns Number of solutions.
-/// WARNING: NOT ALL THE DEGENERATE CASES ARE IMPLEMENTED
-
-int p3p::solve_for_lengths(double lengths[4][3], double distances[3], double cosines[3])
-{
-    double p = cosines[0] * 2;
-    double q = cosines[1] * 2;
-    double r = cosines[2] * 2;
-
-    double inv_d22 = 1. / (distances[2] * distances[2]);
-    double a = inv_d22 * (distances[0] * distances[0]);
-    double b = inv_d22 * (distances[1] * distances[1]);
-
-    double a2 = a * a, b2 = b * b, p2 = p * p, q2 = q * q, r2 = r * r;
-    double pr = p * r, pqr = q * pr;
-
-    // Check reality condition (the four points should not be coplanar)
-    if (p2 + q2 + r2 - pqr - 1 == 0)
-        return 0;
-
-    double ab = a * b, a_2 = 2*a;
-
-    double A = -2 * b + b2 + a2 + 1 + ab*(2 - r2) - a_2;
-
-    // Check reality condition
-    if (A == 0) return 0;
-
-    double a_4 = 4*a;
-
-    double B = q*(-2*(ab + a2 + 1 - b) + r2*ab + a_4) + pr*(b - b2 + ab);
-    double C = q2 + b2*(r2 + p2 - 2) - b*(p2 + pqr) - ab*(r2 + pqr) + (a2 - a_2)*(2 + q2) + 2;
-    double D = pr*(ab-b2+b) + q*((p2-2)*b + 2 * (ab - a2) + a_4 - 2);
-    double E = 1 + 2*(b - a - ab) + b2 - b*p2 + a2;
-
-    double temp = (p2*(a-1+b) + r2*(a-1-b) + pqr - a*pqr);
-    double b0 = b * temp * temp;
-    // Check reality condition
-    if (b0 == 0)
-        return 0;
-
-    double real_roots[4];
-    int n = solve_deg4(A, B, C, D, E,  real_roots[0], real_roots[1], real_roots[2], real_roots[3]);
-
-    if (n == 0)
-        return 0;
-
-    int nb_solutions = 0;
-    double r3 = r2*r, pr2 = p*r2, r3q = r3 * q;
-    double inv_b0 = 1. / b0;
-
-    // For each solution of x
-    for(int i = 0; i < n; i++) {
-        double x = real_roots[i];
-
-        // Check reality condition
-        if (x <= 0)
-            continue;
-
-        double x2 = x*x;
-
-        double b1 =
-            ((1-a-b)*x2 + (q*a-q)*x + 1 - a + b) *
-            (((r3*(a2 + ab*(2 - r2) - a_2 + b2 - 2*b + 1)) * x +
-
-            (r3q*(2*(b-a2) + a_4 + ab*(r2 - 2) - 2) + pr2*(1 + a2 + 2*(ab-a-b) + r2*(b - b2) + b2))) * x2 +
-
-            (r3*(q2*(1-2*a+a2) + r2*(b2-ab) - a_4 + 2*(a2 - b2) + 2) + r*p2*(b2 + 2*(ab - b - a) + 1 + a2) + pr2*q*(a_4 + 2*(b - ab - a2) - 2 - r2*b)) * x +
-
-            2*r3q*(a_2 - b - a2 + ab - 1) + pr2*(q2 - a_4 + 2*(a2 - b2) + r2*b + q2*(a2 - a_2) + 2) +
-            p2*(p*(2*(ab - a - b) + a2 + b2 + 1) + 2*q*r*(b + a_2 - a2 - ab - 1)));
-
-        // Check reality condition
-        if (b1 <= 0)
-            continue;
-
-        double y = inv_b0 * b1;
-        double v = x2 + y*y - x*y*r;
-
-        if (v <= 0)
-            continue;
-
-        double Z = distances[2] / sqrt(v);
-        double X = x * Z;
-        double Y = y * Z;
-
-        lengths[nb_solutions][0] = X;
-        lengths[nb_solutions][1] = Y;
-        lengths[nb_solutions][2] = Z;
-
-        nb_solutions++;
-    }
-
-    return nb_solutions;
-}
-
-bool p3p::align(double M_end[3][3],
-    double X0, double Y0, double Z0,
-    double X1, double Y1, double Z1,
-    double X2, double Y2, double Z2,
-    double R[3][3], double T[3])
-{
-    // Centroids:
-    double C_start[3] = {}, C_end[3] = {};
-    for(int i = 0; i < 3; i++) C_end[i] = (M_end[0][i] + M_end[1][i] + M_end[2][i]) / 3;
-    C_start[0] = (X0 + X1 + X2) / 3;
-    C_start[1] = (Y0 + Y1 + Y2) / 3;
-    C_start[2] = (Z0 + Z1 + Z2) / 3;
-
-    // Covariance matrix s:
-    double s[3 * 3] = {};
-    for(int j = 0; j < 3; j++) {
-        s[0 * 3 + j] = (X0 * M_end[0][j] + X1 * M_end[1][j] + X2 * M_end[2][j]) / 3 - C_end[j] * C_start[0];
-        s[1 * 3 + j] = (Y0 * M_end[0][j] + Y1 * M_end[1][j] + Y2 * M_end[2][j]) / 3 - C_end[j] * C_start[1];
-        s[2 * 3 + j] = (Z0 * M_end[0][j] + Z1 * M_end[1][j] + Z2 * M_end[2][j]) / 3 - C_end[j] * C_start[2];
-    }
-
-    double Qs[16] = {}, evs[4] = {}, U[16] = {};
-
-    Qs[0 * 4 + 0] = s[0 * 3 + 0] + s[1 * 3 + 1] + s[2 * 3 + 2];
-    Qs[1 * 4 + 1] = s[0 * 3 + 0] - s[1 * 3 + 1] - s[2 * 3 + 2];
-    Qs[2 * 4 + 2] = s[1 * 3 + 1] - s[2 * 3 + 2] - s[0 * 3 + 0];
-    Qs[3 * 4 + 3] = s[2 * 3 + 2] - s[0 * 3 + 0] - s[1 * 3 + 1];
-
-    Qs[1 * 4 + 0] = Qs[0 * 4 + 1] = s[1 * 3 + 2] - s[2 * 3 + 1];
-    Qs[2 * 4 + 0] = Qs[0 * 4 + 2] = s[2 * 3 + 0] - s[0 * 3 + 2];
-    Qs[3 * 4 + 0] = Qs[0 * 4 + 3] = s[0 * 3 + 1] - s[1 * 3 + 0];
-    Qs[2 * 4 + 1] = Qs[1 * 4 + 2] = s[1 * 3 + 0] + s[0 * 3 + 1];
-    Qs[3 * 4 + 1] = Qs[1 * 4 + 3] = s[2 * 3 + 0] + s[0 * 3 + 2];
-    Qs[3 * 4 + 2] = Qs[2 * 4 + 3] = s[2 * 3 + 1] + s[1 * 3 + 2];
-
-    jacobi_4x4(Qs, evs, U);
-
-    // Looking for the largest eigen value:
-    int i_ev = 0;
-    double ev_max = evs[i_ev];
-    for(int i = 1; i < 4; i++)
-        if (evs[i] > ev_max)
-            ev_max = evs[i_ev = i];
-
-    // Quaternion:
-    double q[4];
-    for(int i = 0; i < 4; i++)
-        q[i] = U[i * 4 + i_ev];
-
-    double q02 = q[0] * q[0], q12 = q[1] * q[1], q22 = q[2] * q[2], q32 = q[3] * q[3];
-    double q0_1 = q[0] * q[1], q0_2 = q[0] * q[2], q0_3 = q[0] * q[3];
-    double q1_2 = q[1] * q[2], q1_3 = q[1] * q[3];
-    double q2_3 = q[2] * q[3];
-
-    R[0][0] = q02 + q12 - q22 - q32;
-    R[0][1] = 2. * (q1_2 - q0_3);
-    R[0][2] = 2. * (q1_3 + q0_2);
-
-    R[1][0] = 2. * (q1_2 + q0_3);
-    R[1][1] = q02 + q22 - q12 - q32;
-    R[1][2] = 2. * (q2_3 - q0_1);
-
-    R[2][0] = 2. * (q1_3 - q0_2);
-    R[2][1] = 2. * (q2_3 + q0_1);
-    R[2][2] = q02 + q32 - q12 - q22;
-
-    for(int i = 0; i < 3; i++)
-        T[i] = C_end[i] - (R[i][0] * C_start[0] + R[i][1] * C_start[1] + R[i][2] * C_start[2]);
-
-    return true;
-}
-
-bool p3p::jacobi_4x4(double * A, double * D, double * U)
-{
-    double B[4] = {}, Z[4] = {};
-    double Id[16] = {1., 0., 0., 0.,
-        0., 1., 0., 0.,
-        0., 0., 1., 0.,
-        0., 0., 0., 1.};
-
-    memcpy(U, Id, 16 * sizeof(double));
-
-    B[0] = A[0]; B[1] = A[5]; B[2] = A[10]; B[3] = A[15];
-    memcpy(D, B, 4 * sizeof(double));
-
-    for(int iter = 0; iter < 50; iter++) {
-        double sum = fabs(A[1]) + fabs(A[2]) + fabs(A[3]) + fabs(A[6]) + fabs(A[7]) + fabs(A[11]);
-
-        if (sum == 0.0)
+static bool solve_cubic_single_real(double c2, double c1, double c0, double &root) {
+    double a = c1 - c2 * c2 / 3.0;
+    double b = (2.0 * c2 * c2 * c2 - 9.0 * c2 * c1) / 27.0 + c0;
+    double c = b * b / 4.0 + a * a * a / 27.0;
+    if (c != 0) {
+        if (c > 0) {
+            c = std::sqrt(c);
+            b *= -0.5;
+            root = std::cbrt(b + c) + std::cbrt(b - c) - c2 / 3.0;
            return true;
-
-        double tresh =  (iter < 3) ? 0.2 * sum / 16. : 0.0;
-        for(int i = 0; i < 3; i++) {
-            double * pAij = A + 5 * i + 1;
-            for(int j = i + 1 ; j < 4; j++) {
-                double Aij = *pAij;
-                double eps_machine = 100.0 * fabs(Aij);
-
-                if ( iter > 3 && fabs(D[i]) + eps_machine == fabs(D[i]) && fabs(D[j]) + eps_machine == fabs(D[j]) )
-                    *pAij = 0.0;
-                else if (fabs(Aij) > tresh) {
-                    double hh = D[j] - D[i], t;
-                    if (fabs(hh) + eps_machine == fabs(hh))
-                        t = Aij / hh;
-                    else {
-                        double theta = 0.5 * hh / Aij;
-                        t = 1.0 / (fabs(theta) + sqrt(1.0 + theta * theta));
-                        if (theta < 0.0) t = -t;
-                    }
-
-                    hh = t * Aij;
-                    Z[i] -= hh;
-                    Z[j] += hh;
-                    D[i] -= hh;
-                    D[j] += hh;
-                    *pAij = 0.0;
-
-                    double c = 1.0 / sqrt(1 + t * t);
-                    double s = t * c;
-                    double tau = s / (1.0 + c);
-                    for(int k = 0; k <= i - 1; k++) {
-                        double g = A[k * 4 + i], h = A[k * 4 + j];
-                        A[k * 4 + i] = g - s * (h + g * tau);
-                        A[k * 4 + j] = h + s * (g - h * tau);
-                    }
-                    for(int k = i + 1; k <= j - 1; k++) {
-                        double g = A[i * 4 + k], h = A[k * 4 + j];
-                        A[i * 4 + k] = g - s * (h + g * tau);
-                        A[k * 4 + j] = h + s * (g - h * tau);
-                    }
-                    for(int k = j + 1; k < 4; k++) {
-                        double g = A[i * 4 + k], h = A[j * 4 + k];
-                        A[i * 4 + k] = g - s * (h + g * tau);
-                        A[j * 4 + k] = h + s * (g - h * tau);
-                    }
-                    for(int k = 0; k < 4; k++) {
-                        double g = U[k * 4 + i], h = U[k * 4 + j];
-                        U[k * 4 + i] = g - s * (h + g * tau);
-                        U[k * 4 + j] = h + s * (g - h * tau);
-                    }
-                }
-                pAij++;
-            }
+        } else {
+            c = 3.0 * b / (2.0 * a) * std::sqrt(-3.0 / a);
+            root = 2.0 * std::sqrt(-a / 3.0) * std::cos(std::acos(c) / 3.0) - c2 / 3.0;
        }
-
-        for(int i = 0; i < 4; i++) B[i] += Z[i];
-        memcpy(D, B, 4 * sizeof(double));
-        memset(Z, 0, 4 * sizeof(double));
+    } else {
+        root = -c2 / 3.0 + (a != 0 ? (3.0 * b / a) : 0);
    }
-
    return false;
 }
+
+static bool root2real(double b, double c, double &r1, double &r2) {
+    const double THRESHOLD = -1.0e-12;
+    double v = b * b - 4.0 * c;
+    if (v < THRESHOLD) {
+        r1 = r2 = -0.5 * b;
+        return v >= 0;
+    }
+    if (v > THRESHOLD && v < 0.0) {
+        r1 = -0.5 * b;
+        r2 = -2;
+        return true;
+    }
+
+    double y = std::sqrt(v);
+    if (b < 0) {
+        r1 = 0.5 * (-b + y);
+        r2 = 0.5 * (-b - y);
+    } else {
+        r1 = 2.0 * c / (-b + y);
+        r2 = 2.0 * c / (-b - y);
+    }
+    return true;
+}
+
+static std::array<Vec3d, 2> compute_pq(Matx33d C) {
+    std::array<Vec3d, 2> pq;
+    Matx33d C_adj;
+
+    C_adj(0, 0) = C(1, 2) * C(2, 1) - C(1, 1) * C(2, 2);
+    C_adj(1, 1) = C(0, 2) * C(2, 0) - C(0, 0) * C(2, 2);
+    C_adj(2, 2) = C(0, 1) * C(1, 0) - C(0, 0) * C(1, 1);
+    C_adj(0, 1) = C(0, 1) * C(2, 2) - C(0, 2) * C(2, 1);
+    C_adj(0, 2) = C(0, 2) * C(1, 1) - C(0, 1) * C(1, 2);
+    C_adj(1, 0) = C_adj(0, 1);
+    C_adj(1, 2) = C(0, 0) * C(1, 2) - C(0, 2) * C(1, 0);
+    C_adj(2, 0) = C_adj(0, 2);
+    C_adj(2, 1) = C_adj(1, 2);
+
+    Matx31d v;
+    if (C_adj(0, 0) > C_adj(1, 1)) {
+        if (C_adj(0, 0) > C_adj(2, 2)) {
+            v = C_adj.col(0) / std::sqrt(C_adj(0, 0));
+        } else {
+            v = C_adj.col(2) / std::sqrt(C_adj(2, 2));
+        }
+    } else if (C_adj(1, 1) > C_adj(2, 2)) {
+        v = C_adj.col(1) / std::sqrt(C_adj(1, 1));
+    } else {
+        v = C_adj.col(2) / std::sqrt(C_adj(2, 2));
+    }
+
+    C(0, 1) -= v(2);
+    C(0, 2) += v(1);
+    C(1, 2) -= v(0);
+    C(1, 0) += v(2);
+    C(2, 0) -= v(1);
+    C(2, 1) += v(0);
+
+    pq[0](0) = C.col(0)(0);
+    pq[0](1) = C.col(0)(1);
+    pq[0](2) = C.col(0)(2);
+    pq[1](0) = C.row(0)(0);
+    pq[1](1) = C.row(0)(1);
+    pq[1](2) = C.row(0)(2);
+
+    return pq;
+}
+
+// Performs a few Newton steps on the equations
+static void refine_lambda(double &lambda1, double &lambda2, double &lambda3, const double a12, const double a13,
+                   const double a23, const double b12, const double b13, const double b23) {
+    for (int iter = 0; iter < 5; ++iter) {
+        double r1 = (lambda1 * lambda1 - 2.0 * lambda1 * lambda2 * b12 + lambda2 * lambda2 - a12);
+        double r2 = (lambda1 * lambda1 - 2.0 * lambda1 * lambda3 * b13 + lambda3 * lambda3 - a13);
+        double r3 = (lambda2 * lambda2 - 2.0 * lambda2 * lambda3 * b23 + lambda3 * lambda3 - a23);
+
+        if (std::abs(r1) + std::abs(r2) + std::abs(r3) < 1e-10)
+            return;
+
+        double x11 = lambda1 - lambda2 * b12;
+        double x12 = lambda2 - lambda1 * b12;
+        double x21 = lambda1 - lambda3 * b13;
+        double x23 = lambda3 - lambda1 * b13;
+        double x32 = lambda2 - lambda3 * b23;
+        double x33 = lambda3 - lambda2 * b23;
+        double detJ = 0.5 / (x11 * x23 * x32 + x12 * x21 * x33); // half minus inverse determinant
+
+        // This uses the closed form of the inverse for the jacobian.
+        // Due to the zero elements this actually becomes quite nice.
+        lambda1 += (-x23 * x32 * r1 - x12 * x33 * r2 + x12 * x23 * r3) * detJ;
+        lambda2 += (-x21 * x33 * r1 + x11 * x33 * r2 - x11 * x23 * r3) * detJ;
+        lambda3 += (x21 * x32 * r1 - x11 * x32 * r2 - x12 * x21 * r3) * detJ;
+    }
+}
+
+};
+
+void p3p::calibrateAndNormalizePointsPnP(const Mat &opoints_, const Mat &ipoints_) {
+    auto convertPoints = [] (const Mat &points_input, Mat &points, int pt_dim) {
+        points_input.convertTo(points, CV_64F); // convert points to have float precision
+        if (points.channels() > 1)
+            points = points.reshape(1, (int)points.total()); // convert point to have 1 channel
+        if (points.rows < points.cols)
+            transpose(points, points); // transpose so points will be in rows
+        CV_CheckGE(points.cols, pt_dim, "Invalid dimension of point");
+        if (points.cols != pt_dim) // in case when image points are 3D convert them to 2D
+            points = points.colRange(0, pt_dim);
+    };
+
+    Mat ipoints;
+    convertPoints(ipoints_, ipoints, 2);
+    for (int i = 0; i < ipoints.rows; i++) {
+        const double k_inv_u = ipoints.at<double>(i, 0);
+        const double k_inv_v = ipoints.at<double>(i, 1);
+        double x_norm = 1.0 / sqrt(k_inv_u*k_inv_u + k_inv_v*k_inv_v + 1);
+        x_copy[i](0) = k_inv_u * x_norm;
+        x_copy[i](1) = k_inv_v * x_norm;
+        x_copy[i](2) =           x_norm;
+    }
+
+    Mat opoints;
+    convertPoints(opoints_, opoints, 3);
+    X_copy[0](0) = opoints.at<double>(0, 0);
+    X_copy[0](1) = opoints.at<double>(0, 1);
+    X_copy[0](2) = opoints.at<double>(0, 2);
+
+    X_copy[1](0) = opoints.at<double>(1, 0);
+    X_copy[1](1) = opoints.at<double>(1, 1);
+    X_copy[1](2) = opoints.at<double>(1, 2);
+
+    X_copy[2](0) = opoints.at<double>(2, 0);
+    X_copy[2](1) = opoints.at<double>(2, 1);
+    X_copy[2](2) = opoints.at<double>(2, 2);
+}
+
+p3p::p3p() :
+    x_copy(), X_copy()
+{
+}
+
+int p3p::estimate(std::vector<Mat>& Rs, std::vector<Mat>& ts, const cv::Mat& opoints, const cv::Mat& ipoints) {
+    CV_INSTRUMENT_REGION();
+    calibrateAndNormalizePointsPnP(opoints, ipoints);
+
+    Rs.reserve(4);
+    ts.reserve(4);
+
+    Vec3d X01 = X_copy[0] - X_copy[1];
+    Vec3d X02 = X_copy[0] - X_copy[2];
+    Vec3d X12 = X_copy[1] - X_copy[2];
+
+    double a01 = norm(X01, NORM_L2SQR);
+    double a02 = norm(X02, NORM_L2SQR);
+    double a12 = norm(X12, NORM_L2SQR);
+
+    std::array<Vec3d, 3> X = {X_copy[0], X_copy[1], X_copy[2]};
+    std::array<Vec3d, 3> x = {x_copy[0], x_copy[1], x_copy[2]};
+
+    // Switch X,x so that BC is the largest distance among {X01, X02, X12}
+    if (a01 > a02) {
+        if (a01 > a12) {
+            std::swap(x[0], x[2]);
+            std::swap(X[0], X[2]);
+            std::swap(a01, a12);
+            X01 = -X12;
+            X02 = -X02;
+        }
+    } else if (a02 > a12) {
+        std::swap(x[0], x[1]);
+        std::swap(X[0], X[1]);
+        std::swap(a02, a12);
+        X01 = -X01;
+        X02 = X12;
+    }
+
+    const double a12d = 1.0 / a12;
+    const double a = a01 * a12d;
+    const double b = a02 * a12d;
+
+    const double m01 = x[0].dot(x[1]);
+    const double m02 = x[0].dot(x[2]);
+    const double m12 = x[1].dot(x[2]);
+
+    // Ugly parameters to simplify the calculation
+    const double m12sq = -m12 * m12 + 1.0;
+    const double m02sq = -1.0 + m02 * m02;
+    const double m01sq = -1.0 + m01 * m01;
+    const double ab = a * b;
+    const double bsq = b * b;
+    const double asq = a * a;
+    const double m013 = -2.0 + 2.0 * m01 * m02 * m12;
+    const double bsqm12sq = bsq * m12sq;
+    const double asqm12sq = asq * m12sq;
+    const double abm12sq = 2.0 * ab * m12sq;
+
+    const double k3_inv = 1.0 / (bsqm12sq + b * m02sq);
+    const double k2 = k3_inv * ((-1.0 + a) * m02sq + abm12sq + bsqm12sq + b * m013);
+    const double k1 = k3_inv * (asqm12sq + abm12sq + a * m013 + (-1.0 + b) * m01sq);
+    const double k0 = k3_inv * (asqm12sq + a * m01sq);
+
+    double s;
+    bool G = yaqding::solve_cubic_single_real(k2, k1, k0, s);
+
+    Matx33d C;
+    C(0, 0) = -a + s * (1 - b);
+    C(0, 1) = -m02 * s;
+    C(0, 2) = a * m12 + b * m12 * s;
+    C(1, 0) = C(0, 1);
+    C(1, 1) = s + 1;
+    C(1, 2) = -m01;
+    C(2, 0) = C(0, 2);
+    C(2, 1) = C(1, 2);
+    C(2, 2) = -a - b * s + 1;
+
+    std::array<Vec3d, 2> pq = yaqding::compute_pq(C);
+
+    // XX << X01, X02, X01.cross(X02);
+    // XX = XX.inverse().eval();
+    Matx33d XX;
+    XX(0,0) = X01(0);   XX(1,0) = X01(1);   XX(2,0) = X01(2);
+    XX(0,1) = X02(0);   XX(1,1) = X02(1);   XX(2,1) = X02(2);
+    Vec3d X01_X02 = X01.cross(X02);
+    XX(0,2) = X01_X02(0);   XX(1,2) = X01_X02(1);   XX(2,2) = X01_X02(2);
+    XX = XX.inv();
+
+    int n_sols = 0;
+    for (int i = 0; i < 2; ++i) {
+        // [p0 p1 p2] * [1; x; y] = 0, or [p0 p1 p2] * [d2; d0; d1] = 0
+        double p0 = pq[i](0);
+        double p1 = pq[i](1);
+        double p2 = pq[i](2);
+        // here we run into trouble if p0 is zero,
+        // so depending on which is larger, we solve for either d0 or d1
+        // The case p0 = p1 = 0 is degenerate and can be ignored
+        bool switch_12 = std::abs(p0) <= std::abs(p1);
+
+        if (switch_12) {
+            // eliminate d0
+            double w0 = -p0 / p1;
+            double w1 = -p2 / p1;
+            double ca = 1.0 / (w1 * w1 - b);
+            double cb = 2.0 * (b * m12 - m02 * w1 + w0 * w1) * ca;
+            double cc = (w0 * w0 - 2 * m02 * w0 - b + 1.0) * ca;
+            double taus[2];
+
+            if (!yaqding::root2real(cb, cc, taus[0], taus[1]))
+                continue;
+
+            for (double tau : taus) {
+                if (tau <= 0)
+                    continue;
+
+                // positive only
+                double d2 = std::sqrt(a12 / (tau * (tau - 2.0 * m12) + 1.0));
+                double d1 = tau * d2;
+                double d0 = (w0 * d2 + w1 * d1);
+                if (d0 < 0)
+                    continue;
+
+                yaqding::refine_lambda(d0, d1, d2, a01, a02, a12, m01, m02, m12);
+                Vec3d v1 = d0 * x[0] - d1 * x[1];
+                Vec3d v2 = d0 * x[0] - d2 * x[2];
+                // YY << v1, v2, v1.cross(v2);
+                Matx33d YY;
+                YY(0,0) = v1(0);   YY(1,0) = v1(1);   YY(2,0) = v1(2);
+                YY(0,1) = v2(0);   YY(1,1) = v2(1);   YY(2,1) = v2(2);
+                Vec3d v1_v2 = v1.cross(v2);
+                YY(0,2) = v1_v2(0);   YY(1,2) = v1_v2(1);   YY(2,2) = v1_v2(2);
+
+                // output->emplace_back(R, d0 * x[0] - R * X[0]);
+                Matx33d R = (YY * XX);
+                Rs.push_back(Mat(R));
+                Vec3d trans = (d0 * x[0] - R * X[0]);
+                ts.push_back(Mat(trans));
+                ++n_sols;
+            }
+        } else {
+            double w0 = -p1 / p0;
+            double w1 = -p2 / p0;
+            double ca = 1.0 / (-a * w1 * w1 + 2 * a * m12 * w1 - a + 1);
+            double cb = 2 * (a * m12 * w0 - m01 - a * w0 * w1) * ca;
+            double cc = (1 - a * w0 * w0) * ca;
+
+            double taus[2];
+            if (!yaqding::root2real(cb, cc, taus[0], taus[1]))
+                continue;
+
+            for (double tau : taus) {
+                if (tau <= 0)
+                    continue;
+
+                double d0 = std::sqrt(a01 / (tau * (tau - 2.0 * m01) + 1.0));
+                double d1 = tau * d0;
+                double d2 = w0 * d0 + w1 * d1;
+
+                if (d2 < 0)
+                    continue;
+
+                yaqding::refine_lambda(d0, d1, d2, a01, a02, a12, m01, m02, m12);
+                Vec3d v1 = d0 * x[0] - d1 * x[1];
+                Vec3d v2 = d0 * x[0] - d2 * x[2];
+                // YY << v1, v2, v1.cross(v2);
+                Matx33d YY;
+                YY(0,0) = v1(0);   YY(1,0) = v1(1);   YY(2,0) = v1(2);
+                YY(0,1) = v2(0);   YY(1,1) = v2(1);   YY(2,1) = v2(2);
+                Vec3d v1_v2 = v1.cross(v2);
+                YY(0,2) = v1_v2(0);   YY(1,2) = v1_v2(1);   YY(2,2) = v1_v2(2);
+
+                // output->emplace_back(R, d0 * x[0] - R * X[0]);
+                Matx33d R = (YY * XX);
+                Rs.push_back(Mat(R));
+                Vec3d trans = (d0 * x[0] - R * X[0]);
+                ts.push_back(Mat(trans));
+                ++n_sols;
+            }
+        }
+
+        if (n_sols > 0 && G)
+            break;
+    }
+
+    return n_sols;
+}
--- a/modules/calib3d/src/p3p.h
+++ b/modules/calib3d/src/p3p.h
@ -1,71 +1,18 @@
 #ifndef P3P_H
 #define P3P_H

-
 #include "precomp.hpp"

-class p3p
-{
- public:
-  p3p(double fx, double fy, double cx, double cy);
-  p3p(cv::Mat cameraMatrix);
+class p3p {
+public:
+    p3p();
+    int estimate(std::vector<cv::Mat>& Rs, std::vector<cv::Mat>& ts, const cv::Mat& opoints, const cv::Mat& ipoints);

-  bool solve(cv::Mat& R, cv::Mat& tvec, const cv::Mat& opoints, const cv::Mat& ipoints);
-  int solve(std::vector<cv::Mat>& Rs, std::vector<cv::Mat>& tvecs, const cv::Mat& opoints, const cv::Mat& ipoints);
-  int solve(double R[4][3][3], double t[4][3],
-            double mu0, double mv0,   double X0, double Y0, double Z0,
-            double mu1, double mv1,   double X1, double Y1, double Z1,
-            double mu2, double mv2,   double X2, double Y2, double Z2,
-            double mu3, double mv3,   double X3, double Y3, double Z3,
-            bool p4p);
-  bool solve(double R[3][3], double t[3],
-             double mu0, double mv0,   double X0, double Y0, double Z0,
-             double mu1, double mv1,   double X1, double Y1, double Z1,
-             double mu2, double mv2,   double X2, double Y2, double Z2,
-             double mu3, double mv3,   double X3, double Y3, double Z3);
+private:
+    void calibrateAndNormalizePointsPnP(const cv::Mat& opoints, const cv::Mat& ipoints);

- private:
-  template <typename T>
-  void init_camera_parameters(const cv::Mat& cameraMatrix)
-  {
-    cx = cameraMatrix.at<T> (0, 2);
-    cy = cameraMatrix.at<T> (1, 2);
-    fx = cameraMatrix.at<T> (0, 0);
-    fy = cameraMatrix.at<T> (1, 1);
-  }
-  template <typename OpointType, typename IpointType>
-  void extract_points(const cv::Mat& opoints, const cv::Mat& ipoints, std::vector<double>& points)
-  {
-      points.clear();
-      int npoints = std::max(opoints.checkVector(3, CV_32F), opoints.checkVector(3, CV_64F));
-      points.resize(5*4); //resize vector to fit for p4p case
-      for(int i = 0; i < npoints; i++)
-      {
-          points[i*5] = ipoints.at<IpointType>(i).x*fx + cx;
-          points[i*5+1] = ipoints.at<IpointType>(i).y*fy + cy;
-          points[i*5+2] = opoints.at<OpointType>(i).x;
-          points[i*5+3] = opoints.at<OpointType>(i).y;
-          points[i*5+4] = opoints.at<OpointType>(i).z;
-      }
-      //Fill vectors with unused values for p3p case
-      for (int i = npoints; i < 4; i++) {
-          for (int j = 0; j < 5; j++) {
-              points[i * 5 + j] = 0;
-          }
-      }
-  }
-  void init_inverse_parameters();
-  int solve_for_lengths(double lengths[4][3], double distances[3], double cosines[3]);
-  bool align(double M_start[3][3],
-             double X0, double Y0, double Z0,
-             double X1, double Y1, double Z1,
-             double X2, double Y2, double Z2,
-             double R[3][3], double T[3]);
-
-  bool jacobi_4x4(double * A, double * D, double * U);
-
-  double fx, fy, cx, cy;
-  double inv_fx, inv_fy, cx_fx, cy_fy;
+    std::array<cv::Vec3d, 3> x_copy;
+    std::array<cv::Vec3d, 3> X_copy;
 };

 #endif // P3P_H
--- a/modules/calib3d/src/solvepnp.cpp
+++ b/modules/calib3d/src/solvepnp.cpp
@ -452,8 +452,8 @@ int solveP3P( InputArray _opoints, InputArray _ipoints,
    int solutions = 0;
    if (flags == SOLVEPNP_P3P)
    {
-        p3p P3Psolver(cameraMatrix);
-        solutions = P3Psolver.solve(Rs, ts, opoints, undistortedPoints);
+        p3p P3Psolver;
+        solutions = P3Psolver.estimate(Rs, ts, opoints, undistortedPoints);
    }
    else if (flags == SOLVEPNP_AP3P)
    {
--- a/modules/calib3d/src/undistort.dispatch.cpp
+++ b/modules/calib3d/src/undistort.dispatch.cpp
@ -425,19 +425,27 @@ static void cvUndistortPointsInternal( const CvMat* _src, CvMat* _dst, const CvM
            x = srcd[i*sstep].x;
            y = srcd[i*sstep].y;
        }
+        // [u, v]^T = [fx * x''' + cx, fy * y''' + cy]^T =>
+        // [x''', y''']^T = [(u - cx) / fx, (v - cy) / fy]^T
        u = x; v = y;
        x = (x - cx)*ifx;
        y = (y - cy)*ify;

        if( _distCoeffs ) {
            // compensate tilt distortion
+            // s * [x''', y''', 1]^T = matTilt * [x'', y'', 1]^T =>
+            // s * matTilt^{-1} * [x''', y''', 1]^T = [x'', y'', 1]^T =>
+            // (invMatTilt := matTilt^{-1}, vecUntilt := invMatTilt * [x''', y''', 1]^T)
+            // s * vecUntilt = [x'', y'', 1]^T =>
+            // s * vecUntilt_1 = x'', s * vecUntilt_2 = y'', s * vecUntilt_3 = 1 =>
+            // invProj := s = 1 / vecUntilt_3, x'' = invProj * vecUntilt_1, y'' = invProj * vecUntilt_2
            cv::Vec3d vecUntilt = invMatTilt * cv::Vec3d(x, y, 1);
            double invProj = vecUntilt(2) ? 1./vecUntilt(2) : 1;
            x0 = x = invProj * vecUntilt(0);
            y0 = y = invProj * vecUntilt(1);

            double error = std::numeric_limits<double>::max();
-            // compensate distortion iteratively
+            // compensate distortion iteratively using fixed-point iteration

            for( int j = 0; ; j++ )
            {
@ -445,7 +453,9 @@ static void cvUndistortPointsInternal( const CvMat* _src, CvMat* _dst, const CvM
                    break;
                if ((criteria.type & cv::TermCriteria::EPS) && error < criteria.epsilon)
                    break;
+                // r^2 = x'^2 + y'^2
                double r2 = x*x + y*y;
+                // icdist := (1 + k4 * r^2 + k5 * r^4 + k6 * r^6) / (1 + k1 * r^2 + k2 * r^4 + k3 * r^6)
                double icdist = (1 + ((k[7]*r2 + k[6])*r2 + k[5])*r2)/(1 + ((k[4]*r2 + k[1])*r2 + k[0])*r2);
                if (icdist < 0)  // test: undistortPoints.regression_14583
                {
@ -453,8 +463,15 @@ static void cvUndistortPointsInternal( const CvMat* _src, CvMat* _dst, const CvM
                    y = (v - cy)*ify;
                    break;
                }
+                // deltaX := 2 * p1 * x' * y' + p2 * (r^2 + 2 * x'^2) + s1 * r^2 + s2 * r^4
+                // deltaY := p1 * (r^2 + 2 * y'^2) + 2 * p2 * x' * y' + s3 * r^2 + s4 * r^4
                double deltaX = 2*k[2]*x*y + k[3]*(r2 + 2*x*x)+ k[8]*r2+k[9]*r2*r2;
                double deltaY = k[2]*(r2 + 2*y*y) + 2*k[3]*x*y+ k[10]*r2+k[11]*r2*r2;
+                // [x'', y'']^T = [x' / icdist + deltaX, y' / icdist + deltaY]^T =>
+                // [x', y']^T = [(x'' - deltaX) * icdist, (y'' - deltaY) * icdist]^T =>
+                // x' = f1(x') := (x'' - deltaX) * icdist, y' = f2(y') := (y'' - deltaY) * icdist
+                // Fixed-point iteration:
+                //   new_x' = f1(x') = (x'' - deltaX) * icdist, new_y' = f2(y') = (y'' - deltaY) * icdist
                x = (x0 - deltaX)*icdist;
                y = (y0 - deltaY)*icdist;

@ -464,22 +481,33 @@ static void cvUndistortPointsInternal( const CvMat* _src, CvMat* _dst, const CvM
                    double xd, yd, xd0, yd0;
                    cv::Vec3d vecTilt;

+                    // r^2 = x'^2 + y'^2
                    r2 = x*x + y*y;
                    r4 = r2*r2;
                    r6 = r4*r2;
                    a1 = 2*x*y;
                    a2 = r2 + 2*x*x;
                    a3 = r2 + 2*y*y;
+                    // cdist := 1 + k1 * r^2 + k2 * r^4 + k3 * r^6
                    cdist = 1 + k[0]*r2 + k[1]*r4 + k[4]*r6;
+                    // icdist2 := 1 / (1 + k4 * r^2 + k5 * r^4 + k6 * r^6)
                    icdist2 = 1./(1 + k[5]*r2 + k[6]*r4 + k[7]*r6);
+                    // x'' = x' * cdist * icdist2 + 2 * p1 * x' * y' + p2 * (r^2 + 2 * x'^2) + s1 * r^2 + s2 * r^4
+                    // y'' = y' * cdist * icdist2 + p1 * (r^2 + 2 * y'^2) + 2 * p2 * x' * y' + s3 * r^2 + s4 * r^4
                    xd0 = x*cdist*icdist2 + k[2]*a1 + k[3]*a2 + k[8]*r2+k[9]*r4;
                    yd0 = y*cdist*icdist2 + k[2]*a3 + k[3]*a1 + k[10]*r2+k[11]*r4;

+                    // s * [x''', y''', 1]^T = matTilt * [x'', y'', 1]^T =>
+                    // (vecTilt := matTilt * [x'', y'', 1]^T)
+                    // s * [x''', y''', 1]^T = vecTilt =>
+                    // s * x''' = vecTilt_1, s * y''' = vecTilt_2, s = vecTilt_3 =>
+                    // invProj := 1 / s = 1 / vecTilt_3, x''' = invProj * vecTilt_1, y''' = invProj * vecTilt_2
                    vecTilt = matTilt*cv::Vec3d(xd0, yd0, 1);
                    invProj = vecTilt(2) ? 1./vecTilt(2) : 1;
                    xd = invProj * vecTilt(0);
                    yd = invProj * vecTilt(1);

+                    // [u, v]^T = [fx * x''' + cx, fy * y''' + cy]^T
                    double x_proj = xd*fx + cx;
                    double y_proj = yd*fy + cy;

--- a/modules/calib3d/src/usac.hpp
+++ b/modules/calib3d/src/usac.hpp
@ -658,8 +658,6 @@ namespace Math {
    Matx33d getSkewSymmetric(const Vec3d &v_);
    // eliminate matrix with m rows and n columns to be upper triangular.
    bool eliminateUpperTriangular (std::vector<double> &a, int m, int n);
-    Matx33d rotVec2RotMat (const Vec3d &v);
-    Vec3d rotMat2RotVec (const Matx33d &R);
 }

 class SolverPoly: public Algorithm {
--- a/modules/calib3d/src/usac/dls_solver.cpp
+++ b/modules/calib3d/src/usac/dls_solver.cpp
@ -199,7 +199,7 @@ public:
            // and translation.
            const double qi = s1, qi2 = qi*qi, qj = s2, qj2 = qj*qj, qk = s3, qk2 = qk*qk;
            const double s = 1 / (1 + qi2 + qj2 + qk2);
-            const Matx33d rot_mat (1-2*s*(qj2+qk2), 2*s*(qi*qj+qk), 2*s*(qi*qk-qj),
+            Matx33d rot_mat (1-2*s*(qj2+qk2), 2*s*(qi*qj+qk), 2*s*(qi*qk-qj),
                                   2*s*(qi*qj-qk), 1-2*s*(qi2+qk2), 2*s*(qj*qk+qi),
                                   2*s*(qi*qk+qj), 2*s*(qj*qk-qi), 1-2*s*(qi2+qj2));
            const Matx31d soln_translation = translation_factor * rot_mat.reshape<9,1>();
@ -217,8 +217,14 @@ public:
            }

            if (all_points_in_front_of_camera) {
+                // https://github.com/opencv/opencv/blob/2ba688f23c4e20754f32179d9396ba9b54b3b064/modules/calib3d/src/usac/pnp_solver.cpp#L395
+                // Use directly cv::Rodrigues
+                Matx31d rvec;
+                Rodrigues(rot_mat, rvec);
+                Rodrigues(rvec, rot_mat);
+
                Mat model;
-                hconcat(Math::rotVec2RotMat(Math::rotMat2RotVec(rot_mat)), soln_translation, model);
+                hconcat(rot_mat, soln_translation, model);
                models_.emplace_back(K * model);
            }
        }
--- a/modules/calib3d/src/usac/pnp_solver.cpp
+++ b/modules/calib3d/src/usac/pnp_solver.cpp
@ -392,7 +392,12 @@ public:
            zw[1] = Zw2(0);     zw[4] = Zw2(1);     zw[7] = Zw2(2);
            zw[2] = Z3crZ1w(0); zw[5] = Z3crZ1w(1); zw[8] = Z3crZ1w(2);

-            const Matx33d R = Math::rotVec2RotMat(Math::rotMat2RotVec(Z * Zw.inv()));
+            Matx33d R = Z * Zw.inv();
+            // https://github.com/opencv/opencv/blob/2ba688f23c4e20754f32179d9396ba9b54b3b064/modules/calib3d/src/usac/pnp_solver.cpp#L395
+            // Use directly cv::Rodrigues
+            Matx31d rvec;
+            Rodrigues(R, rvec);
+            Rodrigues(rvec, R);
            Matx33d KR = K * R;
            Matx34d P;
            hconcat(KR, -KR * (X1 - R.t() * nX1), P);
--- a/modules/calib3d/src/usac/utils.cpp
+++ b/modules/calib3d/src/usac/utils.cpp
@ -422,46 +422,6 @@ Matx33d Math::getSkewSymmetric(const Vec3d &v) {
            -v[1],  v[0], 0};
 }

-Matx33d Math::rotVec2RotMat (const Vec3d &v) {
-    const double phi = sqrt(v[0]*v[0]+v[1]*v[1]+v[2]*v[2]);
-    const double x = v[0] / phi, y = v[1] / phi, z = v[2] / phi;
-    const double a = sin(phi), b = cos(phi);
-    // R = I + sin(phi) * skew(v) + (1 - cos(phi) * skew(v)^2
-    return {(b - 1)*y*y + (b - 1)*z*z + 1, -a*z - x*y*(b - 1), a*y - x*z*(b - 1),
-     a*z - x*y*(b - 1), (b - 1)*x*x + (b - 1)*z*z + 1, -a*x - y*z*(b - 1),
-    -a*y - x*z*(b - 1), a*x - y*z*(b - 1), (b - 1)*x*x + (b - 1)*y*y + 1};
-}
-
-Vec3d Math::rotMat2RotVec (const Matx33d &R) {
-    // https://math.stackexchange.com/questions/83874/efficient-and-accurate-numerical-implementation-of-the-inverse-rodrigues-rotatio?rq=1
-    Vec3d rot_vec;
-    const double trace = R(0,0)+R(1,1)+R(2,2);
-    if (trace >= 3 - FLT_EPSILON) {
-        rot_vec = (0.5 * (trace-3)/12)*Vec3d(R(2,1)-R(1,2),
-                                             R(0,2)-R(2,0),
-                                             R(1,0)-R(0,1));
-    } else if (3 - FLT_EPSILON > trace && trace > -1 + FLT_EPSILON) {
-        double theta = acos((trace - 1) / 2);
-        rot_vec = (theta / (2 * sin(theta))) * Vec3d(R(2,1)-R(1,2),
-                                                     R(0,2)-R(2,0),
-                                                     R(1,0)-R(0,1));
-    } else {
-        int a;
-        if (R(0,0) > R(1,1))
-            a = R(0,0) > R(2,2) ? 0 : 2;
-        else
-            a = R(1,1) > R(2,2) ? 1 : 2;
-        Vec3d v;
-        int b = (a + 1) % 3, c = (a + 2) % 3;
-        double s = sqrt(R(a,a) - R(b,b) - R(c,c) + 1);
-        v[a] = s / 2;
-        v[b] = (R(b,a) + R(a,b)) / (2 * s);
-        v[c] = (R(c,a) + R(a,c)) / (2 * s);
-        rot_vec = M_PI * v / norm(v);
-    }
-    return rot_vec;
-}
-
 /*
 * Eliminate matrix of m rows and n columns to be upper triangular.
 */
--- a/modules/calib3d/test/test_solvepnp_ransac.cpp
+++ b/modules/calib3d/test/test_solvepnp_ransac.cpp
@ -470,7 +470,7 @@ public:
    {
        eps[SOLVEPNP_ITERATIVE] = 1.0e-6;
        eps[SOLVEPNP_EPNP] = 1.0e-6;
-        eps[SOLVEPNP_P3P] = 2.0e-4;
+        eps[SOLVEPNP_P3P] = 1.0e-4;
        eps[SOLVEPNP_AP3P] = 1.0e-4;
        eps[SOLVEPNP_DLS] = 1.0e-6; // DLS is remapped to EPnP, so we use the same threshold
        eps[SOLVEPNP_UPNP] = 1.0e-6; // UPnP is remapped to EPnP, so we use the same threshold
@ -613,7 +613,7 @@ class CV_solveP3P_Test : public CV_solvePnPRansac_Test
 public:
    CV_solveP3P_Test()
    {
-        eps[SOLVEPNP_P3P] = 2.0e-4;
+        eps[SOLVEPNP_P3P] = 1.0e-4;
        eps[SOLVEPNP_AP3P] = 1.0e-4;
        totalTestsCount = 1000;
    }
@ -1513,7 +1513,7 @@ TEST(Calib3d_SolvePnP, generic)
                for (size_t i = 0; i < rvecs_est.size() && !isTestSuccess; i++) {
                    double rvecDiff = cvtest::norm(rvecs_est[i], rvec_ground_truth, NORM_L2);
                    double tvecDiff = cvtest::norm(tvecs_est[i], tvec_ground_truth, NORM_L2);
-                    const double threshold = method == SOLVEPNP_P3P ? 1e-2 : 1e-4;
+                    const double threshold = 1e-4;
                    isTestSuccess = rvecDiff < threshold && tvecDiff < threshold;
                }

@ -1581,7 +1581,7 @@ TEST(Calib3d_SolvePnP, generic)
                for (size_t i = 0; i < rvecs_est.size() && !isTestSuccess; i++) {
                    double rvecDiff = cvtest::norm(rvecs_est[i], rvec_ground_truth, NORM_L2);
                    double tvecDiff = cvtest::norm(tvecs_est[i], tvec_ground_truth, NORM_L2);
-                    const double threshold = method == SOLVEPNP_P3P ? 1e-2 : 1e-4;
+                    const double threshold = 1e-4;
                    isTestSuccess = rvecDiff < threshold && tvecDiff < threshold;
                }

--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@ -165,7 +165,7 @@ if(HAVE_CUDA)
    message(FATAL_ERROR "CUDA: OpenCV requires enabled 'cudev' module from 'opencv_contrib' repository: https://github.com/opencv/opencv_contrib")
  endif()
  if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
-   ocv_module_include_directories(${CUDAToolkit_INCLUDE_DIRS})
+   list(APPEND extra_libs CUDA::cudart${CUDA_LIB_EXT})
  endif()
  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wenum-compare -Wunused-function -Wshadow)
 endif()
@ -209,7 +209,11 @@ if(HAVE_OPENMP AND DEFINED OpenMP_CXX_LIBRARIES AND OpenMP_CXX_LIBRARIES)
  ocv_target_link_libraries(${the_module} LINK_PRIVATE "${OpenMP_CXX_LIBRARIES}")
 endif()

-ocv_add_accuracy_tests()
+set(test_libs "")
+if(HAVE_CUDA AND ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  list(APPEND test_libs CUDA::cudart${CUDA_LIB_EXT})
+endif()
+ocv_add_accuracy_tests(${test_libs})
 ocv_add_perf_tests()

 ocv_install_3rdparty_licenses(SoftFloat "${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/SoftFloat/COPYING.txt")
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -540,9 +540,9 @@ The function LUT fills the output array with values from the look-up table. Indi
 are taken from the input array. That is, the function processes each element of src as follows:
 \f[\texttt{dst} (I)  \leftarrow \texttt{lut(src(I) + d)}\f]
 where
-\f[d =  \fork{0}{if \(\texttt{src}\) has depth \(\texttt{CV_8U}\)}{128}{if \(\texttt{src}\) has depth \(\texttt{CV_8S}\)}\f]
-@param src input array of 8-bit elements.
-@param lut look-up table of 256 elements; in case of multi-channel input array, the table should
+\f[d =  \forkthree{0}{if \(\texttt{src}\) has depth \(\texttt{CV_8U}\) or \(\texttt{CV_16U}\)}{128}{if \(\texttt{src}\) has depth \(\texttt{CV_8S}\)}{32768}{if \(\texttt{src}\) has depth \(\texttt{CV_16S}\)}\f]
+@param src input array of 8-bit or 16-bit integer elements.
+@param lut look-up table of 256 elements (if src has depth CV_8U or CV_8S) or 65536 elements(if src has depth CV_16U or CV_16S); in case of multi-channel input array, the table should
 either have a single channel (in this case the same table is used for all channels) or the same
 number of channels as in the input array.
@param dst output array of the same size and number of channels as src, and the same depth as lut.
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@ -54,6 +54,7 @@
 #include "opencv2/core/base.hpp"

 #include "opencv2/core/cuda.hpp"
+#include "opencv2/core/private/cuda_stubs.hpp"

 #ifdef HAVE_CUDA
 #  include <cuda.h>
@ -101,16 +102,10 @@ namespace cv { namespace cuda {
    CV_EXPORTS void syncOutput(const GpuMat& dst, OutputArray _dst, Stream& stream);
 }}

-#ifndef HAVE_CUDA
-
-static inline CV_NORETURN void throw_no_cuda() { CV_Error(cv::Error::GpuNotSupported, "The library is compiled without CUDA support"); }
-
-#else // HAVE_CUDA
+#ifdef HAVE_CUDA

 #define nppSafeSetStream(oldStream, newStream) { if(oldStream != newStream) { cudaStreamSynchronize(oldStream); nppSetStream(newStream); } }

-static inline CV_NORETURN void throw_no_cuda() { CV_Error(cv::Error::StsNotImplemented, "The called functionality is disabled for current build or platform"); }
-
 namespace cv { namespace cuda
 {
    static inline void checkNppError(int code, const char* file, const int line, const char* func)
--- a/modules/core/include/opencv2/core/private/cuda_stubs.hpp
+++ b/modules/core/include/opencv2/core/private/cuda_stubs.hpp
@ -0,0 +1,21 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_PRIVATE_CUDA_STUBS_HPP
+#define OPENCV_CORE_PRIVATE_CUDA_STUBS_HPP
+
+#ifndef __OPENCV_BUILD
+#  error this is a private header which should not be used from outside of the OpenCV library
+#endif
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/base.hpp"
+
+#ifndef HAVE_CUDA
+static inline CV_NORETURN void throw_no_cuda() { CV_Error(cv::Error::GpuNotSupported, "The library is compiled without CUDA support"); }
+#else
+static inline CV_NORETURN void throw_no_cuda() { CV_Error(cv::Error::StsNotImplemented, "The called functionality is disabled for current build or platform"); }
+#endif
+
+#endif // OPENCV_CORE_PRIVATE_CUDA_STUBS_HPP
--- a/modules/core/include/opencv2/core/types.hpp
+++ b/modules/core/include/opencv2/core/types.hpp
@ -1931,7 +1931,7 @@ template<typename _Tp> static inline
 Rect_<_Tp>& operator &= ( Rect_<_Tp>& a, const Rect_<_Tp>& b )
 {
    if (a.empty() || b.empty()) {
-        a = Rect();
+        a = Rect_<_Tp>();
        return a;
    }
    const Rect_<_Tp>& Rx_min = (a.x < b.x) ? a : b;
@ -1945,7 +1945,7 @@ Rect_<_Tp>& operator &= ( Rect_<_Tp>& a, const Rect_<_Tp>& b )
    // Let us first deal with the following case.
    if ((Rx_min.x < 0 && Rx_min.x + Rx_min.width < Rx_max.x) ||
        (Ry_min.y < 0 && Ry_min.y + Ry_min.height < Ry_max.y)) {
-        a = Rect();
+        a = Rect_<_Tp>();
        return a;
    }
    // We now know that either Rx_min.x >= 0, or
@ -1957,7 +1957,7 @@ Rect_<_Tp>& operator &= ( Rect_<_Tp>& a, const Rect_<_Tp>& b )
    a.x = Rx_max.x;
    a.y = Ry_max.y;
    if (a.empty())
-        a = Rect();
+        a = Rect_<_Tp>();
    return a;
 }

--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@ -282,6 +282,32 @@ inline int hal_ni_lut(const uchar *src_data, size_t src_step, size_t src_type, c
 #define cv_hal_lut hal_ni_lut
 //! @endcond

+/**
+Lookup table replacement
+Table consists of 65536 elements of a size from 1 to 8 bytes having 1 channel or src_channels
+For 16s input typea 32768 is added to LUT index
+Destination should have the same element type and number of channels as lookup table elements
+@param src_data Source image data
+@param src_step Source image step
+@param src_type Source image type
+@param lut_data Pointer to lookup table
+@param lut_channel_size Size of each channel in bytes
+@param lut_channels Number of channels in lookup table
+@param dst_data Destination data
+@param dst_step Destination step
+@param width Width of images
+@param height Height of images
+@sa LUT
+*/
+//! @addtogroup core_hal_interface_lut16 Lookup table for 16 bit index
+//! @{
+inline int hal_ni_lut16(const ushort *src_data, size_t src_step, size_t src_type, const uchar* lut_data, size_t lut_channel_size, size_t lut_channels, uchar *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_lut16 hal_ni_lut16
+//! @endcond
+
 /**
 Hamming norm of a vector
@param a pointer to vector data
--- a/modules/core/src/lut.cpp
+++ b/modules/core/src/lut.cpp
@ -6,6 +6,7 @@
 #include "precomp.hpp"
 #include "opencl_kernels_core.hpp"
 #include "convert.hpp"
+#include <sys/types.h>

 /****************************************************************************************\
 *                                    LUT Transform                                       *
@ -14,8 +15,8 @@
 namespace cv
 {

-template<typename T> static void
-LUT8u_( const uchar* src, const T* lut, T* dst, int len, int cn, int lutcn )
+template<typename Ti, typename T> static void
+LUT_( const Ti* src, const T* lut, T* dst, const int len, const int cn, const int lutcn )
 {
    if( lutcn == 1 )
    {
@ -30,53 +31,45 @@ LUT8u_( const uchar* src, const T* lut, T* dst, int len, int cn, int lutcn )
    }
 }

-static void LUT8u_8u( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn )
-{
-    LUT8u_( src, lut, dst, len, cn, lutcn );
-}
-
-static void LUT8u_8s( const uchar* src, const schar* lut, schar* dst, int len, int cn, int lutcn )
-{
-    LUT8u_( src, lut, dst, len, cn, lutcn );
-}
-
-static void LUT8u_16u( const uchar* src, const ushort* lut, ushort* dst, int len, int cn, int lutcn )
-{
-    LUT8u_( src, lut, dst, len, cn, lutcn );
-}
-
-static void LUT8u_16s( const uchar* src, const short* lut, short* dst, int len, int cn, int lutcn )
-{
-    LUT8u_( src, lut, dst, len, cn, lutcn );
-}
-
-static void LUT8u_32s( const uchar* src, const int* lut, int* dst, int len, int cn, int lutcn )
-{
-    LUT8u_( src, lut, dst, len, cn, lutcn );
-}
-
-static void LUT8u_16f( const uchar* src, const hfloat* lut, hfloat* dst, int len, int cn, int lutcn )
-{
-    LUT8u_( src, lut, dst, len, cn, lutcn );
-}
-
-static void LUT8u_32f( const uchar* src, const float* lut, float* dst, int len, int cn, int lutcn )
-{
-    LUT8u_( src, lut, dst, len, cn, lutcn );
-}
-
-static void LUT8u_64f( const uchar* src, const double* lut, double* dst, int len, int cn, int lutcn )
-{
-    LUT8u_( src, lut, dst, len, cn, lutcn );
-}
-
 typedef void (*LUTFunc)( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn );

-static LUTFunc lutTab[CV_DEPTH_MAX] =
+static LUTFunc getLUTFunc(const int srcDepth, const int dstDepth)
 {
-    (LUTFunc)LUT8u_8u, (LUTFunc)LUT8u_8s, (LUTFunc)LUT8u_16u, (LUTFunc)LUT8u_16s,
-    (LUTFunc)LUT8u_32s, (LUTFunc)LUT8u_32f, (LUTFunc)LUT8u_64f, (LUTFunc)LUT8u_16f
-};
+    LUTFunc ret = nullptr;
+    if((srcDepth == CV_8U) || (srcDepth == CV_8S))
+    {
+        switch(dstDepth)
+        {
+            case CV_8U:   ret = (LUTFunc)LUT_<uint8_t, uint8_t>;   break;
+            case CV_8S:   ret = (LUTFunc)LUT_<uint8_t, int8_t>;    break;
+            case CV_16U:  ret = (LUTFunc)LUT_<uint8_t, uint16_t>;  break;
+            case CV_16S:  ret = (LUTFunc)LUT_<uint8_t, int16_t>;   break;
+            case CV_32S:  ret = (LUTFunc)LUT_<uint8_t, int32_t>;   break;
+            case CV_32F:  ret = (LUTFunc)LUT_<uint8_t, int32_t>;   break; // float
+            case CV_64F:  ret = (LUTFunc)LUT_<uint8_t, int64_t>;   break; // double
+            case CV_16F:  ret = (LUTFunc)LUT_<uint8_t, int16_t>;   break; // hfloat
+            default:      ret = nullptr;                           break;
+        }
+    }
+    else if((srcDepth == CV_16U) || (srcDepth == CV_16S))
+    {
+        switch(dstDepth)
+        {
+            case CV_8U:   ret = (LUTFunc)LUT_<uint16_t, uint8_t>;  break;
+            case CV_8S:   ret = (LUTFunc)LUT_<uint16_t, int8_t>;   break;
+            case CV_16U:  ret = (LUTFunc)LUT_<uint16_t, uint16_t>; break;
+            case CV_16S:  ret = (LUTFunc)LUT_<uint16_t, int16_t>;  break;
+            case CV_32S:  ret = (LUTFunc)LUT_<uint16_t, int32_t>;  break;
+            case CV_32F:  ret = (LUTFunc)LUT_<uint16_t, int32_t>;  break; // float
+            case CV_64F:  ret = (LUTFunc)LUT_<uint16_t, int64_t>;  break; // double
+            case CV_16F:  ret = (LUTFunc)LUT_<uint16_t, int16_t>;  break; // hfloat
+            default:      ret = nullptr;                           break;
+        }
+    }
+
+    CV_CheckTrue(ret != nullptr, "An unexpected type combination was specified.");
+    return ret;
+}

 #ifdef HAVE_OPENCL

@ -107,24 +100,19 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
 class LUTParallelBody : public ParallelLoopBody
 {
 public:
-    bool* ok;
    const Mat& src_;
    const Mat& lut_;
    Mat& dst_;

-    LUTFunc func;
+    LUTFunc func_;

-    LUTParallelBody(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
-        : ok(_ok), src_(src), lut_(lut), dst_(dst)
+    LUTParallelBody(const Mat& src, const Mat& lut, Mat& dst, LUTFunc func)
+        : src_(src), lut_(lut), dst_(dst), func_(func)
    {
-        func = lutTab[lut.depth()];
-        *ok = (func != NULL);
    }

    void operator()( const cv::Range& range ) const CV_OVERRIDE
    {
-        CV_Assert(*ok);
-
        const int row0 = range.start;
        const int row1 = range.end;

@ -140,7 +128,7 @@ public:
        int len = (int)it.size;

        for( size_t i = 0; i < it.nplanes; i++, ++it )
-            func(ptrs[0], lut_.ptr(), ptrs[1], len, cn, lutcn);
+            func_(ptrs[0], lut_.ptr(), ptrs[1], len, cn, lutcn);
    }
 private:
    LUTParallelBody(const LUTParallelBody&);
@ -155,39 +143,47 @@ void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst )

    int cn = _src.channels(), depth = _src.depth();
    int lutcn = _lut.channels();
+    const size_t lut_size = _lut.total();

-    CV_Assert( (lutcn == cn || lutcn == 1) &&
-        _lut.total() == 256 && _lut.isContinuous() &&
-        (depth == CV_8U || depth == CV_8S) );
+    CV_Assert( (lutcn == cn || lutcn == 1) && _lut.isContinuous() &&
+        (
+            ((lut_size == 256) && ((depth == CV_8U)||(depth == CV_8S))) ||
+            ((lut_size == 65536) && ((depth == CV_16U)||(depth == CV_16S)))
+        )
+    );

-    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
+    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (lut_size == 256),
               ocl_LUT(_src, _lut, _dst))

    Mat src = _src.getMat(), lut = _lut.getMat();
    _dst.create(src.dims, src.size, CV_MAKETYPE(_lut.depth(), cn));
    Mat dst = _dst.getMat();

-    CALL_HAL(LUT, cv_hal_lut, src.data, src.step, src.type(), lut.data,
-             lut.elemSize1(), lutcn, dst.data, dst.step, src.cols, src.rows);
+    if(lut_size == 256)
+    {
+        CALL_HAL(LUT, cv_hal_lut, src.data, src.step, src.type(), lut.data,
+                 lut.elemSize1(), lutcn, dst.data, dst.step, src.cols, src.rows);
+    }
+    else
+    {
+        CALL_HAL(LUT16, cv_hal_lut16, src.ptr<ushort>(), src.step, src.type(), lut.data,
+                 lut.elemSize1(), lutcn, dst.data, dst.step, src.cols, src.rows);
+    }
+
+    const LUTFunc func = getLUTFunc(src.depth(), dst.depth());
+    CV_Assert( func != nullptr );

    if (_src.dims() <= 2)
    {
-        bool ok = false;
-        LUTParallelBody body(src, lut, dst, &ok);
-        if (ok)
-        {
-            Range all(0, dst.rows);
-            if (dst.total() >= (size_t)(1<<18))
-                parallel_for_(all, body, (double)std::max((size_t)1, dst.total()>>16));
-            else
-                body(all);
-            if (ok)
-                return;
-        }
-    }
+        LUTParallelBody body(src, lut, dst, func);
+        Range all(0, dst.rows);
+        if (dst.total() >= (size_t)(1<<18))
+            parallel_for_(all, body, (double)std::max((size_t)1, dst.total()>>16));
+        else
+            body(all);

-    LUTFunc func = lutTab[lut.depth()];
-    CV_Assert( func != 0 );
+        return;
+    }

    const Mat* arrays[] = {&src, &dst, 0};
    uchar* ptrs[2] = {};
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@ -3221,11 +3221,12 @@ INSTANTIATE_TEST_CASE_P(Core_CartPolar, Core_PolarToCart_inplace,
    )
 );

-CV_ENUM(LutMatType, CV_8U, CV_16U, CV_16F, CV_32S, CV_32F, CV_64F)
+CV_ENUM(LutIdxType, CV_8U, CV_8S, CV_16U, CV_16S)
+CV_ENUM(LutMatType, CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F, CV_16F)

-struct Core_LUT: public testing::TestWithParam<LutMatType>
+struct Core_LUT: public testing::TestWithParam< std::tuple<LutIdxType, LutMatType> >
 {
-    template<typename T, int ch, bool same_cn>
+    template<typename Ti, typename T, int ch, bool same_cn>
    cv::Mat referenceWithType(cv::Mat input, cv::Mat table)
    {
        cv::Mat ref(input.size(), CV_MAKE_TYPE(table.depth(), ch));
@ -3235,7 +3236,7 @@ struct Core_LUT: public testing::TestWithParam<LutMatType>
            {
                if(ch == 1)
                {
-                    ref.at<T>(i, j) = table.at<T>(input.at<uchar>(i, j));
+                    ref.at<T>(i, j) = table.at<T>(input.at<Ti>(i, j));
                }
                else
                {
@ -3244,11 +3245,11 @@ struct Core_LUT: public testing::TestWithParam<LutMatType>
                    {
                        if (same_cn)
                        {
-                            val[k] = table.at<Vec<T, ch>>(input.at<Vec<uchar, ch>>(i, j)[k])[k];
+                            val[k] = table.at<Vec<T, ch>>(input.at<Vec<Ti, ch>>(i, j)[k])[k];
                        }
                        else
                        {
-                            val[k] = table.at<T>(input.at<Vec<uchar, ch>>(i, j)[k]);
+                            val[k] = table.at<T>(input.at<Vec<Ti, ch>>(i, j)[k]);
                        }
                    }
                    ref.at<Vec<T, ch>>(i, j) = val;
@ -3261,86 +3262,114 @@ struct Core_LUT: public testing::TestWithParam<LutMatType>
    template<int ch = 1, bool same_cn = false>
    cv::Mat reference(cv::Mat input, cv::Mat table)
    {
-        if (table.depth() == CV_8U)
+        cv::Mat ret = cv::Mat();
+        if ((input.depth() == CV_8U) || (input.depth() == CV_8S)) // Index type for LUT operation
        {
-            return referenceWithType<uchar, ch, same_cn>(input, table);
+            switch(table.depth()) // Value type for LUT operation
+            {
+                case CV_8U:   ret = referenceWithType<uint8_t, uint8_t,  ch, same_cn>(input, table); break;
+                case CV_8S:   ret = referenceWithType<uint8_t, int8_t,   ch, same_cn>(input, table); break;
+                case CV_16U:  ret = referenceWithType<uint8_t, uint16_t, ch, same_cn>(input, table); break;
+                case CV_16S:  ret = referenceWithType<uint8_t, int16_t,  ch, same_cn>(input, table); break;
+                case CV_32S:  ret = referenceWithType<uint8_t, int32_t,  ch, same_cn>(input, table); break;
+                case CV_32F:  ret = referenceWithType<uint8_t, float,    ch, same_cn>(input, table); break;
+                case CV_64F:  ret = referenceWithType<uint8_t, double,   ch, same_cn>(input, table); break;
+                case CV_16F:  ret = referenceWithType<uint8_t, uint16_t, ch, same_cn>(input, table); break;
+                default:      ret = cv::Mat();                                                       break;
+            }
        }
-        else if (table.depth() == CV_16U)
+        else if ((input.depth() == CV_16U) || (input.depth() == CV_16S))
        {
-            return referenceWithType<ushort, ch, same_cn>(input, table);
-        }
-        else if (table.depth() == CV_16F)
-        {
-            return referenceWithType<ushort, ch, same_cn>(input, table);
-        }
-        else if (table.depth() == CV_32S)
-        {
-            return referenceWithType<int, ch, same_cn>(input, table);
-        }
-        else if (table.depth() == CV_32F)
-        {
-            return referenceWithType<float, ch, same_cn>(input, table);
-        }
-        else if (table.depth() == CV_64F)
-        {
-            return referenceWithType<double, ch, same_cn>(input, table);
+            switch(table.depth()) // Value type for LUT operation
+            {
+                case CV_8U:   ret = referenceWithType<uint16_t, uint8_t,  ch, same_cn>(input, table); break;
+                case CV_8S:   ret = referenceWithType<uint16_t, int8_t,   ch, same_cn>(input, table); break;
+                case CV_16U:  ret = referenceWithType<uint16_t, uint16_t, ch, same_cn>(input, table); break;
+                case CV_16S:  ret = referenceWithType<uint16_t, int16_t,  ch, same_cn>(input, table); break;
+                case CV_32S:  ret = referenceWithType<uint16_t, int32_t,  ch, same_cn>(input, table); break;
+                case CV_32F:  ret = referenceWithType<uint16_t, float,    ch, same_cn>(input, table); break;
+                case CV_64F:  ret = referenceWithType<uint16_t, double,   ch, same_cn>(input, table); break;
+                case CV_16F:  ret = referenceWithType<uint16_t, uint16_t, ch, same_cn>(input, table); break;
+                default:      ret = cv::Mat();                                                        break;
+            }
        }

-        return cv::Mat();
+        return ret;
    }
 };

 TEST_P(Core_LUT, accuracy)
 {
-    int type = GetParam();
-    cv::Mat input(117, 113, CV_8UC1);
-    randu(input, 0, 256);
+    int idx_type = get<0>(GetParam());
+    int value_type = get<1>(GetParam());

-    cv::Mat table(1, 256, CV_MAKE_TYPE(type, 1));
-    randu(table, 0, getMaxVal(type));
+    ASSERT_TRUE((idx_type == CV_8U) || (idx_type == CV_8S) || (idx_type == CV_16U ) || (idx_type == CV_16S));
+    const int tableSize = ((idx_type == CV_8U) || (idx_type == CV_8S)) ? 256: 65536;
+
+    cv::Mat input(117, 113, CV_MAKE_TYPE(idx_type, 1));
+    randu(input, getMinVal(idx_type), getMaxVal(idx_type));
+
+    cv::Mat table(1, tableSize, CV_MAKE_TYPE(value_type, 1));
+    randu(table, getMinVal(value_type), getMaxVal(value_type));

    cv::Mat output;
-    cv::LUT(input, table, output);
+    ASSERT_NO_THROW(cv::LUT(input, table, output));
+    ASSERT_FALSE(output.empty());

    cv::Mat gt = reference(input, table);
+    ASSERT_FALSE(gt.empty());

    ASSERT_EQ(0, cv::norm(output, gt, cv::NORM_INF));
 }

 TEST_P(Core_LUT, accuracy_multi)
 {
-    int type = (int)GetParam();
-    cv::Mat input(117, 113, CV_8UC3);
-    randu(input, 0, 256);
+    int idx_type = get<0>(GetParam());
+    int value_type = get<1>(GetParam());

-    cv::Mat table(1, 256, CV_MAKE_TYPE(type, 1));
-    randu(table, 0, getMaxVal(type));
+    ASSERT_TRUE((idx_type == CV_8U) || (idx_type == CV_8S) || (idx_type == CV_16U) || (idx_type == CV_16S));
+    const int tableSize = ((idx_type == CV_8U) || (idx_type == CV_8S) ) ? 256: 65536;
+
+    cv::Mat input(117, 113, CV_MAKE_TYPE(idx_type, 3));
+    randu(input, getMinVal(idx_type), getMaxVal(idx_type));
+
+    cv::Mat table(1, tableSize, CV_MAKE_TYPE(value_type, 1));
+    randu(table, getMinVal(value_type), getMaxVal(value_type));

    cv::Mat output;
-    cv::LUT(input, table, output);
+    ASSERT_NO_THROW(cv::LUT(input, table, output));
+    ASSERT_FALSE(output.empty());

    cv::Mat gt = reference<3>(input, table);
+    ASSERT_FALSE(gt.empty());

    ASSERT_EQ(0, cv::norm(output, gt, cv::NORM_INF));
 }

 TEST_P(Core_LUT, accuracy_multi2)
 {
-    int type = (int)GetParam();
-    cv::Mat input(117, 113, CV_8UC3);
-    randu(input, 0, 256);
+    int idx_type = get<0>(GetParam());
+    int value_type = get<1>(GetParam());

-    cv::Mat table(1, 256, CV_MAKE_TYPE(type, 3));
-    randu(table, 0, getMaxVal(type));
+    ASSERT_TRUE((idx_type == CV_8U) || (idx_type == CV_8S) || (idx_type == CV_16U) || (idx_type == CV_16S));
+    const int tableSize = ((idx_type == CV_8U) || (idx_type == CV_8S)) ? 256: 65536;
+
+    cv::Mat input(117, 113, CV_MAKE_TYPE(idx_type, 3));
+    randu(input, getMinVal(idx_type), getMaxVal(idx_type));
+
+    cv::Mat table(1, tableSize, CV_MAKE_TYPE(value_type, 3));
+    randu(table, getMinVal(value_type), getMaxVal(value_type));

    cv::Mat output;
-    cv::LUT(input, table, output);
+    ASSERT_NO_THROW(cv::LUT(input, table, output));
+    ASSERT_FALSE(output.empty());

    cv::Mat gt = reference<3, true>(input, table);
+    ASSERT_FALSE(gt.empty());

    ASSERT_EQ(0, cv::norm(output, gt, cv::NORM_INF));
 }

-INSTANTIATE_TEST_CASE_P(/**/, Core_LUT, LutMatType::all());
+INSTANTIATE_TEST_CASE_P(/**/, Core_LUT, testing::Combine( LutIdxType::all(), LutMatType::all()));

 }} // namespace
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@ -179,7 +179,7 @@ if(OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
  endforeach()
  unset(CC_LIST)
  if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
-    list(APPEND libs ${CUDNN_LIBRARIES} CUDA::cublas${CUDA_LIB_EXT})
+    list(APPEND libs CUDA::cudart${CUDA_LIB_EXT} ${CUDNN_LIBRARIES} CUDA::cublas${CUDA_LIB_EXT})
    if(NOT CUDA_VERSION VERSION_LESS 10.1)
      list(APPEND libs CUDA::cublasLt${CUDA_LIB_EXT})
    endif()
--- a/modules/dnn/src/layers/cpu_kernels/convolution.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/convolution.cpp
@ -2048,6 +2048,85 @@ static inline void convBlock4x4(int np, const float* a, const float* b, float* c
 }
 #endif

+#if defined(__EMSCRIPTEN__)
+template<int RowBase, int NR = 24>
+static inline void conv2xNR(
+    int np,
+    const float* a,
+    const float* b,
+    float* c, int ldc,
+    bool init_c)
+{
+    static_assert(NR % 8 == 0, "NR must be a multiple of 8");
+    float* d0 = c + (RowBase + 0) * ldc;
+    float* d1 = c + (RowBase + 1) * ldc;
+
+    // p = 0
+    {
+        const float* bp = b;
+        const float a0 = a[RowBase + 0];
+        const float a1 = a[RowBase + 1];
+
+        if (init_c)
+        {
+            for (int j = 0; j < NR; j += 8)
+            {
+                const float b0=bp[j+0], b1=bp[j+1], b2=bp[j+2], b3=bp[j+3];
+                const float b4=bp[j+4], b5=bp[j+5], b6=bp[j+6], b7=bp[j+7];
+                d0[j+0] = b0*a0; d0[j+1] = b1*a0; d0[j+2] = b2*a0; d0[j+3] = b3*a0;
+                d0[j+4] = b4*a0; d0[j+5] = b5*a0; d0[j+6] = b6*a0; d0[j+7] = b7*a0;
+                d1[j+0] = b0*a1; d1[j+1] = b1*a1; d1[j+2] = b2*a1; d1[j+3] = b3*a1;
+                d1[j+4] = b4*a1; d1[j+5] = b5*a1; d1[j+6] = b6*a1; d1[j+7] = b7*a1;
+            }
+        } else
+        {
+            for (int j = 0; j < NR; j += 8)
+            {
+                const float b0=bp[j+0], b1=bp[j+1], b2=bp[j+2], b3=bp[j+3];
+                const float b4=bp[j+4], b5=bp[j+5], b6=bp[j+6], b7=bp[j+7];
+                d0[j+0] += b0*a0; d0[j+1] += b1*a0; d0[j+2] += b2*a0; d0[j+3] += b3*a0;
+                d0[j+4] += b4*a0; d0[j+5] += b5*a0; d0[j+6] += b6*a0; d0[j+7] += b7*a0;
+                d1[j+0] += b0*a1; d1[j+1] += b1*a1; d1[j+2] += b2*a1; d1[j+3] += b3*a1;
+                d1[j+4] += b4*a1; d1[j+5] += b5*a1; d1[j+6] += b6*a1; d1[j+7] += b7*a1;
+            }
+        }
+    }
+
+    // p = 1..np-1
+    for (int p = 1; p < np; ++p)
+    {
+        const float* bp = b + p * NR;
+        const int aoff = p * 4 + RowBase;
+        const float a0 = a[aoff + 0];
+        const float a1 = a[aoff + 1];
+
+        for (int j = 0; j < NR; j += 8)
+        {
+            const float b0=bp[j+0], b1=bp[j+1], b2=bp[j+2], b3=bp[j+3];
+            const float b4=bp[j+4], b5=bp[j+5], b6=bp[j+6], b7=bp[j+7];
+            d0[j+0] += b0*a0; d0[j+1] += b1*a0; d0[j+2] += b2*a0; d0[j+3] += b3*a0;
+            d0[j+4] += b4*a0; d0[j+5] += b5*a0; d0[j+6] += b6*a0; d0[j+7] += b7*a0;
+            d1[j+0] += b0*a1; d1[j+1] += b1*a1; d1[j+2] += b2*a1; d1[j+3] += b3*a1;
+            d1[j+4] += b4*a1; d1[j+5] += b5*a1; d1[j+6] += b6*a1; d1[j+7] += b7*a1;
+        }
+    }
+}
+
+// MR == 4, outLen == 24, scalar (no SIMD128)
+static inline void convBlockNoSIMD4x24(
+    int np,
+    const float*  a,
+    const float*  b,
+    float*  c, int ldc,
+    bool init_c,
+    int convNR)
+{
+    CV_Assert(np > 0 && convNR == 24);
+    conv2xNR<0, 24>(np, a, b, c, ldc, init_c); // rows 0 & 1
+    conv2xNR<2, 24>(np, a, b, c, ldc, init_c); // rows 2 & 3
+}
+#endif
+
 static inline void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
                            const int convMR, const int convNR)
 {
@ -2107,8 +2186,17 @@ void convBlock_F32(int np, const float* a, const float* b, float* c, int ldc, bo
        return;
    }
    convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR);
+#elif defined(__EMSCRIPTEN__)
+    CV_Assert(convMR == 4);
+    if (outLen == 24 && convNR == 24)
+    {
+        convBlockNoSIMD4x24(np, a, b, c, ldc, init_c, convNR);
+        return;
+    }
+    convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR);
 #else
    convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR);
+    return;
 #endif
 }

--- a/modules/gapi/CMakeLists.txt
+++ b/modules/gapi/CMakeLists.txt
@ -66,6 +66,7 @@ file(GLOB gapi_ext_hdrs
    "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/streaming/onevpl/*.hpp"
    "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/plaidml/*.hpp"
    "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/util/*.hpp"
+    "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/pysrc/*.hpp"
    )

 set(gapi_srcs
@ -242,6 +243,9 @@ set(gapi_srcs
    src/streaming/gstreamer/gstreamer_media_adapter.cpp
    src/streaming/gstreamer/gstreamerenv.cpp

+    # Python Custom Stream source
+    src/pysrc/python_stream_source.cpp
+
    # Utils (ITT tracing)
    src/utils/itt.cpp
    )
--- a/modules/gapi/include/opencv2/gapi/pysrc/python_stream_source.hpp
+++ b/modules/gapi/include/opencv2/gapi/pysrc/python_stream_source.hpp
@ -0,0 +1,65 @@
+#ifndef OPENCV_GAPI_PYSRC_PYTHONSTREAMSOURCE_HPP
+#define OPENCV_GAPI_PYSRC_PYTHONSTREAMSOURCE_HPP
+#include <opencv2/gapi/streaming/source.hpp>
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace gapi {
+namespace wip {
+
+/**
+ * @brief Creates a G-API IStreamSource that delegates to a Python-defined source.
+ *
+ * This factory function wraps a Python object (for example, an instance of a class
+ * implementing a `pull()` and a `descr_of()` method) into a `cv::gapi::wip::IStreamSource`,
+ * enabling it to be used within a G-API computation graph. The OpenCV Python bindings
+ * automatically convert the PyObject into a `cv::Ptr<IStreamSource>`.
+ *
+ * @param src
+ * A `cv::Ptr<IStreamSource>` that internally holds the original Python object.
+ *
+ * @return
+ * A `cv::Ptr<IStreamSource>` that wraps the provided Python object. On each frame pull,
+ * G-API will:
+ *   - Acquire the Python GIL
+ *   - Call the Python object’s `pull()` method
+ *   - Convert the resulting NumPy array to a `cv::Mat`
+ *   - Pass the `cv::Mat` into the G-API pipeline
+ *
+ * @note
+ * In Python, you can use the returned `make_py_src` as follows:
+ *
+ * @code{.py}
+ * class MyClass:
+ *     def __init__(self):
+ *         # Initialize your source
+ *     def pull(self):
+ *         # Return the next frame as a numpy.ndarray or None for end-of-stream
+ *     def descr_of(self):
+ *         # Return a numpy.ndarray that describes the format of the frames
+ *
+ * # Create a G-API source from a Python class
+ * py_src = cv.gapi.wip.make_py_src(MyClass())
+ *
+ * # Define a simple graph: input → copy → output
+ * g_in = cv.GMat()
+ * g_out = cv.gapi.copy(g_in)
+ * graph = cv.GComputation(g_in, g_out)
+ *
+ * # Compile the pipeline for streaming and assign the source
+ * pipeline = graph.compileStreaming()
+ * pipeline.setSource([py_src])
+ * pipeline.start()
+ * @endcode
+ */
+
+CV_EXPORTS_W cv::Ptr<cv::gapi::wip::IStreamSource>
+make_py_src(const cv::Ptr<cv::gapi::wip::IStreamSource>& src);
+
+
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_PYSRC_PYTHONSTREAMSOURCE_HPP
--- a/modules/gapi/misc/python/pyopencv_gapi.hpp
+++ b/modules/gapi/misc/python/pyopencv_gapi.hpp
@ -1162,6 +1162,161 @@ bool pyopencv_to(PyObject* obj, cv::GProtoOutputArgs& value, const ArgInfo& info
    }
 }

+namespace cv {
+namespace gapi {
+namespace wip {
+
+/**
+ * @class PythonCustomStreamSource
+ * @brief Wraps a Python-defined frame source as an IStreamSource for G-API.
+ *
+ * This class allows a G-API pipeline to pull frames from a Python object. The Python object
+ * must implement at a `pull()` method which must return a `numpy.ndarray` containing
+ * the next frame, or `None` to signal end-of-stream. It can also implement a `descr_of()`
+ * method which must return a `numpy.ndarray` that describes the format
+ * (data type, number of channels, height, width) of the frames produced.
+ */
+
+class PythonCustomStreamSource : public IStreamSource
+{
+    public:
+    PythonCustomStreamSource(PyObject* _obj = nullptr) : obj(_obj)
+    {
+        if (obj)
+            Py_INCREF(obj);
+    }
+
+    ~PythonCustomStreamSource()
+    {
+        if (obj)
+            Py_DECREF(obj);
+    }
+
+    bool pull(cv::gapi::wip::Data& data) CV_OVERRIDE
+    {
+        if (!obj)
+            return false;
+
+        PyObject* src = reinterpret_cast<PyObject*>(obj);
+
+        PyGILState_STATE gstate;
+        gstate = PyGILState_Ensure();
+
+        PyObject* result = PyObject_CallMethodObjArgs(src, PyUnicode_FromString("pull"), NULL);
+        bool hasPyPullError = PyErr_Occurred() != nullptr;
+
+        if (!result)
+        {
+            PyErr_Print();
+            PyGILState_Release(gstate);
+            CV_Error(cv::Error::StsError, "PythonCustomStreamSource::pull(): call to .pull() failed");
+        }
+
+        if (result == Py_None)
+        {
+            Py_DECREF(result);
+            PyGILState_Release(gstate);
+            return false;
+        }
+
+        if (!PyArray_Check(result))
+        {
+            PyErr_Format(PyExc_TypeError, "Expected numpy.ndarray from .pull()");
+            PyErr_Print();
+            Py_DECREF(result);
+            PyGILState_Release(gstate);
+            CV_Error(cv::Error::StsError, "PythonCustomStreamSource::pull(): .pull() did not return a numpy.ndarray");
+        }
+
+        cv::Mat mat;
+        ArgInfo info("pull return", 0);
+        if (!pyopencv_to(result, mat, info) || PyErr_Occurred())
+        {
+            PyErr_Print();
+            Py_DECREF(result);
+            PyGILState_Release(gstate);
+            CV_Error(cv::Error::StsError, "PythonCustomStreamSource::pull(): failed to convert numpy to cv::Mat");
+        }
+
+        if (mat.empty())
+        {
+            Py_DECREF(result);
+            PyGILState_Release(gstate);
+            return false;
+        }
+
+        data = mat;
+        Py_DECREF(result);
+        PyGILState_Release(gstate);
+
+        if (hasPyPullError)
+            CV_Error(cv::Error::StsError, "Python .pull() call error");
+
+        return true;
+    }
+
+    GMetaArg descr_of() const CV_OVERRIDE
+    {
+        if (!obj)
+            return cv::GMetaArg(cv::GFrameDesc{cv::MediaFormat::BGR, cv::Size(640, 480)});
+
+        PyGILState_STATE gstate = PyGILState_Ensure();
+        PyObject* result = PyObject_CallMethodObjArgs(obj, PyUnicode_FromString("descr_of"), NULL);
+        if (!result)
+        {
+            PyErr_Print();
+            PyGILState_Release(gstate);
+            CV_Error(cv::Error::StsError, "PythonCustomStreamSource::descr_of(): conversion error");
+        }
+
+        if (!PyArray_Check(result)) {
+            PyErr_Format(PyExc_TypeError, "Expected numpy.ndarray from .descr_of()");
+            PyErr_Print();
+            Py_DECREF(result);
+            PyGILState_Release(gstate);
+            CV_Error(cv::Error::StsError, "PythonCustomStreamSource::descr_of(): did not return a numpy.ndarray");
+        }
+
+        cv::Mat mat;
+        ArgInfo info("descr_of return", 0);
+        if (!pyopencv_to(result, mat, info))
+        {
+            PyErr_Print();
+            Py_DECREF(result);
+            PyGILState_Release(gstate);
+            CV_Error(cv::Error::StsError, "PythonCustomStreamSource::descr_of(): conversion error");
+        }
+
+        Py_DECREF(result);
+        PyGILState_Release(gstate);
+        cv::GMatDesc mdesc = cv::descr_of(mat);
+
+        return cv::GMetaArg(mdesc);
+    }
+
+private:
+    PyObject* obj;
+};
+
+inline cv::Ptr<IStreamSource> make_pysrc_from_pyobject(PyObject* obj)
+{
+    return cv::makePtr<PythonCustomStreamSource>(obj);
+}
+
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+template<>
+bool pyopencv_to(PyObject* obj, cv::Ptr<cv::gapi::wip::IStreamSource>& p, const ArgInfo&)
+{
+    if (!obj)
+        return false;
+
+    p = cv::makePtr<cv::gapi::wip::PythonCustomStreamSource>(obj);
+    return true;
+}
+
 // extend cv.gapi methods
 #define PYOPENCV_EXTRA_METHODS_GAPI \
  {"kernels", CV_PY_FN_WITH_KW(pyopencv_cv_gapi_kernels), "kernels(...) -> GKernelPackage"}, \
--- a/modules/gapi/misc/python/test/test_gapi_streaming.py
+++ b/modules/gapi/misc/python/test/test_gapi_streaming.py
@ -539,7 +539,38 @@ try:
                self.assertEqual(0.0, cv.norm(convertNV12p2BGR(expected1), actual1, cv.NORM_INF))
                self.assertEqual(0.0, cv.norm(convertNV12p2BGR(expected2), actual2, cv.NORM_INF))

+        def test_python_custom_stream_source(self):
+            class MySource:
+                def __init__(self):
+                    self.count = 0

+                def pull(self):
+                    if self.count >= 3:
+                        return None
+                    self.count += 1
+                    return np.ones((10, 10, 3), np.uint8) * self.count
+
+                def descr_of(self):
+                    return np.zeros((10, 10, 3), np.uint8)
+
+            g_in = cv.GMat()
+            g_out = cv.gapi.copy(g_in)
+            c = cv.GComputation(g_in, g_out)
+
+            comp = c.compileStreaming()
+
+            src = cv.gapi.wip.make_py_src(MySource())
+            comp.setSource([src])
+            comp.start()
+
+            frames = []
+            while True:
+                has_frame, frame = comp.pull()
+                if not has_frame:
+                    break
+                frames.append(frame)
+
+            self.assertEqual(len(frames), 3)

 except unittest.SkipTest as e:

--- a/modules/gapi/src/api/s11n.cpp
+++ b/modules/gapi/src/api/s11n.cpp
@ -141,5 +141,4 @@ cv::GRunArg cv::gapi::bind(cv::GRunArgP &out)
        GAPI_Error("This value type is UNKNOWN!");
        break;
    }
-    return cv::GRunArg();
 }
--- a/modules/gapi/src/backends/common/serialization.cpp
+++ b/modules/gapi/src/backends/common/serialization.cpp
@ -206,22 +206,20 @@ IOStream& operator<< (IOStream& os, const cv::RMat& mat) {
    mat.serialize(os);
    return os;
 }
-IIStream& operator>> (IIStream& is, cv::RMat&) {
+IIStream& operator>> (IIStream&, cv::RMat&) {
    util::throw_error(std::logic_error("operator>> for RMat should never be called. "
                                        "Instead, cv::gapi::deserialize<cv::GRunArgs, AdapterTypes...>() "
                                        "should be used"));
-    return is;
 }

 IOStream& operator<< (IOStream& os, const cv::MediaFrame &frame) {
    frame.serialize(os);
    return os;
 }
-IIStream& operator>> (IIStream& is, cv::MediaFrame &) {
+IIStream& operator>> (IIStream&, cv::MediaFrame &) {
    util::throw_error(std::logic_error("operator>> for MediaFrame should never be called. "
                                        "Instead, cv::gapi::deserialize<cv::GRunArgs, AdapterTypes...>() "
                                        "should be used"));
-    return is;
 }

 namespace
@ -395,27 +393,23 @@ IOStream& operator<< (IOStream& os, const cv::GArrayDesc &) {return os;}
 IIStream& operator>> (IIStream& is,       cv::GArrayDesc &) {return is;}

 #if !defined(GAPI_STANDALONE)
-IOStream& operator<< (IOStream& os, const cv::UMat &)
+IOStream& operator<< (IOStream&, const cv::UMat &)
 {
    GAPI_Error("Serialization: Unsupported << for UMat");
-    return os;
 }
-IIStream& operator >> (IIStream& is, cv::UMat &)
+IIStream& operator >> (IIStream&, cv::UMat &)
 {
    GAPI_Error("Serialization: Unsupported >> for UMat");
-    return is;
 }
 #endif // !defined(GAPI_STANDALONE)

-IOStream& operator<< (IOStream& os, const cv::gapi::wip::IStreamSource::Ptr &)
+IOStream& operator<< (IOStream&, const cv::gapi::wip::IStreamSource::Ptr &)
 {
    GAPI_Error("Serialization: Unsupported << for IStreamSource::Ptr");
-    return os;
 }
-IIStream& operator >> (IIStream& is, cv::gapi::wip::IStreamSource::Ptr &)
+IIStream& operator >> (IIStream&, cv::gapi::wip::IStreamSource::Ptr &)
 {
-    GAPI_Assert("Serialization: Unsupported >> for IStreamSource::Ptr");
-    return is;
+    GAPI_Error("Serialization: Unsupported >> for IStreamSource::Ptr");
 }

 namespace
--- a/modules/gapi/src/backends/fluid/gfluidbackend.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidbackend.cpp
@ -313,7 +313,7 @@ static int maxLineConsumption(const cv::GFluidKernel::Kind kind, int window, int
        }
    } break;
    case cv::GFluidKernel::Kind::YUV420toRGB: return inPort == 0 ? 2 : 1; break;
-    default: GAPI_Error("InternalError"); return 0;
+    default: GAPI_Error("InternalError");
    }
 }

@ -325,7 +325,7 @@ static int borderSize(const cv::GFluidKernel::Kind kind, int window)
    // Resize never reads from border pixels
    case cv::GFluidKernel::Kind::Resize: return 0; break;
    case cv::GFluidKernel::Kind::YUV420toRGB: return 0; break;
-    default: GAPI_Error("InternalError"); return 0;
+    default: GAPI_Error("InternalError");
    }
 }

--- a/modules/gapi/src/backends/fluid/gfluidbuffer.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidbuffer.cpp
@ -90,7 +90,7 @@ void fillBorderConstant(int borderSize, cv::Scalar borderValue, cv::Mat& mat)
        case CV_16S: return &fillConstBorderRow< int16_t>; break;
        case CV_16U: return &fillConstBorderRow<uint16_t>; break;
        case CV_32F: return &fillConstBorderRow< float  >; break;
-        default: GAPI_Error("InternalError"); return &fillConstBorderRow<uint8_t>;
+        default: GAPI_Error("InternalError");
        }
    };

--- a/modules/gapi/src/compiler/gcompiler.cpp
+++ b/modules/gapi/src/compiler/gcompiler.cpp
@ -341,7 +341,6 @@ void cv::gimpl::GCompiler::validateInputMeta()
        default:
            GAPI_Error("InternalError");
        }
-        return false; // should never happen
    };

    GAPI_LOG_DEBUG(nullptr, "Total count: " << m_metas.size());
--- a/modules/gapi/src/executor/gstreamingexecutor.cpp
+++ b/modules/gapi/src/executor/gstreamingexecutor.cpp
@ -1830,7 +1830,6 @@ bool cv::gimpl::GStreamingExecutor::pull(cv::GRunArgsP &&outs)
        default:
            GAPI_Error("Unsupported cmd type in pull");
    }
-    GAPI_Error("Unreachable code");
 }

 bool cv::gimpl::GStreamingExecutor::pull(cv::GOptRunArgsP &&outs)
--- a/modules/gapi/src/pysrc/python_stream_source.cpp
+++ b/modules/gapi/src/pysrc/python_stream_source.cpp
@ -0,0 +1,17 @@
+#include <opencv2/gapi/pysrc/python_stream_source.hpp>
+#include <opencv2/gapi/streaming/source.hpp>
+#include <opencv2/core/utils/logger.hpp>
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace gapi {
+namespace wip {
+
+cv::Ptr<cv::gapi::wip::IStreamSource> make_py_src(const cv::Ptr<cv::gapi::wip::IStreamSource>& src)
+{
+    return src;
+}
+
+} // namespace wip
+} // namespace gapi
+} // namespace cv
--- a/modules/gapi/test/streaming/gapi_streaming_tests.cpp
+++ b/modules/gapi/test/streaming/gapi_streaming_tests.cpp
@ -327,7 +327,6 @@ public:

        if (m_curr_frame_id % m_throw_every_nth_frame == 0) {
            throw std::logic_error(InvalidSource::exception_msg());
-            return true;
        } else {
            d = cv::Mat(m_mat);
        }
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@ -125,7 +125,7 @@ elseif(HAVE_QT)
    endif()

    foreach(dt_dep ${qt_deps})
-      add_definitions(${Qt${QT_VERSION_MAJOR}${dt_dep}_DEFINITIONS})
+      link_libraries(${Qt${QT_VERSION_MAJOR}${dt_dep}})
      include_directories(${Qt${QT_VERSION_MAJOR}${dt_dep}_INCLUDE_DIRS})
      list(APPEND HIGHGUI_LIBRARIES ${Qt${QT_VERSION_MAJOR}${dt_dep}_LIBRARIES})
    endforeach()
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@ -1115,9 +1115,9 @@ const std::string cv::currentUIFramework()
    return std::string("COCOA");
 #elif defined (HAVE_WAYLAND)
    return std::string("WAYLAND");
-#else
-    return std::string();
 #endif
+
+    return std::string();
 }

 // Without OpenGL
--- a/modules/imgcodecs/src/grfmt_gdal.cpp
+++ b/modules/imgcodecs/src/grfmt_gdal.cpp
@ -425,8 +425,18 @@ bool GdalDecoder::readData( Mat& img ){
        // create a temporary scanline pointer to store data
        double* scanline = new double[nCols];

+#if GDAL_VERSION_NUM < GDAL_COMPUTE_VERSION(3,3,0)
+        // FITS drivers on version GDAL prior to v3.3.0 return vertically mirrored results.
+        // See https://github.com/OSGeo/gdal/pull/3520
+        // See https://github.com/OSGeo/gdal/commit/ef0f86696d163e065943b27f50dcff77790a1311
+        const bool isNeedVerticallyFlip = strncmp(m_dataset->GetDriverName(), "FITS", 4) == 0;
+#else
+        const bool isNeedVerticallyFlip = false;
+#endif
+
        // iterate over each row and column
-        for( int y=0; y<nRows; y++ ){
+        for( int y=0; y<nRows; y++ ){ // for GDAL
+            const int yCv = isNeedVerticallyFlip ? (nRows - 1) - y : y ; // for OpenCV

            // get the entire row
            CPLErr err = band->RasterIO( GF_Read, 0, y, nCols, 1, scanline, nCols, 1, GDT_Float64, 0, 0);
@ -438,10 +448,10 @@ bool GdalDecoder::readData( Mat& img ){
                // set depending on image types
                //   given boost, I would use enable_if to speed up.  Avoid for now.
                if( hasColorTable == false ){
-                    write_pixel( scanline[x], gdalType, nChannels, img, y, x, color );
+                    write_pixel( scanline[x], gdalType, nChannels, img, yCv, x, color );
                }
                else{
-                    write_ctable_pixel( scanline[x], gdalType, gdalColorTable, img, y, x, color );
+                    write_ctable_pixel( scanline[x], gdalType, gdalColorTable, img, yCv, x, color );
                }
            }
        }
--- a/modules/imgcodecs/src/grfmt_tiff.cpp
+++ b/modules/imgcodecs/src/grfmt_tiff.cpp
@ -52,10 +52,17 @@

 #include "grfmt_tiff.hpp"
 #include <limits>
+#include <string>
+#include <cstdarg>

 #include "tiff.h"
 #include "tiffio.h"

+#ifdef TIFFLIB_AT_LEAST
+#if TIFFLIB_AT_LEAST(4, 5, 0)
+#define OCV_HAVE_TIFF_OPEN_OPTIONS
+#endif
+#endif
 namespace cv
 {

@ -78,30 +85,116 @@ static void cv_tiffCloseHandle(void* handle)
    TIFFClose((TIFF*)handle);
 }

+static std::string vformat(const char* fmt, va_list ap)
+{
+    if (!fmt)
+        return {};
+    va_list ap_copy;
+    va_copy(ap_copy, ap);
+    const int len = std::vsnprintf(nullptr, 0, fmt, ap_copy);
+    va_end(ap_copy);
+    if (len < 0)
+        return fmt;
+
+    std::string buf(static_cast<size_t>(len) + 1, '\0');
+    std::vsnprintf(&buf[0], buf.size(), fmt, ap);
+    buf.pop_back();
+    return buf;
+}
+
+static std::string formatTiffMessage(const char* module, const char* fmt, va_list ap)
+{
+    std::stringstream ss;
+    if (module && module[0] != '\0')
+        ss << module << ": ";
+    ss << cv::vformat(fmt, ap);
+    return ss.str();
+}
+
+static int TIFF_Error(TIFF *, void *, const char* module, const char* fmt, va_list ap)
+{
+    CV_LOG_ERROR(NULL, formatTiffMessage(module, fmt, ap));
+    return 1;
+}
+
+static int TIFF_Warning(TIFF *, void *, const char* module, const char* fmt, va_list ap)
+{
+     CV_LOG_WARNING(NULL, formatTiffMessage(module, fmt, ap));
+     return 1;
+}
+
+#ifdef OCV_HAVE_TIFF_OPEN_OPTIONS
+static TIFFOpenOptions* cv_tiffCreateOptions()
+{
+    auto opts = TIFFOpenOptionsAlloc();
+    TIFFOpenOptionsSetErrorHandlerExtR(opts, &TIFF_Error, nullptr);
+    TIFFOpenOptionsSetWarningHandlerExtR(opts, &TIFF_Warning, nullptr);
+#if TIFFLIB_AT_LEAST(4, 7, 1)
+    TIFFOpenOptionsSetWarnAboutUnknownTags(opts, 1);
+#endif
+    return opts;
+}
+#endif
+
+static TIFF* cv_tiffOpen(const char* filename, const char* mode)
+{
+#ifdef OCV_HAVE_TIFF_OPEN_OPTIONS
+    auto opts = cv_tiffCreateOptions();
+    auto tiff = TIFFOpenExt(filename, mode, opts);
+    TIFFOpenOptionsFree(opts);
+    return tiff;
+#else
+    return TIFFOpen(filename, mode);
+#endif
+}
+
+static TIFF* cv_tiffClientOpen(const char* name, const char* mode, thandle_t clientdata,
+                     TIFFReadWriteProc readproc, TIFFReadWriteProc writeproc,
+                     TIFFSeekProc seekproc, TIFFCloseProc closeproc,
+                     TIFFSizeProc sizeproc, TIFFMapFileProc mapproc,
+                     TIFFUnmapFileProc unmapproc)
+{
+#ifdef OCV_HAVE_TIFF_OPEN_OPTIONS
+    auto opts = cv_tiffCreateOptions();
+    auto tiff = TIFFClientOpenExt(name, mode, clientdata, readproc, writeproc,
+        seekproc, closeproc, sizeproc, mapproc, unmapproc, opts);
+    TIFFOpenOptionsFree(opts);
+    return tiff;
+#else
+    return TIFFClientOpen(name, mode, clientdata, readproc, writeproc,
+        seekproc, closeproc, sizeproc, mapproc, unmapproc);
+#endif
+}
+
+#ifndef OCV_HAVE_TIFF_OPEN_OPTIONS
+
 static void cv_tiffErrorHandler(const char* module, const char* fmt, va_list ap)
 {
-    if (cv::utils::logging::getLogLevel() < cv::utils::logging::LOG_LEVEL_DEBUG)
-        return;
-    // TODO cv::vformat() with va_list parameter
-    fprintf(stderr, "OpenCV TIFF: ");
-    if (module != NULL)
-        fprintf(stderr, "%s: ", module);
-    fprintf(stderr, "Warning, ");
-    vfprintf(stderr, fmt, ap);
-    fprintf(stderr, ".\n");
+    (void) TIFF_Error(nullptr, nullptr, module, fmt, ap);
+}
+
+static void cv_tiffWarningHandler(const char* module, const char* fmt, va_list ap)
+{
+    (void) TIFF_Warning(nullptr, nullptr, module, fmt, ap);
 }

 static bool cv_tiffSetErrorHandler_()
 {
    TIFFSetErrorHandler(cv_tiffErrorHandler);
-    TIFFSetWarningHandler(cv_tiffErrorHandler);
+    TIFFSetWarningHandler(cv_tiffWarningHandler);
    return true;
 }

+#endif
+
 static bool cv_tiffSetErrorHandler()
 {
+#ifndef OCV_HAVE_TIFF_OPEN_OPTIONS
    static bool v = cv_tiffSetErrorHandler_();
    return v;
+#else
+    return true;
+#endif
 }

 static const char fmtSignTiffII[] = "II\x2a\x00";
@ -241,7 +334,7 @@ bool TiffDecoder::readHeader()
        {
            m_buf_pos = 0;
            TiffDecoderBufHelper* buf_helper = new TiffDecoderBufHelper(this->m_buf, this->m_buf_pos);
-            tif = TIFFClientOpen( "", "r", reinterpret_cast<thandle_t>(buf_helper), &TiffDecoderBufHelper::read,
+            tif = cv_tiffClientOpen( "", "r", reinterpret_cast<thandle_t>(buf_helper), &TiffDecoderBufHelper::read,
                                  &TiffDecoderBufHelper::write, &TiffDecoderBufHelper::seek,
                                  &TiffDecoderBufHelper::close, &TiffDecoderBufHelper::size,
                                  &TiffDecoderBufHelper::map, /*unmap=*/0 );
@ -250,7 +343,7 @@ bool TiffDecoder::readHeader()
        }
        else
        {
-            tif = TIFFOpen(m_filename.c_str(), "r");
+            tif = cv_tiffOpen(m_filename.c_str(), "r");
        }
        if (tif)
            m_tif.reset(tif, cv_tiffCloseHandle);
@ -1113,7 +1206,7 @@ public:
    {
        // do NOT put "wb" as the mode, because the b means "big endian" mode, not "binary" mode.
        // http://www.simplesystems.org/libtiff/functions/TIFFOpen.html
-        return TIFFClientOpen( "", "w", reinterpret_cast<thandle_t>(this), &TiffEncoderBufHelper::read,
+        return cv_tiffClientOpen( "", "w", reinterpret_cast<thandle_t>(this), &TiffEncoderBufHelper::read,
                               &TiffEncoderBufHelper::write, &TiffEncoderBufHelper::seek,
                               &TiffEncoderBufHelper::close, &TiffEncoderBufHelper::size,
                               /*map=*/0, /*unmap=*/0 );
@ -1210,7 +1303,7 @@ bool TiffEncoder::writeLibTiff( const std::vector<Mat>& img_vec, const std::vect
    }
    else
    {
-        tif = TIFFOpen(m_filename.c_str(), "w");
+        tif = cv_tiffOpen(m_filename.c_str(), "w");
    }
    if (!tif)
    {
--- a/modules/imgcodecs/test/test_tiff.cpp
+++ b/modules/imgcodecs/test/test_tiff.cpp
@ -1282,6 +1282,17 @@ TEST(Imgcodecs_Tiff, read_bigtiff_images)
    }
 }

+TEST(Imgcodecs_Tiff, read_junk) {
+    // Test exercises the tiff error handler integration.
+    // Error messages can be seen with OPENCV_LOG_LEVEL=DEBUG
+    const char junk[] = "II\x2a\x00\x08\x00\x00\x00\x00\x00\x00\x00";
+    cv::Mat junkInputArray(1, sizeof(junk) - 1, CV_8UC1, (void*)junk);
+
+    cv::Mat img;
+    ASSERT_NO_THROW(img = cv::imdecode(junkInputArray, IMREAD_UNCHANGED));
+    ASSERT_TRUE(img.empty());
+}
+
 #endif

 }} // namespace
--- a/modules/imgproc/src/shapedescr.cpp
+++ b/modules/imgproc/src/shapedescr.cpp
@ -62,25 +62,8 @@ static void findCircle3pts(Point2f *pts, Point2f &center, float &radius)
    float det = v1.x * v2.y - v1.y * v2.x;
    if (fabs(det) <= EPS)
    {
-        // v1 and v2 are colinear, so the longest distance between any 2 points
-        // is the diameter of the minimum enclosing circle.
-        float d1 = normL2Sqr<float>(pts[0] - pts[1]);
-        float d2 = normL2Sqr<float>(pts[0] - pts[2]);
-        float d3 = normL2Sqr<float>(pts[1] - pts[2]);
-        radius = sqrt(std::max(d1, std::max(d2, d3))) * 0.5f + EPS;
-        if (d1 >= d2 && d1 >= d3)
-        {
-            center = (pts[0] + pts[1]) * 0.5f;
-        }
-        else if (d2 >= d1 && d2 >= d3)
-        {
-            center = (pts[0] + pts[2]) * 0.5f;
-        }
-        else
-        {
-            CV_DbgAssert(d3 >= d1 && d3 >= d2);
-            center = (pts[1] + pts[2]) * 0.5f;
-        }
+        // triangle is degenerate, so this is 2-points case
+        // 2-points case should be taken into account in previous step
        return;
    }
    float cx = (c1 * v2.y - c2 * v1.y) / det;
@ -115,13 +98,7 @@ static void findThirdPoint(const PT *pts, int i, int j, Point2f &center, float &
            ptsf[0] = (Point2f)pts[i];
            ptsf[1] = (Point2f)pts[j];
            ptsf[2] = (Point2f)pts[k];
-            Point2f new_center; float new_radius = 0;
-            findCircle3pts(ptsf, new_center, new_radius);
-            if (new_radius > 0)
-            {
-                radius = new_radius;
-                center = new_center;
-            }
+            findCircle3pts(ptsf, center, radius);
        }
    }
 }
@ -146,13 +123,7 @@ void findSecondPoint(const PT *pts, int i, Point2f &center, float &radius)
        }
        else
        {
-            Point2f new_center; float new_radius = 0;
-            findThirdPoint(pts, i, j, new_center, new_radius);
-            if (new_radius > 0)
-            {
-                radius = new_radius;
-                center = new_center;
-            }
+            findThirdPoint(pts, i, j, center, radius);
        }
    }
 }
@ -178,13 +149,7 @@ static void findMinEnclosingCircle(const PT *pts, int count, Point2f &center, fl
        }
        else
        {
-            Point2f new_center; float new_radius = 0;
-            findSecondPoint(pts, i, new_center, new_radius);
-            if (new_radius > 0)
-            {
-                radius = new_radius;
-                center = new_center;
-            }
+            findSecondPoint(pts, i, center, radius);
        }
    }
 }
@ -210,61 +175,42 @@ void cv::minEnclosingCircle( InputArray _points, Point2f& _center, float& _radiu
    const Point* ptsi = points.ptr<Point>();
    const Point2f* ptsf = points.ptr<Point2f>();

-    switch (count)
+    if( count == 1 )
    {
-        case 1:
-        {
-            _center = (is_float) ? ptsf[0] : Point2f((float)ptsi[0].x, (float)ptsi[0].y);
-            _radius = EPS;
-            break;
-        }
-        case 2:
-        {
-            Point2f p1 = (is_float) ? ptsf[0] : Point2f((float)ptsi[0].x, (float)ptsi[0].y);
-            Point2f p2 = (is_float) ? ptsf[1] : Point2f((float)ptsi[1].x, (float)ptsi[1].y);
-            _center.x = (p1.x + p2.x) / 2.0f;
-            _center.y = (p1.y + p2.y) / 2.0f;
-            _radius = (float)(norm(p1 - p2) / 2.0) + EPS;
-            break;
-        }
-        default:
-        {
-            Point2f center;
-            float radius = 0.f;
-            if (is_float)
+        _center = (is_float) ? ptsf[0] : Point2f((float)ptsi[0].x, (float)ptsi[0].y);
+        _radius = EPS;
+        return;
+    }
+
+    if (is_float)
+    {
+        findMinEnclosingCircle<Point2f>(ptsf, count, _center, _radius);
+        #if 0
+            for (int m = 0; m < count; ++m)
            {
-                findMinEnclosingCircle<Point2f>(ptsf, count, center, radius);
-                #if 0
-                    for (size_t m = 0; m < count; ++m)
-                    {
-                        float d = (float)norm(ptsf[m] - center);
-                        if (d > radius)
-                        {
-                            printf("error!\n");
-                        }
-                    }
-                #endif
+                float d = (float)norm(ptsf[m] - _center);
+                if (d > _radius)
+                {
+                    printf("error!\n");
+                }
            }
-            else
+        #endif
+    }
+    else
+    {
+        findMinEnclosingCircle<Point>(ptsi, count, _center, _radius);
+        #if 0
+            for (int m = 0; m < count; ++m)
            {
-                findMinEnclosingCircle<Point>(ptsi, count, center, radius);
-                #if 0
-                    for (size_t m = 0; m < count; ++m)
-                    {
-                        double dx = ptsi[m].x - center.x;
-                        double dy = ptsi[m].y - center.y;
-                        double d = std::sqrt(dx * dx + dy * dy);
-                        if (d > radius)
-                        {
-                            printf("error!\n");
-                        }
-                    }
-                #endif
+                double dx = ptsi[m].x - _center.x;
+                double dy = ptsi[m].y - _center.y;
+                double d = std::sqrt(dx * dx + dy * dy);
+                if (d > _radius)
+                {
+                    printf("error!\n");
+                }
            }
-            _center = center;
-            _radius = radius;
-            break;
-        }
+        #endif
    }
 }

--- a/modules/imgproc/test/test_convhull.cpp
+++ b/modules/imgproc/test/test_convhull.cpp
@ -52,31 +52,48 @@ namespace opencv_test { namespace {

 TEST(minEnclosingCircle, basic_test)
 {
-    vector<Point2f> pts;
-    pts.push_back(Point2f(0, 0));
-    pts.push_back(Point2f(10, 0));
-    pts.push_back(Point2f(5, 1));
    const float EPS = 1.0e-3f;
    Point2f center;
    float radius;

+    {
+        const vector<Point2f> pts = { {5, 10} };
+        minEnclosingCircle(pts, center, radius);
+        EXPECT_NEAR(center.x, 5, EPS);
+        EXPECT_NEAR(center.y, 10, EPS);
+        EXPECT_NEAR(radius, 0, EPS);
+    }
+
+    {
+        const vector<Point2f> pts = { {5, 10}, {11, 18} };
+        minEnclosingCircle(pts, center, radius);
+        EXPECT_NEAR(center.x, 8, EPS);
+        EXPECT_NEAR(center.y, 14, EPS);
+        EXPECT_NEAR(radius, 5, EPS);
+    }
+
    // pts[2] is within the circle with diameter pts[0] - pts[1].
    //        2
    // 0             1
    // NB: The triangle is obtuse, so the only pts[0] and pts[1] are on the circle.
-    minEnclosingCircle(pts, center, radius);
-    EXPECT_NEAR(center.x, 5, EPS);
-    EXPECT_NEAR(center.y, 0, EPS);
-    EXPECT_NEAR(5, radius, EPS);
+    {
+        const vector<Point2f> pts = { {0, 0}, {10, 0}, {5, 1} };
+        minEnclosingCircle(pts, center, radius);
+        EXPECT_NEAR(center.x, 5, EPS);
+        EXPECT_NEAR(center.y, 0, EPS);
+        EXPECT_NEAR(5, radius, EPS);
+    }

    // pts[2] is on the circle with diameter pts[0] - pts[1].
    //  2
    // 0 1
-    pts[2] = Point2f(5, 5);
-    minEnclosingCircle(pts, center, radius);
-    EXPECT_NEAR(center.x, 5, EPS);
-    EXPECT_NEAR(center.y, 0, EPS);
-    EXPECT_NEAR(5, radius, EPS);
+    {
+        const vector<Point2f> pts = { {0, 0}, {10, 0}, {5, 5} };
+        minEnclosingCircle(pts, center, radius);
+        EXPECT_NEAR(center.x, 5, EPS);
+        EXPECT_NEAR(center.y, 0, EPS);
+        EXPECT_NEAR(5, radius, EPS);
+    }

    // pts[2] is outside the circle with diameter pts[0] - pts[1].
    //   2
@ -84,32 +101,40 @@ TEST(minEnclosingCircle, basic_test)
    //
    // 0   1
    // NB: The triangle is acute, so all 3 points are on the circle.
-    pts[2] = Point2f(5, 10);
-    minEnclosingCircle(pts, center, radius);
-    EXPECT_NEAR(center.x, 5, EPS);
-    EXPECT_NEAR(center.y, 3.75, EPS);
-    EXPECT_NEAR(6.25f, radius, EPS);
+    {
+        const vector<Point2f> pts = { {0, 0}, {10, 0}, {5, 10} };
+        minEnclosingCircle(pts, center, radius);
+        EXPECT_NEAR(center.x, 5, EPS);
+        EXPECT_NEAR(center.y, 3.75, EPS);
+        EXPECT_NEAR(6.25f, radius, EPS);
+    }

    // The 3 points are colinear.
-    pts[2] = Point2f(3, 0);
-    minEnclosingCircle(pts, center, radius);
-    EXPECT_NEAR(center.x, 5, EPS);
-    EXPECT_NEAR(center.y, 0, EPS);
-    EXPECT_NEAR(5, radius, EPS);
+    {
+        const vector<Point2f> pts = { {0, 0}, {10, 0}, {3, 0} };
+        minEnclosingCircle(pts, center, radius);
+        EXPECT_NEAR(center.x, 5, EPS);
+        EXPECT_NEAR(center.y, 0, EPS);
+        EXPECT_NEAR(5, radius, EPS);
+    }

    // 2 points are the same.
-    pts[2] = pts[1];
-    minEnclosingCircle(pts, center, radius);
-    EXPECT_NEAR(center.x, 5, EPS);
-    EXPECT_NEAR(center.y, 0, EPS);
-    EXPECT_NEAR(5, radius, EPS);
+    {
+        const vector<Point2f> pts = { {0, 0}, {10, 0}, {10, 0} };
+        minEnclosingCircle(pts, center, radius);
+        EXPECT_NEAR(center.x, 5, EPS);
+        EXPECT_NEAR(center.y, 0, EPS);
+        EXPECT_NEAR(5, radius, EPS);
+    }

    // 3 points are the same.
-    pts[0] = pts[1];
-    minEnclosingCircle(pts, center, radius);
-    EXPECT_NEAR(center.x, 10, EPS);
-    EXPECT_NEAR(center.y, 0, EPS);
-    EXPECT_NEAR(0, radius, EPS);
+    {
+        const vector<Point2f> pts = { {10, 0}, {10, 0}, {10, 0} };
+        minEnclosingCircle(pts, center, radius);
+        EXPECT_NEAR(center.x, 10, EPS);
+        EXPECT_NEAR(center.y, 0, EPS);
+        EXPECT_NEAR(0, radius, EPS);
+    }
 }

 TEST(Imgproc_minEnclosingCircle, regression_16051) {
@ -127,6 +152,44 @@ TEST(Imgproc_minEnclosingCircle, regression_16051) {
    EXPECT_NEAR(2.1024551f, radius, 1e-3);
 }

+TEST(Imgproc_minEnclosingCircle, regression_27891) {
+    {
+        const vector<Point2f> pts = { {219, 301}, {639, 635}, {740, 569}, {740, 569}, {309, 123}, {349, 88} };
+
+        Point2f center;
+        float radius;
+        minEnclosingCircle(pts, center, radius);
+
+        EXPECT_NEAR(center.x, 522.476f, 1e-3f);
+        EXPECT_NEAR(center.y, 346.4029f, 1e-3f);
+        EXPECT_NEAR(radius, 311.2331f, 1e-3f);
+    }
+
+    {
+        const vector<Point2f> pts = { {219, 301}, {639, 635}, {740, 569}, {740, 569}, {349, 88} };
+
+        Point2f center;
+        float radius;
+        minEnclosingCircle(pts, center, radius);
+
+        EXPECT_NEAR(center.x, 522.476f, 1e-3f);
+        EXPECT_NEAR(center.y, 346.4029f, 1e-3f);
+        EXPECT_NEAR(radius, 311.2331f, 1e-3f);
+    }
+
+    {
+        const vector<Point2f> pts = { {639, 635}, {740, 569}, {740, 569}, {349, 88} };
+
+        Point2f center;
+        float radius;
+        minEnclosingCircle(pts, center, radius);
+
+        EXPECT_NEAR(center.x, 522.476f, 1e-3f);
+        EXPECT_NEAR(center.y, 346.4029f, 1e-3f);
+        EXPECT_NEAR(radius, 311.2331f, 1e-3f);
+    }
+}
+
 PARAM_TEST_CASE(ConvexityDefects_regression_5908, bool, int)
 {
 public:
--- a/modules/ml/src/knearest.cpp
+++ b/modules/ml/src/knearest.cpp
@ -396,8 +396,21 @@ public:
        {
            Mat _res, _nr, _d;
            tr.findNearest(test_samples.row(i), k, Emax, _res, _nr, _d, noArray());
-            res.push_back(_res.t());
-            _results.assign(res);
+            if( _results.needed() )
+            {
+                res.push_back(_res.t());
+                _results.assign(res);
+            }
+            if( _neighborResponses.needed() )
+            {
+                nr.push_back(_nr.t());
+                _neighborResponses.assign(nr);
+            }
+            if( _dists.needed() )
+            {
+                d.push_back(_d.t());
+                _dists.assign(d);
+            }
        }

        return result; // currently always 0
--- a/modules/ml/test/test_knearest.cpp
+++ b/modules/ml/test/test_knearest.cpp
@ -39,11 +39,16 @@ TEST(ML_KNearest, accuracy)
    {
        SCOPED_TRACE("KDTree");
        Mat neighborIndexes;
+        Mat neighborResponses;
+        Mat dists;
        float err = 1000;
        Ptr<KNearest> knn = KNearest::create();
        knn->setAlgorithmType(KNearest::KDTREE);
        knn->train(trainData, ml::ROW_SAMPLE, trainLabels);
-        knn->findNearest(testData, 4, neighborIndexes);
+        knn->findNearest(testData, 4, neighborIndexes, neighborResponses, dists);
+        EXPECT_EQ(neighborIndexes.size(), Size(4, pointsCount));
+        EXPECT_EQ(neighborResponses.size(), Size(4, pointsCount * 2));
+        EXPECT_EQ(dists.size(), Size(4, pointsCount));
        Mat bestLabels;
        // The output of the KDTree are the neighbor indexes, not actual class labels
        // so we need to do some extra work to get actual predictions
--- a/modules/photo/CMakeLists.txt
+++ b/modules/photo/CMakeLists.txt
@ -5,3 +5,7 @@ if(HAVE_CUDA)
 endif()

 ocv_define_module(photo opencv_imgproc OPTIONAL opencv_cudaarithm opencv_cudaimgproc WRAP java objc python js)
+
+if(HAVE_CUDA AND ENABLE_CUDA_FIRST_CLASS_LANGUAGE AND HAVE_opencv_cudaarithm AND HAVE_opencv_cudaimgproc)
+  ocv_target_link_libraries(${the_module} PRIVATE CUDA::cudart${CUDA_LIB_EXT})
+endif()
--- a/modules/photo/src/cuda/nlm.cu
+++ b/modules/photo/src/cuda/nlm.cu
@ -199,7 +199,7 @@ namespace cv { namespace cuda { namespace device
            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float> > op()
            {
                plus<float> op;
-                return thrust::make_tuple(op, op);
+                return { op, op };
            }
        };
        template <> struct Unroll<2>
@ -218,7 +218,7 @@ namespace cv { namespace cuda { namespace device
            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float> > op()
            {
                plus<float> op;
-                return thrust::make_tuple(op, op, op);
+                return { op, op, op };
            }
        };
        template <> struct Unroll<3>
@ -237,7 +237,7 @@ namespace cv { namespace cuda { namespace device
            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float> > op()
            {
                plus<float> op;
-                return thrust::make_tuple(op, op, op, op);
+                return { op, op, op, op };
            }
        };
        template <> struct Unroll<4>
--- a/modules/photo/src/denoising.cuda.cpp
+++ b/modules/photo/src/denoising.cuda.cpp
@ -43,10 +43,15 @@
 #include "precomp.hpp"

 #include "opencv2/photo/cuda.hpp"
-#include "opencv2/core/private.cuda.hpp"

 #include "opencv2/opencv_modules.hpp"

+#if !defined (HAVE_CUDA) || !defined(HAVE_OPENCV_CUDAARITHM) || !defined(HAVE_OPENCV_CUDAIMGPROC)
+#include "opencv2/core/private/cuda_stubs.hpp"
+#else
+#include "opencv2/core/private.cuda.hpp"
+#endif
+
 #ifdef HAVE_OPENCV_CUDAARITHM
 #  include "opencv2/cudaarithm.hpp"
 #endif
--- a/modules/stitching/CMakeLists.txt
+++ b/modules/stitching/CMakeLists.txt
@ -11,3 +11,7 @@ endif()
 ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_flann
                  OPTIONAL opencv_cudaarithm opencv_cudawarping opencv_cudafeatures2d opencv_cudalegacy opencv_cudaimgproc ${STITCHING_CONTRIB_DEPS}
                  WRAP python)
+
+if(HAVE_CUDA AND ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  ocv_target_link_libraries(${the_module} PUBLIC "CUDA::cudart${CUDA_LIB_EXT}")
+endif()
--- a/modules/video/include/opencv2/video/background_segm.hpp
+++ b/modules/video/include/opencv2/video/background_segm.hpp
@ -71,6 +71,21 @@ public:
     */
    CV_WRAP virtual void apply(InputArray image, OutputArray fgmask, double learningRate=-1) = 0;

+    /** @brief Computes a foreground mask with known foreground mask input.
+
+    @param image Next video frame. Floating point frame will be used without scaling and should be in range \f$[0,255]\f$.
+    @param fgmask The output foreground mask as an 8-bit binary image.
+    @param knownForegroundMask The mask for inputting already known foreground, allows model to ignore pixels.
+    @param learningRate The value between 0 and 1 that indicates how fast the background model is
+    learnt. Negative parameter value makes the algorithm to use some automatically chosen learning
+    rate. 0 means that the background model is not updated at all, 1 means that the background model
+    is completely reinitialized from the last frame.
+
+    @note This method has a default virtual implementation that throws a "not impemented" error.
+    Foreground masking may not be supported by all background subtractors.
+    */
+    CV_WRAP virtual void apply(InputArray image, InputArray knownForegroundMask, OutputArray fgmask, double learningRate=-1) = 0;
+
    /** @brief Computes a background image.

    @param backgroundImage The output background image.
@ -206,6 +221,18 @@ public:
    is completely reinitialized from the last frame.
     */
    CV_WRAP virtual void apply(InputArray image, OutputArray fgmask, double learningRate=-1) CV_OVERRIDE = 0;
+
+    /** @brief Computes a foreground mask and skips known foreground in evaluation.
+
+    @param image Next video frame. Floating point frame will be used without scaling and should be in range \f$[0,255]\f$.
+    @param fgmask The output foreground mask as an 8-bit binary image.
+    @param knownForegroundMask The mask for inputting already known foreground, allows model to ignore pixels.
+    @param learningRate The value between 0 and 1 that indicates how fast the background model is
+    learnt. Negative parameter value makes the algorithm to use some automatically chosen learning
+    rate. 0 means that the background model is not updated at all, 1 means that the background model
+    is completely reinitialized from the last frame.
+     */
+    CV_WRAP virtual void apply(InputArray image, InputArray knownForegroundMask, OutputArray fgmask, double learningRate=-1) CV_OVERRIDE = 0;
 };

 /** @brief Creates MOG2 Background Subtractor
--- a/modules/video/include/opencv2/video/tracking.hpp
+++ b/modules/video/include/opencv2/video/tracking.hpp
@ -268,17 +268,43 @@ enum
    MOTION_HOMOGRAPHY  = 3
 };

-/** @brief Computes the Enhanced Correlation Coefficient value between two images @cite EP08 .
+/**
+@brief Computes the Enhanced Correlation Coefficient (ECC) value between two images

-@param templateImage single-channel template image; CV_8U or CV_32F array.
-@param inputImage single-channel input image to be warped to provide an image similar to
- templateImage, same type as templateImage.
-@param inputMask An optional mask to indicate valid values of inputImage.
+The Enhanced Correlation Coefficient (ECC) is a normalized measure of similarity between two images @cite EP08.
+The result lies in the range [-1, 1], where 1 corresponds to perfect similarity (modulo affine shift and scale),
+0 indicates no correlation, and -1 indicates perfect negative correlation.

-@sa
-findTransformECC
- */
+For single-channel images, the ECC is defined as:

+\f[
+\mathrm{ECC}(I, T) = \frac{\sum_{x} (I(x) - \mu_I)(T(x) - \mu_T)}
+{\sqrt{\sum_{x} (I(x) - \mu_I)^2} \cdot \sqrt{\sum_{x} (T(x) - \mu_T)^2}}
+\f]
+
+For multi-channel images (e.g., 3-channel RGB), the formula generalizes to:
+
+\f[
+\mathrm{ECC}(I, T) =
+\frac{\sum_{x} \sum_{c=1}^{C} (I_c(x) - \mu_{I_c})(T_c(x) - \mu_{T_c})}
+{\sqrt{\sum_{x} \sum_{c=1}^{C} (I_c(x) - \mu_{I_c})^2} \cdot
+ \sqrt{\sum_{x} \sum_{c=1}^{C} (T_c(x) - \mu_{T_c})^2}}
+\f]
+
+Where:
+- \f$I_c(x), T_c(x)\f$ are the values of channel \f$c\f$ at spatial location \f$x\f$,
+- \f$\mu_{I_c}, \mu_{T_c}\f$ are the mean values of channel \f$c\f$ over the masked region (if provided),
+- \f$C\f$ is the number of channels (only 1 and 3 are currently supported),
+- The sums run over all pixels \f$x\f$ in the image domain (optionally restricted by mask).
+
+@param templateImage Input template image; must have either 1 or 3 channels and be of type CV_8U, CV_16U, CV_32F, or CV_64F.
+@param inputImage Input image to be compared with the template; must have the same type and number of channels as templateImage.
+@param inputMask Optional single-channel mask to specify the valid region of interest in inputImage and templateImage.
+
+@return The ECC similarity coefficient in the range [-1, 1].
+
+@sa findTransformECC
+*/
 CV_EXPORTS_W double computeECC(InputArray templateImage, InputArray inputImage, InputArray inputMask = noArray());

 /** @example samples/cpp/image_alignment.cpp
@ -287,8 +313,8 @@ An example using the image alignment ECC algorithm

 /** @brief Finds the geometric transform (warp) between two images in terms of the ECC criterion @cite EP08 .

-@param templateImage single-channel template image; CV_8U or CV_32F array.
-@param inputImage single-channel input image which should be warped with the final warpMatrix in
+@param templateImage 1 or 3 channel template image; CV_8U, CV_16U, CV_32F, CV_64F type.
+@param inputImage input image which should be warped with the final warpMatrix in
 order to provide an image similar to templateImage, same type as templateImage.
@param warpMatrix floating-point \f$2\times 3\f$ or \f$3\times 3\f$ mapping matrix (warp).
@param motionType parameter, specifying the type of motion:
@ -305,7 +331,7 @@ order to provide an image similar to templateImage, same type as templateImage.
 criteria.epsilon defines the threshold of the increment in the correlation coefficient between two
 iterations (a negative criteria.epsilon makes criteria.maxcount the only termination criterion).
 Default values are shown in the declaration above.
-@param inputMask An optional mask to indicate valid values of inputImage.
+@param inputMask An optional single channel mask to indicate valid values of inputImage.
@param gaussFiltSize An optional value indicating size of gaussian blur filter; (DEFAULT: 5)

 The function estimates the optimum transformation (warpMatrix) with respect to ECC criterion
--- a/modules/video/perf/perf_ecc.cpp
+++ b/modules/video/perf/perf_ecc.cpp
@ -1,24 +1,22 @@
 #include "perf_precomp.hpp"

-namespace opencv_test
-{
+namespace opencv_test {
 using namespace perf;

 CV_ENUM(MotionType, MOTION_TRANSLATION, MOTION_EUCLIDEAN, MOTION_AFFINE, MOTION_HOMOGRAPHY)
+CV_ENUM(ReadFlag, IMREAD_GRAYSCALE, IMREAD_COLOR)

-typedef tuple<MotionType> MotionType_t;
-typedef perf::TestBaseWithParam<MotionType_t> TransformationType;
-
-
-PERF_TEST_P(TransformationType, findTransformECC, /*testing::ValuesIn(MotionType::all())*/
-            testing::Values((int) MOTION_TRANSLATION, (int) MOTION_EUCLIDEAN,
-            (int) MOTION_AFFINE, (int) MOTION_HOMOGRAPHY)
-            )
-{
-    Mat img = imread(getDataPath("cv/shared/fruits_ecc.png"),0);
-    Mat templateImage;
+typedef std::tuple<MotionType, ReadFlag> TestParams;
+typedef perf::TestBaseWithParam<TestParams> ECCPerfTest;

+PERF_TEST_P(ECCPerfTest, findTransformECC,
+            testing::Combine(testing::Values(MOTION_TRANSLATION, MOTION_EUCLIDEAN, MOTION_AFFINE, MOTION_HOMOGRAPHY),
+                             testing::Values(IMREAD_GRAYSCALE, IMREAD_COLOR))) {
    int transform_type = get<0>(GetParam());
+    int readFlag = get<1>(GetParam());
+
+    Mat img = imread(getDataPath("cv/shared/fruits_ecc.png"), readFlag);
+    Mat templateImage;

    Mat warpMat;
    Mat warpGround;
@ -26,46 +24,38 @@ PERF_TEST_P(TransformationType, findTransformECC, /*testing::ValuesIn(MotionType
    double angle;
    switch (transform_type) {
        case MOTION_TRANSLATION:
-            warpGround = (Mat_<float>(2,3) << 1.f, 0.f, 7.234f,
-                0.f, 1.f, 11.839f);
+            warpGround = (Mat_<float>(2, 3) << 1.f, 0.f, 7.234f, 0.f, 1.f, 11.839f);

-            warpAffine(img, templateImage, warpGround,
-                Size(200,200), INTER_LINEAR + WARP_INVERSE_MAP);
+            warpAffine(img, templateImage, warpGround, Size(200, 200), INTER_LINEAR + WARP_INVERSE_MAP);
            break;
        case MOTION_EUCLIDEAN:
-            angle = CV_PI/30;
+            angle = CV_PI / 30;

-            warpGround = (Mat_<float>(2,3) << (float)cos(angle), (float)-sin(angle), 12.123f,
-                (float)sin(angle), (float)cos(angle), 14.789f);
-            warpAffine(img, templateImage, warpGround,
-                Size(200,200), INTER_LINEAR + WARP_INVERSE_MAP);
+            warpGround = (Mat_<float>(2, 3) << (float)cos(angle), (float)-sin(angle), 12.123f, (float)sin(angle),
+                          (float)cos(angle), 14.789f);
+            warpAffine(img, templateImage, warpGround, Size(200, 200), INTER_LINEAR + WARP_INVERSE_MAP);
            break;
        case MOTION_AFFINE:
-            warpGround = (Mat_<float>(2,3) << 0.98f, 0.03f, 15.523f,
-                -0.02f, 0.95f, 10.456f);
-            warpAffine(img, templateImage, warpGround,
-                Size(200,200), INTER_LINEAR + WARP_INVERSE_MAP);
+            warpGround = (Mat_<float>(2, 3) << 0.98f, 0.03f, 15.523f, -0.02f, 0.95f, 10.456f);
+            warpAffine(img, templateImage, warpGround, Size(200, 200), INTER_LINEAR + WARP_INVERSE_MAP);
            break;
        case MOTION_HOMOGRAPHY:
-            warpGround = (Mat_<float>(3,3) << 0.98f, 0.03f, 15.523f,
-                -0.02f, 0.95f, 10.456f,
-                0.0002f, 0.0003f, 1.f);
-            warpPerspective(img, templateImage, warpGround,
-                Size(200,200), INTER_LINEAR + WARP_INVERSE_MAP);
+            warpGround = (Mat_<float>(3, 3) << 0.98f, 0.03f, 15.523f, -0.02f, 0.95f, 10.456f, 0.0002f, 0.0003f, 1.f);
+            warpPerspective(img, templateImage, warpGround, Size(200, 200), INTER_LINEAR + WARP_INVERSE_MAP);
            break;
    }

-    TEST_CYCLE()
-    {
-        if (transform_type<3)
-            warpMat = Mat::eye(2,3, CV_32F);
+    TEST_CYCLE() {
+        if (transform_type < 3)
+            warpMat = Mat::eye(2, 3, CV_32F);
        else
-            warpMat = Mat::eye(3,3, CV_32F);
+            warpMat = Mat::eye(3, 3, CV_32F);

        findTransformECC(templateImage, img, warpMat, transform_type,
-            TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 5, -1));
+                         TermCriteria(TermCriteria::COUNT + TermCriteria::EPS, 5, -1));
    }
+
    SANITY_CHECK(warpMat, 3e-3);
 }

-} // namespace
+}  // namespace opencv_test
--- a/modules/video/src/bgfg_KNN.cpp
+++ b/modules/video/src/bgfg_KNN.cpp
@ -132,6 +132,8 @@ public:
    //! the update operator
    void apply(InputArray image, OutputArray fgmask, double learningRate) CV_OVERRIDE;

+    void apply(InputArray image, InputArray knownForegroundMask, OutputArray fgmask, double learningRate) CV_OVERRIDE;
+
    //! computes a background image which are the mean of all background gaussians
    virtual void getBackgroundImage(OutputArray backgroundImage) const CV_OVERRIDE;

@ -526,7 +528,9 @@ public:
               int _nkNN,
               float _fTau,
               bool _bShadowDetection,
-               uchar _nShadowDetection)
+               uchar _nShadowDetection,
+               const Mat& _knownForegroundMask)
+            :  knownForegroundMask(_knownForegroundMask)
    {
        src = &_src;
        dst = &_dst;
@ -587,6 +591,17 @@ public:
                        m_nShortCounter,
                        include
                        );
+                // Check that foreground mask exists
+                if (!knownForegroundMask.empty()) {
+                    // If input mask states pixel is foreground
+                    if (knownForegroundMask.at<uchar>(y, x) > 0)
+                    {
+                        mask[x] = 255; // ensure output mask marks this pixel as FG
+                        data += nchannels;
+                        m_aModel += m_nN*3*ndata;
+                        continue;
+                    }
+                }
                switch (result)
                {
                    case 0:
@ -626,6 +641,7 @@ public:
    int m_nkNN;
    bool m_bShadowDetection;
    uchar m_nShadowDetection;
+    const Mat& knownForegroundMask;
 };

 #ifdef HAVE_OPENCL
@ -728,7 +744,12 @@ void BackgroundSubtractorKNNImpl::create_ocl_apply_kernel()

 #endif

-void BackgroundSubtractorKNNImpl::apply(InputArray _image, OutputArray _fgmask, double learningRate)
+// Base 3 version class
+void BackgroundSubtractorKNNImpl::apply(InputArray _image, OutputArray _fgmask, double learningRate) {
+    apply(_image,  noArray(), _fgmask, learningRate);
+}
+
+void BackgroundSubtractorKNNImpl::apply(InputArray _image, InputArray _knownForegroundMask, OutputArray _fgmask, double learningRate)
 {
    CV_INSTRUMENT_REGION();

@ -757,6 +778,14 @@ void BackgroundSubtractorKNNImpl::apply(InputArray _image, OutputArray _fgmask,
    _fgmask.create( image.size(), CV_8U );
    Mat fgmask = _fgmask.getMat();

+    Mat knownForegroundMask = _knownForegroundMask.getMat();
+
+    if(!knownForegroundMask.empty())
+    {
+    CV_Assert(knownForegroundMask.type() == CV_8UC1);
+    CV_Assert(knownForegroundMask.size() == image.size());
+    }
+
    ++nframes;
    learningRate = learningRate >= 0 && nframes > 1 ? learningRate : 1./std::min( 2*nframes, history );
    CV_Assert(learningRate >= 0);
@ -791,7 +820,8 @@ void BackgroundSubtractorKNNImpl::apply(InputArray _image, OutputArray _fgmask,
                             nkNN,
                             fTau,
                             bShadowDetection,
-                             nShadowDetection),
+                             nShadowDetection,
+                             knownForegroundMask),
                             image.total()/(double)(1 << 16));

    nShortCounter++;//0,1,...,nShortUpdate-1
--- a/modules/video/src/bgfg_gaussmix2.cpp
+++ b/modules/video/src/bgfg_gaussmix2.cpp
@ -178,6 +178,8 @@ public:
    //! the update operator
    void apply(InputArray image, OutputArray fgmask, double learningRate) CV_OVERRIDE;

+    void apply(InputArray image, InputArray knownForegroundMask, OutputArray fgmask, double learningRate) CV_OVERRIDE;
+
    //! computes a background image which are the mean of all background gaussians
    virtual void getBackgroundImage(OutputArray backgroundImage) const CV_OVERRIDE;

@ -546,7 +548,8 @@ public:
                float _Tb, float _TB, float _Tg,
                float _varInit, float _varMin, float _varMax,
                float _prune, float _tau, bool _detectShadows,
-                uchar _shadowVal)
+                uchar _shadowVal, const Mat& _knownForegroundMask)
+            :   knownForegroundMask(_knownForegroundMask)
    {
        src = &_src;
        dst = &_dst;
@ -590,6 +593,18 @@ public:

            for( int x = 0; x < ncols; x++, data += nchannels, gmm += nmixtures, mean += nmixtures*nchannels )
            {
+
+                // Check that foreground mask exists
+                if (!knownForegroundMask.empty())
+                {
+                    // If input mask states pixel is foreground
+                    if (knownForegroundMask.at<uchar>(y, x) > 0)
+                    {
+                        mask[x] = 255; // ensure output mask marks this pixel as FG
+                        continue;
+                    }
+                }
+
                //calculate distances to the modes (+ sort)
                //here we need to go in descending order!!!
                bool background = false;//return value -> true - the pixel classified as background
@ -766,6 +781,7 @@ public:

    bool detectShadows;
    uchar shadowVal;
+    const Mat& knownForegroundMask;
 };

 #ifdef HAVE_OPENCL
@ -844,7 +860,12 @@ void BackgroundSubtractorMOG2Impl::create_ocl_apply_kernel()

 #endif

-void BackgroundSubtractorMOG2Impl::apply(InputArray _image, OutputArray _fgmask, double learningRate)
+// Base 3 version class
+void BackgroundSubtractorMOG2Impl::apply(InputArray _image, OutputArray _fgmask, double learningRate) {
+    apply(_image,  noArray(), _fgmask, learningRate);
+}
+
+void BackgroundSubtractorMOG2Impl::apply(InputArray _image, InputArray _knownForegroundMask, OutputArray _fgmask, double learningRate)
 {
    CV_INSTRUMENT_REGION();

@ -867,6 +888,14 @@ void BackgroundSubtractorMOG2Impl::apply(InputArray _image, OutputArray _fgmask,
    _fgmask.create( image.size(), CV_8U );
    Mat fgmask = _fgmask.getMat();

+    Mat knownForegroundMask = _knownForegroundMask.getMat();
+
+    if(!knownForegroundMask.empty())
+    {
+    CV_Assert(knownForegroundMask.type() == CV_8UC1);
+    CV_Assert(knownForegroundMask.size() == image.size());
+    }
+
    ++nframes;
    learningRate = learningRate >= 0 && nframes > 1 ? learningRate : 1./std::min( 2*nframes, history );
    CV_Assert(learningRate >= 0);
@ -879,7 +908,7 @@ void BackgroundSubtractorMOG2Impl::apply(InputArray _image, OutputArray _fgmask,
                              (float)varThreshold,
                              backgroundRatio, varThresholdGen,
                              fVarInit, fVarMin, fVarMax, float(-learningRate*fCT), fTau,
-                              bShadowDetection, nShadowDetection),
+                              bShadowDetection, nShadowDetection, knownForegroundMask),
                              image.total()/(double)(1 << 16));
 }

--- a/modules/video/src/ecc.cpp
+++ b/modules/video/src/ecc.cpp
@ -41,30 +41,24 @@

 #include "precomp.hpp"

-
 /****************************************************************************************\
 *                                       Image Alignment (ECC algorithm)                  *
 \****************************************************************************************/

 using namespace cv;

-static void image_jacobian_homo_ECC(const Mat& src1, const Mat& src2,
-                                    const Mat& src3, const Mat& src4,
-                                    const Mat& src5, Mat& dst)
-{
-
-
+static void image_jacobian_homo_ECC(const Mat& src1, const Mat& src2, const Mat& src3, const Mat& src4, const Mat& src5,
+                                    Mat& dst) {
    CV_Assert(src1.size() == src2.size());
    CV_Assert(src1.size() == src3.size());
    CV_Assert(src1.size() == src4.size());

-    CV_Assert( src1.rows == dst.rows);
-    CV_Assert(dst.cols == (src1.cols*8));
-    CV_Assert(dst.type() == CV_32FC1);
+    CV_Assert(src1.rows == dst.rows);
+    CV_Assert(dst.cols == (src1.cols * 8));
+    CV_Assert(dst.type() == CV_MAKETYPE(CV_32F, src1.channels()));

    CV_Assert(src5.isContinuous());

-
    const float* hptr = src5.ptr<float>(0);

    const float h0_ = hptr[0];
@ -78,19 +72,23 @@ static void image_jacobian_homo_ECC(const Mat& src1, const Mat& src2,

    const int w = src1.cols;

+    // create denominator for all points as a block
+    Mat den_;
+    addWeighted(src3, h2_, src4, h5_, 1.0, den_);

-    //create denominator for all points as a block
-    Mat den_ = src3*h2_ + src4*h5_ + 1.0;//check the time of this! otherwise use addWeighted
+    // create projected points
+    Mat hatX_, hatY_;
+    addWeighted(src3, h0_, src4, h3_, 0.0, hatX_);
+    hatX_ += h6_;

-    //create projected points
-    Mat hatX_ = -src3*h0_ - src4*h3_ - h6_;
-    divide(hatX_, den_, hatX_);
-    Mat hatY_ = -src3*h1_ - src4*h4_ - h7_;
-    divide(hatY_, den_, hatY_);
+    addWeighted(src3, h1_, src4, h4_, 0.0, hatY_);
+    hatY_ += h7_;

+    divide(-hatY_, den_, hatY_);
+    divide(-hatX_, den_, hatX_);

-    //instead of dividing each block with den,
-    //just pre-divide the block of gradients (it's more efficient)
+    // instead of dividing each block with den,
+    // just pre-divide the block of gradients (it's more efficient)

    Mat src1Divided_;
    Mat src2Divided_;
@ -98,114 +96,98 @@ static void image_jacobian_homo_ECC(const Mat& src1, const Mat& src2,
    divide(src1, den_, src1Divided_);
    divide(src2, den_, src2Divided_);

+    // compute Jacobian blocks (8 blocks)

-    //compute Jacobian blocks (8 blocks)
+    dst.colRange(0, w) = src1Divided_.mul(src3);  // 1

-    dst.colRange(0, w) = src1Divided_.mul(src3);//1
+    dst.colRange(w, 2 * w) = src2Divided_.mul(src3);  // 2

-    dst.colRange(w,2*w) = src2Divided_.mul(src3);//2
-
-    Mat temp_ = (hatX_.mul(src1Divided_)+hatY_.mul(src2Divided_));
-    dst.colRange(2*w,3*w) = temp_.mul(src3);//3
+    Mat temp_ = (hatX_.mul(src1Divided_) + hatY_.mul(src2Divided_));
+    dst.colRange(2 * w, 3 * w) = temp_.mul(src3);  // 3

    hatX_.release();
    hatY_.release();

-    dst.colRange(3*w, 4*w) = src1Divided_.mul(src4);//4
+    dst.colRange(3 * w, 4 * w) = src1Divided_.mul(src4);  // 4

-    dst.colRange(4*w, 5*w) = src2Divided_.mul(src4);//5
+    dst.colRange(4 * w, 5 * w) = src2Divided_.mul(src4);  // 5

-    dst.colRange(5*w, 6*w) = temp_.mul(src4);//6
+    dst.colRange(5 * w, 6 * w) = temp_.mul(src4);  // 6

-    src1Divided_.copyTo(dst.colRange(6*w, 7*w));//7
+    src1Divided_.copyTo(dst.colRange(6 * w, 7 * w));  // 7

-    src2Divided_.copyTo(dst.colRange(7*w, 8*w));//8
+    src2Divided_.copyTo(dst.colRange(7 * w, 8 * w));  // 8
 }

-static void image_jacobian_euclidean_ECC(const Mat& src1, const Mat& src2,
-                                         const Mat& src3, const Mat& src4,
-                                         const Mat& src5, Mat& dst)
-{
-
-    CV_Assert( src1.size()==src2.size());
-    CV_Assert( src1.size()==src3.size());
-    CV_Assert( src1.size()==src4.size());
-
-    CV_Assert( src1.rows == dst.rows);
-    CV_Assert(dst.cols == (src1.cols*3));
-    CV_Assert(dst.type() == CV_32FC1);
-
-    CV_Assert(src5.isContinuous());
-
-    const float* hptr = src5.ptr<float>(0);
-
-    const float h0 = hptr[0];//cos(theta)
-    const float h1 = hptr[3];//sin(theta)
-
-    const int w = src1.cols;
-
-    //create -sin(theta)*X -cos(theta)*Y for all points as a block -> hatX
-    Mat hatX = -(src3*h1) - (src4*h0);
-
-    //create cos(theta)*X -sin(theta)*Y for all points as a block -> hatY
-    Mat hatY = (src3*h0) - (src4*h1);
-
-
-    //compute Jacobian blocks (3 blocks)
-    dst.colRange(0, w) = (src1.mul(hatX))+(src2.mul(hatY));//1
-
-    src1.copyTo(dst.colRange(w, 2*w));//2
-    src2.copyTo(dst.colRange(2*w, 3*w));//3
-}
-
-
-static void image_jacobian_affine_ECC(const Mat& src1, const Mat& src2,
-                                      const Mat& src3, const Mat& src4,
-                                      Mat& dst)
-{
-
+static void image_jacobian_euclidean_ECC(const Mat& src1, const Mat& src2, const Mat& src3, const Mat& src4,
+                                         const Mat& src5, Mat& dst) {
    CV_Assert(src1.size() == src2.size());
    CV_Assert(src1.size() == src3.size());
    CV_Assert(src1.size() == src4.size());

    CV_Assert(src1.rows == dst.rows);
-    CV_Assert(dst.cols == (6*src1.cols));
+    CV_Assert(dst.cols == (src1.cols * 3));
+    CV_Assert(dst.type() == CV_MAKETYPE(CV_32F, src1.channels()));

-    CV_Assert(dst.type() == CV_32FC1);
+    CV_Assert(src5.isContinuous());

+    const float* hptr = src5.ptr<float>(0);
+
+    const float h0 = hptr[0];  // cos(theta)
+    const float h1 = hptr[3];  // sin(theta)

    const int w = src1.cols;

-    //compute Jacobian blocks (6 blocks)
+    // create -sin(theta)*X -cos(theta)*Y for all points as a block -> hatX
+    Mat hatX = -(src3 * h1) - (src4 * h0);

-    dst.colRange(0,w) = src1.mul(src3);//1
-    dst.colRange(w,2*w) = src2.mul(src3);//2
-    dst.colRange(2*w,3*w) = src1.mul(src4);//3
-    dst.colRange(3*w,4*w) = src2.mul(src4);//4
-    src1.copyTo(dst.colRange(4*w,5*w));//5
-    src2.copyTo(dst.colRange(5*w,6*w));//6
+    // create cos(theta)*X -sin(theta)*Y for all points as a block -> hatY
+    Mat hatY = (src3 * h0) - (src4 * h1);
+
+    // compute Jacobian blocks (3 blocks)
+    dst.colRange(0, w) = (src1.mul(hatX)) + (src2.mul(hatY));  // 1
+
+    src1.copyTo(dst.colRange(w, 2 * w));      // 2
+    src2.copyTo(dst.colRange(2 * w, 3 * w));  // 3
 }

+static void image_jacobian_affine_ECC(const Mat& src1, const Mat& src2, const Mat& src3, const Mat& src4, Mat& dst) {
+    CV_Assert(src1.size() == src2.size());
+    CV_Assert(src1.size() == src3.size());
+    CV_Assert(src1.size() == src4.size());

-static void image_jacobian_translation_ECC(const Mat& src1, const Mat& src2, Mat& dst)
-{
+    CV_Assert(src1.rows == dst.rows);
+    CV_Assert(dst.cols == (6 * src1.cols));

-    CV_Assert( src1.size()==src2.size());
-
-    CV_Assert( src1.rows == dst.rows);
-    CV_Assert(dst.cols == (src1.cols*2));
-    CV_Assert(dst.type() == CV_32FC1);
+    CV_Assert(dst.type() == CV_MAKETYPE(CV_32F, src1.channels()));

    const int w = src1.cols;

-    //compute Jacobian blocks (2 blocks)
+    // compute Jacobian blocks (6 blocks)
+
+    dst.colRange(0, w) = src1.mul(src3);          // 1
+    dst.colRange(w, 2 * w) = src2.mul(src3);      // 2
+    dst.colRange(2 * w, 3 * w) = src1.mul(src4);  // 3
+    dst.colRange(3 * w, 4 * w) = src2.mul(src4);  // 4
+    src1.copyTo(dst.colRange(4 * w, 5 * w));      // 5
+    src2.copyTo(dst.colRange(5 * w, 6 * w));      // 6
+}
+
+static void image_jacobian_translation_ECC(const Mat& src1, const Mat& src2, Mat& dst) {
+    CV_Assert(src1.size() == src2.size());
+
+    CV_Assert(src1.rows == dst.rows);
+    CV_Assert(dst.cols == (src1.cols * 2));
+    CV_Assert(dst.type() == CV_MAKETYPE(CV_32F, src1.channels()));
+
+    const int w = src1.cols;
+
+    // compute Jacobian blocks (2 blocks)
    src1.copyTo(dst.colRange(0, w));
-    src2.copyTo(dst.colRange(w, 2*w));
+    src2.copyTo(dst.colRange(w, 2 * w));
 }

-
-static void project_onto_jacobian_ECC(const Mat& src1, const Mat& src2, Mat& dst)
-{
+static void project_onto_jacobian_ECC(const Mat& src1, const Mat& src2, Mat& dst) {
    /* this functions is used for two types of projections. If src1.cols ==src.cols
    it does a blockwise multiplication (like in the outer product of vectors)
    of the blocks in matrices src1 and src2 and dst
@ -222,59 +204,54 @@ static void project_onto_jacobian_ECC(const Mat& src1, const Mat& src2, Mat& dst

    float* dstPtr = dst.ptr<float>(0);

-    if (src1.cols !=src2.cols){//dst.cols==1
-        w  = src2.cols;
-        for (int i=0; i<dst.rows; i++){
-            dstPtr[i] = (float) src2.dot(src1.colRange(i*w,(i+1)*w));
+    if (src1.cols != src2.cols) {  // dst.cols==1
+        w = src2.cols;
+        for (int i = 0; i < dst.rows; i++) {
+            dstPtr[i] = (float)src2.dot(src1.colRange(i * w, (i + 1) * w));
        }
    }

    else {
-        CV_Assert(dst.cols == dst.rows); //dst is square (and symmetric)
-        w = src2.cols/dst.cols;
+        CV_Assert(dst.cols == dst.rows);  // dst is square (and symmetric)
+        w = src2.cols / dst.cols;
        Mat mat;
-        for (int i=0; i<dst.rows; i++){
+        for (int i = 0; i < dst.rows; i++) {
+            mat = Mat(src1.colRange(i * w, (i + 1) * w));
+            dstPtr[i * (dst.rows + 1)] = (float)pow(norm(mat), 2);  // diagonal elements

-            mat = Mat(src1.colRange(i*w, (i+1)*w));
-            dstPtr[i*(dst.rows+1)] = (float) pow(norm(mat),2); //diagonal elements
-
-            for (int j=i+1; j<dst.cols; j++){ //j starts from i+1
-                dstPtr[i*dst.cols+j] = (float) mat.dot(src2.colRange(j*w, (j+1)*w));
-                dstPtr[j*dst.cols+i] = dstPtr[i*dst.cols+j]; //due to symmetry
+            for (int j = i + 1; j < dst.cols; j++) {  // j starts from i+1
+                dstPtr[i * dst.cols + j] = (float)mat.dot(src2.colRange(j * w, (j + 1) * w));
+                dstPtr[j * dst.cols + i] = dstPtr[i * dst.cols + j];  // due to symmetry
            }
        }
    }
 }

+static void update_warping_matrix_ECC(Mat& map_matrix, const Mat& update, const int motionType) {
+    CV_Assert(map_matrix.type() == CV_32FC1);
+    CV_Assert(update.type() == CV_32FC1);

-static void update_warping_matrix_ECC (Mat& map_matrix, const Mat& update, const int motionType)
-{
-    CV_Assert (map_matrix.type() == CV_32FC1);
-    CV_Assert (update.type() == CV_32FC1);
-
-    CV_Assert (motionType == MOTION_TRANSLATION || motionType == MOTION_EUCLIDEAN ||
-        motionType == MOTION_AFFINE || motionType == MOTION_HOMOGRAPHY);
+    CV_Assert(motionType == MOTION_TRANSLATION || motionType == MOTION_EUCLIDEAN || motionType == MOTION_AFFINE ||
+              motionType == MOTION_HOMOGRAPHY);

    if (motionType == MOTION_HOMOGRAPHY)
-        CV_Assert (map_matrix.rows == 3 && update.rows == 8);
+        CV_Assert(map_matrix.rows == 3 && update.rows == 8);
    else if (motionType == MOTION_AFFINE)
        CV_Assert(map_matrix.rows == 2 && update.rows == 6);
    else if (motionType == MOTION_EUCLIDEAN)
-        CV_Assert (map_matrix.rows == 2 && update.rows == 3);
+        CV_Assert(map_matrix.rows == 2 && update.rows == 3);
    else
-        CV_Assert (map_matrix.rows == 2 && update.rows == 2);
+        CV_Assert(map_matrix.rows == 2 && update.rows == 2);

-    CV_Assert (update.cols == 1);
-
-    CV_Assert( map_matrix.isContinuous());
-    CV_Assert( update.isContinuous() );
+    CV_Assert(update.cols == 1);

+    CV_Assert(map_matrix.isContinuous());
+    CV_Assert(update.isContinuous());

    float* mapPtr = map_matrix.ptr<float>(0);
    const float* updatePtr = update.ptr<float>(0);

-
-    if (motionType == MOTION_TRANSLATION){
+    if (motionType == MOTION_TRANSLATION) {
        mapPtr[2] += updatePtr[0];
        mapPtr[5] += updatePtr[1];
    }
@ -302,23 +279,23 @@ static void update_warping_matrix_ECC (Mat& map_matrix, const Mat& update, const

        mapPtr[2] += updatePtr[1];
        mapPtr[5] += updatePtr[2];
-        mapPtr[0] = mapPtr[4] = (float) cos(new_theta);
-        mapPtr[3] = (float) sin(new_theta);
+        mapPtr[0] = mapPtr[4] = (float)cos(new_theta);
+        mapPtr[3] = (float)sin(new_theta);
        mapPtr[1] = -mapPtr[3];
    }
 }

-
 /** Function that computes enhanced corelation coefficient from Georgios et.al. 2008
-*   See https://github.com/opencv/opencv/issues/12432
-*/
-double cv::computeECC(InputArray templateImage, InputArray inputImage, InputArray inputMask)
-{
+ *   See https://github.com/opencv/opencv/issues/12432
+ */
+double cv::computeECC(InputArray templateImage, InputArray inputImage, InputArray inputMask) {
    CV_Assert(!templateImage.empty());
    CV_Assert(!inputImage.empty());

-    if( ! (templateImage.type()==inputImage.type()))
-        CV_Error( Error::StsUnmatchedFormats, "Both input images must have the same data type" );
+    CV_Assert(templateImage.channels() == 1 || templateImage.channels() == 3);
+
+    if (!(templateImage.type() == inputImage.type()))
+        CV_Error(Error::StsUnmatchedFormats, "Both input images must have the same data type");

    Scalar meanTemplate, sdTemplate;

@ -335,7 +312,7 @@ double cv::computeECC(InputArray templateImage, InputArray inputImage, InputArra
     * ultimately results in an incorrect ECC. To circumvent this problem, if unsigned ints are provided,
     * we convert them to a signed ints with larger resolution for the subtraction step.
     */
-    if(type == CV_8U || type == CV_16U) {
+    if (type == CV_8U || type == CV_16U) {
        int newType = type == CV_8U ? CV_16S : CV_32S;
        Mat templateMatConverted, inputMatConverted;
        templateMat.convertTo(templateMatConverted, newType);
@ -344,40 +321,36 @@ double cv::computeECC(InputArray templateImage, InputArray inputImage, InputArra
        cv::swap(inputMat, inputMatConverted);
    }
    subtract(templateMat, meanTemplate, templateImage_zeromean, inputMask);
-    double templateImagenorm = std::sqrt(active_pixels*sdTemplate.val[0]*sdTemplate.val[0]);
+    double templateImagenorm = std::sqrt(active_pixels * cv::norm(sdTemplate, NORM_L2SQR));

    Scalar meanInput, sdInput;

    Mat inputImage_zeromean = Mat::zeros(inputImage.size(), inputImage.type());
    meanStdDev(inputImage, meanInput, sdInput, inputMask);
    subtract(inputMat, meanInput, inputImage_zeromean, inputMask);
-    double inputImagenorm = std::sqrt(active_pixels*sdInput.val[0]*sdInput.val[0]);
+    double inputImagenorm = std::sqrt(active_pixels * norm(sdInput, NORM_L2SQR));

-    return templateImage_zeromean.dot(inputImage_zeromean)/(templateImagenorm*inputImagenorm);
+    return templateImage_zeromean.dot(inputImage_zeromean) / (templateImagenorm * inputImagenorm);
 }

-
-double cv::findTransformECC(InputArray templateImage,
-                            InputArray inputImage,
-                            InputOutputArray warpMatrix,
-                            int motionType,
-                            TermCriteria criteria,
-                            InputArray inputMask,
-                            int gaussFiltSize)
-{
-
-
-    Mat src = templateImage.getMat();//template image
-    Mat dst = inputImage.getMat(); //input image (to be warped)
-    Mat map = warpMatrix.getMat(); //warp (transformation)
+double cv::findTransformECC(InputArray templateImage, InputArray inputImage, InputOutputArray warpMatrix,
+                            int motionType, TermCriteria criteria, InputArray inputMask, int gaussFiltSize) {
+    Mat src = templateImage.getMat();  // template image
+    Mat dst = inputImage.getMat();     // input image (to be warped)
+    Mat map = warpMatrix.getMat();     // warp (transformation)

    CV_Assert(!src.empty());
    CV_Assert(!dst.empty());

+    CV_Assert(src.channels() == 1 || src.channels() == 3);
+    CV_Assert(src.channels() == dst.channels());
+    CV_Assert(src.depth() == dst.depth());
+    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F || src.depth() == CV_64F);
+
    // If the user passed an un-initialized warpMatrix, initialize to identity
-    if(map.empty()) {
+    if (map.empty()) {
        int rowCount = 2;
-        if(motionType == MOTION_HOMOGRAPHY)
+        if (motionType == MOTION_HOMOGRAPHY)
            rowCount = 3;

        warpMatrix.create(rowCount, 3, CV_32FC1);
@ -385,44 +358,39 @@ double cv::findTransformECC(InputArray templateImage,
        map = Mat::eye(rowCount, 3, CV_32F);
    }

-    if( ! (src.type()==dst.type()))
-        CV_Error( Error::StsUnmatchedFormats, "Both input images must have the same data type" );
+    if (!(src.type() == dst.type()))
+        CV_Error(Error::StsUnmatchedFormats, "Both input images must have the same data type");

-    //accept only 1-channel images
-    if( src.type() != CV_8UC1 && src.type()!= CV_32FC1)
-        CV_Error( Error::StsUnsupportedFormat, "Images must have 8uC1 or 32fC1 type");
+    if (map.type() != CV_32FC1)
+        CV_Error(Error::StsUnsupportedFormat, "warpMatrix must be single-channel floating-point matrix");

-    if( map.type() != CV_32FC1)
-        CV_Error( Error::StsUnsupportedFormat, "warpMatrix must be single-channel floating-point matrix");
+    CV_Assert(map.cols == 3);
+    CV_Assert(map.rows == 2 || map.rows == 3);

-    CV_Assert (map.cols == 3);
-    CV_Assert (map.rows == 2 || map.rows ==3);
+    CV_Assert(motionType == MOTION_AFFINE || motionType == MOTION_HOMOGRAPHY || motionType == MOTION_EUCLIDEAN ||
+              motionType == MOTION_TRANSLATION);

-    CV_Assert (motionType == MOTION_AFFINE || motionType == MOTION_HOMOGRAPHY ||
-        motionType == MOTION_EUCLIDEAN || motionType == MOTION_TRANSLATION);
-
-    if (motionType == MOTION_HOMOGRAPHY){
-        CV_Assert (map.rows ==3);
+    if (motionType == MOTION_HOMOGRAPHY) {
+        CV_Assert(map.rows == 3);
    }

-    CV_Assert (criteria.type & TermCriteria::COUNT || criteria.type & TermCriteria::EPS);
-    const int    numberOfIterations = (criteria.type & TermCriteria::COUNT) ? criteria.maxCount : 200;
-    const double termination_eps    = (criteria.type & TermCriteria::EPS)   ? criteria.epsilon  :  -1;
+    CV_Assert(criteria.type & TermCriteria::COUNT || criteria.type & TermCriteria::EPS);
+    const int numberOfIterations = (criteria.type & TermCriteria::COUNT) ? criteria.maxCount : 200;
+    const double termination_eps = (criteria.type & TermCriteria::EPS) ? criteria.epsilon : -1;

-    int paramTemp = 6;//default: affine
-    switch (motionType){
-      case MOTION_TRANSLATION:
-          paramTemp = 2;
-          break;
-      case MOTION_EUCLIDEAN:
-          paramTemp = 3;
-          break;
-      case MOTION_HOMOGRAPHY:
-          paramTemp = 8;
-          break;
+    int paramTemp = 6;  // default: affine
+    switch (motionType) {
+        case MOTION_TRANSLATION:
+            paramTemp = 2;
+            break;
+        case MOTION_EUCLIDEAN:
+            paramTemp = 3;
+            break;
+        case MOTION_HOMOGRAPHY:
+            paramTemp = 8;
+            break;
    }

-
    const int numberOfParameters = paramTemp;

    const int ws = src.cols;
@ -438,10 +406,8 @@ double cv::findTransformECC(InputArray templateImage,
    float* XcoPtr = Xcoord.ptr<float>(0);
    float* YcoPtr = Ycoord.ptr<float>(0);
    int j;
-    for (j=0; j<ws; j++)
-        XcoPtr[j] = (float) j;
-    for (j=0; j<hs; j++)
-        YcoPtr[j] = (float) j;
+    for (j = 0; j < ws; j++) XcoPtr[j] = (float)j;
+    for (j = 0; j < hs; j++) YcoPtr[j] = (float)j;

    repeat(Xcoord, hs, 1, Xgrid);
    repeat(Ycoord, 1, ws, Ygrid);
@ -449,29 +415,38 @@ double cv::findTransformECC(InputArray templateImage,
    Xcoord.release();
    Ycoord.release();

-    Mat templateZM    = Mat(hs, ws, CV_32F);// to store the (smoothed)zero-mean version of template
-    Mat templateFloat = Mat(hs, ws, CV_32F);// to store the (smoothed) template
-    Mat imageFloat    = Mat(hd, wd, CV_32F);// to store the (smoothed) input image
-    Mat imageWarped   = Mat(hs, ws, CV_32F);// to store the warped zero-mean input image
-    Mat imageMask     = Mat(hs, ws, CV_8U); // to store the final mask
+    const int channels = src.channels();
+    int type = CV_MAKETYPE(CV_32F, channels);  // используем отдельно, если нужно явно
+
+    std::vector<cv::Mat> XgridCh(channels, Xgrid);
+    cv::merge(XgridCh, Xgrid);
+
+    std::vector<cv::Mat> YgridCh(channels, Ygrid);
+    cv::merge(YgridCh, Ygrid);
+
+    Mat templateZM = Mat(hs, ws, type);     // to store the (smoothed)zero-mean version of template
+    Mat templateFloat = Mat(hs, ws, type);  // to store the (smoothed) template
+    Mat imageFloat = Mat(hd, wd, type);     // to store the (smoothed) input image
+    Mat imageWarped = Mat(hs, ws, type);    // to store the warped zero-mean input image
+    Mat imageMask = Mat(hs, ws, CV_8U);     // to store the final mask

    Mat inputMaskMat = inputMask.getMat();
-    //to use it for mask warping
+    // to use it for mask warping
    Mat preMask;
-    if(inputMask.empty())
+    if (inputMask.empty())
        preMask = Mat::ones(hd, wd, CV_8U);
    else
        threshold(inputMask, preMask, 0, 1, THRESH_BINARY);

-    //gaussian filtering is optional
+    // Gaussian filtering is optional
    src.convertTo(templateFloat, templateFloat.type());
    GaussianBlur(templateFloat, templateFloat, Size(gaussFiltSize, gaussFiltSize), 0, 0);

    Mat preMaskFloat;
-    preMask.convertTo(preMaskFloat, CV_32F);
+    preMask.convertTo(preMaskFloat, type);
    GaussianBlur(preMaskFloat, preMaskFloat, Size(gaussFiltSize, gaussFiltSize), 0, 0);
    // Change threshold.
-    preMaskFloat *= (0.5/0.95);
+    preMaskFloat *= (0.5 / 0.95);
    // Rounding conversion.
    preMaskFloat.convertTo(preMask, preMask.type());
    preMask.convertTo(preMaskFloat, preMaskFloat.type());
@ -480,11 +455,10 @@ double cv::findTransformECC(InputArray templateImage,
    GaussianBlur(imageFloat, imageFloat, Size(gaussFiltSize, gaussFiltSize), 0, 0);

    // needed matrices for gradients and warped gradients
-    Mat gradientX = Mat::zeros(hd, wd, CV_32FC1);
-    Mat gradientY = Mat::zeros(hd, wd, CV_32FC1);
-    Mat gradientXWarped = Mat(hs, ws, CV_32FC1);
-    Mat gradientYWarped = Mat(hs, ws, CV_32FC1);
-
+    Mat gradientX = Mat::zeros(hd, wd, type);
+    Mat gradientY = Mat::zeros(hd, wd, type);
+    Mat gradientXWarped = Mat(hs, ws, type);
+    Mat gradientYWarped = Mat(hs, ws, type);

    // calculate first order image derivatives
    Matx13f dx(-0.5f, 0.0f, 0.5f);
@ -492,60 +466,59 @@ double cv::findTransformECC(InputArray templateImage,
    filter2D(imageFloat, gradientX, -1, dx);
    filter2D(imageFloat, gradientY, -1, dx.t());

-    gradientX = gradientX.mul(preMaskFloat);
-    gradientY = gradientY.mul(preMaskFloat);
+    cv::Mat preMaskFloatNCh;
+    std::vector<cv::Mat> maskChannels(gradientX.channels(), preMaskFloat);
+    cv::merge(maskChannels, preMaskFloatNCh);
+
+    gradientX = gradientX.mul(preMaskFloatNCh);
+    gradientY = gradientY.mul(preMaskFloatNCh);

    // matrices needed for solving linear equation system for maximizing ECC
-    Mat jacobian                = Mat(hs, ws*numberOfParameters, CV_32F);
-    Mat hessian                 = Mat(numberOfParameters, numberOfParameters, CV_32F);
-    Mat hessianInv              = Mat(numberOfParameters, numberOfParameters, CV_32F);
-    Mat imageProjection         = Mat(numberOfParameters, 1, CV_32F);
-    Mat templateProjection      = Mat(numberOfParameters, 1, CV_32F);
-    Mat imageProjectionHessian  = Mat(numberOfParameters, 1, CV_32F);
-    Mat errorProjection         = Mat(numberOfParameters, 1, CV_32F);
+    Mat jacobian = Mat(hs, ws * numberOfParameters, type);
+    Mat hessian = Mat(numberOfParameters, numberOfParameters, CV_32F);
+    Mat hessianInv = Mat(numberOfParameters, numberOfParameters, CV_32F);
+    Mat imageProjection = Mat(numberOfParameters, 1, CV_32F);
+    Mat templateProjection = Mat(numberOfParameters, 1, CV_32F);
+    Mat imageProjectionHessian = Mat(numberOfParameters, 1, CV_32F);
+    Mat errorProjection = Mat(numberOfParameters, 1, CV_32F);

-    Mat deltaP = Mat(numberOfParameters, 1, CV_32F);//transformation parameter correction
-    Mat error = Mat(hs, ws, CV_32F);//error as 2D matrix
-
-    const int imageFlags = INTER_LINEAR  + WARP_INVERSE_MAP;
-    const int maskFlags  = INTER_NEAREST + WARP_INVERSE_MAP;
+    Mat deltaP = Mat(numberOfParameters, 1, CV_32F);  // transformation parameter correction
+    Mat error = Mat(hs, ws, CV_32F);                  // error as 2D matrix

+    const int imageFlags = INTER_LINEAR + WARP_INVERSE_MAP;
+    const int maskFlags = INTER_NEAREST + WARP_INVERSE_MAP;

    // iteratively update map_matrix
-    double rho      = -1;
-    double last_rho = - termination_eps;
-    for (int i = 1; (i <= numberOfIterations) && (fabs(rho-last_rho)>= termination_eps); i++)
-    {
-
+    double rho = -1;
+    double last_rho = -termination_eps;
+    for (int i = 1; (i <= numberOfIterations) && (fabs(rho - last_rho) >= termination_eps); i++) {
        // warp-back portion of the inputImage and gradients to the coordinate space of the templateImage
-        if (motionType != MOTION_HOMOGRAPHY)
-        {
-            warpAffine(imageFloat, imageWarped,     map, imageWarped.size(),     imageFlags);
-            warpAffine(gradientX,  gradientXWarped, map, gradientXWarped.size(), imageFlags);
-            warpAffine(gradientY,  gradientYWarped, map, gradientYWarped.size(), imageFlags);
-            warpAffine(preMask,    imageMask,       map, imageMask.size(),       maskFlags);
-        }
-        else
-        {
-            warpPerspective(imageFloat, imageWarped,     map, imageWarped.size(),     imageFlags);
-            warpPerspective(gradientX,  gradientXWarped, map, gradientXWarped.size(), imageFlags);
-            warpPerspective(gradientY,  gradientYWarped, map, gradientYWarped.size(), imageFlags);
-            warpPerspective(preMask,    imageMask,       map, imageMask.size(),       maskFlags);
+        if (motionType != MOTION_HOMOGRAPHY) {
+            warpAffine(imageFloat, imageWarped, map, imageWarped.size(), imageFlags);
+            warpAffine(gradientX, gradientXWarped, map, gradientXWarped.size(), imageFlags);
+            warpAffine(gradientY, gradientYWarped, map, gradientYWarped.size(), imageFlags);
+            warpAffine(preMask, imageMask, map, imageMask.size(), maskFlags);
+        } else {
+            warpPerspective(imageFloat, imageWarped, map, imageWarped.size(), imageFlags);
+            warpPerspective(gradientX, gradientXWarped, map, gradientXWarped.size(), imageFlags);
+            warpPerspective(gradientY, gradientYWarped, map, gradientYWarped.size(), imageFlags);
+            warpPerspective(preMask, imageMask, map, imageMask.size(), maskFlags);
        }

        Scalar imgMean, imgStd, tmpMean, tmpStd;
-        meanStdDev(imageWarped,   imgMean, imgStd, imageMask);
+        meanStdDev(imageWarped, imgMean, imgStd, imageMask);
        meanStdDev(templateFloat, tmpMean, tmpStd, imageMask);

-        subtract(imageWarped,   imgMean, imageWarped, imageMask);//zero-mean input
+        subtract(imageWarped, imgMean, imageWarped, imageMask);  // zero-mean input
        templateZM = Mat::zeros(templateZM.rows, templateZM.cols, templateZM.type());
-        subtract(templateFloat, tmpMean, templateZM,  imageMask);//zero-mean template
+        subtract(templateFloat, tmpMean, templateZM, imageMask);  // zero-mean template

-        const double tmpNorm = std::sqrt(countNonZero(imageMask)*(tmpStd.val[0])*(tmpStd.val[0]));
-        const double imgNorm = std::sqrt(countNonZero(imageMask)*(imgStd.val[0])*(imgStd.val[0]));
+        int validPixels = countNonZero(imageMask);
+        double tmpNorm = std::sqrt(validPixels * cv::norm(tmpStd, cv::NORM_L2SQR));
+        double imgNorm = std::sqrt(validPixels * cv::norm(imgStd, cv::NORM_L2SQR));

        // calculate jacobian of image wrt parameters
-        switch (motionType){
+        switch (motionType) {
            case MOTION_AFFINE:
                image_jacobian_affine_ECC(gradientXWarped, gradientYWarped, Xgrid, Ygrid, jacobian);
                break;
@ -569,48 +542,42 @@ double cv::findTransformECC(InputArray templateImage,

        // calculate enhanced correlation coefficient (ECC)->rho
        last_rho = rho;
-        rho = correlation/(imgNorm*tmpNorm);
+        rho = correlation / (imgNorm * tmpNorm);
        if (cvIsNaN(rho)) {
-          CV_Error(Error::StsNoConv, "NaN encountered.");
+            CV_Error(Error::StsNoConv, "NaN encountered.");
        }

        // project images into jacobian
-        project_onto_jacobian_ECC( jacobian, imageWarped, imageProjection);
+        project_onto_jacobian_ECC(jacobian, imageWarped, imageProjection);
        project_onto_jacobian_ECC(jacobian, templateZM, templateProjection);

-
        // calculate the parameter lambda to account for illumination variation
-        imageProjectionHessian = hessianInv*imageProjection;
-        const double lambda_n = (imgNorm*imgNorm) - imageProjection.dot(imageProjectionHessian);
+        imageProjectionHessian = hessianInv * imageProjection;
+        const double lambda_n = (imgNorm * imgNorm) - imageProjection.dot(imageProjectionHessian);
        const double lambda_d = correlation - templateProjection.dot(imageProjectionHessian);
-        if (lambda_d <= 0.0)
-        {
+        if (lambda_d <= 0.0) {
            rho = -1;
-            CV_Error(Error::StsNoConv, "The algorithm stopped before its convergence. The correlation is going to be minimized. Images may be uncorrelated or non-overlapped");
-
+            CV_Error(Error::StsNoConv,
+                     "The algorithm stopped before its convergence. The correlation is going to be minimized. Images "
+                     "may be uncorrelated or non-overlapped");
        }
-        const double lambda = (lambda_n/lambda_d);
+        const double lambda = (lambda_n / lambda_d);

        // estimate the update step delta_p
-        error = lambda*templateZM - imageWarped;
+        error = lambda * templateZM - imageWarped;
        project_onto_jacobian_ECC(jacobian, error, errorProjection);
        deltaP = hessianInv * errorProjection;

        // update warping matrix
-        update_warping_matrix_ECC( map, deltaP, motionType);
-
-
+        update_warping_matrix_ECC(map, deltaP, motionType);
    }

    // return final correlation coefficient
    return rho;
 }

-double cv::findTransformECC(InputArray templateImage, InputArray inputImage,
-    InputOutputArray warpMatrix, int motionType,
-    TermCriteria criteria,
-    InputArray inputMask)
-{
+double cv::findTransformECC(InputArray templateImage, InputArray inputImage, InputOutputArray warpMatrix,
+                            int motionType, TermCriteria criteria, InputArray inputMask) {
    // Use default value of 5 for gaussFiltSize to maintain backward compatibility.
    return findTransformECC(templateImage, inputImage, warpMatrix, motionType, criteria, inputMask, 5);
 }
--- a/modules/video/test/test_bgfg2.cpp
+++ b/modules/video/test/test_bgfg2.cpp
@ -0,0 +1,108 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+#include "opencv2/video/background_segm.hpp"
+
+namespace opencv_test { namespace {
+
+using namespace cv;
+
+///////////////////////// MOG2 //////////////////////////////
+TEST(BackgroundSubtractorMOG2, KnownForegroundMaskShadowsTrue)
+{
+    Ptr<BackgroundSubtractorMOG2> mog2 = createBackgroundSubtractorMOG2(500, 16, true);
+
+    //Black Frame
+    Mat input = Mat::zeros(480,640 , CV_8UC3);
+
+    //White Rectangle
+    Mat knownFG = Mat::zeros(input.size(), CV_8U);
+
+    rectangle(knownFG, Rect(3,3,5,5), Scalar(255,255,255), -1);
+
+    Mat output;
+    mog2->apply(input, knownFG, output);
+
+    for(int y = 3; y < 8; y++)
+    {
+        for (int x = 3; x < 8; x++){
+            EXPECT_EQ(255,output.at<uchar>(y,x)) << "Expected foreground at (" << x << "," << y << ")";
+        }
+    }
+}
+
+TEST(BackgroundSubtractorMOG2, KnownForegroundMaskShadowsFalse)
+{
+    Ptr<BackgroundSubtractorMOG2> mog2 = createBackgroundSubtractorMOG2(500, 16, false);
+
+    //Black Frame
+    Mat input = Mat::zeros(480,640 , CV_8UC3);
+
+    //White Rectangle
+    Mat knownFG = Mat::zeros(input.size(), CV_8U);
+
+    rectangle(knownFG, Rect(3,3,5,5), Scalar(255,255,255), FILLED);
+
+    Mat output;
+    mog2->apply(input, knownFG, output);
+
+    for(int y = 3; y < 8; y++)
+    {
+        for (int x = 3; x < 8; x++){
+            EXPECT_EQ(255,output.at<uchar>(y,x)) << "Expected foreground at (" << x << "," << y << ")";
+        }
+    }
+}
+
+///////////////////////// KNN //////////////////////////////
+
+TEST(BackgroundSubtractorKNN, KnownForegroundMaskShadowsTrue)
+{
+    Ptr<BackgroundSubtractorKNN> knn = createBackgroundSubtractorKNN(500, 400.0, true);
+
+    //Black Frame
+    Mat input = Mat::zeros(480,640 , CV_8UC3);
+
+    //White Rectangle
+    Mat knownFG = Mat::zeros(input.size(), CV_8U);
+
+    rectangle(knownFG, Rect(3,3,5,5), Scalar(255,255,255), FILLED);
+
+    Mat output;
+    knn->apply(input, knownFG, output);
+
+    for(int y = 3; y < 8; y++)
+    {
+        for (int x = 3; x < 8; x++){
+            EXPECT_EQ(255,output.at<uchar>(y,x)) << "Expected foreground at (" << x << "," << y << ")";
+        }
+    }
+}
+
+TEST(BackgroundSubtractorKNN, KnownForegroundMaskShadowsFalse)
+{
+    Ptr<BackgroundSubtractorKNN> knn = createBackgroundSubtractorKNN(500, 400.0, false);
+
+    //Black Frame
+    Mat input = Mat::zeros(480,640 , CV_8UC3);
+
+    //White Rectangle
+    Mat knownFG = Mat::zeros(input.size(), CV_8U);
+
+    rectangle(knownFG, Rect(3,3,5,5), Scalar(255,255,255), FILLED);
+
+    Mat output;
+    knn->apply(input, knownFG, output);
+
+    for(int y = 3; y < 8; y++)
+    {
+        for (int x = 3; x < 8; x++){
+            EXPECT_EQ(255,output.at<uchar>(y,x)) << "Expected foreground at (" << x << "," << y << ")";
+        }
+    }
+}
+
+}} // namespace
+/* End of file. */
--- a/modules/video/test/test_ecc.cpp
+++ b/modules/video/test/test_ecc.cpp
@ -42,41 +42,49 @@

 #include "test_precomp.hpp"

-namespace opencv_test { namespace {
+namespace opencv_test {
+namespace {

-class CV_ECC_BaseTest : public cvtest::BaseTest
-{
-public:
+class CV_ECC_BaseTest : public cvtest::BaseTest {
+   public:
    CV_ECC_BaseTest();
+    virtual ~CV_ECC_BaseTest();

-protected:
-
+   protected:
    double computeRMS(const Mat& mat1, const Mat& mat2);
    bool isMapCorrect(const Mat& mat);

+    virtual bool test(const Mat) { return true; };  // single test
+    bool testAllTypes(const Mat img);               // run test for all supported data types (U8, U16, F32, F64)
+    bool testAllChNum(const Mat img);               // run test for all supported channels count (gray, RGB)

-    double MAX_RMS_ECC;//upper bound for RMS error
-    int ntests;//number of tests per motion type
-    int ECC_iterations;//number of iterations for ECC
-    double ECC_epsilon; //we choose a negative value, so that
+    void run(int);
+
+    bool checkMap(const Mat& map, const Mat& ground);
+
+    double MAX_RMS_ECC;  // upper bound for RMS error
+    int ntests;          // number of tests per motion type
+    int ECC_iterations;  // number of iterations for ECC
+    double ECC_epsilon;  // we choose a negative value, so that
    // ECC_iterations are always executed
+    TermCriteria criteria;
 };

-CV_ECC_BaseTest::CV_ECC_BaseTest()
-{
-    MAX_RMS_ECC=0.1;
+CV_ECC_BaseTest::CV_ECC_BaseTest() {
+    MAX_RMS_ECC = 0.1;
    ntests = 3;
    ECC_iterations = 50;
-    ECC_epsilon = -1; //-> negative value means that ECC_Iterations will be executed
+    ECC_epsilon = -1;  //-> negative value means that ECC_Iterations will be executed
+    criteria = TermCriteria(TermCriteria::COUNT + TermCriteria::EPS, ECC_iterations, ECC_epsilon);
 }

+CV_ECC_BaseTest::~CV_ECC_BaseTest() {}

-bool CV_ECC_BaseTest::isMapCorrect(const Mat& map)
-{
+bool CV_ECC_BaseTest::isMapCorrect(const Mat& map) {
    bool tr = true;
    float mapVal;
-    for(int i =0; i<map.rows; i++)
-        for(int j=0; j<map.cols; j++){
+    for (int i = 0; i < map.rows; i++)
+        for (int j = 0; j < map.cols; j++) {
            mapVal = map.at<float>(i, j);
            tr = tr & (!cvIsNaN(mapVal) && (fabs(mapVal) < 1e9));
        }
@ -84,415 +92,326 @@ bool CV_ECC_BaseTest::isMapCorrect(const Mat& map)
    return tr;
 }

-double CV_ECC_BaseTest::computeRMS(const Mat& mat1, const Mat& mat2){
-
+double CV_ECC_BaseTest::computeRMS(const Mat& mat1, const Mat& mat2) {
    CV_Assert(mat1.rows == mat2.rows);
    CV_Assert(mat1.cols == mat2.cols);

    Mat errorMat;
    subtract(mat1, mat2, errorMat);

-    return sqrt(errorMat.dot(errorMat)/(mat1.rows*mat1.cols));
+    return sqrt(errorMat.dot(errorMat) / (mat1.rows * mat1.cols * mat1.channels()));
 }

-class CV_ECC_Test_Translation : public CV_ECC_BaseTest
-{
-public:
+bool CV_ECC_BaseTest::checkMap(const Mat& map, const Mat& ground) {
+    if (!isMapCorrect(map)) {
+        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+        return false;
+    }
+
+    if (computeRMS(map, ground) > MAX_RMS_ECC) {
+        ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
+        ts->printf(ts->LOG, "RMS = %f", computeRMS(map, ground));
+        return false;
+    }
+    return true;
+}
+
+bool CV_ECC_BaseTest::testAllTypes(const Mat img) {
+    auto types = {CV_8U, CV_16U, CV_32F, CV_64F};
+    for (auto type : types) {
+        Mat timg;
+        img.convertTo(timg, type);
+        if (!test(timg))
+            return false;
+    }
+    return true;
+}
+
+bool CV_ECC_BaseTest::testAllChNum(const Mat img) {
+    if (!testAllTypes(img))
+        return false;
+
+    Mat gray;
+    cvtColor(img, gray, COLOR_RGB2GRAY);
+    if (!testAllTypes(gray))
+        return false;
+
+    return true;
+}
+
+void CV_ECC_BaseTest::run(int) {
+    Mat img = imread(string(ts->get_data_path()) + "shared/fruits.png");
+    if (img.empty()) {
+        ts->printf(ts->LOG, "test image can not be read");
+        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
+        return;
+    }
+
+    Mat testImg;
+    resize(img, testImg, Size(216, 216), 0, 0, INTER_LINEAR_EXACT);
+
+    testAllChNum(testImg);
+
+    ts->set_failed_test_info(cvtest::TS::OK);
+}
+
+class CV_ECC_Test_Translation : public CV_ECC_BaseTest {
+   public:
    CV_ECC_Test_Translation();
-protected:
-    void run(int);

-    bool testTranslation(int);
+   protected:
+    bool test(const Mat);
 };

-CV_ECC_Test_Translation::CV_ECC_Test_Translation(){}
-
-bool CV_ECC_Test_Translation::testTranslation(int from)
-{
-    Mat img = imread( string(ts->get_data_path()) + "shared/fruits.png", 0);
-
-
-    if (img.empty())
-    {
-        ts->printf( ts->LOG, "test image can not be read");
-        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
-        return false;
-    }
-    Mat testImg;
-    resize(img, testImg, Size(216, 216), 0, 0, INTER_LINEAR_EXACT);
+CV_ECC_Test_Translation::CV_ECC_Test_Translation() {}

+bool CV_ECC_Test_Translation::test(const Mat testImg) {
    cv::RNG rng = ts->get_rng();

-    int progress=0;
+    int progress = 0;

-    for (int k=from; k<ntests; k++){
-
-        ts->update_context( this, k, true );
+    for (int k = 0; k < ntests; k++) {
+        ts->update_context(this, k, true);
        progress = update_progress(progress, k, ntests, 0);

-        Mat translationGround = (Mat_<float>(2,3) << 1, 0, (rng.uniform(10.f, 20.f)),
-            0, 1, (rng.uniform(10.f, 20.f)));
+        Mat translationGround = (Mat_<float>(2, 3) << 1, 0, (rng.uniform(10.f, 20.f)), 0, 1, (rng.uniform(10.f, 20.f)));

        Mat warpedImage;

-        warpAffine(testImg, warpedImage, translationGround,
-            Size(200,200), INTER_LINEAR + WARP_INVERSE_MAP);
+        warpAffine(testImg, warpedImage, translationGround, Size(200, 200), INTER_LINEAR + WARP_INVERSE_MAP);

-        Mat mapTranslation = (Mat_<float>(2,3) << 1, 0, 0, 0, 1, 0);
+        Mat mapTranslation = (Mat_<float>(2, 3) << 1, 0, 0, 0, 1, 0);

-        findTransformECC(warpedImage, testImg, mapTranslation, 0,
-            TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, ECC_iterations, ECC_epsilon));
+        findTransformECC(warpedImage, testImg, mapTranslation, 0, criteria);

-        if (!isMapCorrect(mapTranslation)){
-            ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+        if (!checkMap(mapTranslation, translationGround))
            return false;
-        }
-
-        if (computeRMS(mapTranslation, translationGround)>MAX_RMS_ECC){
-            ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
-            ts->printf( ts->LOG, "RMS = %f",
-                computeRMS(mapTranslation, translationGround));
-            return false;
-        }
-
    }
    return true;
 }

-void CV_ECC_Test_Translation::run(int from)
-{
-
-    if (!testTranslation(from))
-        return;
-
-    ts->set_failed_test_info(cvtest::TS::OK);
-}
-
-
-
-class CV_ECC_Test_Euclidean : public CV_ECC_BaseTest
-{
-public:
+class CV_ECC_Test_Euclidean : public CV_ECC_BaseTest {
+   public:
    CV_ECC_Test_Euclidean();
-protected:
-    void run(int);

-    bool testEuclidean(int);
+   protected:
+    bool test(const Mat);
 };

-CV_ECC_Test_Euclidean::CV_ECC_Test_Euclidean() { }
-
-bool CV_ECC_Test_Euclidean::testEuclidean(int from)
-{
-    Mat img = imread( string(ts->get_data_path()) + "shared/fruits.png", 0);
-
-
-    if (img.empty())
-    {
-        ts->printf( ts->LOG, "test image can not be read");
-        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
-        return false;
-    }
-    Mat testImg;
-    resize(img, testImg, Size(216, 216), 0, 0, INTER_LINEAR_EXACT);
+CV_ECC_Test_Euclidean::CV_ECC_Test_Euclidean() {}

+bool CV_ECC_Test_Euclidean::test(const Mat testImg) {
    cv::RNG rng = ts->get_rng();

    int progress = 0;
-    for (int k=from; k<ntests; k++){
-        ts->update_context( this, k, true );
+    for (int k = 0; k < ntests; k++) {
+        ts->update_context(this, k, true);
        progress = update_progress(progress, k, ntests, 0);

-        double angle = CV_PI/30 + CV_PI*rng.uniform((double)-2.f, (double)2.f)/180;
+        double angle = CV_PI / 30 + CV_PI * rng.uniform((double)-2.f, (double)2.f) / 180;

-        Mat euclideanGround = (Mat_<float>(2,3) << cos(angle), -sin(angle), (rng.uniform(10.f, 20.f)),
-            sin(angle), cos(angle), (rng.uniform(10.f, 20.f)));
+        Mat euclideanGround = (Mat_<float>(2, 3) << cos(angle), -sin(angle), (rng.uniform(10.f, 20.f)), sin(angle),
+                               cos(angle), (rng.uniform(10.f, 20.f)));

        Mat warpedImage;

-        warpAffine(testImg, warpedImage, euclideanGround,
-            Size(200,200), INTER_LINEAR + WARP_INVERSE_MAP);
+        warpAffine(testImg, warpedImage, euclideanGround, Size(200, 200), INTER_LINEAR + WARP_INVERSE_MAP);

-        Mat mapEuclidean = (Mat_<float>(2,3) << 1, 0, 0, 0, 1, 0);
+        Mat mapEuclidean = (Mat_<float>(2, 3) << 1, 0, 0, 0, 1, 0);

-        findTransformECC(warpedImage, testImg, mapEuclidean, 1,
-            TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, ECC_iterations, ECC_epsilon));
+        findTransformECC(warpedImage, testImg, mapEuclidean, 1, criteria);

-        if (!isMapCorrect(mapEuclidean)){
-            ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+        if (!checkMap(mapEuclidean, euclideanGround))
            return false;
-        }
-
-        if (computeRMS(mapEuclidean, euclideanGround)>MAX_RMS_ECC){
-            ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
-            ts->printf( ts->LOG, "RMS = %f",
-                computeRMS(mapEuclidean, euclideanGround));
-            return false;
-        }
-
    }
    return true;
 }

-
-void CV_ECC_Test_Euclidean::run(int from)
-{
-
-    if (!testEuclidean(from))
-        return;
-
-    ts->set_failed_test_info(cvtest::TS::OK);
-}
-
-class CV_ECC_Test_Affine : public CV_ECC_BaseTest
-{
-public:
+class CV_ECC_Test_Affine : public CV_ECC_BaseTest {
+   public:
    CV_ECC_Test_Affine();
-protected:
-    void run(int);

-    bool testAffine(int);
+   protected:
+    bool test(const Mat img);
 };

-CV_ECC_Test_Affine::CV_ECC_Test_Affine(){}
-
-
-bool CV_ECC_Test_Affine::testAffine(int from)
-{
-    Mat img = imread( string(ts->get_data_path()) + "shared/fruits.png", 0);
-
-    if (img.empty())
-    {
-        ts->printf( ts->LOG, "test image can not be read");
-        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
-        return false;
-    }
-    Mat testImg;
-    resize(img, testImg, Size(216, 216), 0, 0, INTER_LINEAR_EXACT);
+CV_ECC_Test_Affine::CV_ECC_Test_Affine() {}

+bool CV_ECC_Test_Affine::test(const Mat testImg) {
    cv::RNG rng = ts->get_rng();

    int progress = 0;
-    for (int k=from; k<ntests; k++){
-        ts->update_context( this, k, true );
+    for (int k = 0; k < ntests; k++) {
+        ts->update_context(this, k, true);
        progress = update_progress(progress, k, ntests, 0);

-
-        Mat affineGround = (Mat_<float>(2,3) << (1-rng.uniform(-0.05f, 0.05f)),
-            (rng.uniform(-0.03f, 0.03f)), (rng.uniform(10.f, 20.f)),
-            (rng.uniform(-0.03f, 0.03f)), (1-rng.uniform(-0.05f, 0.05f)),
-            (rng.uniform(10.f, 20.f)));
+        Mat affineGround = (Mat_<float>(2, 3) << (1 - rng.uniform(-0.05f, 0.05f)), (rng.uniform(-0.03f, 0.03f)),
+                            (rng.uniform(10.f, 20.f)), (rng.uniform(-0.03f, 0.03f)), (1 - rng.uniform(-0.05f, 0.05f)),
+                            (rng.uniform(10.f, 20.f)));

        Mat warpedImage;

-        warpAffine(testImg, warpedImage, affineGround,
-            Size(200,200), INTER_LINEAR + WARP_INVERSE_MAP);
+        warpAffine(testImg, warpedImage, affineGround, Size(200, 200), INTER_LINEAR + WARP_INVERSE_MAP);

-        Mat mapAffine = (Mat_<float>(2,3) << 1, 0, 0, 0, 1, 0);
+        Mat mapAffine = (Mat_<float>(2, 3) << 1, 0, 0, 0, 1, 0);

-        findTransformECC(warpedImage, testImg, mapAffine, 2,
-            TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, ECC_iterations, ECC_epsilon));
+        findTransformECC(warpedImage, testImg, mapAffine, 2, criteria);

-        if (!isMapCorrect(mapAffine)){
-            ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+        if (!checkMap(mapAffine, affineGround))
            return false;
-        }
-
-        if (computeRMS(mapAffine, affineGround)>MAX_RMS_ECC){
-            ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
-            ts->printf( ts->LOG, "RMS = %f",
-                computeRMS(mapAffine, affineGround));
-            return false;
-        }
-
    }

    return true;
 }

-
-void CV_ECC_Test_Affine::run(int from)
-{
-
-    if (!testAffine(from))
-        return;
-
-    ts->set_failed_test_info(cvtest::TS::OK);
-}
-
-class CV_ECC_Test_Homography : public CV_ECC_BaseTest
-{
-public:
+class CV_ECC_Test_Homography : public CV_ECC_BaseTest {
+   public:
    CV_ECC_Test_Homography();
-protected:
-    void run(int);

-    bool testHomography(int);
+   protected:
+    bool test(const Mat testImg);
 };

-CV_ECC_Test_Homography::CV_ECC_Test_Homography(){}
-
-bool CV_ECC_Test_Homography::testHomography(int from)
-{
-    Mat img = imread( string(ts->get_data_path()) + "shared/fruits.png", 0);
-
-
-    if (img.empty())
-    {
-        ts->printf( ts->LOG, "test image can not be read");
-        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
-        return false;
-    }
-    Mat testImg;
-    resize(img, testImg, Size(216, 216), 0, 0, INTER_LINEAR_EXACT);
+CV_ECC_Test_Homography::CV_ECC_Test_Homography() {}

+bool CV_ECC_Test_Homography::test(const Mat testImg) {
    cv::RNG rng = ts->get_rng();

    int progress = 0;
-    for (int k=from; k<ntests; k++){
-        ts->update_context( this, k, true );
+    for (int k = 0; k < ntests; k++) {
+        ts->update_context(this, k, true);
        progress = update_progress(progress, k, ntests, 0);

-        Mat homoGround = (Mat_<float>(3,3) << (1-rng.uniform(-0.05f, 0.05f)),
-            (rng.uniform(-0.03f, 0.03f)), (rng.uniform(10.f, 20.f)),
-            (rng.uniform(-0.03f, 0.03f)), (1-rng.uniform(-0.05f, 0.05f)),(rng.uniform(10.f, 20.f)),
-            (rng.uniform(0.0001f, 0.0003f)), (rng.uniform(0.0001f, 0.0003f)), 1.f);
+        Mat homoGround =
+            (Mat_<float>(3, 3) << (1 - rng.uniform(-0.05f, 0.05f)), (rng.uniform(-0.03f, 0.03f)),
+             (rng.uniform(10.f, 20.f)), (rng.uniform(-0.03f, 0.03f)), (1 - rng.uniform(-0.05f, 0.05f)),
+             (rng.uniform(10.f, 20.f)), (rng.uniform(0.0001f, 0.0003f)), (rng.uniform(0.0001f, 0.0003f)), 1.f);

        Mat warpedImage;

-        warpPerspective(testImg, warpedImage, homoGround,
-            Size(200,200), INTER_LINEAR + WARP_INVERSE_MAP);
+        warpPerspective(testImg, warpedImage, homoGround, Size(200, 200), INTER_LINEAR + WARP_INVERSE_MAP);

        Mat mapHomography = Mat::eye(3, 3, CV_32F);

-        findTransformECC(warpedImage, testImg, mapHomography, 3,
-            TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, ECC_iterations, ECC_epsilon));
+        findTransformECC(warpedImage, testImg, mapHomography, 3, criteria);

-        if (!isMapCorrect(mapHomography)){
-            ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+        if (!checkMap(mapHomography, homoGround))
            return false;
-        }
-
-        if (computeRMS(mapHomography, homoGround)>MAX_RMS_ECC){
-            ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
-            ts->printf( ts->LOG, "RMS = %f",
-                computeRMS(mapHomography, homoGround));
-            return false;
-        }
-
    }
    return true;
 }

-void CV_ECC_Test_Homography::run(int from)
-{
-    if (!testHomography(from))
-        return;
-
-    ts->set_failed_test_info(cvtest::TS::OK);
-}
-
-class CV_ECC_Test_Mask : public CV_ECC_BaseTest
-{
-public:
+class CV_ECC_Test_Mask : public CV_ECC_BaseTest {
+   public:
    CV_ECC_Test_Mask();
-protected:
-    void run(int);

-    bool testMask(int);
+   protected:
+    bool test(const Mat);
 };

-CV_ECC_Test_Mask::CV_ECC_Test_Mask(){}
-
-bool CV_ECC_Test_Mask::testMask(int from)
-{
-    Mat img = imread( string(ts->get_data_path()) + "shared/fruits.png", 0);
-
-
-    if (img.empty())
-    {
-        ts->printf( ts->LOG, "test image can not be read");
-        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
-        return false;
-    }
-    Mat scaledImage;
-    resize(img, scaledImage, Size(216, 216), 0, 0, INTER_LINEAR_EXACT );
-
-    Mat_<float> testImg;
-    scaledImage.convertTo(testImg, testImg.type());
+CV_ECC_Test_Mask::CV_ECC_Test_Mask() {}

+bool CV_ECC_Test_Mask::test(const Mat testImg) {
    cv::RNG rng = ts->get_rng();

-    int progress=0;
+    int progress = 0;

-    for (int k=from; k<ntests; k++){
-
-        ts->update_context( this, k, true );
+    for (int k = 0; k < ntests; k++) {
+        ts->update_context(this, k, true);
        progress = update_progress(progress, k, ntests, 0);

-        Mat translationGround = (Mat_<float>(2,3) << 1, 0, (rng.uniform(10.f, 20.f)),
-            0, 1, (rng.uniform(10.f, 20.f)));
+        Mat translationGround = (Mat_<float>(2, 3) << 1, 0, (rng.uniform(10.f, 20.f)), 0, 1, (rng.uniform(10.f, 20.f)));

        Mat warpedImage;

-        warpAffine(testImg, warpedImage, translationGround,
-            Size(200,200), INTER_LINEAR + WARP_INVERSE_MAP);
+        warpAffine(testImg, warpedImage, translationGround, Size(200, 200), INTER_LINEAR + WARP_INVERSE_MAP);

-        Mat mapTranslation = (Mat_<float>(2,3) << 1, 0, 0, 0, 1, 0);
+        Mat mapTranslation = (Mat_<float>(2, 3) << 1, 0, 0, 0, 1, 0);

        Mat_<unsigned char> mask = Mat_<unsigned char>::ones(testImg.rows, testImg.cols);
-        for (int i=testImg.rows*2/3; i<testImg.rows; i++) {
-          for (int j=testImg.cols*2/3; j<testImg.cols; j++) {
-            testImg(i, j) = 0;
-            mask(i, j) = 0;
-          }
-        }
+        Rect region(testImg.cols * 2 / 3, testImg.rows * 2 / 3, testImg.cols / 3, testImg.rows / 3);

-        findTransformECC(warpedImage, testImg, mapTranslation, 0,
-            TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, ECC_iterations, ECC_epsilon), mask);
+        rectangle(testImg, region, Scalar::all(0), FILLED);
+        rectangle(mask, region, Scalar(0), FILLED);

-        if (!isMapCorrect(mapTranslation)){
-            ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+        findTransformECC(warpedImage, testImg, mapTranslation, 0, criteria, mask);
+
+        if (!checkMap(mapTranslation, translationGround))
            return false;
-        }
-
-        if (computeRMS(mapTranslation, translationGround)>MAX_RMS_ECC){
-            ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
-            ts->printf( ts->LOG, "RMS = %f",
-                computeRMS(mapTranslation, translationGround));
-            return false;
-        }

        // Test with non-default gaussian blur.
-        findTransformECC(warpedImage, testImg, mapTranslation, 0,
-            TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, ECC_iterations, ECC_epsilon), mask, 1);
+        findTransformECC(warpedImage, testImg, mapTranslation, 0, criteria, mask, 1);

-        if (!isMapCorrect(mapTranslation)){
-            ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+        if (!checkMap(mapTranslation, translationGround))
            return false;
-        }
-
-        if (computeRMS(mapTranslation, translationGround)>MAX_RMS_ECC){
-            ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
-            ts->printf( ts->LOG, "RMS = %f",
-                computeRMS(mapTranslation, translationGround));
-            return false;
-        }
-
    }
    return true;
 }

-void CV_ECC_Test_Mask::run(int from)
-{
-    if (!testMask(from))
-        return;
+void testECCProperties(Mat x, float eps) {
+    // The channels are independent
+    Mat y = x.t();
+    Mat Z = Mat::zeros(x.size(), y.type());
+    Mat O = Mat::ones(x.size(), y.type());

-    ts->set_failed_test_info(cvtest::TS::OK);
+    EXPECT_NEAR(computeECC(x, y), 0.0, eps);
+    if (x.type() != CV_8U && x.type() != CV_8U) {
+        EXPECT_NEAR(computeECC(x + y, x - y), 0.0, eps);
+    }
+
+    EXPECT_NEAR(computeECC(x, x), 1.0, eps);
+
+    Mat R, G, B, X, Y;
+    cv::merge(std::vector<cv::Mat>({O, Z, Z}), R);
+    cv::merge(std::vector<cv::Mat>({Z, O, Z}), G);
+    cv::merge(std::vector<cv::Mat>({Z, Z, O}), B);
+    cv::merge(std::vector<cv::Mat>({x, x, x}), X);
+    cv::merge(std::vector<cv::Mat>({y, y, y}), Y);
+
+    // 1. The channels are orthogonal and independent
+    EXPECT_NEAR(computeECC(X.mul(R), X.mul(G)), 0, eps);
+    EXPECT_NEAR(computeECC(X.mul(R), X.mul(B)), 0, eps);
+    EXPECT_NEAR(computeECC(X.mul(B), X.mul(G)), 0, eps);
+
+    EXPECT_NEAR(computeECC(X.mul(R) + Y.mul(B), X.mul(B) + Y.mul(R)), 0, eps);
+
+    EXPECT_NEAR(computeECC(X.mul(R) + Y.mul(G) + (X + Y).mul(B), Y.mul(R) + X.mul(G) + (X - Y).mul(B)), 0, eps);
+
+    // 2. Each channel contribute equally
+    EXPECT_NEAR(computeECC(X.mul(R) + Y.mul(G + B), X), 1.0 / 3, eps);
+    EXPECT_NEAR(computeECC(X.mul(G) + Y.mul(R + B), X), 1.0 / 3, eps);
+    EXPECT_NEAR(computeECC(X.mul(B) + Y.mul(G + R), X), 1.0 / 3, eps);
+
+    // 3. The coefficient is invariant with respect to the offset of channels
+    EXPECT_NEAR(computeECC(X - R + 2 * G + B, X), 1.0, eps);
+    if (x.type() != CV_8U && x.type() != CV_8U) {
+        EXPECT_NEAR(computeECC(X + R - 2 * G + B, Y), 0.0, eps);
+    }
+
+    // The channels are independent. Check orthogonal combinations
+    // full squares norm = sum of squared norms
+    EXPECT_NEAR(computeECC(X, Y + X), 1.0 / sqrt(2.0), eps);
+    EXPECT_NEAR(computeECC(X, 2 * Y + X), 1.0 / sqrt(5.0), eps);
 }

-TEST(Video_ECC_Test_Compute, accuracy)
-{
+TEST(Video_ECC_Test_Compute, properies) {
+    Mat xline(1, 100, CV_32F), x;
+    for (int i = 0; i < xline.cols; ++i) xline.at<float>(0, i) =  (float)i;
+
+    repeat(xline, xline.cols, 1, x);
+
+    Mat x_f64, x_u8, x_u16;
+    x.convertTo(x_f64, CV_64F);
+    x.convertTo(x_u8, CV_8U);
+    x.convertTo(x_u16, CV_16U);
+
+    testECCProperties(x, 1e-5f);
+    testECCProperties(x_f64, 1e-5f);
+    testECCProperties(x_u8, 1);
+    testECCProperties(x_u16, 1);
+}
+
+TEST(Video_ECC_Test_Compute, accuracy) {
    Mat testImg = (Mat_<float>(3, 3) << 1, 0, 0, 1, 0, 0, 1, 0, 0);
    Mat warpedImage = (Mat_<float>(3, 3) << 0, 1, 0, 0, 1, 0, 0, 1, 0);
    Mat_<unsigned char> mask = Mat_<unsigned char>::ones(testImg.rows, testImg.cols);
@ -501,8 +420,7 @@ TEST(Video_ECC_Test_Compute, accuracy)
    EXPECT_NEAR(ecc, -0.5f, 1e-5f);
 }

-TEST(Video_ECC_Test_Compute, bug_14657)
-{
+TEST(Video_ECC_Test_Compute, bug_14657) {
    /*
     * Simple test case - a 2 x 2 matrix with 10, 10, 10, 6. When the mean (36 / 4 = 9) is subtracted,
     * it results in 1, 1, 1, 0 for the unsigned int case - compare to  1, 1, 1, -3 in the signed case.
@ -512,11 +430,26 @@ TEST(Video_ECC_Test_Compute, bug_14657)
    EXPECT_NEAR(computeECC(img, img), 1.0f, 1e-5f);
 }

+TEST(Video_ECC_Translation, accuracy) {
+    CV_ECC_Test_Translation test;
+    test.safe_run();
+}
+TEST(Video_ECC_Euclidean, accuracy) {
+    CV_ECC_Test_Euclidean test;
+    test.safe_run();
+}
+TEST(Video_ECC_Affine, accuracy) {
+    CV_ECC_Test_Affine test;
+    test.safe_run();
+}
+TEST(Video_ECC_Homography, accuracy) {
+    CV_ECC_Test_Homography test;
+    test.safe_run();
+}
+TEST(Video_ECC_Mask, accuracy) {
+    CV_ECC_Test_Mask test;
+    test.safe_run();
+}

-TEST(Video_ECC_Translation, accuracy) { CV_ECC_Test_Translation test; test.safe_run();}
-TEST(Video_ECC_Euclidean, accuracy) { CV_ECC_Test_Euclidean test; test.safe_run(); }
-TEST(Video_ECC_Affine, accuracy) { CV_ECC_Test_Affine test; test.safe_run(); }
-TEST(Video_ECC_Homography, accuracy) { CV_ECC_Test_Homography test; test.safe_run(); }
-TEST(Video_ECC_Mask, accuracy) { CV_ECC_Test_Mask test; test.safe_run(); }
-
-}} // namespace
+}  // namespace
+}  // namespace opencv_test
--- a/modules/videoio/cmake/detect_ffmpeg.cmake
+++ b/modules/videoio/cmake/detect_ffmpeg.cmake
@ -1,5 +1,5 @@
 # --- FFMPEG ---
-OCV_OPTION(OPENCV_FFMPEG_ENABLE_LIBAVDEVICE "Include FFMPEG/libavdevice library support." OFF
+OCV_OPTION(OPENCV_FFMPEG_ENABLE_LIBAVDEVICE "Include FFMPEG/libavdevice library support." ON
  VISIBLE_IF WITH_FFMPEG)

 if(NOT HAVE_FFMPEG AND OPENCV_FFMPEG_USE_FIND_PACKAGE)
--- a/modules/videoio/src/cap_ffmpeg.cpp
+++ b/modules/videoio/src/cap_ffmpeg.cpp
@ -74,6 +74,11 @@ public:
    {
        open(filename, params);
    }
+    CvCapture_FFMPEG_proxy(int index, const cv::VideoCaptureParameters& params)
+        : ffmpegCapture(NULL)
+    {
+        open(index, params);
+    }
    CvCapture_FFMPEG_proxy(const Ptr<IStreamReader>& stream, const cv::VideoCaptureParameters& params)
        : ffmpegCapture(NULL)
    {
@ -127,6 +132,13 @@ public:
        ffmpegCapture = cvCreateFileCaptureWithParams_FFMPEG(filename.c_str(), params);
        return ffmpegCapture != 0;
    }
+    bool open(int index, const cv::VideoCaptureParameters& params)
+    {
+        close();
+
+        ffmpegCapture = cvCreateFileCaptureWithParams_FFMPEG(index, params);
+        return ffmpegCapture != 0;
+    }
    bool open(const Ptr<IStreamReader>& stream, const cv::VideoCaptureParameters& params)
    {
        close();
@ -161,6 +173,14 @@ cv::Ptr<cv::IVideoCapture> cvCreateFileCapture_FFMPEG_proxy(const std::string &f
    return cv::Ptr<cv::IVideoCapture>();
 }

+cv::Ptr<cv::IVideoCapture> cvCreateCameraCapture_FFMPEG_proxy(int index, const cv::VideoCaptureParameters& params)
+{
+    cv::Ptr<CvCapture_FFMPEG_proxy> capture = cv::makePtr<CvCapture_FFMPEG_proxy>(index, params);
+    if (capture && capture->isOpened())
+        return capture;
+    return cv::Ptr<cv::IVideoCapture>();
+}
+
 cv::Ptr<cv::IVideoCapture> cvCreateStreamCapture_FFMPEG_proxy(const Ptr<IStreamReader>& stream, const cv::VideoCaptureParameters& params)
 {
    cv::Ptr<CvCapture_FFMPEG_proxy> capture = std::make_shared<CvCapture_FFMPEG_proxy>(stream, params);
@ -272,13 +292,15 @@ CvResult CV_API_CALL cv_capture_open(const char* filename, int camera_index, CV_
    if (!handle)
        return CV_ERROR_FAIL;
    *handle = NULL;
-    if (!filename)
+    if (!filename && camera_index < 0)
        return CV_ERROR_FAIL;
-    CV_UNUSED(camera_index);
    CvCapture_FFMPEG_proxy *cap = 0;
    try
    {
-        cap = new CvCapture_FFMPEG_proxy(String(filename), cv::VideoCaptureParameters());
+        if (filename)
+            cap = new CvCapture_FFMPEG_proxy(String(filename), cv::VideoCaptureParameters());
+        else
+            cap = new CvCapture_FFMPEG_proxy(camera_index, cv::VideoCaptureParameters());
        if (cap->isOpened())
        {
            *handle = (CvPluginCapture)cap;
@ -308,14 +330,16 @@ CvResult CV_API_CALL cv_capture_open_with_params(
    if (!handle)
        return CV_ERROR_FAIL;
    *handle = NULL;
-    if (!filename)
+    if (!filename && camera_index < 0)
        return CV_ERROR_FAIL;
-    CV_UNUSED(camera_index);
    CvCapture_FFMPEG_proxy *cap = 0;
    try
    {
        cv::VideoCaptureParameters parameters(params, n_params);
-        cap = new CvCapture_FFMPEG_proxy(String(filename), parameters);
+        if (filename)
+            cap = new CvCapture_FFMPEG_proxy(String(filename), parameters);
+        else
+            cap = new CvCapture_FFMPEG_proxy(camera_index, parameters);
        if (cap->isOpened())
        {
            *handle = (CvPluginCapture)cap;
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@ -526,7 +526,7 @@ inline static std::string _opencv_ffmpeg_get_error_string(int error_code)

 struct CvCapture_FFMPEG
 {
-    bool open(const char* filename, const Ptr<IStreamReader>& stream, const VideoCaptureParameters& params);
+    bool open(const char* filename, int index, const Ptr<IStreamReader>& stream, const VideoCaptureParameters& params);
    void close();

    double getProperty(int) const;
@ -1043,7 +1043,7 @@ static bool isThreadSafe() {
    return threadSafe;
 }

-bool CvCapture_FFMPEG::open(const char* _filename, const Ptr<IStreamReader>& stream, const VideoCaptureParameters& params)
+bool CvCapture_FFMPEG::open(const char* _filename, int index, const Ptr<IStreamReader>& stream, const VideoCaptureParameters& params)
 {
    const bool threadSafe = isThreadSafe();
    InternalFFMpegRegister::init(threadSafe);
@ -1145,16 +1145,6 @@ bool CvCapture_FFMPEG::open(const char* _filename, const Ptr<IStreamReader>& str
        }
    }

-#if USE_AV_INTERRUPT_CALLBACK
-    /* interrupt callback */
-    interrupt_metadata.timeout_after_ms = open_timeout;
-    get_monotonic_time(&interrupt_metadata.value);
-
-    ic = avformat_alloc_context();
-    ic->interrupt_callback.callback = _opencv_ffmpeg_interrupt_callback;
-    ic->interrupt_callback.opaque = &interrupt_metadata;
-#endif
-
    std::string options = utils::getConfigurationParameterString("OPENCV_FFMPEG_CAPTURE_OPTIONS");
    if (options.empty())
    {
@ -1180,7 +1170,55 @@ bool CvCapture_FFMPEG::open(const char* _filename, const Ptr<IStreamReader>& str
      input_format = av_find_input_format(entry->value);
    }

-    if (!_filename)
+#ifdef HAVE_FFMPEG_LIBAVDEVICE
+    AVDeviceInfoList* device_list = nullptr;
+#endif
+    if (index >= 0)
+    {
+#ifdef HAVE_FFMPEG_LIBAVDEVICE
+        entry = av_dict_get(dict, "f", NULL, 0);
+        const char* backend = nullptr;
+        if (entry)
+        {
+            backend = entry->value;
+        }
+        else
+        {
+#ifdef __linux__
+            backend = "v4l2";
+#endif
+#ifdef _WIN32
+            backend = "dshow";
+#endif
+#ifdef __APPLE__
+            backend = "avfoundation";
+#endif
+        }
+        avdevice_list_input_sources(nullptr, backend, nullptr, &device_list);
+        if (!device_list)
+        {
+            CV_LOG_ONCE_WARNING(NULL, "VIDEOIO/FFMPEG: Failed list devices for backend " << backend);
+            return false;
+        }
+        CV_CheckLT(index, device_list->nb_devices, "VIDEOIO/FFMPEG: Camera index out of range");
+        _filename = device_list->devices[index]->device_name;
+#else
+        CV_LOG_ONCE_WARNING(NULL, "VIDEOIO/FFMPEG: OpenCV should be configured with libavdevice to open a camera device");
+        return false;
+#endif
+    }
+
+#if USE_AV_INTERRUPT_CALLBACK
+    /* interrupt callback */
+    interrupt_metadata.timeout_after_ms = open_timeout;
+    get_monotonic_time(&interrupt_metadata.value);
+
+    ic = avformat_alloc_context();
+    ic->interrupt_callback.callback = _opencv_ffmpeg_interrupt_callback;
+    ic->interrupt_callback.opaque = &interrupt_metadata;
+#endif
+
+    if (stream)
    {
        size_t avio_ctx_buffer_size = 4096;
        uint8_t* avio_ctx_buffer = (uint8_t*)av_malloc(avio_ctx_buffer_size);
@ -1231,6 +1269,13 @@ bool CvCapture_FFMPEG::open(const char* _filename, const Ptr<IStreamReader>& str
        ic->pb = avio_context;
    }
    int err = avformat_open_input(&ic, _filename, input_format, &dict);
+#ifdef HAVE_FFMPEG_LIBAVDEVICE
+    if (device_list)
+    {
+        avdevice_free_list_devices(&device_list);
+        device_list = nullptr;
+    }
+#endif

    if (err < 0)
    {
@ -3447,7 +3492,22 @@ CvCapture_FFMPEG* cvCreateFileCaptureWithParams_FFMPEG(const char* filename, con
    if (!capture)
        return 0;
    capture->init();
-    if (capture->open(filename, nullptr, params))
+    if (capture->open(filename, -1, nullptr, params))
+        return capture;
+
+    capture->close();
+    delete capture;
+    return 0;
+}
+
+static
+CvCapture_FFMPEG* cvCreateFileCaptureWithParams_FFMPEG(int index, const VideoCaptureParameters& params)
+{
+    CvCapture_FFMPEG* capture = new CvCapture_FFMPEG();
+    if (!capture)
+        return 0;
+    capture->init();
+    if (capture->open(nullptr, index, nullptr, params))
        return capture;

    capture->close();
@ -3462,7 +3522,7 @@ CvCapture_FFMPEG* cvCreateStreamCaptureWithParams_FFMPEG(const Ptr<IStreamReader
    if (!capture)
        return 0;
    capture->init();
-    if (capture->open(nullptr, stream, params))
+    if (capture->open(nullptr, -1, stream, params))
        return capture;

    capture->close();
--- a/modules/videoio/src/cap_interface.hpp
+++ b/modules/videoio/src/cap_interface.hpp
@ -327,6 +327,7 @@ protected:
 //==================================================================================================

 Ptr<IVideoCapture> cvCreateFileCapture_FFMPEG_proxy(const std::string &filename, const VideoCaptureParameters& params);
+Ptr<IVideoCapture> cvCreateCameraCapture_FFMPEG_proxy(int index, const VideoCaptureParameters& params);
 Ptr<IVideoCapture> cvCreateStreamCapture_FFMPEG_proxy(const Ptr<IStreamReader>& stream, const VideoCaptureParameters& params);
 Ptr<IVideoWriter> cvCreateVideoWriter_FFMPEG_proxy(const std::string& filename, int fourcc,
                                                   double fps, const Size& frameSize,
--- a/modules/videoio/src/videoio_registry.cpp
+++ b/modules/videoio/src/videoio_registry.cpp
@ -112,6 +112,12 @@ static const struct VideoBackendInfo builtin_backends[] =
    DECLARE_STATIC_BACKEND(CAP_V4L, "V4L_BSD", MODE_CAPTURE_ALL, create_V4L_capture_file, create_V4L_capture_cam, 0)
 #endif

+    // FFmpeg webcamera by underlying backend (DShow, V4L2, AVFoundation)
+#ifdef HAVE_FFMPEG
+    DECLARE_STATIC_BACKEND(CAP_FFMPEG, "FFMPEG", MODE_CAPTURE_BY_INDEX, 0, cvCreateCameraCapture_FFMPEG_proxy, 0)
+#elif defined(ENABLE_PLUGINS) || defined(HAVE_FFMPEG_WRAPPER)
+    DECLARE_DYNAMIC_BACKEND(CAP_FFMPEG, "FFMPEG", MODE_CAPTURE_BY_INDEX)
+#endif

    // RGB-D universal
 #ifdef HAVE_OPENNI2
--- a/modules/videoio/test/test_camera.cpp
+++ b/modules/videoio/test/test_camera.cpp
@ -327,4 +327,18 @@ TEST(DISABLED_videoio_camera, waitAny_V4L)
    }
 }

+TEST(DISABLED_videoio_camera, ffmpeg_index)
+{
+    int idx = (int)utils::getConfigurationParameterSizeT("OPENCV_TEST_FFMPEG_DEVICE_IDX", (size_t)-1);
+    if (idx == -1)
+    {
+        throw SkipTestException("OPENCV_TEST_FFMPEG_DEVICE_IDX is not set");
+    }
+    VideoCapture cap;
+    ASSERT_TRUE(cap.open(idx, CAP_FFMPEG));
+    Mat frame;
+    ASSERT_TRUE(cap.read(frame));
+    ASSERT_FALSE(frame.empty());
+}
+
 }} // namespace
--- a/modules/videoio/test/test_v4l2.cpp
+++ b/modules/videoio/test/test_v4l2.cpp
@ -158,6 +158,20 @@ inline static std::string param_printer(const testing::TestParamInfo<videoio_v4l

 INSTANTIATE_TEST_CASE_P(/*videoio_v4l2*/, videoio_v4l2, ValuesIn(all_params), param_printer);

+TEST(videoio_ffmpeg, camera_index)
+{
+    utils::Paths devs = utils::getConfigurationParameterPaths("OPENCV_TEST_V4L2_VIVID_DEVICE");
+    if (devs.size() != 1)
+    {
+        throw SkipTestException("OPENCV_TEST_V4L2_VIVID_DEVICE is not set");
+    }
+    VideoCapture cap;
+    ASSERT_TRUE(cap.open(0, CAP_FFMPEG));
+    Mat frame;
+    ASSERT_TRUE(cap.read(frame));
+    ASSERT_FALSE(frame.empty());
+}
+
 }} // opencv_test::<anonymous>::

 #endif // HAVE_CAMV4L2
--- a/samples/python/background_subtractor_mask.py
+++ b/samples/python/background_subtractor_mask.py
@ -0,0 +1,68 @@
+
+'''
+Showcases the use of background subtraction from a live video feed,
+aswell as pass through of a known foreground parameter
+'''
+
+# Python 2/3 compatibility
+from __future__ import print_function
+
+import numpy as np
+import cv2 as cv
+
+def main():
+    cap = cv.VideoCapture(0)
+    if not cap.isOpened:
+        print("Capture source avaialable.")
+        exit()
+
+    # Create background subtractor
+    mog2_bg_subtractor = cv.createBackgroundSubtractorMOG2(history=300, varThreshold=50, detectShadows=False)
+    knn_bg_subtractor = cv.createBackgroundSubtractorKNN(history=300, detectShadows=False)
+
+    frame_count = 0
+    # Allows for a frame buffer for the mask to learn pre known foreground
+    show_count = 10
+
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+
+        x = 100 + (frame_count % 10) * 3
+
+        frame = cv.resize(frame, (640, 480))
+        aKnownForegroundMask = np.zeros(frame.shape[:2], dtype=np.uint8)
+
+        # Allow for models to "settle"/learn
+        if frame_count > show_count:
+            cv.rectangle(aKnownForegroundMask, (x,200), (x+50,300), 255, -1)
+            cv.rectangle(aKnownForegroundMask, (540,180), (640,480), 255, -1)
+
+        #MOG2 Subtraction
+        mog2_with_mask = mog2_bg_subtractor.apply(frame,knownForegroundMask=aKnownForegroundMask)
+        mog2_without_mask = mog2_bg_subtractor.apply(frame)
+
+        #KNN Subtraction
+        knn_with_mask = knn_bg_subtractor.apply(frame,knownForegroundMask=aKnownForegroundMask)
+        knn_without_mask = knn_bg_subtractor.apply(frame)
+
+        # Display the 3 parameter apply and the 4 parameter apply for both subtractors
+        cv.imshow("MOG2 With a Foreground Mask", mog2_with_mask)
+        cv.imshow("MOG2 Without a Foreground Mask", mog2_without_mask)
+        cv.imshow("KNN With a Foreground Mask", knn_with_mask)
+        cv.imshow("KNN Without a Foreground Mask", knn_without_mask)
+
+        key = cv.waitKey(30)
+        if key == 27:  # ESC
+            break
+
+        frame_count += 1
+
+    cap.release()
+    cv.destroyAllWindows()
+
+if __name__ == '__main__':
+    print(__doc__)
+    main()
+    cv.destroyAllWindows()