diff --git a/3rdparty/carotene/CMakeLists.txt b/3rdparty/carotene/CMakeLists.txt
index 528fcf62e1..4319815708 100644
--- a/3rdparty/carotene/CMakeLists.txt
+++ b/3rdparty/carotene/CMakeLists.txt
@@ -40,4 +40,5 @@ if(WITH_NEON)
     target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
 endif()
 
-add_library(carotene STATIC EXCLUDE_FROM_ALL "$<TARGET_OBJECTS:carotene_objs>")
+# we add dummy file to fix XCode build
+add_library(carotene STATIC EXCLUDE_FROM_ALL "$<TARGET_OBJECTS:carotene_objs>" "${CAROTENE_SOURCE_DIR}/dummy.cpp")
diff --git a/3rdparty/carotene/hal/CMakeLists.txt b/3rdparty/carotene/hal/CMakeLists.txt
index 8ca7a7de32..592771c676 100644
--- a/3rdparty/carotene/hal/CMakeLists.txt
+++ b/3rdparty/carotene/hal/CMakeLists.txt
@@ -82,7 +82,8 @@ set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS ${carotene_defs})
 #    set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
   endif()
 
-add_library(tegra_hal STATIC $<TARGET_OBJECTS:carotene_objs>)
+# we add dummy file to fix XCode build
+add_library(tegra_hal STATIC $<TARGET_OBJECTS:carotene_objs> "dummy.cpp")
 set_target_properties(tegra_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
 set(OPENCV_SRC_DIR "${CMAKE_SOURCE_DIR}")
 if(NOT BUILD_SHARED_LIBS)
diff --git a/3rdparty/carotene/hal/dummy.cpp b/3rdparty/carotene/hal/dummy.cpp
new file mode 100644
index 0000000000..7f10ff3e8c
--- /dev/null
+++ b/3rdparty/carotene/hal/dummy.cpp
@@ -0,0 +1,2 @@
+// This file is needed for compilation on some platforms e.g. with XCode generator
+// Related issue: https://gitlab.kitware.com/cmake/cmake/-/issues/17457
diff --git a/3rdparty/carotene/src/dummy.cpp b/3rdparty/carotene/src/dummy.cpp
new file mode 100644
index 0000000000..7f10ff3e8c
--- /dev/null
+++ b/3rdparty/carotene/src/dummy.cpp
@@ -0,0 +1,2 @@
+// This file is needed for compilation on some platforms e.g. with XCode generator
+// Related issue: https://gitlab.kitware.com/cmake/cmake/-/issues/17457
diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index 4d4fa24ccd..9b74c0e36a 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -88,7 +88,12 @@ if(CUDA_FOUND)
 
   message(STATUS "CUDA detected: " ${CUDA_VERSION})
 
-  set(_generations "Fermi" "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere")
+  OCV_OPTION(CUDA_ENABLE_DEPRECATED_GENERATION "Enable deprecated generations in the list" OFF)
+  set(_generations "Maxwell" "Pascal" "Volta" "Turing" "Ampere")
+  if(CUDA_ENABLE_DEPRECATED_GENERATION)
+    set(_generations "Fermi" "${_generations}")
+    set(_generations "Kepler" "${_generations}")
+  endif()
   set(_arch_fermi   "2.0")
   set(_arch_kepler  "3.0;3.5;3.7")
   set(_arch_maxwell "5.0;5.2")
@@ -209,10 +214,6 @@ if(CUDA_FOUND)
     endif()
   endmacro()
 
-  macro(ocv_wipeout_deprecated _arch_bin_list)
-    string(REPLACE "2.1" "2.1(2.0)" ${_arch_bin_list} "${${_arch_bin_list}}")
-  endmacro()
-
   set(__cuda_arch_ptx "")
   if(CUDA_GENERATION STREQUAL "Fermi")
     set(__cuda_arch_bin ${_arch_fermi})
@@ -275,7 +276,6 @@ if(CUDA_FOUND)
       )
     endif()
   endif()
-  ocv_wipeout_deprecated(__cuda_arch_bin)
 
   set(CUDA_ARCH_BIN ${__cuda_arch_bin} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
   set(CUDA_ARCH_PTX ${__cuda_arch_ptx} CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
@@ -283,10 +283,14 @@ if(CUDA_FOUND)
   string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
   string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}")
 
-  # Check if user specified 1.0 compute capability: we don't support it
-  if(" ${CUDA_ARCH_BIN} ${CUDA_ARCH_PTX}" MATCHES " 1.0")
-    message(SEND_ERROR "CUDA: 1.0 compute capability is not supported - exclude it from ARCH/PTX list are re-run CMake")
-  endif()
+  # Check if user specified 1.0/2.1 compute capability: we don't support it
+  macro(ocv_wipeout_deprecated_cc target_cc)
+    if(" ${CUDA_ARCH_BIN} ${CUDA_ARCH_PTX}" MATCHES " ${target_cc}")
+      message(SEND_ERROR "CUDA: ${target_cc} compute capability is not supported - exclude it from ARCH/PTX list and re-run CMake")
+    endif()
+  endmacro()
+  ocv_wipeout_deprecated_cc("1.0")
+  ocv_wipeout_deprecated_cc("2.1")
 
   # NVCC flags to be set
   set(NVCC_FLAGS_EXTRA "")
diff --git a/doc/js_tutorials/js_assets/opencv_logo.jpg b/doc/js_tutorials/js_assets/opencv_logo.jpg
index a2854e1e9e..c2bf3a1748 100644
Binary files a/doc/js_tutorials/js_assets/opencv_logo.jpg and b/doc/js_tutorials/js_assets/opencv_logo.jpg differ
diff --git a/doc/opencv-logo-small.png b/doc/opencv-logo-small.png
index 763ceb2b32..b7e76d27ba 100644
Binary files a/doc/opencv-logo-small.png and b/doc/opencv-logo-small.png differ
diff --git a/doc/opencv-logo-white.png b/doc/opencv-logo-white.png
index 3c7098459e..a683e3569f 100644
Binary files a/doc/opencv-logo-white.png and b/doc/opencv-logo-white.png differ
diff --git a/doc/opencv-logo.png b/doc/opencv-logo.png
index 76cc29f6dc..3ed6a233bb 100644
Binary files a/doc/opencv-logo.png and b/doc/opencv-logo.png differ
diff --git a/doc/opencv-logo2.png b/doc/opencv-logo2.png
index bc71a2ae50..6658e07b31 100644
Binary files a/doc/opencv-logo2.png and b/doc/opencv-logo2.png differ
diff --git a/doc/opencv.bib b/doc/opencv.bib
index bdfbc8cf1e..975630a18d 100644
--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@@ -584,6 +584,16 @@
   pages = {1033--1040},
   publisher = {IEEE}
 }
+@article{YM11,
+  author = {Yu, Guoshen and Morel, Jean-Michel},
+  title = {ASIFT: An Algorithm for Fully Affine Invariant Comparison},
+  year = {2011},
+  pages = {11--38},
+  journal = {Image Processing On Line},
+  volume = {1},
+  doi = {10.5201/ipol.2011.my-asift},
+  url = {http://www.ipol.im/pub/algo/my_affine_sift/}
+}
 @inproceedings{LCS11,
   author = {Leutenegger, Stefan and Chli, Margarita and Siegwart, Roland Yves},
   title = {BRISK: Binary robust invariant scalable keypoints},
diff --git a/doc/opencv.ico b/doc/opencv.ico
index 38f033f3b0..c4d2cfd471 100644
Binary files a/doc/opencv.ico and b/doc/opencv.ico differ
diff --git a/doc/py_tutorials/py_setup/images/opencv_logo.jpg b/doc/py_tutorials/py_setup/images/opencv_logo.jpg
index a2854e1e9e..c2bf3a1748 100644
Binary files a/doc/py_tutorials/py_setup/images/opencv_logo.jpg and b/doc/py_tutorials/py_setup/images/opencv_logo.jpg differ
diff --git a/modules/calib3d/src/calibration_handeye.cpp b/modules/calib3d/src/calibration_handeye.cpp
index 18561c77fe..37d4e89d78 100644
--- a/modules/calib3d/src/calibration_handeye.cpp
+++ b/modules/calib3d/src/calibration_handeye.cpp
@@ -712,7 +712,10 @@ void calibrateHandEye(InputArrayOfArrays R_gripper2base, InputArrayOfArrays t_gr
     {
         Mat m = Mat::eye(4, 4, CV_64FC1);
         Mat R = m(Rect(0, 0, 3, 3));
-        R_gripper2base_[i].convertTo(R, CV_64F);
+        if(R_gripper2base_[i].size() == Size(3, 3))
+            R_gripper2base_[i].convertTo(R, CV_64F);
+        else
+            Rodrigues(R_gripper2base_[i], R);
 
         Mat t = m(Rect(3, 0, 1, 3));
         t_gripper2base_[i].convertTo(t, CV_64F);
@@ -727,7 +730,10 @@ void calibrateHandEye(InputArrayOfArrays R_gripper2base, InputArrayOfArrays t_gr
     {
         Mat m = Mat::eye(4, 4, CV_64FC1);
         Mat R = m(Rect(0, 0, 3, 3));
-        R_target2cam_[i].convertTo(R, CV_64F);
+        if(R_target2cam_[i].size() == Size(3, 3))
+            R_target2cam_[i].convertTo(R, CV_64F);
+        else
+            Rodrigues(R_target2cam_[i], R);
 
         Mat t = m(Rect(3, 0, 1, 3));
         t_target2cam_[i].convertTo(t, CV_64F);
diff --git a/modules/calib3d/test/test_calibration_hand_eye.cpp b/modules/calib3d/test/test_calibration_hand_eye.cpp
index d2cef969b3..848dcf07c2 100644
--- a/modules/calib3d/test/test_calibration_hand_eye.cpp
+++ b/modules/calib3d/test/test_calibration_hand_eye.cpp
@@ -317,7 +317,10 @@ void CV_CalibrateHandEyeTest::simulateData(RNG& rng, int nPoses,
             t_gripper2base_noise.at<double>(2,0) += rng.gaussian(0.001);
         }
 
-        R_target2cam.push_back(T_target2cam(Rect(0, 0, 3, 3)));
+        // test rvec represenation
+        Mat rvec_target2cam;
+        cv::Rodrigues(T_target2cam(Rect(0, 0, 3, 3)), rvec_target2cam);
+        R_target2cam.push_back(rvec_target2cam);
         t_target2cam.push_back(T_target2cam(Rect(3, 0, 1, 3)));
     }
 }
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index ff9fa36232..adbe3727a4 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -1614,7 +1614,9 @@ elements.
 CV_EXPORTS_W bool checkRange(InputArray a, bool quiet = true, CV_OUT Point* pos = 0,
                             double minVal = -DBL_MAX, double maxVal = DBL_MAX);
 
-/** @brief converts NaN's to the given number
+/** @brief converts NaNs to the given number
+@param a input/output matrix (CV_32F type).
+@param val value to convert the NaNs
 */
 CV_EXPORTS_W void patchNaNs(InputOutputArray a, double val = 0);
 
diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index 32f59d1a3e..5d8fbc8b84 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -600,6 +600,14 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<RegionLayer> create(const LayerParams& params);
     };
 
+    /**
+     * @brief Detection output layer.
+     *
+     * The layer size is: @f$ (1 \times 1 \times N \times 7) @f$
+     *    where N is [keep_top_k] parameter multiplied by batch size. Each row is:
+     *    [image_id, label, confidence, xmin, ymin, xmax, ymax]
+     *    where image_id is the index of image input in the batch.
+     */
     class CV_EXPORTS DetectionOutputLayer : public Layer
     {
     public:
diff --git a/modules/dnn/src/darknet/darknet_io.cpp b/modules/dnn/src/darknet/darknet_io.cpp
index f6504b96c7..bc0b413588 100644
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@@ -221,6 +221,10 @@ namespace cv {
                 {
                     cv::dnn::LayerParams activation_param;
                     if (type == "relu")
+                    {
+                        activation_param.type = "ReLU";
+                    }
+                    else if (type == "leaky")
                     {
                         activation_param.set<float>("negative_slope", 0.1f);
                         activation_param.type = "ReLU";
@@ -862,24 +866,8 @@ namespace cv {
                     }
 
                     std::string activation = getParam<std::string>(layer_params, "activation", "linear");
-                    if (activation == "leaky")
-                    {
-                        setParams.setActivation("relu");
-                    }
-                    else if (activation == "swish")
-                    {
-                        setParams.setActivation("swish");
-                    }
-                    else if (activation == "mish")
-                    {
-                        setParams.setActivation("mish");
-                    }
-                    else if (activation == "logistic")
-                    {
-                        setParams.setActivation("logistic");
-                    }
-                    else if (activation != "linear")
-                        CV_Error(cv::Error::StsParseError, "Unsupported activation: " + activation);
+                    if (activation != "linear")
+                        setParams.setActivation(activation);
 
                     net->out_channels_vec[layers_counter] = tensor_shape[0];
                 }
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 2823ee1115..ae7501ea9f 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -114,18 +114,19 @@ public:
         inputs_arr.getMatVector(inputs);
         outputs_arr.getMatVector(outputs);
 
-        CV_Assert(inputs.size() > 0);
+        CV_Assert((inputs.size() > outputs.size() && blobs.empty()) ||
+                  (!inputs.empty() && (blobs.size() == 1 || blobs.size() == 2)));
+        MatSize weightShape = blobs.empty() ? inputs[1].size : blobs[0].size;
 
-        CV_Assert(blobs.size() == 1 || blobs.size() == 2);
         CV_Assert(inputs[0].dims == outputs[0].dims);
-        CV_Assert(blobs[0].dims == kernel_size.size() + 2);
+        CV_Assert(weightShape.dims() == kernel_size.size() + 2);
         for (int i = 0; i < kernel_size.size(); i++) {
-            CV_Assert(blobs[0].size[i + 2] == kernel_size[i]);
+            CV_Assert(weightShape[i + 2] == kernel_size[i]);
         }
 
         const Mat &input = inputs[0];
         CV_Assert((input.dims == 4 || input.dims == 5) && (input.type() == CV_32F || input.type() == CV_16S));
-        for (size_t i = 0; i < inputs.size(); i++)
+        for (size_t i = 0; i < outputs.size(); i++)
         {
             CV_Assert(inputs[i].type() == input.type());
             CV_Assert((inputs[i].dims == 4 || inputs[i].dims == 5) && inputs[i].size[1] == input.size[1]);
@@ -270,6 +271,7 @@ public:
 
     MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE
     {
+        CV_Assert(!blobs.empty());
         int dims = inpShape.size();
         int inpD = dims == 5 ? inpShape[2] : 1;
         int inpH = inpShape[dims - 2];
@@ -296,6 +298,8 @@ public:
         {
             if (kernel_size.size() == 3)
                 return preferableTarget == DNN_TARGET_CPU;
+            if ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || preferableTarget != DNN_TARGET_MYRIAD) && blobs.empty())
+                return false;
             return (preferableTarget != DNN_TARGET_MYRIAD || dilation.width == dilation.height);
         }
         else
@@ -305,7 +309,7 @@ public:
                 return (preferableTarget == DNN_TARGET_CPU && backendId == DNN_BACKEND_OPENCV);
             else if (kernel_size.size() == 2)
                 return backendId == DNN_BACKEND_OPENCV ||
-                       backendId == DNN_BACKEND_HALIDE ||
+                       (backendId == DNN_BACKEND_HALIDE && !blobs.empty()) ||
                        (backendId == DNN_BACKEND_VKCOM && haveVulkan());
             else
                 return false;
@@ -317,16 +321,16 @@ public:
                          std::vector<MatShape> &outputs,
                          std::vector<MatShape> &internals) const CV_OVERRIDE
     {
-        CV_Assert(blobs.size() != 0);
-        CV_Assert(!hasBias() || blobs[1].total() == (size_t)blobs[0].size[0]);
-        CV_Assert(inputs.size() == (size_t)1);
+        CV_Assert(!blobs.empty() || inputs.size() > 1);
+        const int* weightShape = blobs.empty() ? &inputs[1][0] : blobs[0].size.p;
+        CV_Assert(!hasBias() || blobs[1].total() == (size_t)weightShape[0]);
 
         internals.clear();
 
         CV_Assert(inputs.size() != 0);
         std::vector<int> inpShape(inputs[0].begin() + 2, inputs[0].end());
 
-        int outCn = blobs[0].size[0];
+        int outCn = weightShape[0];
         std::vector<int> outShape;
         outShape.push_back(inputs[0][0]);
         outShape.push_back(outCn);
@@ -342,10 +346,10 @@ public:
             getConvPoolOutParams(inpShape, kernel_size, strides, padMode, dilations, outShape);
         }
 
-        int ngroups = inpCn / blobs[0].size[1];
-        if (ngroups == 0 || ngroups * blobs[0].size[1] != inpCn)
+        int ngroups = inpCn / weightShape[1];
+        if (ngroups == 0 || ngroups * weightShape[1] != inpCn)
             CV_Error(Error::StsError, format("Number of input channels should "
-                     "be multiple of %d but got %d", blobs[0].size[1], inpCn));
+                     "be multiple of %d but got %d", weightShape[1], inpCn));
         CV_Assert(ngroups > 0 && inpCn % ngroups == 0 && outCn % ngroups == 0);
 
         outputs.resize(1, outShape);
@@ -357,15 +361,15 @@ public:
     {
         BaseConvolutionLayerImpl::finalize(inputs_arr, outputs_arr);
 
-        CV_Assert(!blobs.empty());
-        const int outCn = blobs[0].size[0];
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
         // prepare weightsMat where each row is aligned and has enough zero padding on the right to
         // use vectorized (i.e. with intrinsics) loops without tail processing
-        Mat wm = blobs[0].reshape(1, outCn);
+        Mat wm = blobs.empty() ? inputs[1].reshape(1, numOutput) : blobs[0].reshape(1, numOutput);
         if( wm.step1() % VEC_ALIGN != 0 )
         {
             int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
-            Mat wm_buffer = Mat(outCn, newcols, wm.type());
+            Mat wm_buffer = Mat(numOutput, newcols, wm.type());
             Mat wm_padding = wm_buffer.colRange(wm.cols, newcols);
             wm_padding.setTo(Scalar::all(0.));
             Mat wm_aligned = wm_buffer.colRange(0, wm.cols);
@@ -373,18 +377,18 @@ public:
             wm = wm_aligned;
         }
         weightsMat = wm;
-        weightsMultipliers.assign(outCn, 1.0);
+        weightsMultipliers.assign(numOutput, 1.0);
 
-        Mat biasMat = hasBias() ? blobs[1].reshape(1, outCn) : Mat();
-        biasvec.resize(outCn+2);
+        Mat biasMat = hasBias() ? blobs[1].reshape(1, numOutput) : Mat();
+        biasvec.resize(numOutput+2);
         if( biasMat.empty() )
         {
-            for(int i = 0; i < outCn; i++ )
+            for(int i = 0; i < numOutput; i++ )
                 biasvec[i] = 0.f;
         }
         else
         {
-            for(int i = 0; i < outCn; i++ )
+            for(int i = 0; i < numOutput; i++ )
                 biasvec[i] = biasMat.at<float>(i);
         }
 #ifdef HAVE_OPENCL
@@ -394,7 +398,7 @@ public:
 
     bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
     {
-        if (!activ.empty() && !layer.empty())
+        if ((!activ.empty() && !layer.empty()) || blobs.empty())
             return false;
 
         activ = layer;
@@ -743,37 +747,48 @@ public:
     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
-        CV_Assert_N(inputs.size() == 1, nodes.size() == 1);
+        CV_Assert_N(inputs.size() >= 1, nodes.size() >= 1);
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
         std::vector<size_t> dims = ieInpNode->get_shape();
         CV_Assert(dims.size() == 4 || dims.size() == 5);
+        std::shared_ptr<ngraph::Node> ieWeights = nodes.size() > 1 ? nodes[1].dynamicCast<InfEngineNgraphNode>()->node : nullptr;
         const int inpCn = dims[1];
-        const int outCn = blobs[0].size[0];
-        const int inpGroupCn = blobs[0].size[1];
+        const int inpGroupCn = nodes.size() > 1 ? ieWeights->get_shape()[1] : blobs[0].size[1];
         const int group = inpCn / inpGroupCn;
 
-        std::vector<size_t> kernel_shape = getShape<size_t>(blobs[0]);
+        std::vector<size_t> kernel_shape;
         if (group != 1)
         {
-            kernel_shape[0] /= group;
-            kernel_shape.insert(kernel_shape.begin(), group);
+            kernel_shape.push_back(group);
         }
+        kernel_shape.push_back(numOutput / group);
+        kernel_shape.push_back(inpCn / group);
+        std::copy(kernel_size.begin(), kernel_size.end(), back_inserter(kernel_shape));
 
-        auto ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, blobs[0].data);
-        if (fusedWeights)
+        if (nodes.size() == 1)
         {
-            if (weightsMat.isContinuous())
+            ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, blobs[0].data);
+            if (fusedWeights)
             {
-                ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, weightsMat.data);
-            }
-            else
-            {
-                Mat newWeights;
-                Mat cvWeights = weightsMat.colRange(0, blobs[0].total() / outCn);
-                cvWeights.copyTo(newWeights);
-                ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, newWeights.data);
+                if (weightsMat.isContinuous())
+                {
+                    ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, weightsMat.data);
+                }
+                else
+                {
+                    Mat newWeights;
+                    Mat cvWeights = weightsMat.colRange(0, blobs[0].total() / numOutput);
+                    cvWeights.copyTo(newWeights);
+                    ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, newWeights.data);
+                }
             }
         }
+        else
+        {
+            auto shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                             ngraph::Shape{kernel_shape.size()}, kernel_shape.data());
+            ieWeights  = std::make_shared<ngraph::op::v1::Reshape>(ieWeights, shape, true);
+        }
 
         ngraph::op::PadType pad_type = ngraph::op::PadType::EXPLICIT;
         if (!padMode.empty())
@@ -798,11 +813,21 @@ public:
                                 pad_type);
         }
 
-        if (hasBias() || fusedBias)
+        if (hasBias() || fusedBias || nodes.size() == 3)
         {
             std::vector<size_t> shape(conv_node->get_shape().size(), 1);
-            shape[1] = outCn;
-            auto bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), biasvec.data());
+            shape[1] = conv_node->get_shape()[1];
+            std::shared_ptr<ngraph::Node> bias;
+            if (nodes.size() == 3)
+            {
+                auto bias_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                                    ngraph::Shape{shape.size()}, shape.data());
+                bias = std::make_shared<ngraph::op::v1::Reshape>(nodes[2].dynamicCast<InfEngineNgraphNode>()->node, bias_shape, true);
+            }
+            else
+            {
+                bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), biasvec.data());
+            }
             auto conv_bias = std::make_shared<ngraph::op::v1::Add>(conv_node, bias, ngraph::op::AutoBroadcastType::NUMPY);
             return Ptr<BackendNode>(new InfEngineNgraphNode(conv_bias));
         }
@@ -1516,6 +1541,26 @@ public:
         for (int i = 0; i < inputs.size(); ++i)
             CV_Assert(inputs[i].u != outputs[0].u);
 
+        if (blobs.empty())
+        {
+            size_t n = inputs.size() - 1;
+            umat_blobs.resize(n);
+            for (size_t i = 0; i < n; i++)
+            {
+                if (use_half)
+                {
+                    Mat matFP32;
+                    convertFp16(inputs[i + 1], matFP32);
+                    matFP32.copyTo(umat_blobs[i]);
+                }
+                else
+                {
+                    inputs[i + 1].copyTo(umat_blobs[i]);
+                }
+            }
+            inputs.resize(1);
+        }
+
         if (umat_blobs.empty())
         {
             size_t n = blobs.size();
@@ -1526,7 +1571,7 @@ public:
             }
         }
 
-        if (convolutionOp.empty())
+        if (convolutionOp.empty() || blobs.empty())
         {
             OCL4DNNConvConfig config;
             config.in_shape = shape(inputs[0]);
@@ -1536,7 +1581,7 @@ public:
             config.stride = stride;
             config.dilation = dilation;
             config.group = inputs[0].size[1] / umat_blobs[0].size[1];
-            config.bias_term = (hasBias()) ? true : false;
+            config.bias_term = umat_blobs.size() == 2;
             config.use_half = use_half;
 
             convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
@@ -1663,16 +1708,37 @@ public:
         inputs_arr.getMatVector(inputs);
         outputs_arr.getMatVector(outputs);
 
+        int outCn = blobs.empty() ? inputs[1].size[0] : blobs[0].size[0];
+        // Need to align non-const blobs
+        if (blobs.empty())
+        {
+            Mat wm = inputs[1].reshape(1, outCn);
+            if( wm.step1() % VEC_ALIGN != 0 )
+            {
+                wm.copyTo(weightsMat);
+                if (inputs.size() > 2)
+                {
+                    Mat biasMat = inputs[2].reshape(1, outCn);
+                    biasMat.col(0).copyTo(biasvec);
+                    biasvec.resize(outCn + 2);
+                }
+                else
+                {
+                    biasvec.resize(outCn + 2, 0);
+                }
+            }
+        }
+
         /*printf("conv %s: input (%d x %d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
                name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2], inputs[0].size[3],
                kernel.width, kernel.height, pad.width, pad.height,
                stride.width, stride.height, dilation.width, dilation.height);*/
-        CV_Assert_N(inputs.size() == (size_t)1, inputs[0].size[1] % blobs[0].size[1] == 0,
+        int inpGroupCn = blobs.empty() ? inputs[1].size[1] : blobs[0].size[1];
+        CV_Assert_N(inputs.size() >= (size_t)1, inputs[0].size[1] % inpGroupCn == 0,
                     outputs.size() == 1, inputs[0].data != outputs[0].data);
 
-        int ngroups = inputs[0].size[1]/blobs[0].size[1];
+        int ngroups = inputs[0].size[1] / inpGroupCn;
         CV_Assert(outputs[0].size[1] % ngroups == 0);
-        int outCn = blobs[0].size[0];
 
         reluslope.clear();
         if( activ )
@@ -1810,11 +1876,11 @@ public:
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
-        CV_Assert(inputs.size() == outputs.size());
+        CV_Assert(inputs.size() == outputs.size() || inputs.size() == outputs.size() + blobs.size());
 
         int64 flops = 0;
         int karea = std::accumulate(kernel_size.begin(), kernel_size.end(), 1, std::multiplies<size_t>());
-        for (int i = 0; i < inputs.size(); i++)
+        for (int i = 0; i < outputs.size(); i++)
         {
             flops += total(outputs[i])*(CV_BIG_INT(2)*karea*inputs[i][1] + 1);
         }
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index b17391f638..b9d30bbd62 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -587,7 +587,7 @@ public:
         }
         else
         {
-            std::vector<size_t> data = {(size_t)ieInpNode->get_shape()[0], (size_t)blobs[0].size[1]};
+            std::vector<int64_t> data = {(int64_t)ieInpNode->get_shape()[0], (int64_t)blobs[0].size[1]};
             auto new_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, data.data());
             auto inp = std::make_shared<ngraph::op::v1::Reshape>(ieInpNode, new_shape, true);
 
diff --git a/modules/dnn/src/layers/permute_layer.cpp b/modules/dnn/src/layers/permute_layer.cpp
index 943e347ff6..ffc8540a25 100644
--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@@ -397,8 +397,9 @@ public:
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        std::vector<int64_t> order(_order.begin(), _order.end());
         auto tr_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                       ngraph::Shape({_order.size()}), _order.data());
+                       ngraph::Shape({order.size()}), order.data());
         auto transpose = std::make_shared<ngraph::op::Transpose>(ieInpNode, tr_axes);
         return Ptr<BackendNode>(new InfEngineNgraphNode(transpose));
     }
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index 0522ff7cfa..6deabb5884 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -167,6 +167,10 @@ public:
 
     void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
     {
+#ifdef HAVE_OPENCL
+        ocl_exec_cache.clear();
+#endif
+
         std::vector<Mat> inputs, outputs;
         inputs_arr.getMatVector(inputs);
         outputs_arr.getMatVector(outputs);
@@ -221,26 +225,33 @@ public:
     }
 
 #ifdef HAVE_OPENCL
-    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    struct OpenCLExecInfo
     {
-        std::vector<UMat> inputs;
-        std::vector<UMat> outputs;
+        std::string kernel_name;
+        std::string build_opts;
+        size_t local_size[2];
+        size_t global_size[2];
 
-        inputs_.getUMatVector(inputs);
-        outputs_.getUMatVector(outputs);
+        OpenCLExecInfo()
+        {
+            local_size[0] = local_size[1] = 0;
+            global_size[0] = global_size[1] = 0;
+        }
+    };
+    std::vector<OpenCLExecInfo> ocl_exec_cache;
+
+    void ocl_prepare(const std::vector<UMat>& inputs, const std::vector<UMat>& outputs)
+    {
+        CV_TRACE_FUNCTION();
 
         CV_Assert(outputs.size() == finalSliceRanges.size());
+        ocl_exec_cache.resize(outputs.size());
 
         const UMat& input = inputs[0];
-        if (input.dims > 5)
-        {
-            CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << input.dims << ". Fallback to CPU");
-            return false;
-        }
+        const int dims = input.dims;
 
         size_t WSZ = 128;
 
-        const int dims = input.dims;
         const int elemSize = (int)input.elemSize();
         String opts0 = cv::format(
                 "-DDIMS=%d -DELEMSIZE=%d",
@@ -250,10 +261,11 @@ public:
         {
             opts0 += cv::format(" -DSRC_STEP_%d=%d", d, (int)input.step[dims - 1 - d]);
         }
-        String kname = cv::format("slice_%d", dims);
         for (size_t i = 0; i < outputs.size(); i++)
         {
-            UMat& output = outputs[i];
+            OpenCLExecInfo& ocl = ocl_exec_cache[i];
+
+            const UMat& output = outputs[i];
             const std::vector<Range>& range = finalSliceRanges[i];
 
             String opts = opts0;
@@ -269,6 +281,8 @@ public:
                 CV_CheckEQ(range[d].size(), (int)output.size[d], "");
             }
 
+            const size_t param_LIMIT_BLOCK_SIZE_PER_WG = WSZ * 64;
+
             int block_dims = 0;
             size_t block_size = elemSize;
             for (int i = dims - 1; i >= 0; --i)
@@ -277,12 +291,14 @@ public:
                     break;
                 block_size *= output.size[i];
                 block_dims++;
+                if (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG)
+                    break;
             }
 
             const size_t total = output.total() * elemSize;
             size_t num_blocks = total / block_size;
 
-            if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= WSZ * 64))
+            if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG))
             {
                 // use 1D copy mode
                 opts += cv::format(" -DUSE_COPY_1D=1");
@@ -352,23 +368,98 @@ public:
 
             opts += cv::format(" -DWSZ=%d", (int)WSZ);
 
-            size_t local[] = { WSZ, 1 };
-            size_t global[] = { WSZ, num_blocks };
+            std::ostringstream kernel_suffix;
+            kernel_suffix << dims << 'x' << elemSize << "_bsz" << block_size;
+            kernel_suffix << "__src_";
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << input.size[dims - 1 - d] << '_';
+            }
+            kernel_suffix << '_';
+            /*for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << input.step[dims - 1 - d] << '_';
+            }
+            kernel_suffix << '_';*/
 
-            ocl::Kernel kernel(kname.c_str(), ocl::dnn::slice_oclsrc, opts);
+            kernel_suffix << "dst_";
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << output.size[dims - 1 - d] << '_';
+            }
+            /*kernel_suffix << '_';
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << output.step[dims - 1 - d] << '_';
+            }*/
+            kernel_suffix << "_slice_";
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << range[dims - 1 - d].start << '_';
+            }
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << '_' << range[dims - 1 - d].end;
+            }
+
+            std::string kernel_suffix_str = kernel_suffix.str();
+            opts += cv::format(" -DSLICE_KERNEL_SUFFIX=%s", kernel_suffix_str.c_str());
+
+            ocl.kernel_name = cv::format("slice_%s", kernel_suffix_str.c_str());
+            ocl.build_opts = opts;
+            ocl.local_size[0] = WSZ;
+            ocl.local_size[1] = 1;
+            ocl.global_size[0] = WSZ;
+            ocl.global_size[1] = num_blocks;
+        }  // for outputs.size()
+    }  // ocl_prepare
+
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        CV_TRACE_FUNCTION();
+
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        CV_Assert(outputs.size() == finalSliceRanges.size());
+
+        const UMat& input = inputs[0];
+        const int dims = input.dims;
+        if (dims > 5)
+        {
+            CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << dims << ". Fallback to CPU");
+            return false;
+        }
+
+        if (ocl_exec_cache.empty())
+        {
+            ocl_prepare(inputs, outputs);
+        }
+        CV_CheckEQ(ocl_exec_cache.size(), outputs.size(), "");
+
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            const OpenCLExecInfo& ocl = ocl_exec_cache[i];
+
+            UMat& output = outputs[i];
+
+            ocl::Kernel kernel(ocl.kernel_name.c_str(), ocl::dnn::slice_oclsrc, ocl.build_opts);
             if (kernel.empty())
                 return false;
             bool ret = kernel.args(
                     ocl::KernelArg::PtrReadOnly(input),
                     ocl::KernelArg::PtrWriteOnly(output)
                 )
-                .run(2, global, local, false);
+                .run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
             if (!ret)
                 return false;
         }  // for outputs.size()
 
         return true;
-        }
+    }  // forward_ocl
 #endif
 
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 2e1c185bbe..794d0721e0 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -1003,10 +1003,13 @@ void ONNXImporter::populateNet(Net dstNet)
             CV_Assert(node_proto.input_size() >= 2);
             layerParams.type = "Convolution";
             for (int j = 1; j < node_proto.input_size(); j++) {
-                layerParams.blobs.push_back(getBlob(node_proto, constBlobs, j));
+                if (constBlobs.find(node_proto.input(j)) != constBlobs.end())
+                {
+                    layerParams.blobs.push_back(getBlob(node_proto, constBlobs, j));
+                }
             }
-            layerParams.set("num_output", layerParams.blobs[0].size[0]);
-            layerParams.set("bias_term", node_proto.input_size() == 3);
+            int outCn = layerParams.blobs.empty() ? outShapes[node_proto.input(1)][0] : layerParams.blobs[0].size[0];
+            layerParams.set("num_output", outCn);
         }
         else if (layer_type == "ConvTranspose")
         {
diff --git a/modules/dnn/src/opencl/slice.cl b/modules/dnn/src/opencl/slice.cl
index d468dbc16a..f32d66a9ca 100644
--- a/modules/dnn/src/opencl/slice.cl
+++ b/modules/dnn/src/opencl/slice.cl
@@ -48,19 +48,85 @@ global: <WSZ, number_of_copy_blocks, 1>
 #define BLOCK_COLS_X4 (BLOCK_COLS / 4)
 #define BLOCK_COLS_X16 (BLOCK_COLS / 16)
 
-#ifdef USE_COPY_1D
-
-static inline
-__attribute__((always_inline))
-void copy_block_1d(
+__attribute__((reqd_work_group_size(WSZ, 1, 1)))
+__kernel void
+CONCAT(slice_, SLICE_KERNEL_SUFFIX)(
     __global const uchar* src0,
-    const uint src_offset,
-    __global uchar* dst0,
-    const uint dst_offset
+    __global uchar* dst0
 )
 {
-    __global const uchar* src = src0 + src_offset;
-    __global uchar* dst = dst0 + dst_offset;
+    uint block_id = get_global_id(1);
+    uint dst_offset0 = block_id * BLOCK_SIZE;
+    uint src_offset0 = 0;
+
+    {  // calculate src_offset0
+
+#define CALC_SRC_INDEX(dim) \
+    { \
+    uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \
+    CONCAT(idx_, dim) = block_id / plane_sz; \
+    block_id = block_id - CONCAT(idx_, dim) * plane_sz; \
+    }
+#define UPDATE_SRC_OFFSET(dim) \
+    src_offset0 = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset0);
+/*
+    if (get_global_id(0) == 0 && get_global_id(1) == 0) \
+        printf("(%d, %d): @%d src_offset0=%d   idx_dim=%d   block_id=%d\n", \
+            get_global_id(0), get_global_id(1), \
+            dim, src_offset0, CONCAT(idx_, dim), block_id \
+        );
+*/
+
+#if DIMS > 5
+#error "invalid configuration"
+#endif
+#if DIMS > 4
+    uint idx_4 = 0;
+#if BLOCK_DIMS <= 4
+    CALC_SRC_INDEX(4)
+#endif
+    UPDATE_SRC_OFFSET(4)
+#endif
+#if DIMS > 3
+    uint idx_3 = 0;
+#if BLOCK_DIMS <= 3
+    CALC_SRC_INDEX(3)
+#endif
+    UPDATE_SRC_OFFSET(3)
+#endif
+#if DIMS > 2
+    uint idx_2 = 0;
+#if BLOCK_DIMS <= 2
+    CALC_SRC_INDEX(2)
+#endif
+    UPDATE_SRC_OFFSET(2)
+#endif
+#if DIMS > 1
+    uint idx_1 = 0;
+#if BLOCK_DIMS <= 1
+    CALC_SRC_INDEX(1)
+#endif
+    UPDATE_SRC_OFFSET(1)
+#endif
+#if DIMS > 0
+    uint idx_0 = 0;
+    UPDATE_SRC_OFFSET(0)
+#endif
+
+/*
+    if (get_global_id(0) == 0)
+        printf("(%d, %d): src_offset0=%d dst_offset0=%d\n",
+            get_global_id(0), get_global_id(1),
+            src_offset0, dst_offset0
+        );
+*/
+
+    }  // calculate src_offset0
+
+#ifdef USE_COPY_1D
+    {  // copy_block_1d
+    __global const uchar* src = src0 + src_offset0;
+    __global uchar* dst = dst0 + dst_offset0;
 
     uint processed = 0;
 
@@ -70,8 +136,9 @@ void copy_block_1d(
         uint i = get_local_id(0) * 16;  // uchar16
         while (i < BLOCK_COLS_X16 * 16)
         {
-            uint4 idx = (uint4)(i, i + 16 * WSZ, i + 32 * WSZ, i + 48 * WSZ);
-            idx = select((uint4)i, idx, idx < (BLOCK_COLS_X16 * 16));
+            uint4 idx0 = (uint4)i;
+            uint4 idx = idx0 + (uint4)(0, 16 * WSZ, 32 * WSZ, 48 * WSZ);
+            idx = select(idx0, idx, idx < (BLOCK_COLS_X16 * 16));
 
             uchar16 a0 = vload16(0, src + idx.s0);
             uchar16 a1 = vload16(0, src + idx.s1);
@@ -97,8 +164,9 @@ void copy_block_1d(
         uint i = get_local_id(0) * 4 + processed;  // uchar4
         while (i < BLOCK_COLS_X4 * 4)
         {
-            uint4 idx = (uint4)(i, i + 4 * WSZ, i + 8 * WSZ, i + 12 * WSZ);
-            idx = select((uint4)i, idx, idx < (BLOCK_COLS_X4 * 4));
+            uint4 idx0 = (uint4)i;
+            uint4 idx = idx0 + (uint4)(0, 4 * WSZ, 8 * WSZ, 12 * WSZ);
+            idx = select(idx0, idx, idx < (BLOCK_COLS_X4 * 4));
 
             uchar4 a0 = vload4(0, src + idx.s0);
             uchar4 a1 = vload4(0, src + idx.s1);
@@ -130,19 +198,11 @@ void copy_block_1d(
         }
     }
 #endif
-}
+    }  // copy_block_1d
 
-#else  // USE_COPY_1D
+#else
 
-static inline
-__attribute__((always_inline))
-void copy_block_2d(
-    __global const uchar* src0,
-    const uint src_offset0,
-    __global uchar* dst0,
-    const uint dst_offset0
-)
-{
+    {  // copy_block_2d
     __global const uchar* src = src0 + src_offset0;
     __global uchar* dst = dst0 + dst_offset0;
 
@@ -199,85 +259,6 @@ void copy_block_2d(
 #endif  // BLOCK_COLS_FILL_X4 != BLOCK_COLS
         i += WSZ * 4;
     }
-}
-
-#endif  // USE_COPY_1D
-
-__kernel void
-CONCAT(slice_, DIMS)(
-    __global const uchar* src,
-    __global uchar* dst
-)
-{
-    uint block_id = get_global_id(1);
-
-    uint dst_offset = block_id * BLOCK_SIZE;
-
-    uint src_offset = 0;
-
-#define CALC_SRC_INDEX(dim) \
-    { \
-    uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \
-    CONCAT(idx_, dim) = block_id / plane_sz; \
-    block_id = block_id - CONCAT(idx_, dim) * plane_sz; \
-    }
-#define UPDATE_SRC_OFFSET(dim) \
-    src_offset = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset);
-/*
-    if (get_global_id(0) == 0 && get_global_id(1) == 0) \
-        printf("(%d, %d): @%d src_offset=%d   idx_dim=%d   block_id=%d\n", \
-            get_global_id(0), get_global_id(1), \
-            dim, src_offset, CONCAT(idx_, dim), block_id \
-        );
-*/
-
-#if DIMS > 5
-#error "invalid configuration"
-#endif
-#if DIMS > 4
-    uint idx_4 = 0;
-#if BLOCK_DIMS <= 4
-    CALC_SRC_INDEX(4)
-#endif
-    UPDATE_SRC_OFFSET(4)
-#endif
-#if DIMS > 3
-    uint idx_3 = 0;
-#if BLOCK_DIMS <= 3
-    CALC_SRC_INDEX(3)
-#endif
-    UPDATE_SRC_OFFSET(3)
-#endif
-#if DIMS > 2
-    uint idx_2 = 0;
-#if BLOCK_DIMS <= 2
-    CALC_SRC_INDEX(2)
-#endif
-    UPDATE_SRC_OFFSET(2)
-#endif
-#if DIMS > 1
-    uint idx_1 = 0;
-#if BLOCK_DIMS <= 1
-    CALC_SRC_INDEX(1)
-#endif
-    UPDATE_SRC_OFFSET(1)
-#endif
-#if DIMS > 0
-    uint idx_0 = 0;
-    UPDATE_SRC_OFFSET(0)
-#endif
-
-/*
-    if (get_global_id(0) == 0)
-        printf("(%d, %d): src_offset=%d dst_offset=%d\n",
-            get_global_id(0), get_global_id(1),
-            src_offset, dst_offset
-        );
-*/
-
-#ifdef USE_COPY_1D
-    copy_block_1d(src, src_offset, dst, dst_offset);
-#else
-    copy_block_2d(src, src_offset, dst, dst_offset);
+    }  // copy_block_2d
 #endif
 }
diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp
index fb9cc0184b..53886e8e29 100644
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -784,6 +784,11 @@ TEST_P(Test_Darknet_layers, connected)
     testDarknetLayer("connected", true);
 }
 
+TEST_P(Test_Darknet_layers, relu)
+{
+    testDarknetLayer("relu");
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_Darknet_layers, dnnBackendsAndTargets());
 
 }} // namespace
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 8ea5304c14..cb3313def3 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -1133,6 +1133,9 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy)
     const Backend backendId = get<0>(GetParam());
     const Target targetId = get<1>(GetParam());
 
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+
     if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         throw SkipTestException("No support for async forward");
 
@@ -1143,9 +1146,8 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy)
     else
         FAIL() << "Unknown backendId";
 
-    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
     Net netDefault = readNet(_tf("layer_convolution.caffemodel"), _tf("layer_convolution.prototxt"));
-    Net net = readNet(_tf("layer_convolution" + suffix + ".xml"), _tf("layer_convolution" + suffix + ".bin"));
+    Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin"));
 
     Mat inp = blobFromNPY(_tf("blob.npy"));
 
@@ -1165,7 +1167,10 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy)
 
     std::vector<int> outLayers = net.getUnconnectedOutLayers();
     ASSERT_EQ(net.getLayer(outLayers[0])->name, "output");
-    ASSERT_EQ(net.getLayer(outLayers[0])->type, "Convolution");
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        ASSERT_EQ(net.getLayer(outLayers[0])->type, "Convolution");
+    else
+        ASSERT_EQ(net.getLayer(outLayers[0])->type, "Add");
 }
 
 TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8)
@@ -1173,6 +1178,9 @@ TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8)
     const Backend backendId = get<0>(GetParam());
     const Target targetId = get<1>(GetParam());
 
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+
     if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         throw SkipTestException("No support for async forward");
 
@@ -1189,12 +1197,10 @@ TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8)
     randu(inputs[0], 0, 255);
     inputs[0].convertTo(inputs[1], CV_32F);
 
-    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
-
     Mat outs[2];
     for (int i = 0; i < 2; ++i)
     {
-        Net net = readNet(_tf("layer_convolution" + suffix + ".xml"), _tf("layer_convolution" + suffix + ".bin"));
+        Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin"));
         net.setPreferableBackend(backendId);
         net.setPreferableTarget(targetId);
         net.setInput(inputs[i]);
@@ -1210,6 +1216,9 @@ TEST_P(Layer_Test_Convolution_DLDT, multithreading)
     const Backend backendId = get<0>(GetParam());
     const Target targetId = get<1>(GetParam());
 
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+
     if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         throw SkipTestException("No support for async forward");
 
@@ -1220,9 +1229,8 @@ TEST_P(Layer_Test_Convolution_DLDT, multithreading)
     else
         FAIL() << "Unknown backendId";
 
-    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
-    std::string xmlPath = _tf("layer_convolution" + suffix + ".xml");
-    std::string binPath = _tf("layer_convolution" + suffix + ".bin");
+    std::string xmlPath = _tf("layer_convolution.xml");
+    std::string binPath = _tf("layer_convolution.bin");
     Net firstNet = readNet(xmlPath, binPath);
     Net secondNet = readNet(xmlPath, binPath);
     Mat inp = blobFromNPY(_tf("blob.npy"));
@@ -1281,8 +1289,7 @@ TEST_P(Test_DLDT_two_inputs_3dim, as_IR)
     int secondInpType = get<1>(GetParam());
     Target targetId = get<2>(GetParam());
 
-    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
-    Net net = readNet(_tf("net_two_inputs" + suffix + ".xml"), _tf("net_two_inputs.bin"));
+    Net net = readNet(_tf("net_two_inputs.xml"), _tf("net_two_inputs.bin"));
     std::vector<int> inpSize = get<3>(GetParam());
     Mat firstInp(3, inpSize.data(), firstInpType);
     Mat secondInp(3, inpSize.data(), secondInpType);
diff --git a/modules/dnn/test/test_misc.cpp b/modules/dnn/test/test_misc.cpp
index a1480b0e8b..b803f782d0 100644
--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@@ -444,12 +444,14 @@ TEST_P(Async, model_optimizer_pipeline_set_and_forward_single)
     const Backend backendId = get<0>(get<1>(GetParam()));
     const Target targetId = get<1>(get<1>(GetParam()));
 
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+
     if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         throw SkipTestException("No support for async forward");
 
-    const std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
-    const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
-    const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
+    const std::string& model = findDataFile("dnn/layers/layer_convolution.bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution.xml");
 
     if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
@@ -503,12 +505,14 @@ TEST_P(Async, model_optimizer_pipeline_set_and_forward_all)
     const Backend backendId = get<0>(get<1>(GetParam()));
     const Target targetId = get<1>(get<1>(GetParam()));
 
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+
     if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         throw SkipTestException("No support for async forward");
 
-    const std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
-    const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
-    const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
+    const std::string& model = findDataFile("dnn/layers/layer_convolution.bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution.xml");
 
     if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
@@ -677,9 +681,11 @@ TEST_P(Test_Model_Optimizer, forward_two_nets)
     const Backend backendId = get<0>(GetParam());
     const Target targetId = get<1>(GetParam());
 
-    const std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
-    const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
-    const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+
+    const std::string& model = findDataFile("dnn/layers/layer_convolution.bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution.xml");
 
     if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
@@ -716,12 +722,14 @@ TEST_P(Test_Model_Optimizer, readFromBuffer)
     const Backend backendId = get<0>(GetParam());
     const Target targetId = get<1>(GetParam());
 
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+
     if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         throw SkipTestException("No support for async forward");
 
-    const std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
-    const std::string& weightsFile = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
-    const std::string& modelFile = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
+    const std::string& weightsFile = findDataFile("dnn/layers/layer_convolution.bin");
+    const std::string& modelFile = findDataFile("dnn/layers/layer_convolution.xml");
 
     if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
@@ -769,8 +777,11 @@ TEST_P(Test_Model_Optimizer, flexible_inputs)
     const Backend backendId = get<0>(GetParam());
     const Target targetId = get<1>(GetParam());
 
-    const std::string& model = findDataFile("dnn/layers/layer_convolution_fp16.bin");
-    const std::string& proto = findDataFile("dnn/layers/layer_convolution_fp16.xml");
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+
+    const std::string& model = findDataFile("dnn/layers/layer_convolution.bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution.xml");
 
     if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 86dfcae080..22a45286d6 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -114,6 +114,62 @@ TEST_P(Test_ONNX_layers, Convolution)
     testONNXModels("convolution");
 }
 
+TEST_P(Test_ONNX_layers, Convolution_variable_weight)
+{
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+
+    String basename = "conv_variable_w";
+    Net net = readNetFromONNX(_tf("models/" + basename + ".onnx"));
+    ASSERT_FALSE(net.empty());
+
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+
+    for (int i = 0; i < 2; i++)
+    {
+        Mat input = blobFromNPY(_tf("data/input_" + basename + format("_%d", i) + "_0.npy"));
+        Mat weights = blobFromNPY(_tf("data/input_" + basename + format("_%d", i) + "_1.npy"));
+        Mat ref = blobFromNPY(_tf("data/output_" + basename + format("_%d", i) + ".npy"));
+
+        net.setInput(input, "0");
+        net.setInput(weights, "1");
+
+        Mat out = net.forward();
+        normAssert(ref, out, "", default_l1, default_lInf);
+    }
+}
+
+TEST_P(Test_ONNX_layers, Convolution_variable_weight_bias)
+{
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+
+    String basename = "conv_variable_wb";
+    Net net = readNetFromONNX(_tf("models/" + basename + ".onnx"));
+    ASSERT_FALSE(net.empty());
+
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+
+    for (int i = 0; i < 2; i++)
+    {
+        Mat input = blobFromNPY(_tf("data/input_" + basename + format("_%d", i) + "_0.npy"));
+        Mat weights = blobFromNPY(_tf("data/input_" + basename + format("_%d", i) + "_1.npy"));
+        Mat bias = blobFromNPY(_tf("data/input_" + basename + format("_%d", i) + "_2.npy"));
+        Mat ref = blobFromNPY(_tf("data/output_" + basename + format("_%d", i) + ".npy"));
+
+        net.setInput(input, "0");
+        net.setInput(weights, "1");
+        net.setInput(bias, "bias");
+
+        Mat out = net.forward();
+        normAssert(ref, out, "", default_l1, default_lInf);
+    }
+}
+
 TEST_P(Test_ONNX_layers, Gather)
 {
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp
index 08faf42370..2becf11950 100644
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -245,6 +245,31 @@ typedef Feature2D DescriptorExtractor;
 //! @{
 
 
+/** @brief Class for implementing the wrapper which makes detectors and extractors to be affine invariant,
+described as ASIFT in @cite YM11 .
+*/
+class CV_EXPORTS_W AffineFeature : public Feature2D
+{
+public:
+    /**
+    @param backend The detector/extractor you want to use as backend.
+    @param maxTilt The highest power index of tilt factor. 5 is used in the paper as tilt sampling range n.
+    @param minTilt The lowest power index of tilt factor. 0 is used in the paper.
+    @param tiltStep Tilt sampling step \f$\delta_t\f$ in Algorithm 1 in the paper.
+    @param rotateStepBase Rotation sampling step factor b in Algorithm 1 in the paper.
+    */
+    CV_WRAP static Ptr<AffineFeature> create(const Ptr<Feature2D>& backend,
+        int maxTilt = 5, int minTilt = 0, float tiltStep = 1.4142135623730951f, float rotateStepBase = 72);
+
+    CV_WRAP virtual void setViewParams(const std::vector<float>& tilts, const std::vector<float>& rolls) = 0;
+    CV_WRAP virtual void getViewParams(std::vector<float>& tilts, std::vector<float>& rolls) const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+typedef AffineFeature AffineFeatureDetector;
+typedef AffineFeature AffineDescriptorExtractor;
+
+
 /** @brief Class for extracting keypoints and computing descriptors using the Scale Invariant Feature Transform
 (SIFT) algorithm by D. Lowe @cite Lowe04 .
 */
diff --git a/modules/features2d/src/affine_feature.cpp b/modules/features2d/src/affine_feature.cpp
new file mode 100644
index 0000000000..41518d945d
--- /dev/null
+++ b/modules/features2d/src/affine_feature.cpp
@@ -0,0 +1,358 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// This file is based on code issued with the following license.
+/*********************************************************************
+* Software License Agreement (BSD License)
+*
+*  Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+*  Copyright (C) 2008-2013, Willow Garage Inc., all rights reserved.
+*  Copyright (C) 2013, Evgeny Toropov, all rights reserved.
+*  Third party copyrights are property of their respective owners.
+*
+*  Redistribution and use in source and binary forms, with or without
+*  modification, are permitted provided that the following conditions
+*  are met:
+*
+*   * Redistributions of source code must retain the above copyright
+*     notice, this list of conditions and the following disclaimer.
+*   * Redistributions in binary form must reproduce the above
+*     copyright notice, this list of conditions and the following
+*     disclaimer in the documentation and/or other materials provided
+*     with the distribution.
+*   * The name of the copyright holders may not be used to endorse
+*     or promote products derived from this software without specific
+*     prior written permission.
+*
+*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+*  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+*  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+*  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+*  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+*  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+*  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+*  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+*  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+*  POSSIBILITY OF SUCH DAMAGE.
+*********************************************************************/
+
+/*
+ Guoshen Yu, Jean-Michel Morel, ASIFT: An Algorithm for Fully Affine
+ Invariant Comparison,  Image Processing On Line, 1 (2011), pp. 11–38.
+ https://doi.org/10.5201/ipol.2011.my-asift
+ */
+
+#include "precomp.hpp"
+#include <iostream>
+namespace cv {
+
+class AffineFeature_Impl CV_FINAL : public AffineFeature
+{
+public:
+    explicit AffineFeature_Impl(const Ptr<Feature2D>& backend,
+            int maxTilt, int minTilt, float tiltStep, float rotateStepBase);
+
+    int descriptorSize() const CV_OVERRIDE
+    {
+        return backend_->descriptorSize();
+    }
+
+    int descriptorType() const CV_OVERRIDE
+    {
+        return backend_->descriptorType();
+    }
+
+    int defaultNorm() const CV_OVERRIDE
+    {
+        return backend_->defaultNorm();
+    }
+
+    void detectAndCompute(InputArray image, InputArray mask, std::vector<KeyPoint>& keypoints,
+            OutputArray descriptors, bool useProvidedKeypoints=false) CV_OVERRIDE;
+
+    void setViewParams(const std::vector<float>& tilts, const std::vector<float>& rolls) CV_OVERRIDE;
+    void getViewParams(std::vector<float>& tilts, std::vector<float>& rolls) const CV_OVERRIDE;
+
+protected:
+    void splitKeypointsByView(const std::vector<KeyPoint>& keypoints_,
+            std::vector< std::vector<KeyPoint> >& keypointsByView) const;
+
+    const Ptr<Feature2D> backend_;
+    int maxTilt_;
+    int minTilt_;
+    float tiltStep_;
+    float rotateStepBase_;
+
+    // Tilt factors.
+    std::vector<float> tilts_;
+    // Roll factors.
+    std::vector<float> rolls_;
+
+private:
+    AffineFeature_Impl(const AffineFeature_Impl &); // copy disabled
+    AffineFeature_Impl& operator=(const AffineFeature_Impl &); // assign disabled
+};
+
+AffineFeature_Impl::AffineFeature_Impl(const Ptr<FeatureDetector>& backend,
+        int maxTilt, int minTilt, float tiltStep, float rotateStepBase)
+    : backend_(backend), maxTilt_(maxTilt), minTilt_(minTilt), tiltStep_(tiltStep), rotateStepBase_(rotateStepBase)
+{
+    int i = minTilt_;
+    if( i == 0 )
+    {
+        tilts_.push_back(1);
+        rolls_.push_back(0);
+        i++;
+    }
+    float tilt = 1;
+    for( ; i <= maxTilt_; i++ )
+    {
+        tilt *= tiltStep_;
+        float rotateStep = rotateStepBase_ / tilt;
+        int rollN = cvFloor(180.0f / rotateStep);
+        if( rollN * rotateStep == 180.0f )
+            rollN--;
+        for( int j = 0; j <= rollN; j++ )
+        {
+            tilts_.push_back(tilt);
+            rolls_.push_back(rotateStep * j);
+        }
+    }
+}
+
+void AffineFeature_Impl::setViewParams(const std::vector<float>& tilts,
+        const std::vector<float>& rolls)
+{
+    CV_Assert(tilts.size() == rolls.size());
+    tilts_ = tilts;
+    rolls_ = rolls;
+}
+
+void AffineFeature_Impl::getViewParams(std::vector<float>& tilts,
+        std::vector<float>& rolls) const
+{
+    tilts = tilts_;
+    rolls = rolls_;
+}
+
+void AffineFeature_Impl::splitKeypointsByView(const std::vector<KeyPoint>& keypoints_,
+        std::vector< std::vector<KeyPoint> >& keypointsByView) const
+{
+    for( size_t i = 0; i < keypoints_.size(); i++ )
+    {
+        const KeyPoint& kp = keypoints_[i];
+        CV_Assert( kp.class_id >= 0 && kp.class_id < (int)tilts_.size() );
+        keypointsByView[kp.class_id].push_back(kp);
+    }
+}
+
+class skewedDetectAndCompute : public ParallelLoopBody
+{
+public:
+    skewedDetectAndCompute(
+        const std::vector<float>& _tilts,
+        const std::vector<float>& _rolls,
+        std::vector< std::vector<KeyPoint> >& _keypointsCollection,
+        std::vector<Mat>& _descriptorCollection,
+        const Mat& _image,
+        const Mat& _mask,
+        const bool _do_keypoints,
+        const bool _do_descriptors,
+        const Ptr<Feature2D>& _backend)
+        : tilts(_tilts),
+          rolls(_rolls),
+          keypointsCollection(_keypointsCollection),
+          descriptorCollection(_descriptorCollection),
+          image(_image),
+          mask(_mask),
+          do_keypoints(_do_keypoints),
+          do_descriptors(_do_descriptors),
+          backend(_backend) {}
+
+    void operator()( const cv::Range& range ) const CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+
+        const int begin = range.start;
+        const int end = range.end;
+
+        for( int a = begin; a < end; a++ )
+        {
+            Mat warpedImage, warpedMask;
+            Matx23f pose, invPose;
+            affineSkew(tilts[a], rolls[a], warpedImage, warpedMask, pose);
+            invertAffineTransform(pose, invPose);
+
+            std::vector<KeyPoint> wKeypoints;
+            Mat wDescriptors;
+            if( !do_keypoints )
+            {
+                const std::vector<KeyPoint>& keypointsInView = keypointsCollection[a];
+                if( keypointsInView.size() == 0 ) // when there are no keypoints in this affine view
+                    continue;
+
+                std::vector<Point2f> pts_, pts;
+                KeyPoint::convert(keypointsInView, pts_);
+                transform(pts_, pts, pose);
+                wKeypoints.resize(keypointsInView.size());
+                for( size_t wi = 0; wi < wKeypoints.size(); wi++ )
+                {
+                    wKeypoints[wi] = keypointsInView[wi];
+                    wKeypoints[wi].pt = pts[wi];
+                }
+            }
+            backend->detectAndCompute(warpedImage, warpedMask, wKeypoints, wDescriptors, !do_keypoints);
+            if( do_keypoints )
+            {
+                // KeyPointsFilter::runByPixelsMask( wKeypoints, warpedMask );
+                if( wKeypoints.size() == 0 )
+                {
+                    keypointsCollection[a].clear();
+                    continue;
+                }
+                std::vector<Point2f> pts_, pts;
+                KeyPoint::convert(wKeypoints, pts_);
+                transform(pts_, pts, invPose);
+
+                keypointsCollection[a].resize(wKeypoints.size());
+                for( size_t wi = 0; wi < wKeypoints.size(); wi++ )
+                {
+                    keypointsCollection[a][wi] = wKeypoints[wi];
+                    keypointsCollection[a][wi].pt = pts[wi];
+                    keypointsCollection[a][wi].class_id = a;
+                }
+            }
+            if( do_descriptors )
+                wDescriptors.copyTo(descriptorCollection[a]);
+        }
+    }
+private:
+    void affineSkew(float tilt, float phi,
+            Mat& warpedImage, Mat& warpedMask, Matx23f& pose) const
+    {
+        int h = image.size().height;
+        int w = image.size().width;
+        Mat rotImage;
+
+        Mat mask0;
+        if( mask.empty() )
+            mask0 = Mat(h, w, CV_8UC1, 255);
+        else
+            mask0 = mask;
+        pose = Matx23f(1,0,0,
+                    0,1,0);
+
+        if( phi == 0 )
+            image.copyTo(rotImage);
+        else
+        {
+            phi = phi * (float)CV_PI / 180;
+            float s = std::sin(phi);
+            float c = std::cos(phi);
+            Matx22f A(c, -s, s, c);
+            Matx<float, 4, 2> corners(0, 0, (float)w, 0, (float)w,(float)h, 0, (float)h);
+            Mat tf(corners * A.t());
+            Mat tcorners;
+            tf.convertTo(tcorners, CV_32S);
+            Rect rect = boundingRect(tcorners);
+            h = rect.height; w = rect.width;
+            pose = Matx23f(c, -s, -(float)rect.x,
+                        s,  c, -(float)rect.y);
+            warpAffine(image, rotImage, pose, Size(w, h), INTER_LINEAR, BORDER_REPLICATE);
+        }
+        if( tilt == 1 )
+            warpedImage = rotImage;
+        else
+        {
+            float s = 0.8f * sqrt(tilt * tilt - 1);
+            GaussianBlur(rotImage, rotImage, Size(0, 0), s, 0.01);
+            resize(rotImage, warpedImage, Size(0, 0), 1.0/tilt, 1.0, INTER_NEAREST);
+            pose(0, 0) /= tilt;
+            pose(0, 1) /= tilt;
+            pose(0, 2) /= tilt;
+        }
+        if( phi != 0 || tilt != 1 )
+            warpAffine(mask0, warpedMask, pose, warpedImage.size(), INTER_NEAREST);
+    }
+
+
+    const std::vector<float>& tilts;
+    const std::vector<float>& rolls;
+    std::vector< std::vector<KeyPoint> >& keypointsCollection;
+    std::vector<Mat>& descriptorCollection;
+    const Mat& image;
+    const Mat& mask;
+    const bool do_keypoints;
+    const bool do_descriptors;
+    const Ptr<Feature2D>& backend;
+};
+
+void AffineFeature_Impl::detectAndCompute(InputArray _image, InputArray _mask,
+        std::vector<KeyPoint>& keypoints,
+        OutputArray _descriptors,
+        bool useProvidedKeypoints)
+{
+    CV_TRACE_FUNCTION();
+
+    bool do_keypoints = !useProvidedKeypoints;
+    bool do_descriptors = _descriptors.needed();
+    Mat image = _image.getMat(), mask = _mask.getMat();
+    Mat descriptors;
+
+    if( (!do_keypoints && !do_descriptors) || _image.empty() )
+        return;
+
+    std::vector< std::vector<KeyPoint> > keypointsCollection(tilts_.size());
+    std::vector< Mat > descriptorCollection(tilts_.size());
+
+    if( do_keypoints )
+        keypoints.clear();
+    else
+        splitKeypointsByView(keypoints, keypointsCollection);
+
+    parallel_for_(Range(0, (int)tilts_.size()), skewedDetectAndCompute(tilts_, rolls_, keypointsCollection, descriptorCollection,
+        image, mask, do_keypoints, do_descriptors, backend_));
+
+    if( do_keypoints )
+        for( size_t i = 0; i < keypointsCollection.size(); i++ )
+        {
+            const std::vector<KeyPoint>& keys = keypointsCollection[i];
+            keypoints.insert(keypoints.end(), keys.begin(), keys.end());
+        }
+
+    if( do_descriptors )
+    {
+        _descriptors.create((int)keypoints.size(), backend_->descriptorSize(), backend_->descriptorType());
+        descriptors = _descriptors.getMat();
+        int iter = 0;
+        for( size_t i = 0; i < descriptorCollection.size(); i++ )
+        {
+            const Mat& descs = descriptorCollection[i];
+            if( descs.empty() )
+                continue;
+            Mat roi(descriptors, Rect(0, iter, descriptors.cols, descs.rows));
+            descs.copyTo(roi);
+            iter += descs.rows;
+        }
+    }
+}
+
+
+Ptr<AffineFeature> AffineFeature::create(const Ptr<Feature2D>& backend,
+                                         int maxTilt, int minTilt, float tiltStep, float rotateStepBase)
+{
+    CV_Assert(minTilt < maxTilt);
+    CV_Assert(tiltStep > 0);
+    CV_Assert(rotateStepBase > 0);
+    return makePtr<AffineFeature_Impl>(backend, maxTilt, minTilt, tiltStep, rotateStepBase);
+}
+
+String AffineFeature::getDefaultName() const
+{
+    return (Feature2D::getDefaultName() + ".AffineFeature");
+}
+
+} // namespace
diff --git a/modules/features2d/test/test_affine_feature.cpp b/modules/features2d/test/test_affine_feature.cpp
new file mode 100644
index 0000000000..f40f21ed8d
--- /dev/null
+++ b/modules/features2d/test/test_affine_feature.cpp
@@ -0,0 +1,185 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "test_precomp.hpp"
+
+// #define GENERATE_DATA // generate data in debug mode
+
+namespace opencv_test { namespace {
+
+#ifndef GENERATE_DATA
+static bool isSimilarKeypoints( const KeyPoint& p1, const KeyPoint& p2 )
+{
+    const float maxPtDif = 1.f;
+    const float maxSizeDif = 1.f;
+    const float maxAngleDif = 2.f;
+    const float maxResponseDif = 0.1f;
+
+    float dist = (float)cv::norm( p1.pt - p2.pt );
+    return (dist < maxPtDif &&
+            fabs(p1.size - p2.size) < maxSizeDif &&
+            abs(p1.angle - p2.angle) < maxAngleDif &&
+            abs(p1.response - p2.response) < maxResponseDif &&
+            (p1.octave & 0xffff) == (p2.octave & 0xffff)     // do not care about sublayers and class_id
+            );
+}
+#endif
+
+TEST(Features2d_AFFINE_FEATURE, regression)
+{
+    Mat image = imread(cvtest::findDataFile("features2d/tsukuba.png"));
+    string xml = cvtest::TS::ptr()->get_data_path() + "asift/regression_cpp.xml.gz";
+    ASSERT_FALSE(image.empty());
+
+    Mat gray;
+    cvtColor(image, gray, COLOR_BGR2GRAY);
+
+    // Default ASIFT generates too large descriptors. This test uses small maxTilt to suppress the size of testdata.
+    Ptr<AffineFeature> ext = AffineFeature::create(SIFT::create(), 2, 0, 1.4142135623730951f, 144.0f);
+    Mat mpt, msize, mangle, mresponse, moctave, mclass_id;
+#ifdef GENERATE_DATA
+    // calculate
+    vector<KeyPoint> calcKeypoints;
+    Mat calcDescriptors;
+    ext->detectAndCompute(gray, Mat(), calcKeypoints, calcDescriptors, false);
+
+    // create keypoints XML
+    FileStorage fs(xml, FileStorage::WRITE);
+    ASSERT_TRUE(fs.isOpened()) << xml;
+    std::cout << "Creating keypoints XML..." << std::endl;
+
+    mpt = Mat(calcKeypoints.size(), 2, CV_32F);
+    msize = Mat(calcKeypoints.size(), 1, CV_32F);
+    mangle = Mat(calcKeypoints.size(), 1, CV_32F);
+    mresponse = Mat(calcKeypoints.size(), 1, CV_32F);
+    moctave = Mat(calcKeypoints.size(), 1, CV_32S);
+    mclass_id = Mat(calcKeypoints.size(), 1, CV_32S);
+
+    for( size_t i = 0; i < calcKeypoints.size(); i++ )
+    {
+        const KeyPoint& key = calcKeypoints[i];
+        mpt.at<float>(i, 0) = key.pt.x;
+        mpt.at<float>(i, 1) = key.pt.y;
+        msize.at<float>(i, 0) = key.size;
+        mangle.at<float>(i, 0) = key.angle;
+        mresponse.at<float>(i, 0) = key.response;
+        moctave.at<int>(i, 0) = key.octave;
+        mclass_id.at<int>(i, 0) = key.class_id;
+    }
+
+    fs << "keypoints_pt" << mpt;
+    fs << "keypoints_size" << msize;
+    fs << "keypoints_angle" << mangle;
+    fs << "keypoints_response" << mresponse;
+    fs << "keypoints_octave" << moctave;
+    fs << "keypoints_class_id" << mclass_id;
+
+    // create descriptor XML
+    fs << "descriptors" << calcDescriptors;
+    fs.release();
+#else
+    const float badCountsRatio = 0.01f;
+    const float badDescriptorDist = 1.0f;
+    const float maxBadKeypointsRatio = 0.15f;
+    const float maxBadDescriptorRatio = 0.15f;
+
+    // read keypoints
+    vector<KeyPoint> validKeypoints;
+    Mat validDescriptors;
+    FileStorage fs(xml, FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened()) << xml;
+
+    fs["keypoints_pt"] >> mpt;
+    ASSERT_EQ(mpt.type(), CV_32F);
+    fs["keypoints_size"] >> msize;
+    ASSERT_EQ(msize.type(), CV_32F);
+    fs["keypoints_angle"] >> mangle;
+    ASSERT_EQ(mangle.type(), CV_32F);
+    fs["keypoints_response"] >> mresponse;
+    ASSERT_EQ(mresponse.type(), CV_32F);
+    fs["keypoints_octave"] >> moctave;
+    ASSERT_EQ(moctave.type(), CV_32S);
+    fs["keypoints_class_id"] >> mclass_id;
+    ASSERT_EQ(mclass_id.type(), CV_32S);
+
+    validKeypoints.resize(mpt.rows);
+    for( int i = 0; i < (int)validKeypoints.size(); i++ )
+    {
+        validKeypoints[i].pt.x = mpt.at<float>(i, 0);
+        validKeypoints[i].pt.y = mpt.at<float>(i, 1);
+        validKeypoints[i].size = msize.at<float>(i, 0);
+        validKeypoints[i].angle = mangle.at<float>(i, 0);
+        validKeypoints[i].response = mresponse.at<float>(i, 0);
+        validKeypoints[i].octave = moctave.at<int>(i, 0);
+        validKeypoints[i].class_id = mclass_id.at<int>(i, 0);
+    }
+
+    // read descriptors
+    fs["descriptors"] >> validDescriptors;
+    fs.release();
+
+    // calc and compare keypoints
+    vector<KeyPoint> calcKeypoints;
+    ext->detectAndCompute(gray, Mat(), calcKeypoints, noArray(), false);
+
+    float countRatio = (float)validKeypoints.size() / (float)calcKeypoints.size();
+    ASSERT_LT(countRatio, 1 + badCountsRatio) << "Bad keypoints count ratio.";
+    ASSERT_GT(countRatio, 1 - badCountsRatio) << "Bad keypoints count ratio.";
+
+    int badPointCount = 0, commonPointCount = max((int)validKeypoints.size(), (int)calcKeypoints.size());
+    for( size_t v = 0; v < validKeypoints.size(); v++ )
+    {
+        int nearestIdx = -1;
+        float minDist = std::numeric_limits<float>::max();
+        float angleDistOfNearest = std::numeric_limits<float>::max();
+
+        for( size_t c = 0; c < calcKeypoints.size(); c++ )
+        {
+            if( validKeypoints[v].class_id != calcKeypoints[c].class_id )
+                continue;
+            float curDist = (float)cv::norm( calcKeypoints[c].pt - validKeypoints[v].pt );
+            if( curDist < minDist )
+            {
+                minDist = curDist;
+                nearestIdx = (int)c;
+                angleDistOfNearest = abs( calcKeypoints[c].angle - validKeypoints[v].angle );
+            }
+            else if( curDist == minDist ) // the keypoints whose positions are same but angles are different
+            {
+                float angleDist = abs( calcKeypoints[c].angle - validKeypoints[v].angle );
+                if( angleDist < angleDistOfNearest )
+                {
+                    nearestIdx = (int)c;
+                    angleDistOfNearest = angleDist;
+                }
+            }
+        }
+        if( nearestIdx == -1 || !isSimilarKeypoints( validKeypoints[v], calcKeypoints[nearestIdx] ) )
+            badPointCount++;
+    }
+    float badKeypointsRatio = (float)badPointCount / (float)commonPointCount;
+    std::cout << "badKeypointsRatio: " << badKeypointsRatio << std::endl;
+    ASSERT_LT( badKeypointsRatio , maxBadKeypointsRatio ) << "Bad accuracy!";
+
+    // Calc and compare descriptors. This uses validKeypoints for extraction.
+    Mat calcDescriptors;
+    ext->detectAndCompute(gray, Mat(), validKeypoints, calcDescriptors, true);
+
+    int dim = validDescriptors.cols;
+    int badDescriptorCount = 0;
+    L1<float> distance;
+
+    for( int i = 0; i < (int)validKeypoints.size(); i++ )
+    {
+        float dist = distance( validDescriptors.ptr<float>(i), calcDescriptors.ptr<float>(i), dim );
+        if( dist > badDescriptorDist )
+            badDescriptorCount++;
+    }
+    float badDescriptorRatio = (float)badDescriptorCount / (float)validKeypoints.size();
+    std::cout << "badDescriptorRatio: " << badDescriptorRatio << std::endl;
+    ASSERT_LT( badDescriptorRatio, maxBadDescriptorRatio ) << "Too many descriptors mismatched.";
+#endif
+}
+
+}} // namespace
diff --git a/modules/flann/include/opencv2/flann.hpp b/modules/flann/include/opencv2/flann.hpp
index 293990752b..9013ae4308 100644
--- a/modules/flann/include/opencv2/flann.hpp
+++ b/modules/flann/include/opencv2/flann.hpp
@@ -191,8 +191,28 @@ public:
             KDTreeIndexParams( int trees = 4 );
         };
         @endcode
+        - **HierarchicalClusteringIndexParams** When passing an object of this type the index constructed
+        will be a hierarchical tree of clusters, dividing each set of points into n clusters whose centers
+        are picked among the points without further refinement of their position.
+        This algorithm fits both floating, integer and binary vectors. :
+        @code
+        struct HierarchicalClusteringIndexParams : public IndexParams
+        {
+            HierarchicalClusteringIndexParams(
+                int branching = 32,
+                flann_centers_init_t centers_init = CENTERS_RANDOM,
+                int trees = 4,
+                int leaf_size = 100);
+
+        };
+        @endcode
         - **KMeansIndexParams** When passing an object of this type the index constructed will be a
-        hierarchical k-means tree. :
+        hierarchical k-means tree (one tree by default), dividing each set of points into n clusters
+        whose barycenters are refined iteratively.
+        Note that this algorithm has been extended to the support of binary vectors as an alternative
+        to LSH when knn search speed is the criterium. It will also outperform LSH when processing
+        directly (i.e. without the use of MCA/PCA) datasets whose points share mostly the same values
+        for most of the dimensions. It is recommended to set more than one tree with binary data. :
         @code
         struct KMeansIndexParams : public IndexParams
         {
@@ -201,6 +221,13 @@ public:
                 int iterations = 11,
                 flann_centers_init_t centers_init = CENTERS_RANDOM,
                 float cb_index = 0.2 );
+
+            KMeansIndexParams(
+                int branching,
+                int iterations,
+                flann_centers_init_t centers_init,
+                float cb_index,
+                int trees );
         };
         @endcode
         - **CompositeIndexParams** When using a parameters object of this type the index created
@@ -219,7 +246,8 @@ public:
         - **LshIndexParams** When using a parameters object of this type the index created uses
         multi-probe LSH (by Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity Search
         by Qin Lv, William Josephson, Zhe Wang, Moses Charikar, Kai Li., Proceedings of the 33rd
-        International Conference on Very Large Data Bases (VLDB). Vienna, Austria. September 2007) :
+        International Conference on Very Large Data Bases (VLDB). Vienna, Austria. September 2007).
+        This algorithm is designed for binary vectors. :
         @code
         struct LshIndexParams : public IndexParams
         {
diff --git a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
index 9d01644aad..b7a650ff00 100644
--- a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
+++ b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
@@ -404,34 +404,16 @@ public:
      */
     virtual ~HierarchicalClusteringIndex()
     {
-        free_elements();
-
         if (root!=NULL) {
             delete[] root;
         }
 
         if (indices!=NULL) {
+            free_indices();
             delete[] indices;
         }
     }
 
-
-    /**
-     * Release the inner elements of indices[]
-     */
-    void free_elements()
-    {
-        if (indices!=NULL) {
-            for(int i=0; i<trees_; ++i) {
-                if (indices[i]!=NULL) {
-                    delete[] indices[i];
-                    indices[i] = NULL;
-                }
-            }
-        }
-    }
-
-
     /**
      *  Returns size of index.
      */
@@ -467,7 +449,7 @@ public:
             throw FLANNException("Branching factor must be at least 2");
         }
 
-        free_elements();
+        free_indices();
 
         for (int i=0; i<trees_; ++i) {
             indices[i] = new int[size_];
@@ -503,13 +485,12 @@ public:
 
     void loadIndex(FILE* stream) CV_OVERRIDE
     {
-        free_elements();
-
         if (root!=NULL) {
             delete[] root;
         }
 
         if (indices!=NULL) {
+            free_indices();
             delete[] indices;
         }
 
@@ -650,6 +631,20 @@ private:
     }
 
 
+    /**
+     * Release the inner elements of indices[]
+     */
+    void free_indices()
+    {
+        if (indices!=NULL) {
+            for(int i=0; i<trees_; ++i) {
+                if (indices[i]!=NULL) {
+                    delete[] indices[i];
+                    indices[i] = NULL;
+                }
+            }
+        }
+    }
 
 
     void computeLabels(int* dsindices, int indices_length,  int* centers, int centers_length, int* labels, DistanceType& cost)
diff --git a/modules/flann/include/opencv2/flann/kmeans_index.h b/modules/flann/include/opencv2/flann/kmeans_index.h
index a50e0cdf8d..a823986e09 100644
--- a/modules/flann/include/opencv2/flann/kmeans_index.h
+++ b/modules/flann/include/opencv2/flann/kmeans_index.h
@@ -57,8 +57,8 @@ namespace cvflann
 
 struct KMeansIndexParams : public IndexParams
 {
-    KMeansIndexParams(int branching = 32, int iterations = 11,
-                      flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 )
+    void indexParams(int branching, int iterations,
+                     flann_centers_init_t centers_init, float cb_index, int trees)
     {
         (*this)["algorithm"] = FLANN_INDEX_KMEANS;
         // branching factor
@@ -69,6 +69,20 @@ struct KMeansIndexParams : public IndexParams
         (*this)["centers_init"] = centers_init;
         // cluster boundary index. Used when searching the kmeans tree
         (*this)["cb_index"] = cb_index;
+        // number of kmeans trees to search in
+        (*this)["trees"] = trees;
+    }
+
+    KMeansIndexParams(int branching = 32, int iterations = 11,
+                      flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 )
+    {
+        indexParams(branching, iterations, centers_init, cb_index, 1);
+    }
+
+    KMeansIndexParams(int branching, int iterations,
+                      flann_centers_init_t centers_init, float cb_index, int trees)
+    {
+        indexParams(branching, iterations, centers_init, cb_index, trees);
     }
 };
 
@@ -347,6 +361,7 @@ public:
         veclen_ = dataset_.cols;
 
         branching_ = get_param(params,"branching",32);
+        trees_ = get_param(params,"trees",1);
         iterations_ = get_param(params,"iterations",11);
         if (iterations_<0) {
             iterations_ = (std::numeric_limits<int>::max)();
@@ -367,6 +382,13 @@ public:
         }
         cb_index_ = 0.4f;
 
+        root_ = new KMeansNodePtr[trees_];
+        indices_ = new int*[trees_];
+
+        for (int i=0; i<trees_; ++i) {
+            root_[i] = NULL;
+            indices_[i] = NULL;
+        }
     }
 
 
@@ -382,9 +404,11 @@ public:
     virtual ~KMeansIndex()
     {
         if (root_ != NULL) {
-            free_centers(root_);
+            free_centers();
+            delete[] root_;
         }
         if (indices_!=NULL) {
+            free_indices();
             delete[] indices_;
         }
     }
@@ -429,23 +453,24 @@ public:
             throw FLANNException("Branching factor must be at least 2");
         }
 
-        indices_ = new int[size_];
-        for (size_t i=0; i<size_; ++i) {
-            indices_[i] = int(i);
-        }
+        free_indices();
 
-        root_ = pool_.allocate<KMeansNode>();
-        std::memset(root_, 0, sizeof(KMeansNode));
+        for (int i=0; i<trees_; ++i) {
+            indices_[i] = new int[size_];
+            for (size_t j=0; j<size_; ++j) {
+                indices_[i][j] = int(j);
+            }
+            root_[i] = pool_.allocate<KMeansNode>();
+            std::memset(root_[i], 0, sizeof(KMeansNode));
 
-        if(is_kdtree_distance::val || is_vector_space_distance::val)
-        {
-            computeNodeStatistics(root_, indices_, (unsigned int)size_);
-            computeClustering(root_, indices_, (int)size_, branching_,0);
-        }
-        else
-        {
-            computeBitfieldNodeStatistics(root_, indices_, (unsigned int)size_);
-            computeBitfieldClustering(root_, indices_, (int)size_, branching_,0);
+            if(is_kdtree_distance::val || is_vector_space_distance::val) {
+                computeNodeStatistics(root_[i], indices_[i], (unsigned int)size_);
+                computeClustering(root_[i], indices_[i], (int)size_, branching_,0);
+            }
+            else {
+                computeBitfieldNodeStatistics(root_[i], indices_[i], (unsigned int)size_);
+                computeBitfieldClustering(root_[i], indices_[i], (int)size_, branching_,0);
+            }
         }
     }
 
@@ -456,35 +481,43 @@ public:
         save_value(stream, iterations_);
         save_value(stream, memoryCounter_);
         save_value(stream, cb_index_);
-        save_value(stream, *indices_, (int)size_);
-
-        save_tree(stream, root_);
+        save_value(stream, trees_);
+        for (int i=0; i<trees_; ++i) {
+            save_value(stream, *indices_[i], (int)size_);
+            save_tree(stream, root_[i], i);
+        }
     }
 
 
     void loadIndex(FILE* stream) CV_OVERRIDE
     {
+        if (indices_!=NULL) {
+            free_indices();
+            delete[] indices_;
+        }
+        if (root_!=NULL) {
+            free_centers();
+        }
+
         load_value(stream, branching_);
         load_value(stream, iterations_);
         load_value(stream, memoryCounter_);
         load_value(stream, cb_index_);
-        if (indices_!=NULL) {
-            delete[] indices_;
-        }
-        indices_ = new int[size_];
-        load_value(stream, *indices_, size_);
+        load_value(stream, trees_);
 
-        if (root_!=NULL) {
-            free_centers(root_);
+        indices_ = new int*[trees_];
+        for (int i=0; i<trees_; ++i) {
+            indices_[i] = new int[size_];
+            load_value(stream, *indices_[i], size_);
+            load_tree(stream, root_[i], i);
         }
-        load_tree(stream, root_);
 
         index_params_["algorithm"] = getType();
         index_params_["branching"] = branching_;
+        index_params_["trees"] = trees_;
         index_params_["iterations"] = iterations_;
         index_params_["centers_init"] = centers_init_;
         index_params_["cb_index"] = cb_index_;
-
     }
 
 
@@ -500,17 +533,21 @@ public:
     void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
     {
 
-        int maxChecks = get_param(searchParams,"checks",32);
+        const int maxChecks = get_param(searchParams,"checks",32);
 
         if (maxChecks==FLANN_CHECKS_UNLIMITED) {
-            findExactNN(root_, result, vec);
+            findExactNN(root_[0], result, vec);
         }
         else {
             // Priority queue storing intermediate branches in the best-bin-first search
             Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);
 
             int checks = 0;
-            findNN(root_, result, vec, checks, maxChecks, heap);
+            for (int i=0; i<trees_; ++i) {
+                findNN(root_[i], result, vec, checks, maxChecks, heap);
+                if ((checks >= maxChecks) && result.full())
+                    break;
+            }
 
             BranchSt branch;
             while (heap->popMin(branch) && (checks<maxChecks || !result.full())) {
@@ -521,7 +558,6 @@ public:
 
             CV_Assert(result.full());
         }
-
     }
 
     /**
@@ -541,7 +577,7 @@ public:
         DistanceType variance;
         KMeansNodePtr* clusters = new KMeansNodePtr[numClusters];
 
-        int clusterCount = getMinVarianceClusters(root_, clusters, numClusters, variance);
+        int clusterCount = getMinVarianceClusters(root_[0], clusters, numClusters, variance);
 
         Logger::info("Clusters requested: %d, returning %d\n",numClusters, clusterCount);
 
@@ -611,23 +647,23 @@ private:
 
 
 
-    void save_tree(FILE* stream, KMeansNodePtr node)
+    void save_tree(FILE* stream, KMeansNodePtr node, int num)
     {
         save_value(stream, *node);
         save_value(stream, *(node->pivot), (int)veclen_);
         if (node->childs==NULL) {
-            int indices_offset = (int)(node->indices - indices_);
+            int indices_offset = (int)(node->indices - indices_[num]);
             save_value(stream, indices_offset);
         }
         else {
             for(int i=0; i<branching_; ++i) {
-                save_tree(stream, node->childs[i]);
+                save_tree(stream, node->childs[i], num);
             }
         }
     }
 
 
-    void load_tree(FILE* stream, KMeansNodePtr& node)
+    void load_tree(FILE* stream, KMeansNodePtr& node, int num)
     {
         node = pool_.allocate<KMeansNode>();
         load_value(stream, *node);
@@ -636,12 +672,12 @@ private:
         if (node->childs==NULL) {
             int indices_offset;
             load_value(stream, indices_offset);
-            node->indices = indices_ + indices_offset;
+            node->indices = indices_[num] + indices_offset;
         }
         else {
             node->childs = pool_.allocate<KMeansNodePtr>(branching_);
             for(int i=0; i<branching_; ++i) {
-                load_tree(stream, node->childs[i]);
+                load_tree(stream, node->childs[i], num);
             }
         }
     }
@@ -660,6 +696,32 @@ private:
         }
     }
 
+    void free_centers()
+    {
+       if (root_ != NULL) {
+           for(int i=0; i<trees_; ++i) {
+               if (root_[i] != NULL) {
+                   free_centers(root_[i]);
+               }
+           }
+       }
+    }
+
+    /**
+     * Release the inner elements of indices[]
+     */
+    void free_indices()
+    {
+        if (indices_!=NULL) {
+            for(int i=0; i<trees_; ++i) {
+                if (indices_[i]!=NULL) {
+                    delete[] indices_[i];
+                    indices_[i] = NULL;
+                }
+            }
+        }
+    }
+
     /**
      * Computes the statistics of a node (mean, radius, variance).
      *
@@ -960,7 +1022,45 @@ private:
     }
 
 
-
+    /**
+     * The method responsible with doing the recursive hierarchical clustering on
+     * binary vectors.
+     * As some might have heared that KMeans on binary data doesn't make sense,
+     * it's worth a little explanation why it actually fairly works. As
+     * with the Hierarchical Clustering algortihm, we seed several centers for the
+     * current node by picking some of its points. Then in a first pass each point
+     * of the node is then related to its closest center. Now let's have a look at
+     * the 5 central dimensions of the 9 following points:
+     *
+     * xxxxxx11100xxxxx (1)
+     * xxxxxx11010xxxxx (2)
+     * xxxxxx11001xxxxx (3)
+     * xxxxxx10110xxxxx (4)
+     * xxxxxx10101xxxxx (5)
+     * xxxxxx10011xxxxx (6)
+     * xxxxxx01110xxxxx (7)
+     * xxxxxx01101xxxxx (8)
+     * xxxxxx01011xxxxx (9)
+     * sum   _____
+     * of 1: 66555
+     *
+     * Even if the barycenter notion doesn't apply, we can set a center
+     * xxxxxx11111xxxxx that will better fit the five dimensions we are focusing
+     * on for these points.
+     *
+     * Note that convergence isn't ensured anymore. In practice, using Gonzales
+     * as seeding algorithm should be fine for getting convergence ("iterations"
+     * value can be set to -1). But with KMeans++ seeding you should definitely
+     * set a maximum number of iterations (but make it higher than the "iterations"
+     * default value of 11).
+     *
+     * Params:
+     *     node = the node to cluster
+     *     indices = indices of the points belonging to the current node
+     *     indices_length = number of points in the current node
+     *     branching = the branching factor to use in the clustering
+     *     level = 0 for the root node, it increases with the subdivision levels
+     */
     void computeBitfieldClustering(KMeansNodePtr node, int* indices,
                                    int indices_length, int branching, int level)
     {
@@ -1195,8 +1295,8 @@ private:
         }
 
         if (node->childs==NULL) {
-            if (checks>=maxChecks) {
-                if (result.full()) return;
+            if ((checks>=maxChecks) && result.full()) {
+                return;
             }
             checks += node->size;
             for (int i=0; i<node->size; ++i) {
@@ -1397,6 +1497,9 @@ private:
     /** The branching factor used in the hierarchical k-means clustering */
     int branching_;
 
+    /** Number of kmeans trees (default is one) */
+    int trees_;
+
     /** Maximum number of iterations to use when performing k-means clustering */
     int iterations_;
 
@@ -1432,12 +1535,12 @@ private:
     /**
      * The root node in the tree.
      */
-    KMeansNodePtr root_;
+    KMeansNodePtr* root_;
 
     /**
      *  Array of indices to vectors in the dataset.
      */
-    int* indices_;
+    int** indices_;
 
     /**
      * The distance
diff --git a/modules/imgcodecs/src/grfmt_jpeg2000.cpp b/modules/imgcodecs/src/grfmt_jpeg2000.cpp
index f4bb09718d..0f80d89c8d 100644
--- a/modules/imgcodecs/src/grfmt_jpeg2000.cpp
+++ b/modules/imgcodecs/src/grfmt_jpeg2000.cpp
@@ -378,7 +378,7 @@ bool  Jpeg2KDecoder::readComponent8u( uchar *data, void *_buffer,
 
     for( y = 0; y < yend - ystart; )
     {
-        jas_seqent_t* pix_row = &jas_matrix_get( buffer, y / ystep, 0 );
+        jas_seqent_t* pix_row = jas_matrix_getref( buffer, y / ystep, 0 );
         uchar* dst = data + (y - yoffset) * step - xoffset;
 
         if( xstep == 1 )
@@ -444,7 +444,7 @@ bool  Jpeg2KDecoder::readComponent16u( unsigned short *data, void *_buffer,
 
     for( y = 0; y < yend - ystart; )
     {
-        jas_seqent_t* pix_row = &jas_matrix_get( buffer, y / ystep, 0 );
+        jas_seqent_t* pix_row = jas_matrix_getref( buffer, y / ystep, 0 );
         ushort* dst = data + (y - yoffset) * step - xoffset;
 
         if( xstep == 1 )
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 6c22ea4f8f..4ac0d9db50 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -2310,7 +2310,7 @@ CV_EXPORTS_W void warpAffine( InputArray src, OutputArray dst,
                               const Scalar& borderValue = Scalar());
 
 /** @example samples/cpp/warpPerspective_demo.cpp
-An example program shows using cv::findHomography and cv::warpPerspective for image warping
+An example program shows using cv::getPerspectiveTransform and cv::warpPerspective for image warping
 */
 
 /** @brief Applies a perspective transformation to an image.
diff --git a/modules/stitching/include/opencv2/stitching.hpp b/modules/stitching/include/opencv2/stitching.hpp
index 016e7d8023..f6e7f70172 100644
--- a/modules/stitching/include/opencv2/stitching.hpp
+++ b/modules/stitching/include/opencv2/stitching.hpp
@@ -272,7 +272,7 @@ public:
     @param pano Final pano.
     @return Status code.
      */
-    Status composePanorama(InputArrayOfArrays images, OutputArray pano);
+    CV_WRAP Status composePanorama(InputArrayOfArrays images, OutputArray pano);
 
     /** @overload */
     CV_WRAP Status stitch(InputArrayOfArrays images, OutputArray pano);
diff --git a/modules/stitching/misc/python/test/test_stitching.py b/modules/stitching/misc/python/test/test_stitching.py
index 89c701f018..719f0583f2 100644
--- a/modules/stitching/misc/python/test/test_stitching.py
+++ b/modules/stitching/misc/python/test/test_stitching.py
@@ -19,6 +19,7 @@ class stitching_test(NewOpenCVTests):
         self.assertAlmostEqual(pano.shape[0], 685, delta=100, msg="rows: %r" % list(pano.shape))
         self.assertAlmostEqual(pano.shape[1], 1025, delta=100, msg="cols: %r" % list(pano.shape))
 
+
 class stitching_detail_test(NewOpenCVTests):
 
     def test_simple(self):
@@ -82,5 +83,37 @@ class stitching_detail_test(NewOpenCVTests):
         timelapser = cv.detail.Timelapser_createDefault(cv.detail.Timelapser_CROP);
         self.assertIsNotNone(timelapser)
 
+
+class stitching_compose_panorama_test_no_args(NewOpenCVTests):
+
+    def test_simple(self):
+
+        img1 = self.get_sample('stitching/a1.png')
+        img2 = self.get_sample('stitching/a2.png')
+
+        stitcher = cv.Stitcher.create(cv.Stitcher_PANORAMA)
+
+        stitcher.estimateTransform((img1, img2))
+
+        result, _ = stitcher.composePanorama()
+
+        assert result == 0
+
+
+class stitching_compose_panorama_args(NewOpenCVTests):
+
+    def test_simple(self):
+
+        img1 = self.get_sample('stitching/a1.png')
+        img2 = self.get_sample('stitching/a2.png')
+
+        stitcher = cv.Stitcher.create(cv.Stitcher_PANORAMA)
+
+        stitcher.estimateTransform((img1, img2))
+        result, _ = stitcher.composePanorama((img1, img2))
+
+        assert result == 0
+
+
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()
diff --git a/samples/cpp/asift.cpp b/samples/cpp/asift.cpp
new file mode 100644
index 0000000000..568954058d
--- /dev/null
+++ b/samples/cpp/asift.cpp
@@ -0,0 +1,199 @@
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/features2d.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/calib3d.hpp>
+#include <iostream>
+#include <iomanip>
+
+using namespace std;
+using namespace cv;
+
+static void help(char** argv)
+{
+    cout
+    << "This is a sample usage of AffineFeature detector/extractor.\n"
+    << "And this is a C++ version of samples/python/asift.py\n"
+    << "Usage: " << argv[0] << "\n"
+    << "     [ --feature=<sift|orb|brisk> ]         # Feature to use.\n"
+    << "     [ --flann ]                            # use Flann-based matcher instead of bruteforce.\n"
+    << "     [ --maxlines=<number(50 as default)> ] # The maximum number of lines in visualizing the matching result.\n"
+    << "     [ --image1=<image1(aero1.jpg as default)> ]\n"
+    << "     [ --image2=<image2(aero3.jpg as default)> ] # Path to images to compare."
+    << endl;
+}
+
+static double timer()
+{
+    return getTickCount() / getTickFrequency();
+}
+
+int main(int argc, char** argv)
+{
+    vector<String> fileName;
+    cv::CommandLineParser parser(argc, argv,
+        "{help h ||}"
+        "{feature|brisk|}"
+        "{flann||}"
+        "{maxlines|50|}"
+        "{image1|aero1.jpg|}{image2|aero3.jpg|}");
+    if (parser.has("help"))
+    {
+        help(argv);
+        return 0;
+    }
+    string feature = parser.get<string>("feature");
+    bool useFlann = parser.has("flann");
+    int maxlines = parser.get<int>("maxlines");
+    fileName.push_back(samples::findFile(parser.get<string>("image1")));
+    fileName.push_back(samples::findFile(parser.get<string>("image2")));
+    if (!parser.check())
+    {
+        parser.printErrors();
+        cout << "See --help (or missing '=' between argument name and value?)" << endl;
+        return 1;
+    }
+
+    Mat img1 = imread(fileName[0], IMREAD_GRAYSCALE);
+    Mat img2 = imread(fileName[1], IMREAD_GRAYSCALE);
+    if (img1.empty())
+    {
+        cerr << "Image " << fileName[0] << " is empty or cannot be found" << endl;
+        return 1;
+    }
+    if (img2.empty())
+    {
+        cerr << "Image " << fileName[1] << " is empty or cannot be found" << endl;
+        return 1;
+    }
+
+    Ptr<Feature2D> backend;
+    Ptr<DescriptorMatcher> matcher;
+
+    if (feature == "sift")
+    {
+        backend = SIFT::create();
+        if (useFlann)
+            matcher = DescriptorMatcher::create("FlannBased");
+        else
+            matcher = DescriptorMatcher::create("BruteForce");
+    }
+    else if (feature == "orb")
+    {
+        backend = ORB::create();
+        if (useFlann)
+            matcher = makePtr<FlannBasedMatcher>(makePtr<flann::LshIndexParams>(6, 12, 1));
+        else
+            matcher = DescriptorMatcher::create("BruteForce-Hamming");
+    }
+    else if (feature == "brisk")
+    {
+        backend = BRISK::create();
+        if (useFlann)
+            matcher = makePtr<FlannBasedMatcher>(makePtr<flann::LshIndexParams>(6, 12, 1));
+        else
+            matcher = DescriptorMatcher::create("BruteForce-Hamming");
+    }
+    else
+    {
+        cerr << feature << " is not supported. See --help" << endl;
+        return 1;
+    }
+
+    cout << "extracting with " << feature << "..." << endl;
+    Ptr<AffineFeature> ext = AffineFeature::create(backend);
+    vector<KeyPoint> kp1, kp2;
+    Mat desc1, desc2;
+
+    ext->detectAndCompute(img1, Mat(), kp1, desc1);
+    ext->detectAndCompute(img2, Mat(), kp2, desc2);
+    cout << "img1 - " << kp1.size() << " features, "
+         << "img2 - " << kp2.size() << " features"
+         << endl;
+
+    cout << "matching with " << (useFlann ? "flann" : "bruteforce") << "..." << endl;
+    double start = timer();
+    // match and draw
+    vector< vector<DMatch> > rawMatches;
+    vector<Point2f> p1, p2;
+    vector<float> distances;
+    matcher->knnMatch(desc1, desc2, rawMatches, 2);
+    // filter_matches
+    for (size_t i = 0; i < rawMatches.size(); i++)
+    {
+        const vector<DMatch>& m = rawMatches[i];
+        if (m.size() == 2 && m[0].distance < m[1].distance * 0.75)
+        {
+            p1.push_back(kp1[m[0].queryIdx].pt);
+            p2.push_back(kp2[m[0].trainIdx].pt);
+            distances.push_back(m[0].distance);
+        }
+    }
+    vector<uchar> status;
+    vector< pair<Point2f, Point2f> > pointPairs;
+    Mat H = findHomography(p1, p2, status, RANSAC);
+    int inliers = 0;
+    for (size_t i = 0; i < status.size(); i++)
+    {
+        if (status[i])
+        {
+            pointPairs.push_back(make_pair(p1[i], p2[i]));
+            distances[inliers] = distances[i];
+            // CV_Assert(inliers <= (int)i);
+            inliers++;
+        }
+    }
+    distances.resize(inliers);
+
+    cout << "execution time: " << fixed << setprecision(2) << (timer()-start)*1000 << " ms" << endl;
+    cout << inliers << " / " << status.size() << " inliers/matched" << endl;
+
+    cout << "visualizing..." << endl;
+    vector<int> indices(inliers);
+    cv::sortIdx(distances, indices, SORT_EVERY_ROW+SORT_ASCENDING);
+
+    // explore_match
+    int h1 = img1.size().height;
+    int w1 = img1.size().width;
+    int h2 = img2.size().height;
+    int w2 = img2.size().width;
+    Mat vis = Mat::zeros(max(h1, h2), w1+w2, CV_8U);
+    img1.copyTo(Mat(vis, Rect(0, 0, w1, h1)));
+    img2.copyTo(Mat(vis, Rect(w1, 0, w2, h2)));
+    cvtColor(vis, vis, COLOR_GRAY2BGR);
+
+    vector<Point2f> corners(4);
+    corners[0] = Point2f(0, 0);
+    corners[1] = Point2f((float)w1, 0);
+    corners[2] = Point2f((float)w1, (float)h1);
+    corners[3] = Point2f(0, (float)h1);
+    vector<Point2i> icorners;
+    perspectiveTransform(corners, corners, H);
+    transform(corners, corners, Matx23f(1,0,(float)w1,0,1,0));
+    Mat(corners).convertTo(icorners, CV_32S);
+    polylines(vis, icorners, true, Scalar(255,255,255));
+
+    for (int i = 0; i < min(inliers, maxlines); i++)
+    {
+        int idx = indices[i];
+        const Point2f& pi1 = pointPairs[idx].first;
+        const Point2f& pi2 = pointPairs[idx].second;
+        circle(vis, pi1, 2, Scalar(0,255,0), -1);
+        circle(vis, pi2 + Point2f((float)w1,0), 2, Scalar(0,255,0), -1);
+        line(vis, pi1, pi2 + Point2f((float)w1,0), Scalar(0,255,0));
+    }
+    if (inliers > maxlines)
+        cout << "only " << maxlines << " inliers are visualized" << endl;
+    imshow("affine find_obj", vis);
+
+    // Mat vis2 = Mat::zeros(max(h1, h2), w1+w2, CV_8U);
+    // Mat warp1;
+    // warpPerspective(img1, warp1, H, Size(w1, h1));
+    // warp1.copyTo(Mat(vis2, Rect(0, 0, w1, h1)));
+    // img2.copyTo(Mat(vis2, Rect(w1, 0, w2, h2)));
+    // imshow("warped", vis2);
+
+    waitKey();
+    cout << "done" << endl;
+    return 0;
+}
diff --git a/samples/cpp/warpPerspective_demo.cpp b/samples/cpp/warpPerspective_demo.cpp
index 4a9069f5d2..947abd4359 100644
--- a/samples/cpp/warpPerspective_demo.cpp
+++ b/samples/cpp/warpPerspective_demo.cpp
@@ -8,7 +8,6 @@
 #include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
 #include "opencv2/highgui.hpp"
-#include "opencv2/calib3d.hpp"
 #include <iostream>
 
 using namespace std;
@@ -36,6 +35,7 @@ Mat warping(Mat image, Size warped_image_size, vector< Point2f> srcPoints, vecto
 String windowTitle = "Perspective Transformation Demo";
 String labels[4] = { "TL","TR","BR","BL" };
 vector< Point2f> roi_corners;
+vector< Point2f> midpoints(4);
 vector< Point2f> dst_corners(4);
 int roiIndex = 0;
 bool dragging;
@@ -99,21 +99,26 @@ int main(int argc, char** argv)
 
             imshow( windowTitle, image );
 
+            midpoints[0] = (roi_corners[0] + roi_corners[1]) / 2;
+            midpoints[1] = (roi_corners[1] + roi_corners[2]) / 2;
+            midpoints[2] = (roi_corners[2] + roi_corners[3]) / 2;
+            midpoints[3] = (roi_corners[3] + roi_corners[0]) / 2;
+
             dst_corners[0].x = 0;
             dst_corners[0].y = 0;
-            dst_corners[1].x = (float)std::max(norm(roi_corners[0] - roi_corners[1]), norm(roi_corners[2] - roi_corners[3]));
+            dst_corners[1].x = (float)norm(midpoints[1] - midpoints[3]);
             dst_corners[1].y = 0;
-            dst_corners[2].x = (float)std::max(norm(roi_corners[0] - roi_corners[1]), norm(roi_corners[2] - roi_corners[3]));
-            dst_corners[2].y = (float)std::max(norm(roi_corners[1] - roi_corners[2]), norm(roi_corners[3] - roi_corners[0]));
+            dst_corners[2].x = dst_corners[1].x;
+            dst_corners[2].y = (float)norm(midpoints[0] - midpoints[2]);
             dst_corners[3].x = 0;
-            dst_corners[3].y = (float)std::max(norm(roi_corners[1] - roi_corners[2]), norm(roi_corners[3] - roi_corners[0]));
+            dst_corners[3].y = dst_corners[2].y;
 
             Size warped_image_size = Size(cvRound(dst_corners[2].x), cvRound(dst_corners[2].y));
 
-            Mat H = findHomography(roi_corners, dst_corners); //get homography
+            Mat M = getPerspectiveTransform(roi_corners, dst_corners);
 
             Mat warped_image;
-            warpPerspective(original_image, warped_image, H, warped_image_size); // do perspective transformation
+            warpPerspective(original_image, warped_image, M, warped_image_size); // do perspective transformation
 
             imshow("Warped Image", warped_image);
         }
diff --git a/samples/data/opencv-logo-white.png b/samples/data/opencv-logo-white.png
index 3c7098459e..a683e3569f 100644
Binary files a/samples/data/opencv-logo-white.png and b/samples/data/opencv-logo-white.png differ
diff --git a/samples/data/opencv-logo.png b/samples/data/opencv-logo.png
index bc71a2ae50..995bdc0fa6 100644
Binary files a/samples/data/opencv-logo.png and b/samples/data/opencv-logo.png differ
diff --git a/samples/winrt/ImageManipulations/assets/StoreLogo.png b/samples/winrt/ImageManipulations/assets/StoreLogo.png
index af64bf00ad..0fb23ff2cf 100644
Binary files a/samples/winrt/ImageManipulations/assets/StoreLogo.png and b/samples/winrt/ImageManipulations/assets/StoreLogo.png differ
diff --git a/samples/winrt/ImageManipulations/assets/opencv-logo-150.png b/samples/winrt/ImageManipulations/assets/opencv-logo-150.png
index ea685d651a..8f447ad30c 100644
Binary files a/samples/winrt/ImageManipulations/assets/opencv-logo-150.png and b/samples/winrt/ImageManipulations/assets/opencv-logo-150.png differ
diff --git a/samples/winrt/ImageManipulations/assets/opencv-logo-30.png b/samples/winrt/ImageManipulations/assets/opencv-logo-30.png
index efaf5468a1..449be5858e 100644
Binary files a/samples/winrt/ImageManipulations/assets/opencv-logo-30.png and b/samples/winrt/ImageManipulations/assets/opencv-logo-30.png differ
diff --git a/samples/winrt/ImageManipulations/assets/windows-sdk.scale-100.png b/samples/winrt/ImageManipulations/assets/windows-sdk.scale-100.png
index af64bf00ad..0fb23ff2cf 100644
Binary files a/samples/winrt/ImageManipulations/assets/windows-sdk.scale-100.png and b/samples/winrt/ImageManipulations/assets/windows-sdk.scale-100.png differ
diff --git a/samples/winrt/JavaScript/images/logo.scale-100.png b/samples/winrt/JavaScript/images/logo.scale-100.png
index ea685d651a..8f447ad30c 100644
Binary files a/samples/winrt/JavaScript/images/logo.scale-100.png and b/samples/winrt/JavaScript/images/logo.scale-100.png differ
diff --git a/samples/winrt/JavaScript/images/smalllogo.scale-100.png b/samples/winrt/JavaScript/images/smalllogo.scale-100.png
index efaf5468a1..f2d1877348 100644
Binary files a/samples/winrt/JavaScript/images/smalllogo.scale-100.png and b/samples/winrt/JavaScript/images/smalllogo.scale-100.png differ
diff --git a/samples/winrt/JavaScript/images/windows-sdk.png b/samples/winrt/JavaScript/images/windows-sdk.png
index af64bf00ad..0fb23ff2cf 100644
Binary files a/samples/winrt/JavaScript/images/windows-sdk.png and b/samples/winrt/JavaScript/images/windows-sdk.png differ
diff --git a/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/Logo.png b/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/Logo.png
index ea685d651a..8f447ad30c 100644
Binary files a/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/Logo.png and b/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/Logo.png differ
diff --git a/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/SmallLogo.png b/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/SmallLogo.png
index efaf5468a1..449be5858e 100644
Binary files a/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/SmallLogo.png and b/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/SmallLogo.png differ
diff --git a/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/StoreLogo.png b/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/StoreLogo.png
index af64bf00ad..0fb23ff2cf 100644
Binary files a/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/StoreLogo.png and b/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/StoreLogo.png differ