Merge remote-tracking branch 'upstream/3.4' into merge-3.4

2025-12-06 12:19:50 +01:00 · 2020-05-06 19:52:58 +00:00 · 2020-05-06 19:52:58 +00:00 · 09799402f9
commit 09799402f9
parent c722625f28 5813d2439c
10 changed files with 229 additions and 67 deletions
--- a/modules/calib3d/test/test_fisheye.cpp
+++ b/modules/calib3d/test/test_fisheye.cpp
@ -101,7 +101,7 @@ TEST_F(fisheyeTest, projectPoints)
    EXPECT_MAT_NEAR(distorted0, distorted2, 1e-10);
 }

-TEST_F(fisheyeTest, DISABLED_undistortImage)
+TEST_F(fisheyeTest, undistortImage)
 {
    cv::Matx33d theK = this->K;
    cv::Mat theD = cv::Mat(this->D);
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@ -229,6 +229,10 @@ namespace cv {
                    {
                        activation_param.type = "Swish";
                    }
+                    else if (type == "mish")
+                    {
+                        activation_param.type = "Mish";
+                    }
                    else if (type == "logistic")
                    {
                        activation_param.type = "Sigmoid";
@ -436,7 +440,7 @@ namespace cv {
                    fused_layer_names.push_back(last_layer);
                }

-                void setYolo(int classes, const std::vector<int>& mask, const std::vector<float>& anchors, float thresh, float nms_threshold)
+                void setYolo(int classes, const std::vector<int>& mask, const std::vector<float>& anchors, float thresh, float nms_threshold, float scale_x_y)
                {
                    cv::dnn::LayerParams region_param;
                    region_param.name = "Region-name";
@ -449,6 +453,7 @@ namespace cv {
                    region_param.set<bool>("logistic", true);
                    region_param.set<float>("thresh", thresh);
                    region_param.set<float>("nms_threshold", nms_threshold);
+                    region_param.set<float>("scale_x_y", scale_x_y);

                    std::vector<float> usedAnchors(numAnchors * 2);
                    for (int i = 0; i < numAnchors; ++i)
@ -786,6 +791,7 @@ namespace cv {
                        int num_of_anchors = getParam<int>(layer_params, "num", -1);
                        float thresh = getParam<float>(layer_params, "thresh", 0.2);
                        float nms_threshold = getParam<float>(layer_params, "nms_threshold", 0.4);
+                        float scale_x_y = getParam<float>(layer_params, "scale_x_y", 1.0);

                        std::string anchors_values = getParam<std::string>(layer_params, "anchors", std::string());
                        CV_Assert(!anchors_values.empty());
@ -798,7 +804,7 @@ namespace cv {
                        CV_Assert(classes > 0 && num_of_anchors > 0 && (num_of_anchors * 2) == anchors_vec.size());

                        setParams.setPermute(false);
-                        setParams.setYolo(classes, mask_vec, anchors_vec, thresh, nms_threshold);
+                        setParams.setYolo(classes, mask_vec, anchors_vec, thresh, nms_threshold, scale_x_y);
                    }
                    else {
                        CV_Error(cv::Error::StsParseError, "Unknown layer type: " + layer_type);
@ -813,6 +819,10 @@ namespace cv {
                    {
                        setParams.setActivation("swish");
                    }
+                    else if (activation == "mish")
+                    {
+                        setParams.setActivation("mish");
+                    }
                    else if (activation == "logistic")
                    {
                        setParams.setActivation("logistic");
@ -935,8 +945,8 @@ namespace cv {
                    }

                    std::string activation = getParam<std::string>(layer_params, "activation", "linear");
-                    if(activation == "leaky" || activation == "swish" || activation == "logistic")
-                        ++cv_layers_counter;  // For ReLU, Swish, Sigmoid
+                    if(activation == "leaky" || activation == "swish" || activation == "mish" || activation == "logistic")
+                        ++cv_layers_counter;  // For ReLU, Swish, Mish, Sigmoid

                    if(!darknet_layers_counter)
                        tensor_shape.resize(1);
--- a/modules/dnn/src/layers/region_layer.cpp
+++ b/modules/dnn/src/layers/region_layer.cpp
@ -69,7 +69,7 @@ class RegionLayerImpl CV_FINAL : public RegionLayer
 {
 public:
    int coords, classes, anchors, classfix;
-    float thresh, nmsThreshold;
+    float thresh, nmsThreshold, scale_x_y;
    bool useSoftmax, useLogistic;
 #ifdef HAVE_OPENCL
    UMat blob_umat;
@ -88,6 +88,7 @@ public:
        useSoftmax = params.get<bool>("softmax", false);
        useLogistic = params.get<bool>("logistic", false);
        nmsThreshold = params.get<float>("nms_threshold", 0.4);
+        scale_x_y = params.get<float>("scale_x_y", 1.0); // Yolov4

        CV_Assert(nmsThreshold >= 0.);
        CV_Assert(coords == 4);
@ -302,8 +303,10 @@ public:
                            if (classfix == -1 && scale < .5) scale = 0;  // if(t0 < 0.5) t0 = 0;
                            int box_index = index_sample_offset + index * cell_size;

-                            dstData[box_index + 0] = (x + logistic_activate(srcData[box_index + 0])) / cols;
-                            dstData[box_index + 1] = (y + logistic_activate(srcData[box_index + 1])) / rows;
+                            float x_tmp = (logistic_activate(srcData[box_index + 0]) - 0.5f) * scale_x_y + 0.5f;
+                            float y_tmp = (logistic_activate(srcData[box_index + 1]) - 0.5f) * scale_x_y + 0.5f;
+                            dstData[box_index + 0] = (x + x_tmp) / cols;
+                            dstData[box_index + 1] = (y + y_tmp) / rows;
                            dstData[box_index + 2] = exp(srcData[box_index + 2]) * biasData[2 * a] / wNorm;
                            dstData[box_index + 3] = exp(srcData[box_index + 3]) * biasData[2 * a + 1] / hNorm;

@ -471,6 +474,8 @@ public:
        auto shape_3d = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{boxes_shape.size()}, boxes_shape.data());

        ngraph::Shape box_broad_shape{1, (size_t)anchors, (size_t)h, (size_t)w};
+        auto scale_x_y_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &scale_x_y);
+        auto shift_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, std::vector<float>{0.5});

        std::shared_ptr<ngraph::Node> box_x;
        {
@ -478,6 +483,9 @@ public:
            auto upper_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{1, cols});
            box_x = std::make_shared<ngraph::op::v1::StridedSlice>(input2d, lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
            box_x = std::make_shared<ngraph::op::Sigmoid>(box_x);
+            box_x = std::make_shared<ngraph::op::v1::Subtract>(box_x, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
+            box_x = std::make_shared<ngraph::op::v1::Multiply>(box_x, scale_x_y_node, ngraph::op::AutoBroadcastType::NUMPY);
+            box_x = std::make_shared<ngraph::op::v1::Add>(box_x, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
            box_x = std::make_shared<ngraph::op::v1::Reshape>(box_x, shape_3d, true);

            std::vector<float> x_indices(w * h * anchors);
@ -504,6 +512,9 @@ public:
            auto upper_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{2, cols});
            box_y = std::make_shared<ngraph::op::v1::StridedSlice>(input2d, lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
            box_y = std::make_shared<ngraph::op::Sigmoid>(box_y);
+            box_y = std::make_shared<ngraph::op::v1::Subtract>(box_y, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
+            box_y = std::make_shared<ngraph::op::v1::Multiply>(box_y, scale_x_y_node, ngraph::op::AutoBroadcastType::NUMPY);
+            box_y = std::make_shared<ngraph::op::v1::Add>(box_y, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
            box_y = std::make_shared<ngraph::op::v1::Reshape>(box_y, shape_3d, true);

            std::vector<float> y_indices(h * anchors);
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@ -172,18 +172,19 @@ public:
        CV_Assert(inputs.size() == 1);
        const MatSize& inpShape = inputs[0].size;

+        finalSliceRanges = sliceRanges;
        if (sliceRanges.empty())
        {
            // Divide input blob on equal parts by axis.
            int outAxisSize = inpShape[axis] / outputs.size();
-            sliceRanges.resize(outputs.size(),
-                               std::vector<Range>(axis + 1, Range::all()));
+            finalSliceRanges.resize(outputs.size(),
+                                    std::vector<Range>(axis + 1, Range::all()));
            int prevSlice = 0;
            for (int i = 0; i < outputs.size(); ++i)
            {
-                sliceRanges[i][axis].start = prevSlice;
-                sliceRanges[i][axis].end = sliceRanges[i][axis].start + outAxisSize;
-                prevSlice = sliceRanges[i][axis].end;
+                finalSliceRanges[i][axis].start = prevSlice;
+                finalSliceRanges[i][axis].end = finalSliceRanges[i][axis].start + outAxisSize;
+                prevSlice = finalSliceRanges[i][axis].end;
            }
        }
        else
@ -191,16 +192,16 @@ public:

        for (int i = 0; i < outputs.size(); ++i)
        {
-            CV_Assert(sliceRanges[i].size() <= inpShape.dims());
+            CV_Assert(finalSliceRanges[i].size() <= inpShape.dims());
            // Fill the rest of ranges.
-            for (int j = sliceRanges[i].size(); j < inpShape.dims(); ++j)
+            for (int j = finalSliceRanges[i].size(); j < inpShape.dims(); ++j)
            {
-                sliceRanges[i].push_back(Range::all());
+                finalSliceRanges[i].push_back(Range::all());
            }
            // Clamp.
-            for (int j = 0; j < sliceRanges[i].size(); ++j)
+            for (int j = 0; j < finalSliceRanges[i].size(); ++j)
            {
-                sliceRanges[i][j] = clamp(sliceRanges[i][j], inpShape[j]);
+                finalSliceRanges[i][j] = clamp(finalSliceRanges[i][j], inpShape[j]);
            }
        }
    }
@ -241,8 +242,8 @@ public:
            kernel.set(idx++, (int)(rows * cols));
            kernel.set(idx++, (int)inpMat.size[3]);
            kernel.set(idx++, (int)cols);
-            kernel.set(idx++, (int)sliceRanges[i][2].start);
-            kernel.set(idx++, (int)sliceRanges[i][3].start);
+            kernel.set(idx++, (int)finalSliceRanges[i][2].start);
+            kernel.set(idx++, (int)finalSliceRanges[i][3].start);
            kernel.set(idx++, ocl::KernelArg::PtrWriteOnly(outputs[i]));
            bool ret = kernel.run(1, global, local, false);
            if (!ret)
@ -266,10 +267,10 @@ public:
        outputs_arr.getMatVector(outputs);

        const Mat& inpMat = inputs[0];
-        CV_Assert(outputs.size() == sliceRanges.size());
+        CV_Assert(outputs.size() == finalSliceRanges.size());
        for (size_t i = 0; i < outputs.size(); i++)
        {
-            inpMat(sliceRanges[i]).copyTo(outputs[i]);
+            inpMat(finalSliceRanges[i]).copyTo(outputs[i]);
        }
    }

@ -278,11 +279,11 @@ public:
 #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1)
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
    {
-        CV_Assert_N(sliceRanges.size() == 1, inputs.size() <= 2);
+        CV_Assert_N(finalSliceRanges.size() == 1, inputs.size() <= 2);

        std::vector<size_t> axes, offsets, dims;
        int from, to, step;
-        int numDims = sliceRanges[0].size();
+        int numDims = finalSliceRanges[0].size();
        if (preferableTarget == DNN_TARGET_MYRIAD)
        {
            from = axis;
@ -298,8 +299,8 @@ public:
        for (int i = from; i != to; i += step)
        {
            axes.push_back(i);
-            offsets.push_back(sliceRanges[0][i].start);
-            dims.push_back(sliceRanges[0][i].size());
+            offsets.push_back(finalSliceRanges[0][i].start);
+            dims.push_back(finalSliceRanges[0][i].size());
        }

        InferenceEngine::Builder::Layer ieLayer(name);
@ -315,7 +316,7 @@ public:
        {
            std::vector<size_t> outShape(numDims);
            for (int i = 0; i < numDims; ++i)
-                outShape[i] = sliceRanges[0][i].size();
+                outShape[i] = finalSliceRanges[0][i].size();

            ieLayer.getInputPorts()[1].setParameter("type", "weights");

@ -338,13 +339,13 @@ public:
    {
        CV_Assert_N(nodes.size() <= 2);
        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        CV_Assert(sliceRanges[0].size() == ieInpNode->get_shape().size());
+        CV_Assert(finalSliceRanges[0].size() == ieInpNode->get_shape().size());

        std::vector<int64_t> offsets, dims;
-        for (int i = 0; i < sliceRanges[0].size(); ++i)
+        for (int i = 0; i < finalSliceRanges[0].size(); ++i)
        {
-            offsets.push_back(sliceRanges[0][i].start);
-            dims.push_back(sliceRanges[0][i].end);
+            offsets.push_back(finalSliceRanges[0][i].start);
+            dims.push_back(finalSliceRanges[0][i].end);
        }

        auto lower_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
@ -384,6 +385,10 @@ public:
    }
 #endif

+
+protected:
+    // The actual non-negative values determined from @p sliceRanges depends on input size.
+    std::vector<std::vector<Range> > finalSliceRanges;
 };

 class CropLayerImpl CV_FINAL : public SliceLayerImpl
@ -447,18 +452,18 @@ public:
                offset_final[i] = offset[i - start_axis];
        }

-        sliceRanges.resize(1);
-        sliceRanges[0].resize(dims);
+        finalSliceRanges.resize(1);
+        finalSliceRanges[0].resize(dims);
        for (int i = 0; i < start_axis; i++)
        {
-            sliceRanges[0][i] = Range(0, inpBlob.size[i]);
+            finalSliceRanges[0][i] = Range(0, inpBlob.size[i]);
        }
        for (int i = start_axis; i < dims; i++)
        {
            if (offset_final[i] < 0 || offset_final[i] + inpSzBlob.size[i] > inpBlob.size[i])
                CV_Error(Error::StsBadArg, "invalid crop parameters or blob sizes");

-            sliceRanges[0][i] = Range(offset_final[i], offset_final[i] + inpSzBlob.size[i]);
+            finalSliceRanges[0][i] = Range(offset_final[i], offset_final[i] + inpSzBlob.size[i]);
        }
    }

--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@ -549,6 +549,11 @@ TEST_P(Test_Darknet_layers, upsample)
    testDarknetLayer("upsample");
 }

+TEST_P(Test_Darknet_layers, mish)
+{
+    testDarknetLayer("mish", true);
+}
+
 TEST_P(Test_Darknet_layers, avgpool_softmax)
 {
    testDarknetLayer("avgpool_softmax");
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -1791,4 +1791,38 @@ TEST_P(Layer_Test_Resize, change_input)

 INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_Resize, dnnBackendsAndTargets());

+typedef testing::TestWithParam<tuple<Backend, Target> > Layer_Test_Slice;
+TEST_P(Layer_Test_Slice, variable_input_shape)
+{
+    int backendId = get<0>(GetParam());
+    int targetId = get<1>(GetParam());
+
+    int begin[] = {0, 0, 0, 0};
+    int end[] = {-1, -1, -1, -1};
+
+    Net net;
+    LayerParams lp;
+    lp.type = "Slice";
+    lp.name = "testLayer";
+    lp.set("begin", DictValue::arrayInt<int*>(&begin[0], 4));
+    lp.set("end", DictValue::arrayInt<int*>(&end[0], 4));
+    net.addLayerToPrev(lp.name, lp.type, lp);
+
+    for (int i = 0; i < 2; ++i)
+    {
+        Mat inp(4 + i, 5 + i, CV_8UC1);
+        randu(inp, 0, 255);
+        inp = blobFromImage(inp);
+
+        net.setInput(inp);
+        net.setPreferableBackend(backendId);
+        net.setPreferableTarget(targetId);
+        Mat out = net.forward();
+
+        normAssert(out, inp);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_Slice, dnnBackendsAndTargets());
+
 }} // namespace
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@ -559,7 +559,7 @@ public:

 TEST_P(Test_ONNX_nets, Alexnet)
 {
-#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+#if defined(OPENCV_32BIT_CONFIGURATION) && (defined(HAVE_OPENCL) || defined(_WIN32))
    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
 #else
    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
@ -623,7 +623,7 @@ TEST_P(Test_ONNX_nets, Googlenet)

 TEST_P(Test_ONNX_nets, CaffeNet)
 {
-#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+#if defined(OPENCV_32BIT_CONFIGURATION) && (defined(HAVE_OPENCL) || defined(_WIN32))
    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
 #else
    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
@ -639,7 +639,7 @@ TEST_P(Test_ONNX_nets, CaffeNet)

 TEST_P(Test_ONNX_nets, RCNN_ILSVRC13)
 {
-#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+#if defined(OPENCV_32BIT_CONFIGURATION) && (defined(HAVE_OPENCL) || defined(_WIN32))
    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
 #else
    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@ -55,19 +55,19 @@

 namespace
 {
-static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
+static void calcScharrDeriv(const cv::Mat& src, cv::Mat& dst)
 {
    using namespace cv;
    using cv::detail::deriv_type;
    int rows = src.rows, cols = src.cols, cn = src.channels(), depth = src.depth();
    CV_Assert(depth == CV_8U);
    dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn*2));
-    parallel_for_(Range(0, rows), cv::detail::SharrDerivInvoker(src, dst), cv::getNumThreads());
+    parallel_for_(Range(0, rows), cv::detail::ScharrDerivInvoker(src, dst), cv::getNumThreads());
 }

 }//namespace

-void cv::detail::SharrDerivInvoker::operator()(const Range& range) const
+void cv::detail::ScharrDerivInvoker::operator()(const Range& range) const
 {
    using cv::detail::deriv_type;
    int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols*cn;
@ -801,7 +801,7 @@ int cv::buildOpticalFlowPyramid(InputArray _img, OutputArrayOfArrays pyramid, Si
                deriv.create(sz.height + winSize.height*2, sz.width + winSize.width*2, derivType);

            Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height));
-            calcSharrDeriv(thisLevel, derivI);
+            calcScharrDeriv(thisLevel, derivI);

            if(derivBorder != BORDER_TRANSPARENT)
                copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder|BORDER_ISOLATED);
@ -1382,7 +1382,7 @@ void SparsePyrLKOpticalFlowImpl::calc( InputArray _prevImg, InputArray _nextImg,
            Mat _derivI( imgSize.height + winSize.height*2,
                imgSize.width + winSize.width*2, derivIBuf.type(), derivIBuf.ptr() );
            derivI = _derivI(Rect(winSize.width, winSize.height, imgSize.width, imgSize.height));
-            calcSharrDeriv(prevPyr[level * lvlStep1], derivI);
+            calcScharrDeriv(prevPyr[level * lvlStep1], derivI);
            copyMakeBorder(derivI, _derivI, winSize.height, winSize.height, winSize.width, winSize.width, BORDER_CONSTANT|BORDER_ISOLATED);
        }
        else
--- a/modules/video/src/lkpyramid.hpp
+++ b/modules/video/src/lkpyramid.hpp
@ -7,9 +7,9 @@ namespace detail

    typedef short deriv_type;

-    struct SharrDerivInvoker : ParallelLoopBody
+    struct ScharrDerivInvoker : ParallelLoopBody
    {
-        SharrDerivInvoker(const Mat& _src, const Mat& _dst)
+        ScharrDerivInvoker(const Mat& _src, const Mat& _dst)
            : src(_src), dst(_dst)
        { }

--- a/samples/dnn/text_detection.cpp
+++ b/samples/dnn/text_detection.cpp
@ -1,3 +1,20 @@
+/*
+    Text detection model: https://github.com/argman/EAST
+    Download link: https://www.dropbox.com/s/r2ingd0l3zt8hxs/frozen_east_text_detection.tar.gz?dl=1
+
+    Text recognition model taken from here: https://github.com/meijieru/crnn.pytorch
+    How to convert from pb to onnx:
+    Using classes from here: https://github.com/meijieru/crnn.pytorch/blob/master/models/crnn.py
+
+    import torch
+    import models.crnn as crnn
+
+    model = CRNN(32, 1, 37, 256)
+    model.load_state_dict(torch.load('crnn.pth'))
+    dummy_input = torch.randn(1, 1, 32, 100)
+    torch.onnx.export(model, dummy_input, "crnn.onnx", verbose=True)
+*/
+
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <opencv2/dnn.hpp>
@ -8,21 +25,26 @@ using namespace cv::dnn;
 const char* keys =
    "{ help  h     | | Print help message. }"
    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
-    "{ model m     | | Path to a binary .pb file contains trained network.}"
+    "{ model m     | | Path to a binary .pb file contains trained detector network.}"
+    "{ ocr         | | Path to a binary .pb or .onnx file contains trained recognition network.}"
    "{ width       | 320 | Preprocess input image by resizing to a specific width. It should be multiple by 32. }"
    "{ height      | 320 | Preprocess input image by resizing to a specific height. It should be multiple by 32. }"
    "{ thr         | 0.5 | Confidence threshold. }"
    "{ nms         | 0.4 | Non-maximum suppression threshold. }";

-void decode(const Mat& scores, const Mat& geometry, float scoreThresh,
-            std::vector<RotatedRect>& detections, std::vector<float>& confidences);
+void decodeBoundingBoxes(const Mat& scores, const Mat& geometry, float scoreThresh,
+                         std::vector<RotatedRect>& detections, std::vector<float>& confidences);
+
+void fourPointsTransform(const Mat& frame, Point2f vertices[4], Mat& result);
+
+void decodeText(const Mat& scores, std::string& text);

 int main(int argc, char** argv)
 {
    // Parse command line arguments.
    CommandLineParser parser(argc, argv, keys);
    parser.about("Use this script to run TensorFlow implementation (https://github.com/argman/EAST) of "
-                  "EAST: An Efficient and Accurate Scene Text Detector (https://arxiv.org/abs/1704.03155v2)");
+                 "EAST: An Efficient and Accurate Scene Text Detector (https://arxiv.org/abs/1704.03155v2)");
    if (argc == 1 || parser.has("help"))
    {
        parser.printMessage();
@ -33,7 +55,8 @@ int main(int argc, char** argv)
    float nmsThreshold = parser.get<float>("nms");
    int inpWidth = parser.get<int>("width");
    int inpHeight = parser.get<int>("height");
-    String model = parser.get<String>("model");
+    String modelDecoder = parser.get<String>("model");
+    String modelRecognition = parser.get<String>("ocr");

    if (!parser.check())
    {
@ -41,17 +64,19 @@ int main(int argc, char** argv)
        return 1;
    }

-    CV_Assert(!model.empty());
+    CV_Assert(!modelDecoder.empty());

-    // Load network.
-    Net net = readNet(model);
+    // Load networks.
+    Net detector = readNet(modelDecoder);
+    Net recognizer;
+
+    if (!modelRecognition.empty())
+        recognizer = readNet(modelRecognition);

    // Open a video file or an image file or a camera stream.
    VideoCapture cap;
-    if (parser.has("input"))
-        cap.open(parser.get<String>("input"));
-    else
-        cap.open(0);
+    bool openSuccess = parser.has("input") ? cap.open(parser.get<String>("input")) : cap.open(0);
+    CV_Assert(openSuccess);

    static const std::string kWinName = "EAST: An Efficient and Accurate Scene Text Detector";
    namedWindow(kWinName, WINDOW_NORMAL);
@ -62,6 +87,7 @@ int main(int argc, char** argv)
    outNames[1] = "feature_fusion/concat_3";

    Mat frame, blob;
+    TickMeter tickMeter;
    while (waitKey(1) < 0)
    {
        cap >> frame;
@ -72,8 +98,10 @@ int main(int argc, char** argv)
        }

        blobFromImage(frame, blob, 1.0, Size(inpWidth, inpHeight), Scalar(123.68, 116.78, 103.94), true, false);
-        net.setInput(blob);
-        net.forward(outs, outNames);
+        detector.setInput(blob);
+        tickMeter.start();
+        detector.forward(outs, outNames);
+        tickMeter.stop();

        Mat scores = outs[0];
        Mat geometry = outs[1];
@ -81,43 +109,64 @@ int main(int argc, char** argv)
        // Decode predicted bounding boxes.
        std::vector<RotatedRect> boxes;
        std::vector<float> confidences;
-        decode(scores, geometry, confThreshold, boxes, confidences);
+        decodeBoundingBoxes(scores, geometry, confThreshold, boxes, confidences);

        // Apply non-maximum suppression procedure.
        std::vector<int> indices;
        NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);

-        // Render detections.
        Point2f ratio((float)frame.cols / inpWidth, (float)frame.rows / inpHeight);
+
+        // Render text.
        for (size_t i = 0; i < indices.size(); ++i)
        {
            RotatedRect& box = boxes[indices[i]];

            Point2f vertices[4];
            box.points(vertices);
+
            for (int j = 0; j < 4; ++j)
            {
                vertices[j].x *= ratio.x;
                vertices[j].y *= ratio.y;
            }
+
+            if (!modelRecognition.empty())
+            {
+                Mat cropped;
+                fourPointsTransform(frame, vertices, cropped);
+
+                cvtColor(cropped, cropped, cv::COLOR_BGR2GRAY);
+
+                Mat blobCrop = blobFromImage(cropped, 1.0/127.5, Size(), Scalar::all(127.5));
+                recognizer.setInput(blobCrop);
+
+                tickMeter.start();
+                Mat result = recognizer.forward();
+                tickMeter.stop();
+
+                std::string wordRecognized = "";
+                decodeText(result, wordRecognized);
+                putText(frame, wordRecognized, vertices[1], FONT_HERSHEY_SIMPLEX, 1.5, Scalar(0, 0, 255));
+            }
+
            for (int j = 0; j < 4; ++j)
                line(frame, vertices[j], vertices[(j + 1) % 4], Scalar(0, 255, 0), 1);
        }

        // Put efficiency information.
-        std::vector<double> layersTimes;
-        double freq = getTickFrequency() / 1000;
-        double t = net.getPerfProfile(layersTimes) / freq;
-        std::string label = format("Inference time: %.2f ms", t);
+        std::string label = format("Inference time: %.2f ms", tickMeter.getTimeMilli());
        putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));

        imshow(kWinName, frame);
+
+        tickMeter.reset();
    }
    return 0;
 }

-void decode(const Mat& scores, const Mat& geometry, float scoreThresh,
-            std::vector<RotatedRect>& detections, std::vector<float>& confidences)
+void decodeBoundingBoxes(const Mat& scores, const Mat& geometry, float scoreThresh,
+                         std::vector<RotatedRect>& detections, std::vector<float>& confidences)
 {
    detections.clear();
    CV_Assert(scores.dims == 4); CV_Assert(geometry.dims == 4); CV_Assert(scores.size[0] == 1);
@ -159,3 +208,51 @@ void decode(const Mat& scores, const Mat& geometry, float scoreThresh,
        }
    }
 }
+
+void fourPointsTransform(const Mat& frame, Point2f vertices[4], Mat& result)
+{
+    const Size outputSize = Size(100, 32);
+
+    Point2f targetVertices[4] = {Point(0, outputSize.height - 1),
+                                  Point(0, 0), Point(outputSize.width - 1, 0),
+                                  Point(outputSize.width - 1, outputSize.height - 1),
+                                  };
+    Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices);
+
+    warpPerspective(frame, result, rotationMatrix, outputSize);
+}
+
+void decodeText(const Mat& scores, std::string& text)
+{
+    static const std::string alphabet = "0123456789abcdefghijklmnopqrstuvwxyz";
+    Mat scoresMat = scores.reshape(1, scores.size[0]);
+
+    std::vector<char> elements;
+    elements.reserve(scores.size[0]);
+
+    for (int rowIndex = 0; rowIndex < scoresMat.rows; ++rowIndex)
+    {
+        Point p;
+        minMaxLoc(scoresMat.row(rowIndex), 0, 0, 0, &p);
+        if (p.x > 0 && static_cast<size_t>(p.x) <= alphabet.size())
+        {
+            elements.push_back(alphabet[p.x - 1]);
+        }
+        else
+        {
+            elements.push_back('-');
+        }
+    }
+
+    if (elements.size() > 0 && elements[0] != '-')
+        text += elements[0];
+
+    for (size_t elementIndex = 1; elementIndex < elements.size(); ++elementIndex)
+    {
+        if (elementIndex > 0 && elements[elementIndex] != '-' &&
+            elements[elementIndex - 1] != elements[elementIndex])
+        {
+            text += elements[elementIndex];
+        }
+    }
+}