命令

Question

我这里有一个程序，用文本对图像进行二值化。在程序的一部分中，您可以启用检测文本轮廓的裁剪功能。但在某些情况下，它并没有检测到所有文本轮廓

如果使用-d参数，程序会在输出图像中将文本轮廓绘制为矩形而不进行裁剪

文本轮廓检测（和矩形绘图）的逻辑在detect_text_box函数

中

命令

/var/txtbin /var/in.png -d /var/out.png

码

/*
 *  Compile
 *  # g++ txtbin.cpp -o txtbin `pkg-config opencv --cflags --libs`
 *
 *  Get opencv version
 *  # pkg-config --modversion opencv
 *
 *  Run
 *  # ./txtbin input.jpg output.png
 */

#include "string"
#include "fstream"
#include "/var/bin/opencv/include/opencv2/opencv.hpp"
//#include "/usr/include/opencv2/opencv.hpp"
#include "/usr/include/boost/tuple/tuple.hpp"

using namespace std;
using namespace cv;
using namespace boost;

void CalcBlockMeanVariance(Mat& Img, Mat& Res, float blockSide=21, float contrast=0.01){
    /*
     *  blockSide: set greater for larger fonts in image and vice versa
     *  contrast: set smaller for lower contrast image
     */

    Mat I;
    Img.convertTo(I, CV_32FC1);
    Res = Mat::zeros(Img.rows / blockSide, Img.cols / blockSide, CV_32FC1);
    Mat inpaintmask;
    Mat patch;
    Mat smallImg;
    Scalar m, s;

    for(int i = 0; i < Img.rows - blockSide; i += blockSide){
        for(int j = 0; j < Img.cols - blockSide; j += blockSide){
            patch = I(Range(i, i + blockSide + 1), Range(j, j + blockSide + 1));
            meanStdDev(patch, m, s);

            if(s[0] > contrast){
                Res.at<float>(i / blockSide, j / blockSide) = m[0];
            }
            else{
                Res.at<float>(i / blockSide, j / blockSide) = 0;
            }
        }
    }

    resize(I, smallImg, Res.size());

    threshold(Res, inpaintmask, 0.02, 1.0, THRESH_BINARY);

    Mat inpainted;
    smallImg.convertTo(smallImg, CV_8UC1, 255);

    inpaintmask.convertTo(inpaintmask, CV_8UC1);
    inpaint(smallImg, inpaintmask, inpainted, 5, INPAINT_TELEA);

    resize(inpainted, Res, Img.size());
    Res.convertTo(Res, CV_32FC1, 1.0 / 255.0);
}

tuple<int, int, int, int> detect_text_box(string input, Mat& res, bool draw_contours=false){
    Mat large = imread(input);

    bool test_output = false;

    int
        top = large.rows,
        bottom = 0,
        left = large.cols,
        right = 0;

    int
        rect_bottom,
        rect_right;

    Mat rgb;
    // downsample and use it for processing
    pyrDown(large, rgb);
    pyrDown(rgb, rgb);
    Mat small;
    cvtColor(rgb, small, CV_BGR2GRAY);
    // morphological gradient
    Mat grad;
    Mat morphKernel = getStructuringElement(MORPH_ELLIPSE, Size(3, 3));
    morphologyEx(small, grad, MORPH_GRADIENT, morphKernel);
    // binarize
    Mat bw;
    threshold(grad, bw, 0.0, 255.0, THRESH_BINARY | THRESH_OTSU);
    // connect horizontally oriented regions
    Mat connected;
    morphKernel = getStructuringElement(MORPH_RECT, Size(9, 1));
    morphologyEx(bw, connected, MORPH_CLOSE, morphKernel);
    // find contours
    Mat mask = Mat::zeros(bw.size(), CV_8UC1);
    vector<vector<Point> > contours;
    vector<Vec4i> hierarchy;
    findContours(connected, contours, hierarchy, CV_RETR_CCOMP, CV_CHAIN_APPROX_SIMPLE, Point(0, 0));

    Scalar color = Scalar(0, 255, 0);
    Scalar color2 = Scalar(0, 0, 255);
    int thickness = 2;

    // filter contours
    for(int idx = 0; idx >= 0; idx = hierarchy[idx][0]){
        Rect rect = boundingRect(contours[idx]);
        Mat maskROI(mask, rect);
        maskROI = Scalar(0, 0, 0);
        // fill the contour
        drawContours(mask, contours, idx, Scalar(255, 255, 255), CV_FILLED);
        // ratio of non-zero pixels in the filled region
        double r = (double)countNonZero(maskROI) / (rect.width * rect.height);

        // assume at least 25% of the area is filled if it contains text
        if (r > 0.25 && 
        (rect.height > 8 && rect.width > 8) // constraints on region size
        // these two conditions alone are not very robust. better to use something 
        //like the number of significant peaks in a horizontal projection as a third condition
        ){
            if(draw_contours){
                rectangle(res, Rect(rect.x * 4, rect.y * 4, rect.width * 4, rect.height * 4), color, thickness);
            }

            if(test_output){
                rectangle(rgb, rect, color, thickness);
            }

            if(rect.y < top){
                top = rect.y;
            }
            rect_bottom = rect.y + rect.height;
            if(rect_bottom > bottom){
                bottom = rect_bottom;
            }
            if(rect.x < left){
                left = rect.x;
            }
            rect_right = rect.x + rect.width;
            if(rect_right > right){
                right = rect_right;
            }
        }
    }

    if(draw_contours){
        rectangle(res, Point(left * 4, top * 4), Point(right * 4, bottom * 4), color2, thickness);
    }

    if(test_output){
        rectangle(rgb, Point(left, top), Point(right, bottom), color2, thickness);
        imwrite(string("test_text_contours.jpg"), rgb);
    }

    return make_tuple(left * 4, top * 4, (right - left) * 4, (bottom - top) * 4);
}

int main(int argc, char* argv[]){
    string input;
    string output = "output.png";

    int
        width = 0,
        height = 0,
        blockside = 9;

    bool
        crop = false,
        draw = false;

    float margin = 0;

    cout << "OpenCV version: " << CV_VERSION << endl;

    //  Return error if arguments are missing
    if(argc < 3){
        cerr << "\nUsage: txtbin input [options] output\n\n"
            "Options:\n"
            "\t-w <number>          -- set max width (keeps aspect ratio)\n"
            "\t-h <number>          -- set max height (keeps aspect ratio)\n"
            "\t-c                   -- crop text content contour\n"
            "\t-m <number>          -- add margins (number in %)\n"
            "\t-b <number>          -- set blockside\n"
            "\t-d                   -- draw text content contours (debugging)\n" << endl;
        return 1;
    }

    //  Parse arguments
    for(int i = 1; i < argc; i++){
        if(i == 1){
            input = string(argv[i]);

            //  Return error if input file is invalid
            ifstream stream(input.c_str());
            if(!stream.good()){
                cerr << "Error: Input file is invalid!" << endl;
                return 1;
            }
        }
        else if(string(argv[i]) == "-w"){
            width = atoi(argv[++i]);
        }
        else if(string(argv[i]) == "-h"){
            height = atoi(argv[++i]);
        }
        else if(string(argv[i]) == "-c"){
            crop = true;
        }
        else if(string(argv[i]) == "-m"){
            margin = atoi(argv[++i]);
        }
        else if(string(argv[i]) == "-b"){
            blockside = atoi(argv[++i]);
        }
        else if(string(argv[i]) == "-d"){
            draw = true;
        }
        else if(i == argc - 1){
            output = string(argv[i]);
        }
    }

    Mat Img = imread(input, CV_LOAD_IMAGE_GRAYSCALE);
    Mat res;
    Img.convertTo(Img, CV_32FC1, 1.0 / 255.0);
    CalcBlockMeanVariance(Img, res, blockside);
    res = 1.0 - res;
    res = Img + res;
    threshold(res, res, 0.85, 1, THRESH_BINARY);

    int
        txt_x,
        txt_y,
        txt_width,
        txt_height;

    if(crop || draw){
        tie(txt_x, txt_y, txt_width, txt_height) = detect_text_box(input, res, draw);
    }

    if(crop){
        //res = res(Rect(txt_x, txt_y, txt_width, txt_height)).clone();
        res = res(Rect(txt_x, txt_y, txt_width, txt_height));
    }

    if(margin){
        int border = res.cols * margin / 100;
        copyMakeBorder(res, res, border, border, border, border, BORDER_CONSTANT, Scalar(255, 255, 255));
    }

    float
        width_input = res.cols,
        height_input = res.rows;

    bool resized = false;

    //  Downscale image
    if(width > 0 && width_input > width){
        float scale = width_input / width;
        width_input /= scale;
        height_input /= scale;
        resized = true;
    }
    if(height > 0 && height_input > height){
        float scale = height_input / height;
        width_input /= scale;
        height_input /= scale;
        resized = true;
    }
    if(resized){
        resize(res, res, Size(round(width_input), round(height_input)));
    }

    imwrite(output, res * 255);

    return 0;
}

图像1输入

图像1输出

图像2输入

图像2输出

更新

我把你的代码放在一个类中但是出错了

该类名为XYcut，在以下代码中，我得到一个编译错误

int n_labels = partition(filteredRects, labels, [max_distance2](const cv::Rect& lhs, const cv::Rect& rhs){
    if(XYcut::ed2(lhs.tl(), cv::Point(rhs.br().x, rhs.tl().y)) < max_distance2){
        return true;
    }
    if(XYcut::ed2(rhs.tl(), cv::Point(lhs.br().x, lhs.tl().y)) < max_distance2){
        return true;
    }
    return false;
});

错误

 error: ‘this’ was not captured for this lambda function
   if(XYcut::ed2(lhs.tl(), cv::Point(rhs.br().x, rhs.tl().y)) < max_distance2){

如何将ed2方法引用到XYcut类..

类和方法

这是方法

int XYcut::ed2(const cv::Point& lhs, const cv::Point& rhs){
    return (lhs.x - rhs.x)*(lhs.x - rhs.x) + (lhs.y - rhs.y)*(lhs.y - rhs.y);
}

Answer 1

我只是想提出一种不同的方法。它基于XY-Cut算法，并且由于您的文本是轴对齐的，因此效果非常好。

在输入图像上，计算XY-Cut，并获取边界框：

您会看到正确识别字符组，但不能识别整个字词。所以，首先我们删除非常小的矩形，这只是噪音：

然后我们将非常接近的矩形分组。您可以将cv::partition与适当的谓词一起使用：

现在每个单词都有一个边界框。您最终可以获得每行的边界框。在第二张图片上，您将获得：

这是我使用的代码：

#include <opencv2\opencv.hpp>
#include <vector>
using namespace std;
using namespace cv;

Mat3b dbg;

vector<Rect> XYCut_projH(const Mat1b& src, Rect roi)
{
    rectangle(dbg, roi, Scalar(255, 0, 0));

    Mat1b projH;
    reduce(src(roi), projH, 1, CV_REDUCE_MAX);

    vector<Rect> rects;

    bool bOut = true;
    vector<int> coords;

    for (int i = 0; i < projH.rows; ++i)
    {
        if (bOut && projH(i) > 0)
        {
            coords.push_back(i);
            bOut = false;
        }
        else if (!bOut && projH(i) == 0)
        {
            coords.push_back(i);
            bOut = true;
        }
    }

    if (!bOut)
    {
        coords.push_back(projH.rows);
    }


    for (int i = 0; i < coords.size() - 1; i += 2)
    {
        Rect r(0, coords[i], src.cols, coords[i + 1] - coords[i]);
        r = (r + roi.tl()) & roi;
        rects.push_back(r);

        rectangle(dbg, r, Scalar(0, 255, 0));
    }

    if ((rects.size() == 1) && (rects[0] == roi))
    {
        return vector<Rect>();
    }

    return rects;
}

vector<Rect> XYCut_projV(const Mat1b& src, Rect roi)
{
    rectangle(dbg, roi, Scalar(255, 0, 0));

    Mat1b projV;
    reduce(src(roi), projV, 0, CV_REDUCE_MAX);

    vector<Rect> rects;

    bool bOut = true;
    vector<int> coords;

    for (int i = 0; i < projV.cols; ++i)
    {
        if (bOut && projV(i) > 0)
        {
            coords.push_back(i);
            bOut = false;
        }
        else if (!bOut && projV(i) == 0)
        {
            coords.push_back(i);
            bOut = true;
        }
    }

    if (!bOut)
    {
        coords.push_back(projV.cols);
    }

    for (int i = 0; i < coords.size() - 1; i += 2)
    {
        Rect r(coords[i], 0, coords[i + 1] - coords[i], src.rows);
        r = (r + roi.tl()) & roi;
        rects.push_back(r);

        rectangle(dbg, r, Scalar(0, 255, 0));
    }

    if ((rects.size() == 1) && (rects[0] == roi))
    {
        return vector<Rect>();
    }

    return rects;
}

void XYCut_step(const Mat1b& src, Rect roi, vector<Rect>& rects, bool bAlternate)
{
    vector<Rect> step;
    if (bAlternate)
    {
        step = XYCut_projH(src, roi);

        if (step.empty())
        {
            rects.push_back(roi);
            return;
        }
    }
    else
    {
        step = XYCut_projV(src, roi);

        if (step.empty())
        {
            rects.push_back(roi);
            return;
        }
    }

    for (int i = 0; i < step.size(); ++i)
    {
        XYCut_step(src, step[i], rects, !bAlternate);
    }
}

void XYCut(const Mat1b& src, vector<Rect>& rects)
{
    bool bAlternate = true;
    Rect roi(0, 0, src.cols, src.rows);

    XYCut_step(src, roi, rects, bAlternate);
}

int ed2(const Point& lhs, const Point& rhs)
{
    return (lhs.x - rhs.x)*(lhs.x - rhs.x) + (lhs.y - rhs.y)*(lhs.y - rhs.y);
}

int main()
{
    // Load image
    Mat1b img = imread("path_to_image", IMREAD_GRAYSCALE);
    cvtColor(img, dbg, COLOR_GRAY2BGR);

    // invert image, if needed
    img = ~img;

    // Apply XY Cut
    vector<Rect> rects;
    XYCut(img, rects);

    // Show XY results
    Mat3b xyres;
    cvtColor(img, xyres, COLOR_GRAY2BGR);
    for (int i = 0; i < rects.size(); ++i)
    {
        rectangle(xyres, rects[i], Scalar(0, 0, 255), 2);
    }

    //imshow("XY-Cut Result", xyres);
    //waitKey(1);

    // Remove small bounding boxes (noise)
    int min_area = 10;
    vector<Rect> filteredRects;
    for (const auto& r : rects)
    {
        if (r.area() > min_area)
        {
            filteredRects.push_back(r);
        }
    }

    // Show Filtered results
    Mat3b filtres;
    cvtColor(img, filtres, COLOR_GRAY2BGR);
    for (int i = 0; i < filteredRects.size(); ++i)
    {
        rectangle(filtres, filteredRects[i], Scalar(255, 0, 0), 2);
    }

    //imshow("Filtered Result", filtres);
    //waitKey(1);

    // Group near rectangles
    int max_distance = 10;

    vector<int> labels;
    int max_distance2 = max_distance*max_distance;
    int n_labels = partition(filteredRects, labels, [max_distance2](const Rect& lhs, const Rect& rhs)
    {
        if (ed2(lhs.tl(), Point(rhs.br().x, rhs.tl().y)) < max_distance2) { return true; }
        if (ed2(rhs.tl(), Point(lhs.br().x, lhs.tl().y)) < max_distance2) { return true; }
        return false;
    });

    // Make a bounding box for rects grouped together
    vector<vector<Point>> pts(n_labels);
    for (int i = 0; i < filteredRects.size(); ++i)
    {
        pts[labels[i]].push_back(filteredRects[i].tl());
        pts[labels[i]].push_back(filteredRects[i].br());
    }

    // Show Grouped results
    vector<Rect> groupedRects(n_labels);
    for (int i = 0; i < pts.size(); ++i)
    {
        groupedRects[i] = boundingRect(pts[i]);
    }


    // Show Grouped results
    Mat3b groupres;
    cvtColor(img, groupres, COLOR_GRAY2BGR);
    for (int i = 0; i < groupedRects.size(); ++i)
    {
        rectangle(groupres, groupedRects[i], Scalar(0, 255, 0), 2);
    }


    //imshow("Grouped Result", groupres);
    //waitKey(1);




    return 0;
}

检测图像中的文本轮廓

命令

码

图像1输入

图像1输出

图像2输入

图像2输出

更新

错误

类和方法

1 个答案: