使用OpenCV改进自然图像中的OCR文本检测/分割

时间:2017-08-30 17:26:39

标签: python c++ opencv ocr tesseract

我正在研究OCR文本检测和识别问题,在使用Opencv和Tesseract库在某些图像上尝试Text Scene Detection and Recognition with OpenCV 3后,我做了一些评论(检查检测图像输出,分割输出和识别输出后手动图片),我正在寻找如何通过更改代码来改善结果。

首先,图像是原始的黑白(主要是灰色,文本是黑色还是白色),文本可以是数字和字母字符的组合。

1 - 当文本颜色强度与背景颜色强度相差太大时,不会检测到文本。

2-当图像包含例如“63H”时,它被正确检测到,但是如果它们之间有更多的空间(例如:“63 H”)则根本检测不到它。

3 - 当不同的文本区域未完全对齐时(图像角度与同一形式的另一个正确文本检测不同),检测到文本的一部分,另一部分未检测到(例如text = 12548 58A。12548是第一部分) 58A是第2部分。

我试图分析结果,并认为这些评论背后的原因可能是在ERGrouping阶段('使用Exhaustive Search algorithm grouping algorithm。分组任意导向文本会给我带来更糟糕的结果)。如果是这样如何改进呢?

任何人都可以帮助我更好地理解言论的原因吗? (不幸的是,我无法分享图片,因为它们是保密的)

我可以对分类器参数进行哪些修改以改善结果(我使用的是opencv提供的默认分类器文件)?

createERFilterNM1(const Ptr<ERFilter::Callback>& cb, int thresholdDelta=1, float minArea=0.00025, float maxArea=0.13, float minProbability=0.4, bool nonMaxSuppression=true, float minProbabilityDiff=0.1 )

我还可以尝试其他哪些方法? (选项是关于库和语言的开放(Python / C ++)。     // TextRecognition.cpp:définitlepoint d'entréepourl'application console。     //     / *      * textdetection.cpp      *

 */

#include "opencv2/text.hpp"
#include "opencv2/core/utility.hpp"
#include "opencv2/core/core.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include <fstream>

#include <iostream>

using namespace std;
using namespace cv;
using namespace cv::text;

//Calculate edit distance between two words
size_t edit_distance(const string& A, const string& B);
size_t min(size_t x, size_t y, size_t z);
bool   isRepetitive(const string& s);
bool   sort_by_lenght(const string &a, const string &b);
//Draw ER's in an image via floodFill
void   er_draw(vector<Mat> &channels, vector<vector<ERStat> > &regions, vector<Vec2i> group, Mat& segmentation);

//Perform text detection and recognition and evaluate results using edit distance
int main(int argc, char* argv[])
{
    cout << endl << argv[0] << endl << endl;
    cout << "Text Detection and Recognition: " << endl;
    cout << "algorithm described in:" << endl;
    cout << "Real-Time Scene Text Localization and Recognition" << endl << endl;

    Mat image;
    // image  = imread("C:\\scene-text-recognition\\scenetext01.jpg");

     if(argc>1)
        image  = imread(argv[1]);
    else
    {
        cout << "    Usage: " << argv[0] << " <input_image> [<gt_word1> ... <gt_wordN>]" << endl;
        return(0);
    }


    cout << "IMG_W=" << image.cols << endl;
    cout << "IMG_H=" << image.rows << endl;


    /*Text Detection*/

    // Extract channels to be processed individually
    vector<Mat> channels;

    Mat grey;
    cvtColor(image,grey,COLOR_RGB2GRAY);

    // Notice here we are only using grey channel, see textdetection.cpp for example with more channels
    channels.push_back(grey);
    channels.push_back(255-grey);

    double t_d = (double)getTickCount();
    // Create ERFilter objects with the 1st and 2nd stage default classifiers
    //createERFilterNM1(const Ptr<ERFilter::Callback>& cb, int thresholdDelta=1, float minArea=0.00025, float maxArea=0.13, float minProbability=0.4, bool nonMaxSuppression=true, float minProbabilityDiff=0.1 )
    Ptr<ERFilter> er_filter1 = createERFilterNM1(loadClassifierNM1("trained_classifierNM1.xml"),8,0.00015f,0.13f,0.2f,true,0.2f);
    Ptr<ERFilter> er_filter2 = createERFilterNM2(loadClassifierNM2("trained_classifierNM2.xml"),0.5);

    vector<vector<ERStat> > regions(channels.size());
    // Apply the default cascade classifier to each independent channel (could be done in parallel)
    for (int c=0; c<(int)channels.size(); c++)
    {
        er_filter1->run(channels[c], regions[c]);
        er_filter2->run(channels[c], regions[c]);
    }
    cout << "TIME_REGION_DETECTION = " << ((double)getTickCount() - t_d)*1000/getTickFrequency() << endl;

    Mat out_img_decomposition= Mat::zeros(image.rows+2, image.cols+2, CV_8UC1);
    vector<Vec2i> tmp_group;
    for (int i=0; i<(int)regions.size(); i++)
    {
        for (int j=0; j<(int)regions[i].size();j++)
        {
            tmp_group.push_back(Vec2i(i,j));
        }
        Mat tmp= Mat::zeros(image.rows+2, image.cols+2, CV_8UC1);
        er_draw(channels, regions, tmp_group, tmp);
        if (i > 0)
            tmp = tmp / 2;
        out_img_decomposition = out_img_decomposition | tmp;
        tmp_group.clear();
    }

    double t_g = (double)getTickCount();
    // Detect character groups
    vector< vector<Vec2i> > nm_region_groups;
    vector<Rect> nm_boxes;
     erGrouping(image, channels, regions, nm_region_groups, nm_boxes,ERGROUPING_ORIENTATION_HORIZ);
    //erGrouping(image, channels, regions, nm_region_groups, nm_boxes,ERGROUPING_ORIENTATION_ANY,"trained_classifier_erGrouping.xml",0.5);
    cout << "TIME_GROUPING = " << ((double)getTickCount() - t_g)*1000/getTickFrequency() << endl;

0 个答案:

没有答案