我正在研究OCR文本检测和识别问题,在使用Opencv和Tesseract库在某些图像上尝试Text Scene Detection and Recognition with OpenCV 3后,我做了一些评论(检查检测图像输出,分割输出和识别输出后手动图片),我正在寻找如何通过更改代码来改善结果。
首先,图像是原始的黑白(主要是灰色,文本是黑色还是白色),文本可以是数字和字母字符的组合。
1 - 当文本颜色强度与背景颜色强度相差太大时,不会检测到文本。
2-当图像包含例如“63H”时,它被正确检测到,但是如果它们之间有更多的空间(例如:“63 H”)则根本检测不到它。
3 - 当不同的文本区域未完全对齐时(图像角度与同一形式的另一个正确文本检测不同),检测到文本的一部分,另一部分未检测到(例如text = 12548 58A。12548是第一部分) 58A是第2部分。
我试图分析结果,并认为这些评论背后的原因可能是在ERGrouping阶段('使用Exhaustive Search algorithm grouping algorithm。分组任意导向文本会给我带来更糟糕的结果)。如果是这样如何改进呢?
任何人都可以帮助我更好地理解言论的原因吗? (不幸的是,我无法分享图片,因为它们是保密的)
我可以对分类器参数进行哪些修改以改善结果(我使用的是opencv提供的默认分类器文件)?
createERFilterNM1(const Ptr<ERFilter::Callback>& cb, int thresholdDelta=1, float minArea=0.00025, float maxArea=0.13, float minProbability=0.4, bool nonMaxSuppression=true, float minProbabilityDiff=0.1 )
我还可以尝试其他哪些方法? (选项是关于库和语言的开放(Python / C ++)。 // TextRecognition.cpp:définitlepoint d'entréepourl'application console。 // / * * textdetection.cpp *
*/
#include "opencv2/text.hpp"
#include "opencv2/core/utility.hpp"
#include "opencv2/core/core.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include <fstream>
#include <iostream>
using namespace std;
using namespace cv;
using namespace cv::text;
//Calculate edit distance between two words
size_t edit_distance(const string& A, const string& B);
size_t min(size_t x, size_t y, size_t z);
bool isRepetitive(const string& s);
bool sort_by_lenght(const string &a, const string &b);
//Draw ER's in an image via floodFill
void er_draw(vector<Mat> &channels, vector<vector<ERStat> > ®ions, vector<Vec2i> group, Mat& segmentation);
//Perform text detection and recognition and evaluate results using edit distance
int main(int argc, char* argv[])
{
cout << endl << argv[0] << endl << endl;
cout << "Text Detection and Recognition: " << endl;
cout << "algorithm described in:" << endl;
cout << "Real-Time Scene Text Localization and Recognition" << endl << endl;
Mat image;
// image = imread("C:\\scene-text-recognition\\scenetext01.jpg");
if(argc>1)
image = imread(argv[1]);
else
{
cout << " Usage: " << argv[0] << " <input_image> [<gt_word1> ... <gt_wordN>]" << endl;
return(0);
}
cout << "IMG_W=" << image.cols << endl;
cout << "IMG_H=" << image.rows << endl;
/*Text Detection*/
// Extract channels to be processed individually
vector<Mat> channels;
Mat grey;
cvtColor(image,grey,COLOR_RGB2GRAY);
// Notice here we are only using grey channel, see textdetection.cpp for example with more channels
channels.push_back(grey);
channels.push_back(255-grey);
double t_d = (double)getTickCount();
// Create ERFilter objects with the 1st and 2nd stage default classifiers
//createERFilterNM1(const Ptr<ERFilter::Callback>& cb, int thresholdDelta=1, float minArea=0.00025, float maxArea=0.13, float minProbability=0.4, bool nonMaxSuppression=true, float minProbabilityDiff=0.1 )
Ptr<ERFilter> er_filter1 = createERFilterNM1(loadClassifierNM1("trained_classifierNM1.xml"),8,0.00015f,0.13f,0.2f,true,0.2f);
Ptr<ERFilter> er_filter2 = createERFilterNM2(loadClassifierNM2("trained_classifierNM2.xml"),0.5);
vector<vector<ERStat> > regions(channels.size());
// Apply the default cascade classifier to each independent channel (could be done in parallel)
for (int c=0; c<(int)channels.size(); c++)
{
er_filter1->run(channels[c], regions[c]);
er_filter2->run(channels[c], regions[c]);
}
cout << "TIME_REGION_DETECTION = " << ((double)getTickCount() - t_d)*1000/getTickFrequency() << endl;
Mat out_img_decomposition= Mat::zeros(image.rows+2, image.cols+2, CV_8UC1);
vector<Vec2i> tmp_group;
for (int i=0; i<(int)regions.size(); i++)
{
for (int j=0; j<(int)regions[i].size();j++)
{
tmp_group.push_back(Vec2i(i,j));
}
Mat tmp= Mat::zeros(image.rows+2, image.cols+2, CV_8UC1);
er_draw(channels, regions, tmp_group, tmp);
if (i > 0)
tmp = tmp / 2;
out_img_decomposition = out_img_decomposition | tmp;
tmp_group.clear();
}
double t_g = (double)getTickCount();
// Detect character groups
vector< vector<Vec2i> > nm_region_groups;
vector<Rect> nm_boxes;
erGrouping(image, channels, regions, nm_region_groups, nm_boxes,ERGROUPING_ORIENTATION_HORIZ);
//erGrouping(image, channels, regions, nm_region_groups, nm_boxes,ERGROUPING_ORIENTATION_ANY,"trained_classifier_erGrouping.xml",0.5);
cout << "TIME_GROUPING = " << ((double)getTickCount() - t_g)*1000/getTickFrequency() << endl;