我可以通过命令提示符使用新训练的tessedata(版本3.02)获得正确的OCR输出,但我希望在带有DLL ref的C#代码中使用相同的输出。我已尝试使用tessnet2_32.dll引用但是它抛出了异常所以如何使用或访问tesseract 3.02版本训练的tessedata使用DLL引用通过C#代码?
答案 0 :(得分:1)
To access or use tesseract 3.02 trained data we have to create separate wrapper class like below.
using System;
using System.IO;
using System.Diagnostics;
using System.Drawing;
/// <summary>
/// Summary description for TesseractOCR
/// </summary>
///
namespace tesseractThree
{
public class TesseractOCR
{
public TesseractOCR()
{
//
// TODO: Add constructor logic here
//
}
private string commandpath;
private string outpath;
private string tmppath;
public TesseractOCR(string commandpath)
{
this.commandpath = commandpath;
tmppath = System.Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData) + @"\out.tif";
outpath = System.Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData) + @"\out.txt";
}
public string analyze(string filename,string lang,bool noLine)
{
string args = filename + " " + outpath.Replace(".txt", "");
ProcessStartInfo startinfo;
if (noLine == true)
{
startinfo = new ProcessStartInfo(commandpath, args + " -l " + lang + " -psm 6");
}
else
{
startinfo = new ProcessStartInfo(commandpath, args + " -l " + lang);
}
startinfo.CreateNoWindow = true;
startinfo.UseShellExecute = false;
Process.Start(startinfo).WaitForExit();
string ret = "";
using (StreamReader r = new StreamReader(outpath))
{
string content = r.ReadToEnd();
ret = content;
}
File.Delete(outpath);
return ret;
}
public string OCRFromBitmap(Bitmap bmp,string lang,bool noLine)
{
bmp.Save(tmppath, System.Drawing.Imaging.ImageFormat.Tiff);
string ret = analyze(tmppath,lang,noLine);
File.Delete(tmppath);
return ret;
}
/* public string OCRFromFile(string filename)
{
return analyze(filename);
}*/
}
}
//Usage of this class
string lang = "enc";
Bitmap b = new Bitmap(@"D:\Image\enc.test_font.exp0.tif");
TesseractOCR ocr = new TesseractOCR(@"C:\Program Files\Tesseract-OCR\tesseract.exe");
string result = ocr.OCRFromBitmap(b, lang,true);
Label1.Text = result;
OR Refer below link for more details.
https://gist.github.com/yatt/915443
答案 1 :(得分:0)
那是针对Tesseract 2.04的。您需要一个与3.02版本兼容的.NET wrapper。
答案 2 :(得分:0)
使用tesseractengine3.dll我们可以使用tesseract v3.02训练数据,如下所示。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using tesseract;
using System.Drawing;
using System.IO;
public enum TesseractEngineMode : int
{
/// <summary>
/// Run Tesseract only - fastest
/// </summary>
TESSERACT_ONLY = 0,
/// <summary>
/// Run Cube only - better accuracy, but slower
/// </summary>
CUBE_ONLY = 1,
/// <summary>
/// Run both and combine results - best accuracy
/// </summary>
TESSERACT_CUBE_COMBINED = 2,
/// <summary>
/// Specify this mode when calling init_*(),
/// to indicate that any of the above modes
/// should be automatically inferred from the
/// variables in the language-specific config,
/// command-line configs, or if not specified
/// in any of the above should be set to the
/// default OEM_TESSERACT_ONLY.
/// </summary>
DEFAULT = 3
}
public enum TesseractPageSegMode : int
{
/// <summary>
/// Fully automatic page segmentation
/// </summary>
PSM_AUTO = 0,
/// <summary>
/// Assume a single column of text of variable sizes
/// </summary>
PSM_SINGLE_COLUMN = 1,
/// <summary>
/// Assume a single uniform block of text (Default)
/// </summary>
PSM_SINGLE_BLOCK = 2,
/// <summary>
/// Treat the image as a single text line
/// </summary>
PSM_SINGLE_LINE = 3,
/// <summary>
/// Treat the image as a single word
/// </summary>
PSM_SINGLE_WORD = 4,
/// <summary>
/// Treat the image as a single character
/// </summary>
PSM_SINGLE_CHAR = 5
}
public partial class importDLL : System.Web.UI.Page
{
private TesseractProcessor m_tesseract = null;
//private const string m_path = @"..\..\data\";
private const string m_path = @"D:\tessdata-3.02\";
private const string m_lang = "eng";
protected void Page_Load(object sender, EventArgs e)
{
var image = System.Drawing.Image.FromFile(@"D:\Image\Capture1T.tif");
m_tesseract = new TesseractProcessor();
bool succeed = m_tesseract.Init(m_path, m_lang, (int)TesseractEngineMode.DEFAULT);
if (!succeed)
{
}
m_tesseract.SetVariable("tessedit_pageseg_mode", ((int)TesseractPageSegMode.PSM_SINGLE_LINE).ToString());
m_tesseract.Clear();
m_tesseract.ClearAdaptiveClassifier();
string outValue= m_tesseract.Apply(image);
Response.Write(outValue);
}
}