是否有任何pdf文本提取器api从pdf中提取阿拉伯语文本。
我使用的是itextpdf api,它在提取英语时效果很好,但它不提取阿拉伯语文本 这是我在pdf中提取文本的代码:
private String extractPDF(String path) throws IOException {
String parsedText = "";
PdfReader reader = new PdfReader(path);
int n = reader.getNumberOfPages();
for (int page = 0; page < n; page++) {
parsedText = parsedText + PdfTextExtractor.getTextFromPage(reader, page + 1).trim() + "\n"; //Extracting the content from the different pages
}
reader.close();
return parsedText;
}
这是输入pdf:arabic.pdf
更新:
我能够提取阿拉伯语文本,但它不会保留行的顺序,这是我的代码:
private String extractPDF(String name) throws IOException {
PdfReader reader = new PdfReader(name);
StringBuilder text = new StringBuilder();
for (int i=1;i<=reader.getNumberOfPages();i++){
String data = PdfTextExtractor.getTextFromPage(reader,i,new SimpleTextExtractionStrategy());
text.append(Bidi.BidiText(data,1).getText());
}
return text.toString();
}
pdf文字是:
بسماللهالرحمنالرحيم
السلامعليكمورحمةاللهوبركاته
سبحانالله
输出是:
سبحانالله
السلامعليكمورحمةاللهوبركاته
بسماللهالرحمنالرحيم
这是我的BidiText方法代码:
public static BidiResult BidiText(String str, int startLevel)
{
boolean isLtr = true;
int strLength = str.length();
if (strLength == 0)
{
return new BidiResult(str, false);
}
// get types, fill arrays
char[] chars = new char[strLength];
String[] types = new String[strLength];
String[] oldtypes = new String[strLength];
int numBidi = 0;
for (int i = 0; i < strLength; ++i)
{
chars[i] = str.charAt(i);
char charCode = str.charAt(i);
String charType = "L";
if (charCode <= 0x00ff)
{
charType = BaseTypes[charCode];
}
else if (0x0590 <= charCode && charCode <= 0x05f4)
{
charType = "R";
}
else if (0x0600 <= charCode && charCode <= 0x06ff)
{
charType = ArabicTypes[charCode & 0xff];
}
else if (0x0700 <= charCode && charCode <= 0x08AC)
{
charType = "AL";
}
if (charType.equals("R") || charType.equals("AL") || charType.equals("AN"))
{
numBidi++;
}
oldtypes[i] = types[i] = charType;
}
if (numBidi == 0)
{
return new BidiResult(str, true);
}
if (startLevel == -1)
{
if ((strLength / numBidi) < 0.3)
{
startLevel = 0;
}
else
{
isLtr = false;
startLevel = 1;
}
}
int[] levels = new int[strLength];
for (int i = 0; i < strLength; ++i)
{
levels[i] = startLevel;
}
String e = IsOdd(startLevel) ? "R" : "L";
String sor = e;
String eor = sor;
String lastType = sor;
for (int i = 0; i < strLength; ++i)
{
if (types[i].equals("NSM"))
{
types[i] = lastType;
}
else
{
lastType = types[i];
}
}
lastType = sor;
for (int i = 0; i < strLength; ++i)
{
String t = types[i];
if (t.equals("EN"))
{
types[i] = (lastType.equals("AL")) ? "AN" : "EN";
}
else if (t.equals("R") || t.equals("L") || t.equals("AL"))
{
lastType = t;
}
}
for (int i = 0; i < strLength; ++i)
{
String t = types[i];
if (t.equals("AL"))
{
types[i] = "R";
}
}
for (int i = 1; i < strLength - 1; ++i)
{
if (types[i].equals("ES") && types[i - 1].equals("EN") && types[i + 1].equals("EN"))
{
types[i] = "EN";
}
if (types[i].equals("CS") && (types[i - 1].equals("EN") || types[i - 1].equals("AN")) && types[i + 1] == types[i - 1])
{
types[i] = types[i - 1];
}
}
for (int i = 0; i < strLength; ++i)
{
if (types[i].equals("EN"))
{
// do before
for (int j = i - 1; j >= 0; --j)
{
if (!types[j].equals("ET"))
{
break;
}
types[j] = "EN";
}
// do after
for (int j = i + 1; j < strLength; --j)
{
if (!types[j].equals("ET"))
{
break;
}
types[j] = "EN";
}
}
}
for (int i = 0; i < strLength; ++i)
{
String t = types[i];
if (t.equals("WS") || t.equals("ES") || t.equals("ET") || t.equals("CS"))
{
types[i] = "ON";
}
}
lastType = sor;
for (int i = 0; i < strLength; ++i)
{
String t = types[i];
if (t.equals("EN"))
{
types[i] = (lastType.equals("L")) ? "L" : "EN";
}
else if (t.equals("R") || t.equals("L"))
{
lastType = t;
}
}
for (int i = 0; i < strLength; ++i)
{
if (types[i].equals("ON"))
{
int end = FindUnequal(types, i + 1, "ON");
String before = sor;
if (i > 0)
{
before = types[i - 1];
}
String after = eor;
if (end + 1 < strLength)
{
after = types[end + 1];
}
if (!before.equals("L"))
{
before = "R";
}
if (!after.equals("L"))
{
after = "R";
}
if (before == after)
{
SetValues(types, i, end, before);
}
i = end - 1; // reset to end (-1 so next iteration is ok)
}
}
for (int i = 0; i < strLength; ++i)
{
if (types[i].equals("ON"))
{
types[i] = e;
}
}
for (int i = 0; i < strLength; ++i)
{
String t = types[i];
if (IsEven(levels[i]))
{
if (t.equals("R"))
{
levels[i] += 1;
}
else if (t.equals("AN") || t.equals("EN"))
{
levels[i] += 2;
}
}
else
{
if (t.equals("L") || t.equals("AN") || t.equals("EN"))
{
levels[i] += 1;
}
}
}
int highestLevel = -1;
int lowestOddLevel = 99;
int ii = levels.length;
for (int i = 0; i < ii; ++i)
{
int level = levels[i];
if (highestLevel < level)
{
highestLevel = level;
}
if (lowestOddLevel > level && IsOdd(level))
{
lowestOddLevel = level;
}
}
for (int level = highestLevel; level >= lowestOddLevel; --level)
{
int start = -1;
ii = levels.length;
for (int i = 0; i < ii; ++i)
{
if (levels[i] < level)
{
if (start >= 0)
{
chars = ReverseValues(chars, start, i);
start = -1;
}
}
else if (start < 0)
{
start = i;
}
}
if (start >= 0)
{
chars = ReverseValues(chars, start, levels.length);
}
}
String result = "";
ii = chars.length;
for (int i = 0; i < ii; ++i)
{
char ch = chars[i];
if (ch != '<' && ch != '>')
{
result += ch;
}
}
return new BidiResult(result, isLtr);
}
答案 0 :(得分:0)
您的示例PDF根本不包含任何文本,它只包含嵌入的文本位图图像。
谈论&#34;从PDF文本中提取文本&#34; (以及&#34;文本提取器API&#34; 和PdfTextExtractor
类等等,通常意味着在PDF中查找文本绘图说明(PDF查看器使用嵌入在PDF中或在手边系统上可用的字体程序来显示文本)和< em>从字符串参数和字体编码定义确定其文本内容。
在您的情况下,没有这样的文本绘图说明,只有位图绘制指令和位图本身,文档中的文本提取将返回一个空字符串。
要检索文档中显示的文本,您必须寻找OCR(光学字符识别)解决方案。如果OCR解决方案不直接支持PDF而只支持位图格式,PDF库(如iText)可以帮助您提取嵌入的位图图像以转发到OCR解决方案。
如果您还有使用文本绘图说明显示阿拉伯文本的PDF文档,其中包含足够的编码信息而不是位图,您可能需要使用类似{的方法对iText的文本提取输出进行后处理正如Amedee在对您的问题发表评论中所指出的那样this answer中提出的{1}} {1}}。 (是的,它是用C#编写的,但是很容易移植到Java。)