我们部门继承了使用Apache PDFBox 1.8.x或更早版本的代码,我们正在尝试将其迁移到Apache PDFBox 2.0.x.代码的某些部分使用TextNormalize,但我在2.0 javadocs中找不到任何提及它的内容。我也没有在Migration to PDFBox 2.0.0指南中找到任何提及。
我似乎无法找到有关如何更改此类或替换TextNormalize的任何信息。有没有人对如何替换Apache PDFBox 2.0有任何建议?
很大程度上,我们创建一个TextNormalize对象作为我们的类的构造函数的一部分,该类扩展了PDFStreamEngine,并且它是在合并/插入变音符号的代码中使用它的唯一位置。
/**
* Merge a single character TextPosition into the current object.
* This is to be used only for cases where we have a diacritic that
* overlaps an existing TextPosition. In a graphical display, we could
* overlay them, but for text extraction we need to merge them. Use the
* contains() method to test if two objects overlap.
*
* @param diacritic TextPosition to merge into the current TextPosition.
* @param normalize Instance of TextNormalize class to be used to normalize diacritic
*/
public void mergeDiacritic(TextPosition diacritic, TextNormalize normalize)
{
if (diacritic.getCharacter().length() > 1)
{
return;
}
float diacXStart = diacritic.getXDirAdj();
float diacXEnd = diacXStart + diacritic.widths[0];
float currCharXStart = getXDirAdj();
int strLen = str.length();
boolean wasAdded = false;
for (int i = 0; i < strLen && !wasAdded; i++)
{
float currCharXEnd = currCharXStart + widths[i];
/*
* This is the case where there is an overlap of the diacritic character with
* the current character and the previous character. If no previous character,
* just append the diacritic after the current one.
*/
if(diacXStart < currCharXStart && diacXEnd <= currCharXEnd)
{
if(i == 0)
{
insertDiacritic(i, diacritic, normalize);
}
else
{
float distanceOverlapping1 = diacXEnd - currCharXStart;
float percentage1 = distanceOverlapping1/widths[i];
float distanceOverlapping2 = currCharXStart - diacXStart;
float percentage2 = distanceOverlapping2/widths[i-1];
if(percentage1 >= percentage2)
{
insertDiacritic(i, diacritic, normalize);
}
else
{
insertDiacritic(i-1, diacritic, normalize);
}
}
wasAdded = true;
}
//diacritic completely covers this character and therefore we assume that
//this is the character the diacritic belongs to
else if(diacXStart < currCharXStart && diacXEnd > currCharXEnd)
{
insertDiacritic(i, diacritic, normalize);
wasAdded = true;
}
//Otherwise, The diacritic modifies this character because its completely
//contained by the character width
else if(diacXStart >= currCharXStart && diacXEnd <= currCharXEnd)
{
insertDiacritic(i, diacritic, normalize);
wasAdded = true;
}
/*
* Last character in the TextPosition so we add diacritic to the end
*/
else if(diacXStart >= currCharXStart && diacXEnd > currCharXEnd && i == (strLen - 1))
{
insertDiacritic(i, diacritic, normalize);
wasAdded = true;
}
/*
* Couldn't find anything useful so we go to the next character in the
* TextPosition
*/
currCharXStart += widths[i];
}
}
和
/**
* Inserts the diacritic TextPosition to the str of this TextPosition
* and updates the widths array to include the extra character width.
* @param i current character
* @param diacritic The diacritic TextPosition
* @param normalize Instance of TextNormalize class to be used to normalize diacritic
*/
private void insertDiacritic(int i, TextPosition diacritic, TextNormalize normalize)
{
/* we add the diacritic to the right or left of the character
* depending on the direction of the character. Note that this
* is only required because the text is currently stored in
* presentation order and not in logical order.
*/
int dir = Character.getDirectionality(str.charAt(i));
StringBuffer buf = new StringBuffer();
buf.append(str.substring(0,i));
float[] widths2 = new float[widths.length+1];
System.arraycopy(widths, 0, widths2, 0, i);
if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE))
{
buf.append(normalize.normalizeDiac(diacritic.getCharacter()));
widths2[i] = 0;
buf.append(str.charAt(i));
widths2[i+1] = widths[i];
}
else
{
buf.append(str.charAt(i));
widths2[i] = widths[i];
buf.append(normalize.normalizeDiac(diacritic.getCharacter()));
widths2[i+1] = 0;
}
// Get the rest of the string
buf.append(str.substring(i+1, str.length()));
System.arraycopy(widths, i+1, widths2, i+2, widths.length-i-1);
str = buf.toString();
widths = widths2;
}