在Apache PDFBox 2.0中替换TextNormalize?

时间:2016-10-25 22:34:45

标签: pdfbox

我们部门继承了使用Apache PDFBox 1.8.x或更早版本的代码,我们正在尝试将其迁移到Apache PDFBox 2.0.x.代码的某些部分使用TextNormalize,但我在2.0 javadocs中找不到任何提及它的内容。我也没有在Migration to PDFBox 2.0.0指南中找到任何提及。

我似乎无法找到有关如何更改此类或替换TextNormalize的任何信息。有没有人对如何替换Apache PDFBox 2.0有任何建议?

很大程度上,我们创建一个TextNormalize对象作为我们的类的构造函数的一部分,该类扩展了PDFStreamEngine,并且它是在合并/插入变音符号的代码中使用它的唯一位置。

/**
 * Merge a single character TextPosition into the current object.
 * This is to be used only for cases where we have a diacritic that
 * overlaps an existing TextPosition.  In a graphical display, we could
 * overlay them, but for text extraction we need to merge them. Use the
 * contains() method to test if two objects overlap.
 *
 * @param diacritic TextPosition to merge into the current TextPosition.
 * @param normalize Instance of TextNormalize class to be used to normalize diacritic
 */
public void mergeDiacritic(TextPosition diacritic, TextNormalize normalize)
{
    if (diacritic.getCharacter().length() > 1)
    {
        return;
    }

    float diacXStart = diacritic.getXDirAdj();
    float diacXEnd = diacXStart + diacritic.widths[0];

    float currCharXStart = getXDirAdj();

    int strLen = str.length();
    boolean wasAdded = false;

    for (int i = 0; i < strLen && !wasAdded; i++)
    {
        float currCharXEnd = currCharXStart + widths[i];

        /*
         * This is the case where there is an overlap of the diacritic character with
         * the current character and the previous character. If no previous character,
         * just append the diacritic after the current one.
         */
        if(diacXStart < currCharXStart && diacXEnd <= currCharXEnd)
        {
            if(i == 0)
            {
                insertDiacritic(i, diacritic, normalize);
            }
            else
            {
                float distanceOverlapping1 = diacXEnd - currCharXStart;
                float percentage1 = distanceOverlapping1/widths[i];

                float distanceOverlapping2 = currCharXStart - diacXStart;
                float percentage2 = distanceOverlapping2/widths[i-1];

                if(percentage1 >= percentage2)
                {
                    insertDiacritic(i, diacritic, normalize);
                }
                else
                {
                    insertDiacritic(i-1, diacritic, normalize);
                }
            }
            wasAdded = true;
        }
        //diacritic completely covers this character and therefore we assume that
        //this is the character the diacritic belongs to
        else if(diacXStart < currCharXStart && diacXEnd > currCharXEnd)
        {
            insertDiacritic(i, diacritic, normalize);
            wasAdded = true;
        }
        //Otherwise, The diacritic modifies this character because its completely
        //contained by the character width
        else if(diacXStart >= currCharXStart && diacXEnd <= currCharXEnd)
        {
            insertDiacritic(i, diacritic, normalize);
            wasAdded = true;
        }
        /*
         * Last character in the TextPosition so we add diacritic to the end
         */
        else if(diacXStart >= currCharXStart && diacXEnd > currCharXEnd && i == (strLen - 1))
        {
            insertDiacritic(i, diacritic, normalize);
            wasAdded = true;
        }
        /*
         * Couldn't find anything useful so we go to the next character in the
         * TextPosition
         */
        currCharXStart += widths[i];
    }
}

/**
 * Inserts the diacritic TextPosition to the str of this TextPosition
 * and updates the widths array to include the extra character width.
 * @param i current character
 * @param diacritic The diacritic TextPosition
 * @param normalize Instance of TextNormalize class to be used to normalize diacritic
 */
private void insertDiacritic(int i, TextPosition diacritic, TextNormalize normalize)
{
    /* we add the diacritic to the right or left of the character
     * depending on the direction of the character.  Note that this
     * is only required because the text is currently stored in
     * presentation order and not in logical order.
     */
    int dir = Character.getDirectionality(str.charAt(i));
    StringBuffer buf = new StringBuffer();

    buf.append(str.substring(0,i));

    float[] widths2 = new float[widths.length+1];
    System.arraycopy(widths, 0, widths2, 0, i);

    if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT)
            || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC)
            || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING)
            || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE))
    {
        buf.append(normalize.normalizeDiac(diacritic.getCharacter()));
        widths2[i] = 0;
        buf.append(str.charAt(i));
        widths2[i+1] = widths[i];
    }
    else
    {
        buf.append(str.charAt(i));
        widths2[i] = widths[i];
        buf.append(normalize.normalizeDiac(diacritic.getCharacter()));
        widths2[i+1] = 0;
    }

    // Get the rest of the string
    buf.append(str.substring(i+1, str.length()));
    System.arraycopy(widths, i+1, widths2, i+2, widths.length-i-1);

    str = buf.toString();
    widths = widths2;
}

0 个答案:

没有答案