Question

我通过覆盖PDFTextStripper中的processTextPosition方法将pdf转换为html 知道我正在将整个文本转换为单个.html但我希望每个pdfpage都是.html有没有办法通过processTextPosition方法来实现它我的代码是：在这里＆＃34; f＆＃34;是道路 processTextPosition从其子类PDFText2Html

中调用

protected void processTextPosition( TextPosition text )
{  
    boolean showCharacter = true;
    PositionWrapper p =new PositionWrapper(text);
    TextPosition t=p.getTextPosition();


    try
    {




            f.createNewFile();
            bw= new BufferedWriter(new FileWriter(f, true) );


                float a;
                a = t.getTextPos().getYPosition();
                b=compare(a,b,t);
                System.out.println(b);


} 
catch (IOException e) 
{
    // TODO Auto-generated catch block
    e.printStackTrace();
}
finally
{
    try 
        {
         bw.flush();
         bw.close();
        } catch (IOException e) 
         {
           // TODO Auto-generated catch block
            e.printStackTrace();
          }
 }


    if(suppressDuplicateOverlappingText)
    {
        showCharacter = false;
        String textCharacter = text.getCharacter();
        float textX = text.getX();
        float textY = text.getY();
        TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping.get( textCharacter );
        if(sameTextCharacters == null )
        {
            sameTextCharacters = new TreeMap<Float, TreeSet<Float>>();
            characterListMapping.put( textCharacter, sameTextCharacters );
        }

        boolean suppressCharacter = false;
        float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
        SortedMap<Float, TreeSet<Float>> xMatches =
            sameTextCharacters.subMap(textX - tolerance, textX + tolerance);
        for(TreeSet<Float> xMatch : xMatches.values()) 
        {
            SortedSet<Float> yMatches =
                xMatch.subSet(textY - tolerance , textY + tolerance);
            if (!yMatches.isEmpty()) 
            {
                suppressCharacter = true;
                break;
            }
        }
        if( !suppressCharacter )
        {
            TreeSet<Float> ySet = sameTextCharacters.get(textX);
            if (ySet == null) 
            {
                ySet = new TreeSet<Float>();
                sameTextCharacters.put( textX,  ySet );
            }
            ySet.add( textY );
            showCharacter = true;
        }
    }
    if( showCharacter )
    {

        int foundArticleDivisionIndex = -1;
        int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
        int notFoundButFirstLeftArticleDivisionIndex = -1;
        int notFoundButFirstAboveArticleDivisionIndex = -1;
        float x = text.getX();
        float y = text.getY();
        if( shouldSeparateByBeads )
        {
            for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ )
            {
                PDThreadBead bead = (PDThreadBead)pageArticles.get( i );
                if( bead != null )
                {
                    PDRectangle rect = bead.getRectangle();
                    if(rect.contains( x, y ) )
                    {
                        foundArticleDivisionIndex = i*2+1;
                    }
                    else if( (x < rect.getLowerLeftX() ||
                            y < rect.getUpperRightY()) &&
                            notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
                    {
                        notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2;
                    }
                    else if( x < rect.getLowerLeftX() &&
                            notFoundButFirstLeftArticleDivisionIndex == -1)
                    {
                        notFoundButFirstLeftArticleDivisionIndex = i*2;
                    }
                    else if( y < rect.getUpperRightY() &&
                            notFoundButFirstAboveArticleDivisionIndex == -1)
                    {
                        notFoundButFirstAboveArticleDivisionIndex = i*2;
                    }
                }
                else
                {
                    foundArticleDivisionIndex = 0;
                }
            }
        }
        else
        {
            foundArticleDivisionIndex = 0;
        }
        int articleDivisionIndex = -1;
        if( foundArticleDivisionIndex != -1 )
        {
            articleDivisionIndex = foundArticleDivisionIndex;
        }
        else if( notFoundButFirstLeftAndAboveArticleDivisionIndex != -1 )
        {
            articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
        }
        else if( notFoundButFirstLeftArticleDivisionIndex != -1 )
        {
            articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
        }
        else if( notFoundButFirstAboveArticleDivisionIndex != -1 )
        {
            articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
        }
        else
        {
            articleDivisionIndex = charactersByArticle.size()-1;
        }

        List<TextPosition> textList = (List<TextPosition>) charactersByArticle.get( articleDivisionIndex );


        if(textList.isEmpty())
        {
            textList.add(text);
        }
        else
        {
            TextPosition previousTextPosition = (TextPosition)textList.get(textList.size()-1);
            if(text.isDiacritic() && previousTextPosition.contains(text))
            {
                previousTextPosition.mergeDiacritic(text, normalize);
            }
            /* If the previous TextPosition was the diacritic, merge it into this
             * one and remove it from the list. */
            else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
            {
                text.mergeDiacritic(previousTextPosition, normalize);
                textList.remove(textList.size()-1);
                textList.add(text);
            }
            else
            {
                textList.add(text);
            }
        }
    }

   }

我的比较方法正文是

private float compare(float a, float b, TextPosition t) throws IOException 
{ 
if(a==b)
{

    bw.write("<span style=\"font-size:"+t.getFontSizeInPt()+"pt;"+ "fontfamily:"+t.getFont().getBaseFont()+ "width:"+t.getWidth()+"left:"+t.getTextPos().getXPosition()+"pt;top:"+t.getTextPos().getYPosition()+";\">"+t.getCharacter()+"</span>");
    b=a;
}
else
{
    b=a;

    bw.write("<br>"+"<span style=\"font-size:"+t.getFontSizeInPt()+"pt;"+ "fontfamily:"+t.getFont().getBaseFont()+ "width:"+t.getWidth()+"left:"+t.getTextPos().getXPosition()+"pt;top:"+t.getTextPos().getYPosition()+";\">"+t.getCharacter()+"</span>");
}
    return b;
}

使用pdfbox将pdf转换为html pagewise

0 个答案: