我通过覆盖PDFTextStripper中的processTextPosition方法将pdf转换为html 知道我正在将整个文本转换为单个.html但我希望每个pdfpage都是.html有没有办法通过processTextPosition方法来实现它我的代码是: 在这里" f"是道路 processTextPosition从其子类PDFText2Html
中调用protected void processTextPosition( TextPosition text )
{
boolean showCharacter = true;
PositionWrapper p =new PositionWrapper(text);
TextPosition t=p.getTextPosition();
try
{
f.createNewFile();
bw= new BufferedWriter(new FileWriter(f, true) );
float a;
a = t.getTextPos().getYPosition();
b=compare(a,b,t);
System.out.println(b);
}
catch (IOException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
finally
{
try
{
bw.flush();
bw.close();
} catch (IOException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
if(suppressDuplicateOverlappingText)
{
showCharacter = false;
String textCharacter = text.getCharacter();
float textX = text.getX();
float textY = text.getY();
TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping.get( textCharacter );
if(sameTextCharacters == null )
{
sameTextCharacters = new TreeMap<Float, TreeSet<Float>>();
characterListMapping.put( textCharacter, sameTextCharacters );
}
boolean suppressCharacter = false;
float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
SortedMap<Float, TreeSet<Float>> xMatches =
sameTextCharacters.subMap(textX - tolerance, textX + tolerance);
for(TreeSet<Float> xMatch : xMatches.values())
{
SortedSet<Float> yMatches =
xMatch.subSet(textY - tolerance , textY + tolerance);
if (!yMatches.isEmpty())
{
suppressCharacter = true;
break;
}
}
if( !suppressCharacter )
{
TreeSet<Float> ySet = sameTextCharacters.get(textX);
if (ySet == null)
{
ySet = new TreeSet<Float>();
sameTextCharacters.put( textX, ySet );
}
ySet.add( textY );
showCharacter = true;
}
}
if( showCharacter )
{
int foundArticleDivisionIndex = -1;
int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
int notFoundButFirstLeftArticleDivisionIndex = -1;
int notFoundButFirstAboveArticleDivisionIndex = -1;
float x = text.getX();
float y = text.getY();
if( shouldSeparateByBeads )
{
for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ )
{
PDThreadBead bead = (PDThreadBead)pageArticles.get( i );
if( bead != null )
{
PDRectangle rect = bead.getRectangle();
if(rect.contains( x, y ) )
{
foundArticleDivisionIndex = i*2+1;
}
else if( (x < rect.getLowerLeftX() ||
y < rect.getUpperRightY()) &&
notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
{
notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2;
}
else if( x < rect.getLowerLeftX() &&
notFoundButFirstLeftArticleDivisionIndex == -1)
{
notFoundButFirstLeftArticleDivisionIndex = i*2;
}
else if( y < rect.getUpperRightY() &&
notFoundButFirstAboveArticleDivisionIndex == -1)
{
notFoundButFirstAboveArticleDivisionIndex = i*2;
}
}
else
{
foundArticleDivisionIndex = 0;
}
}
}
else
{
foundArticleDivisionIndex = 0;
}
int articleDivisionIndex = -1;
if( foundArticleDivisionIndex != -1 )
{
articleDivisionIndex = foundArticleDivisionIndex;
}
else if( notFoundButFirstLeftAndAboveArticleDivisionIndex != -1 )
{
articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
}
else if( notFoundButFirstLeftArticleDivisionIndex != -1 )
{
articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
}
else if( notFoundButFirstAboveArticleDivisionIndex != -1 )
{
articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
}
else
{
articleDivisionIndex = charactersByArticle.size()-1;
}
List<TextPosition> textList = (List<TextPosition>) charactersByArticle.get( articleDivisionIndex );
if(textList.isEmpty())
{
textList.add(text);
}
else
{
TextPosition previousTextPosition = (TextPosition)textList.get(textList.size()-1);
if(text.isDiacritic() && previousTextPosition.contains(text))
{
previousTextPosition.mergeDiacritic(text, normalize);
}
/* If the previous TextPosition was the diacritic, merge it into this
* one and remove it from the list. */
else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
{
text.mergeDiacritic(previousTextPosition, normalize);
textList.remove(textList.size()-1);
textList.add(text);
}
else
{
textList.add(text);
}
}
}
}
我的比较方法正文是
private float compare(float a, float b, TextPosition t) throws IOException
{
if(a==b)
{
bw.write("<span style=\"font-size:"+t.getFontSizeInPt()+"pt;"+ "fontfamily:"+t.getFont().getBaseFont()+ "width:"+t.getWidth()+"left:"+t.getTextPos().getXPosition()+"pt;top:"+t.getTextPos().getYPosition()+";\">"+t.getCharacter()+"</span>");
b=a;
}
else
{
b=a;
bw.write("<br>"+"<span style=\"font-size:"+t.getFontSizeInPt()+"pt;"+ "fontfamily:"+t.getFont().getBaseFont()+ "width:"+t.getWidth()+"left:"+t.getTextPos().getXPosition()+"pt;top:"+t.getTextPos().getYPosition()+";\">"+t.getCharacter()+"</span>");
}
return b;
}