Question

我在pdfclown中得到了要求，如果有少数关键字与其他关键字一起使用子串/匹配，而突出显示这些关键字必须覆盖并且应该允许突出显示完整关键字。例如在下面的地图中，ETS关键字是子串的just.ETS和Test.ETS关键字。预期结果应该像我们需要突出显示完整的关键字，如just.ETS，Test.ETS而不是ETS关键字及其弹出度量值。。ActualPdf和actual result pdf。和jar path。

Map<String, String> m = new HashMap<String, String>();
        map.put("ETS" , "Loss");
        map.put("Just. ETS" , "Net ");
        map.put("Test. ETS" , "Profit");

（注意：1。如果文件中已突出显示大尺寸关键字，则与大关键字匹配的小尺寸关键字不应突出显示2.如果小尺寸关键字已突出显示且此关键字与大关键字匹配则为大关键字应该高亮并忽略/取消强调小关键字。）。

    import java.awt.Color;
    import java.awt.Desktop;
    import java.awt.geom.Rectangle2D;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.UnsupportedEncodingException;
    import java.net.URL;
    import java.nio.charset.Charset;
    import java.util.ArrayList;
    import java.util.Collection;
    import java.util.Date;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    import java.util.concurrent.TimeUnit;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    import java.io.File;
    import org.pdfclown.documents.Page;
    import org.pdfclown.documents.contents.ITextString;
    import org.pdfclown.documents.contents.TextChar;
    import org.pdfclown.documents.contents.colorSpaces.DeviceRGBColor;
    import org.pdfclown.documents.interaction.annotations.TextMarkup;
    import org.pdfclown.documents.interaction.annotations.TextMarkup.MarkupTypeEnum;

    import org.pdfclown.files.SerializationModeEnum;
    import org.pdfclown.util.math.Interval;
    import org.pdfclown.util.math.geom.Quad;
    import org.pdfclown.tools.TextExtractor;

    public class pdfclown2 {
        private static int count;

        public static void main(String[] args) throws IOException {

            highlight("C:\\Users\\uc23\\Desktop\\pdf\\80743064.pdf","C:\\Users\\\Downloads\\6.pdf");
            System.out.println("OK");
        }
        private static void highlight(String inputPath, String outputPath) throws IOException {




   org.pdfclown.files.File file = null;

try {
    file = new org.pdfclown.files.File("C:\\Users\\uc239646\\Desktop\\test.pdf");

List<Keyword> l=new ArrayList<Keyword>();
Keyword k=new Keyword();
Keyword k1=new Keyword();
k1.setKey("Just. ETS");
k1.setValue("NET");
l.add(k1);
Keyword k2=new Keyword();
k2.setKey("Test. ETS");
k2.setValue("PROFIT");
l.add(k2);
k.setKey("ETS");
k.setValue("LOSS");
l.add(k);

 long startTime = System.currentTimeMillis();




    // 2. Iterating through the document pages...
    TextExtractor textExtractor = new TextExtractor(true, true);
    for (final Page page : file.getDocument().getPages()) {
        Map<Rectangle2D, List<ITextString>> textStrings = textExtractor.extract(page);
        for (Keyword e : l) {
            Pattern pattern;
            String serachKey =  e.getKey();
            final String translationKeyword = e.getValue();

                if ((serachKey.contains(")") && serachKey.contains("("))
                        || (serachKey.contains("(") && !serachKey.contains(")"))
                        || (serachKey.contains(")") && !serachKey.contains("(")) || serachKey.contains("?")
                        || serachKey.contains("*") || serachKey.contains("+")) {
                    pattern = Pattern.compile(Pattern.quote(serachKey), Pattern.CASE_INSENSITIVE);
                }
                else
                     pattern = Pattern.compile("\\b"+serachKey+"\\b", Pattern.CASE_INSENSITIVE);
        // 2.1. Extract the page text!

    //System.out.println(textStrings.toString().indexOf(entry.getKey()));

        // 2.2. Find the text pattern matches!
                        final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings).toLowerCase());
        // 2.3. Highlight the text pattern matches!
        //System.out.println(textStrings);
        textExtractor.filter(textStrings, new TextExtractor.IIntervalFilter() {

            public boolean hasNext() {
                // if(key.getMatchCriteria() == 1){
                if (matcher.find()) {
                    return true;
                }
                /*
                 * } else if(key.getMatchCriteria() == 2) { if
                 * 
                 * 
                 * 
                 * 
                 * 
                 * 
                 * 
                 * 
                 * (matcher.hitEnd()) { count++; return true; } }
                 */
                return false;

            }

            public Interval<Integer> next() {
                return new Interval<Integer>(matcher.start(), matcher.end());
            }

            public void process(Interval<Integer> interval, ITextString match) {
                System.out.println(match);
                // Defining the highlight box of the text pattern
                // match...
                /*List l=new ArrayList();
                if(!l.contains(match)){
                    System.out.println("map.put("+match+","+translationKeyword+")");
                }
            */
                List<Quad> highlightQuads = new ArrayList<Quad>();
                {
                    Rectangle2D textBox = null;
                    for (TextChar textChar : match.getTextChars()) {
                        Rectangle2D textCharBox = textChar.getBox();
                        if (textBox == null) {
                            textBox = (Rectangle2D) textCharBox.clone();
                        } else {
                            if (textCharBox.getY() > textBox.getMaxY()) {
                                highlightQuads.add(Quad.get(textBox));
                                textBox = (Rectangle2D) textCharBox.clone();
                            } else {
                                textBox.add(textCharBox);
                            }
                        }

                    System.out.println(highlightQuads.contains(textBox));

                    textBox.setRect(textBox.getX(), textBox.getY(), textBox.getWidth(), textBox.getHeight());
                    highlightQuads.add(Quad.get(textBox));
                }
            /*  List<Quad> highlightQuads = new ArrayList<Quad>();
                List<TextChar> textChars = match.getTextChars();
                Rectangle2D firstRect = textChars.get(0).getBox();
                Rectangle2D lastRect = textChars.get(textChars.size()-1).getBox();
                Rectangle2D rect = firstRect.createUnion(lastRect);
                highlightQuads.add(Quad.get(rect));*/
                // subtype can be Highlight, Underline, StrikeOut, Squiggly


                new TextMarkup(page, highlightQuads, translationKeyword, MarkupTypeEnum.Highlight);

            }

            }

            public void remove() {
                throw new UnsupportedOperationException();
            }

        });

    }

}

    SerializationModeEnum serializationMode = SerializationModeEnum.Standard;
    file.save(new java.io.File(outputPath), serializationMode);
    System.out.println("file created");
    long endTime = System.currentTimeMillis();
    System.out.println("seconds take for execution is:"+(endTime-startTime)/1000);

} catch (Exception e) {
       e.printStackTrace();
}


        }
    }

Answer 1

正如评论中已经提到的那样（同时也是moved to chat）：

您的问题只会成为PDF小丑问题，因为您试图将购物车放在马前：

您已确定要制作太多精彩内容。

显而易见的解决方案是从一开始就停止制作那些剩余的亮点，并将其排序是与PDF小丑无关的问题。

另一方面，您尝试过的解决方案是在事后删除多余的高光，这只会使您成为PDF Clown问题，因为现在您必须搜索已存在的高亮显示重叠。该解决方案也是可能的解决方案，但它不必要地浪费资源。

这里为他们创建了在突出显示之前排除不需要的匹配的方法。您在页面上循环的内容将替换为：

[...] TextExtractor textExtractor = new TextExtractor(true, true); for (final Page page : file.getDocument().getPages()) { Map<Rectangle2D, List<ITextString>> textStrings = textExtractor.extract(page); List<Match> matches = new ArrayList<>(); for (Keyword e : l) { final String searchKey = e.getKey(); final String translationKeyword = e.getValue(); final Pattern pattern; if ((searchKey.contains(")") && searchKey.contains("(")) || (searchKey.contains("(") && !searchKey.contains(")")) || (searchKey.contains(")") && !searchKey.contains("(")) || searchKey.contains("?") || searchKey.contains("*") || searchKey.contains("+")) { pattern = Pattern.compile(Pattern.quote(searchKey), Pattern.CASE_INSENSITIVE); } else pattern = Pattern.compile("\\b" + searchKey + "\\b", Pattern.CASE_INSENSITIVE); final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings).toLowerCase()); textExtractor.filter(textStrings, new TextExtractor.IIntervalFilter() { public boolean hasNext() { return matcher.find(); } public Interval<Integer> next() { return new Interval<Integer>(matcher.start(), matcher.end(), true, false); } public void process(Interval<Integer> interval, ITextString match) { matches.add(new Match(interval, match, translationKeyword)); } public void remove() { throw new UnsupportedOperationException(); } }); } removeOverlaps(matches); for (Match match : matches) { List<Quad> highlightQuads = new ArrayList<Quad>(); { Rectangle2D textBox = null; for (TextChar textChar : match.match.getTextChars()) { Rectangle2D textCharBox = textChar.getBox(); if (textBox == null) { textBox = (Rectangle2D) textCharBox.clone(); } else { if (textCharBox.getY() > textBox.getMaxY()) { highlightQuads.add(Quad.get(textBox)); textBox = (Rectangle2D) textCharBox.clone(); } else { textBox.add(textCharBox); } } textBox.setRect(textBox.getX(), textBox.getY(), textBox.getWidth(), textBox.getHeight()); highlightQuads.add(Quad.get(textBox)); } new TextMarkup(page, highlightQuads, match.tag, MarkupTypeEnum.Highlight); } } } [...]

（ComplexHighlight test testMarkLikeSeshadriImproved）

使用这些辅助方法/类：

static void removeOverlaps(List<Match> matches) { Collections.sort(matches, ComplexHighlight::compareLowLengthTag); for (int i = 0; i < matches.size() - 1; i++) { Interval<Integer> intervalI = matches.get(i).interval; for (int j = i + 1; j < matches.size(); j++) { Interval<Integer> intervalJ = matches.get(j).interval; if (intervalI.getLow() < intervalJ.getHigh() && intervalJ.getLow() < intervalI.getHigh()) { System.out.printf("Match %d removed as it overlaps match %d.\n", j, i); matches.remove(j--); } } } }

（ComplexHighlight方法removeOverlaps）

static int compareLowLengthTag(Match a, Match b) { int compare = a.interval.getLow().compareTo(b.interval.getLow()); if (compare == 0) compare = - a.interval.getHigh().compareTo(b.interval.getHigh()); if (compare == 0) compare = a.tag.compareTo(b.tag); return compare; }

（ComplexHighlight方法compareLowLengthTag）

class Match { final Interval<Integer> interval; final ITextString match; final String tag; public Match(final Interval<Integer> interval, final ITextString match, final String tag) { this.interval = interval; this.match = match; this.tag = tag; } }

（Match class）

如您所见，此处的匹配项不会立即添加为精彩集锦，而是收集在列表matches中。然后处理该列表以不再包含重叠，并且仅将没有重叠的剩余列表的元素添加为高亮。

正如评论中所提到的，人们必须决定比赛中的优先顺序。

E.g。在搜索条件的情况下＆＃34; AB＆＃34;和＆＃34; BCD＆＃34;和文件文本＆＃34; ABCD＆＃34;上面使用的比较方法compareLowLengthTag总是更喜欢AB匹配，而下面的比较方法compareLengthLowTag更喜欢更长的匹配BCD，并且只有在相同长度的情况下才会更喜欢先前开始的匹配：

static int compareLengthLowTag(Match a, Match b) { int aLength = a.interval.getHigh() - a.interval.getLow(); int bLength = b.interval.getHigh() - b.interval.getLow(); int compare = - Integer.compare(aLength, bLength); if (compare == 0) compare = a.interval.getLow().compareTo(b.interval.getLow()); if (compare == 0) compare = a.tag.compareTo(b.tag); return compare; }

（ComplexHighlight方法compareLengthLowTag）

Pdfclown：如何覆盖pdfclown

1 个答案: