PDFBox的内存泄漏问题

时间:2018-07-31 14:26:35

标签: pdfbox apache-tika

我在应用程序中使用PDF Box version 2.0.9。我必须从网上解析大型pdf文件。以下是我正在使用的代码

MimeDetector类

@Getter
@Setter
class MimeTypeDetector {
    private ByteArrayInputStream byteArrayInputStream;
    private BodyContentHandler bodyContentHandler;
    private Metadata metadata;
    private ParseContext parseContext;
    private Detector detector;
    private TikaInputStream tikaInputStream;

    MimeTypeDetector(ByteArrayInputStream byteArrayInputStream) {
        this.byteArrayInputStream = byteArrayInputStream;
        this.bodyContentHandler = new BodyContentHandler(-1);
        this.metadata = new Metadata();
        this.parseContext = new ParseContext();
        this.detector = new DefaultDetector();
        this.tikaInputStream = TikaInputStream.get(new CloseShieldInputStream(byteArrayInputStream));
    }
}


private void crawlAndSave(String url, DomainGroup domainGroup)  {
    MimeTypeDetector mimeTypeDetector = null;
    try {
        String decodeUrl = URLDecoder.decode(url, WebCrawlerConstants.UTF_8);
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(HTMLFetcher.fetch(WebCrawlerUtil.encodeUrl(url)));
        mimeTypeDetector = new MimeTypeDetector(byteArrayInputStream);
        String contentType = getContentType(mimeTypeDetector);
        if (isPDF(contentType)) {
            crawlPDFContent(decodeUrl, mimeTypeDetector, domainGroup);
        } else if (isWebPage(contentType)) {
            // fetching HTML web Page Content
        } else {
            log.warn("Skipping URL::" + url + ".Not a supported crawler format");
            linksVisited.remove(url);
        }
    } catch (IOException e) {
        log.error("crawlAndSave:: Error occurred while decoding URL:" + url + " : " + e.getMessage());
        // some catch operation
    } finally {
        if (Objects.nonNull(mimeTypeDetector)) {
            IOUtils.closeQuietly(mimeTypeDetector.getByteArrayInputStream());
        }
    }
}

private String getContentType(MimeTypeDetector mimeTypeDetector) throws IOException {
    TikaInputStream tikaInputStream = mimeTypeDetector.getTikaInputStream();
    String contentType = mimeTypeDetector.getDetector().detect(tikaInputStream, mimeTypeDetector.getMetadata()).toString();
    tikaInputStream.close();
    return contentType;
}

private void crawlPDFContent(String url, MimeTypeDetector mimeTypeDetector, DomainGroup domainGroup) {
    try {
        private PDFParser pdfParser = new PDFParser();
        pdfParser.parse(mimeTypeDetector.getByteArrayInputStream(), mimeTypeDetector.getBodyContentHandler(),
                mimeTypeDetector.getMetadata(), mimeTypeDetector.getParseContext());
        // Some Database operation
    } catch (IOException | TikaException | SAXException e) {
        //Some Catch operation
        log.error("crawlPDFContent:: Error in crawling PDF Content" + " : " + e.getMessage());
    }
}

HTML提取程序

public class HTMLFetcher {

private HTMLFetcher() {
}

/**
 * Fetches the document at the given URL, using {@link URLConnection}.
 *
 * @param url
 * @return
 * @throws IOException
 */
public static byte[] fetch(final URL url) throws IOException {

    TrustManager[] trustAllCerts = new TrustManager[]{new X509TrustManager() {
        public java.security.cert.X509Certificate[] getAcceptedIssuers() {
            return null;
        }

        public void checkClientTrusted(X509Certificate[] certs, String authType) {
        }

        public void checkServerTrusted(X509Certificate[] certs, String authType) {
        }

    }};

    SSLContext sc = null;
    try {
        sc = SSLContext.getInstance("SSL");
        sc.init(null, trustAllCerts, new java.security.SecureRandom());
        HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
    } catch (NoSuchAlgorithmException | KeyManagementException e) {
        e.printStackTrace();
    }

    // Create all-trusting host name verifier
    HostnameVerifier allHostsValid = (hostname, session) -> true;

    HttpsURLConnection.setDefaultHostnameVerifier(allHostsValid);

    setAuthentication(url);
    //Taken from Boilerpipe
    final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
    InputStream in = conn.getInputStream();
    byte[] byteArray = IOUtils.toByteArray(in);
    in.close();
    conn.disconnect();
    return byteArray;
}

private static void setAuthentication(URL url) {
    AuthenticationDTO authenticationDTO = WebCrawlerUtil.getAuthenticationFromUrl(url);
    if (Objects.nonNull(authenticationDTO)) {
        Authenticator.setDefault(new Authenticator() {
            protected PasswordAuthentication getPasswordAuthentication() {
                return new PasswordAuthentication(authenticationDTO.getUserName(),
                        authenticationDTO.getPassword().toCharArray());
            }
        });
      }
   }
}

但是当我检查内存统计信息时,内存使用量一直在增加。我使用visualVMYourKit Java profiler.

对此进行了验证

检查附件图像。

enter image description here

我在做错什么吗?我搜索了类似的问题,例如thisthis,但有人提到此问题已在最新版本中修复。

1 个答案:

答案 0 :(得分:0)

在加载文档MemoryUsageSetting.setupTempFileOnly()时请在下面使用