在我的Java Spring MVC
网络应用程序中,我尝试使用Apache Tika
从网址中提取文字。我已将以下Maven
依赖项添加到我的pom.xml
文件
<!-- Apache Tika -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.6</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.6</version>
</dependency>
我使用以下代码从网址获取文字内容:
URL url = new URL("http://www.basicsbehind.com/extract-text-webpage/");
InputStream input = url.openStream();
LinkContentHandler linkHandler = new LinkContentHandler();
ContentHandler textHandler = new BodyContentHandler();
ToHTMLContentHandler toHTMLHandler = new ToHTMLContentHandler();
TeeContentHandler teeHandler = new TeeContentHandler(linkHandler, textHandler, toHTMLHandler);
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
Parser parser = new AutoDetectParser();
parser.parse(input, teeHandler, metadata, parseContext);
System.out.println("title:\n" + metadata.get("title"));
System.out.println("links:\n" + linkHandler.getLinks());
System.out.println("text:\n" + textHandler.toString());
System.out.println("html:\n" + toHTMLHandler.toString());
代码工作正常,我能够获得所需的数据。但是,如果我改为使用HTTPS网址,则会抛出以下异常:
javax.net.ssl.SSLHandshakeException: sun.security.validator.ValidatorException: PKIX path building failed: sun.security.provider.certpath.SunCertPathBuilderException: unable to find valid certification path to requested target
at sun.security.ssl.Alerts.getSSLException(Alerts.java:192)
at sun.security.ssl.SSLSocketImpl.fatal(SSLSocketImpl.java:1949)
at sun.security.ssl.Handshaker.fatalSE(Handshaker.java:302)
at sun.security.ssl.Handshaker.fatalSE(Handshaker.java:296)
at sun.security.ssl.ClientHandshaker.serverCertificate(ClientHandshaker.java:1514)
at sun.security.ssl.ClientHandshaker.processMessage(ClientHandshaker.java:216)
at sun.security.ssl.Handshaker.processLoop(Handshaker.java:1026)
at sun.security.ssl.Handshaker.process_record(Handshaker.java:961)
at sun.security.ssl.SSLSocketImpl.readRecord(SSLSocketImpl.java:1062)
at sun.security.ssl.SSLSocketImpl.performInitialHandshake(SSLSocketImpl.java:1375)
at sun.security.ssl.SSLSocketImpl.startHandshake(SSLSocketImpl.java:1403)
at sun.security.ssl.SSLSocketImpl.startHandshake(SSLSocketImpl.java:1387)
at sun.net.www.protocol.https.HttpsClient.afterConnect(HttpsClient.java:559)
at sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.connect(AbstractDelegateHttpsURLConnection.java:185)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1546)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1474)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.getInputStream(HttpsURLConnectionImpl.java:254)
at java.net.URL.openStream(URL.java:1045)
如果我使用以下代码
,则不会对所有HTTPS URL抛出此异常URL url = new URL("https://www.smartwcm.com");
抛出异常。我无法找到解决方案。