1.我使用Httpclient抓取网址并使用redis来计算成功数字,最后,日志显示有29个网址没有成功抓取,3小时前最后一次成功记录。 2.我使用jstack查看线程信息,其中29个线程状态是可运行的,堆栈信息如下 3.我使用线程池执行以下方法来抓取url,方法没有完成。
"爬取线程: 942" prio=10 tid=0x00007fe048237800 nid=0x1f69 runnable [0x00007fdf0fefe000]
java.lang.Thread.State: RUNNABLE
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.read(SocketInputStream.java:152)
at java.net.SocketInputStream.read(SocketInputStream.java:122)
at org.apache.http.impl.io.SessionInputBufferImpl.streamRead(SessionInputBufferImpl.java:137)
at org.apache.http.impl.io.SessionInputBufferImpl.fillBuffer(SessionInputBufferImpl.java:153)
at org.apache.http.impl.io.SessionInputBufferImpl.readLine(SessionInputBufferImpl.java:282)
at org.apache.http.impl.io.ChunkedInputStream.getChunkSize(ChunkedInputStream.java:264)
at org.apache.http.impl.io.ChunkedInputStream.nextChunk(ChunkedInputStream.java:225)
at org.apache.http.impl.io.ChunkedInputStream.read(ChunkedInputStream.java:184)
at org.apache.http.impl.io.ChunkedInputStream.read(ChunkedInputStream.java:213)
at org.apache.http.impl.io.ChunkedInputStream.close(ChunkedInputStream.java:315)
at org.apache.http.impl.execchain.ResponseEntityProxy.streamClosed(ResponseEntityProxy.java:140)
at org.apache.http.conn.EofSensorInputStream.checkClose(EofSensorInputStream.java:228)
at org.apache.http.conn.EofSensorInputStream.close(EofSensorInputStream.java:174)
at org.apache.http.util.EntityUtils.consume(EntityUtils.java:88)
at org.apache.http.util.EntityUtils.consumeQuietly(EntityUtils.java:67)
at com.eversec.crawl.http.HttpManager.getRequest(HttpManager.java:164)
at com.eversec.crawl.StartExec$ExecUrl.run(StartExec.java:204)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
public DomainInfo getRequest(String url) throws Exception {
CloseableHttpClient httpClient = getHttpClient();
HttpGet httpGet;
CloseableHttpResponse response = null;
try {
httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36");
response = httpClient.execute(httpGet);
// 判断返回状态是否为200
int httpResponseStateOk = 200;
if (response.getStatusLine().getStatusCode() == httpResponseStateOk) {
Header contentType = response.getEntity().getContentType();
if (contentType != null && contentType.getValue().toLowerCase().startsWith("text/html")) {
HttpEntity entity = response.getEntity();
Header[] contentLengthHeaders = response.getHeaders("Content-Length");
return new DomainInfo(StringUtils.getCharset(entity.getContentType().getValue()), contentBytes);
}
}
} finally {
if (response != null) {
EntityUtils.consumeQuietly(response.getEntity());
}
}
return null;
}
builder.setConnectTimeout(properties.getConnectTimeout());
builder.setConnectionRequestTimeout(properties.getConnectionRequestTimeout());
builder.setSocketTimeout(properties.getSocketTimeout());
RequestConfig config = builder.build();