我正在编写一个代码,它使用NIO / Selector进行网页抓取。 有用。我得到OP_CONNECT,然后我发送GET请求,并返回整个html页面。 但是,在那之后,我不知道-1已经完成了。我确实看到,这意味着整个页面已被发送,但SocketChannel.read不返回-1表示流的结束。 非常感谢任何帮助!
以下是整个示例代码:
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.StandardSocketOptions;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.channels.SelectionKey;
import java.nio.channels.Selector;
import java.nio.channels.SocketChannel;
import java.util.Iterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class HttpClientTest {
private static final Logger logger = LoggerFactory.getLogger(HttpClientTest.class);
private static final String BASE_URL_STR = "https://www.youtube.com/channel";
private static final String CHANNEL_ID = "UCDm6kPZFCoT7altG4WNGy-A";
private final ByteArrayOutputStream baHtmlPage = new ByteArrayOutputStream();
private final ByteBuffer buffer = ByteBuffer.allocate(128 * 1024);
private String htmlPage = null;
private void startHttpClient() throws InterruptedException {
// open Selector and ServerSocketChannel by calling the open() method
try (Selector selector = Selector.open();
SocketChannel socketChannel = SocketChannel.open()) {
// check that both of them were successfully opened
if ((socketChannel.isOpen()) && (selector.isOpen())) {
// configure non-blocking mode
socketChannel.configureBlocking(false);
socketChannel.setOption(StandardSocketOptions.SO_RCVBUF,
128 * 1024);
socketChannel.setOption(StandardSocketOptions.SO_SNDBUF,
128 * 1024);
socketChannel.setOption(StandardSocketOptions.SO_KEEPALIVE,
true);
//socketChannel.setOption(StandardSocketOptions.TCP_NODELAY,
// true);
//socketChannel.connect(new InetSocketAddress(IP, DEFAULT_PORT));
socketChannel.connect(createSocketAddress(CHANNEL_ID));
// register the current channel with the given selector
socketChannel.register(selector, SelectionKey.OP_CONNECT);
while (true) {
// wait for incomming events
int num = selector.selectNow();
if (num==0) {
//Thread.yield();
Thread.sleep(2000);
System.out.println("sleep: 2 sec");
continue;
}
// there is something to process on selected keys
Iterator<SelectionKey> keys = selector.selectedKeys().iterator();
while (keys.hasNext()) {
SelectionKey key = (SelectionKey) keys.next();
// prevent the same key from coming up again
keys.remove();
if (!key.isValid()) {
continue;
}
if (key.isConnectable() && socketChannel.finishConnect()) {
System.out.println("Key: OP_CONNECT");
// reset the byte-array
baHtmlPage.reset();
// Connected --> Send the HTTP request
key.interestOps(SelectionKey.OP_WRITE);
} else if (key.isReadable()) {
System.out.println("Key: OP_READ");
if (readResponse(key)) {
logger.info("finished reading, htmlpage:{}", htmlPage);
} else {
key.interestOps(SelectionKey.OP_READ);
}
// Once read is done --> we are done
//key.interestOps(SelectionKey.OP_WRITE);
} else if (key.isWritable()) {
System.out.println("Key: OP_WRITE");
if (writeHttpRequest(key)) {
// HTTP request is sent --> Get the response
key.interestOps(SelectionKey.OP_READ);
}
}
}
}
} else { // if ((serverSocketChannel.isOpen()) && (selector.isOpen())) {
System.out
.println("The server socket channel or selector cannot be opened!");
}
} catch (IOException ex) {
System.err.println(ex);
}
}
private static InetSocketAddress createSocketAddress(String channelID) throws MalformedURLException {
//String urlStr = BASE_URL_STR + "/" + CHANNEL_ID;
String urlStr = "http://www.google.com";
URL url = new URL(urlStr);
String host = url.getHost();
int port = url.getPort();
if (port == -1)
port = 80;
return new InetSocketAddress(host, port);
}
private boolean readResponse(SelectionKey key) throws IOException {
boolean done = false;
SocketChannel socketChannel = (SocketChannel) key.channel();
int numRead = -1;
do {
buffer.clear();
numRead = socketChannel.read(buffer);
baHtmlPage.write(buffer.array(), 0, numRead);
System.out.println("Server sent:" + new String(buffer.array(), 0, numRead, "UTF-8") );
} while(numRead>0);
if (numRead == -1) {
System.out.println("Connection closed by: " + socketChannel.getRemoteAddress());
key.cancel();
socketChannel.close();
htmlPage = baHtmlPage.toString("UTF-8");
done = true;
}
return done;
}
private boolean writeHttpRequest(SelectionKey key) throws IOException {
boolean done = false;
SocketChannel socketChannel = (SocketChannel) key.channel();
String request =
"GET /channel/UCDm6kPZFCoT7altG4WNGy-A HTTP/1.1\r\n" +
"Host: www.youtube.com\r\n" +
"Cache-Control: no-cache\r\n\r\n";
// ISO-8859-1
ByteBuffer randomBuffer = ByteBuffer.wrap(request.getBytes("UTF-8"));
int rem = randomBuffer.remaining();
int num = socketChannel.write(randomBuffer);
if (rem==num) {
done = true;
System.out.printf("Request written:%s\n", request);
}
return done;
}
// private void doEchoJob(SelectionKey key, byte[] data) {
//
// SocketChannel socketChannel = (SocketChannel) key.channel();
// List<byte[]> channelData = keepDataTrack.get(socketChannel);
// channelData.add(data);
//
// key.interestOps(SelectionKey.OP_WRITE);
// }
public static void main(String[] args) throws InterruptedException {
HttpClientTest client = new HttpClientTest();
client.startHttpClient();
}
}
答案 0 :(得分:1)
您正在执行HTTP / 1.1请求,该请求具有隐式保持活动状态。这意味着,一旦发送完整响应,服务器就不必关闭连接,而是将其保持打开一段时间,希望它能获得更多请求,从而节省另一个TCP连接设置的开销。 / p>
虽然这有助于在浏览器的正常情况下提高性能,但在您的情况下它并没有帮助。我建议使用HTTP / 1.0而不是HTTP / 1.1,这样您就不必处理保持活动或其他HTTP / 1.1功能,如分块编码。除此之外,建议使用现有的HTTP库来处理所有这些问题。