Question

我有一个（相当难看）的方法，从网站获取页面和页面上的所有图像。获取网页完全没有问题。但是当我拿到图像时，它们会变得奇怪而且绝对不会像它们发送的那样。我一直用于测试的uri是这样的：http://www.themountaingoats.net/contact.html这个网页非常简单，并且拥有测试所需的一切。

使用\ r或\ n作为结束字符可以得到不同的结果，\ r \ n甚至无法打开图像。

public static String GET(String uri, int port) throws IOException {

        String domain = uri.split("/",2)[0];
        String filename = uri.split("/",2)[1];
        Socket socket = new Socket(domain, port);


        // send the command to the server.
        System.out.println(socket.isConnected());
        DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream());
        BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream()));
        String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
        System.out.println(request);
        outToServer.writeBytes(request);

        //create a file to write in.
        File file = new File(domain+".txt");
        // if file doesnt exists, then create it
        if (!file.exists()) {
            file.createNewFile();
        }
        PrintWriter writer = new PrintWriter(file);
        writer.print("");
        writer.close();

        int characterCounter=100;
        while(characterCounter >= 0){
            String serverSentence = inFromServer.readLine();
            System.out.println(serverSentence);
            if (serverSentence.startsWith("Content-Length:")){
                characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ",""));
            }
            if ( !serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ")
                    && !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ")
                    && !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ") ){
                characterCounter = characterCounter - serverSentence.length()-1;
            }

            //write in the file
            FileWriter fw = new FileWriter(file.getAbsoluteFile(),true);
            BufferedWriter bw = new BufferedWriter(fw);
            bw.write(serverSentence+"\r\n");
            bw.close();
        }


        Document doc = Jsoup.parse(file, "UTF-8");
        Elements imgs = doc.getElementsByTag("img");

        System.out.println(imgs);


        for (Element link : imgs) {
            String source = link.attr("src");

            source = source.replace("http://"+domain+"", "");

            System.out.println(source);


            //create a file to write in.
            File image = new File(source.replace("/", "."));
            // if file doesnt exists, then create it
            if (!image.exists()) {
                image.createNewFile();
            }

            PrintWriter imageWriter = new PrintWriter(image);
            imageWriter.print("");
            imageWriter.close();

            String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
            System.out.println(requestImage);
            outToServer.writeBytes(requestImage);

            boolean flag = false;
            String previousServerSentence = "something not empty";
            characterCounter=100;
            while(characterCounter > 0){
                String serverSentence = inFromServer.readLine();
                System.out.println(serverSentence);
                if (serverSentence.startsWith("Content-Length:")){
                    characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ",""));
                }

                if (!flag){
                    if ( previousServerSentence.matches("") && !serverSentence.matches("")){
                        flag = true;
                    }
                }

                if ( (!serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ")
                        && !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ")
                        && !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ") && !serverSentence.startsWith("ETag: ") && !serverSentence.startsWith("Accept-Ranges: ")
                        && !serverSentence.startsWith("Accept-Language: ") && !serverSentence.startsWith("Accept-Datetime: ") && !serverSentence.startsWith("Authorization: ") 
                        && !serverSentence.startsWith("Connection: ") && !serverSentence.startsWith("Content-Language: ") && !serverSentence.startsWith("Content-Length: ") 
                        && !serverSentence.startsWith("Content-Location: ")  && !serverSentence.startsWith("Content-MD5: ")  && !serverSentence.startsWith("Content-Range: ")
                        && !serverSentence.startsWith("Content-Type: ")  && !serverSentence.startsWith("Date: ")  && !serverSentence.startsWith("expect: ")
                        && !serverSentence.startsWith("From: ") && !serverSentence.startsWith("Host: ") && !serverSentence.startsWith("If-Match: ") && !serverSentence.startsWith("If-Modified-Since: ")
                        && !serverSentence.startsWith("Accept: ") && !serverSentence.startsWith("Accept-Charset: ") && !serverSentence.startsWith("Accept-Encoding: ")
                        && !serverSentence.startsWith("Age: ") && !serverSentence.startsWith("Allow: ") && !serverSentence.startsWith("Content-Encoding: ")
                        && !serverSentence.startsWith("If-None-Match: ") && !serverSentence.startsWith("If-Range: ") && !serverSentence.startsWith("If-Unmodified-Since: ")
                        && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Location: ") && !serverSentence.startsWith("Max-Forwards: ")
                        && !serverSentence.startsWith("Pragma: ") && !serverSentence.startsWith("Proxy-Authenticate: ") && !serverSentence.startsWith("Proxy-Authorization: ")
                        && !serverSentence.startsWith("Range: ") && !serverSentence.startsWith("Referer: ") && !serverSentence.startsWith("Retry-After: ")
                        && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("TE: ") && !serverSentence.startsWith("Trailer: ")
                        && !serverSentence.startsWith("Transfer-Encoding: ") && !serverSentence.startsWith("Upgrade: ") && !serverSentence.startsWith("User-Agent: ")
                        && !serverSentence.startsWith("Via: ") && !serverSentence.startsWith("Warning: ") && !serverSentence.startsWith("WWW-Authenticate: "))
                        && flag){
                    characterCounter = characterCounter - serverSentence.length()-1;
                    //write in the file

                    FileWriter fw = new FileWriter(image.getAbsoluteFile(),true);
                    BufferedWriter bw = new BufferedWriter(fw);
                    bw.write(serverSentence+"\r");
                    bw.close();


                }

                previousServerSentence = serverSentence;
            }


        }
        return null;
    }

第一个图像用于\ r \ n作为结束，第二个图像用于\ n作为结束，最后一个图像是原始图像。我完全不知道为什么图像搞得这么糟糕。

所以我的问题是：为什么会发生这种情况，我该如何解决？

编辑：

public static String GET(String uri, int port) throws IOException {

        /*
         * Retrieval of the webpage
         */

        String domain = uri.split("/",2)[0];
        String filename = uri.split("/",2)[1];
        Socket socket = new Socket(domain, port);


        // send the command to the server.
        System.out.println(socket.isConnected());
        DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream());
        BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream()));
        String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
        System.out.println(request);
        outToServer.writeBytes(request);

        //create a file to write in.
        File file = new File(domain+".txt");
        // if file doesnt exists, then create it
        if (!file.exists()) {
            file.createNewFile();
        }
        PrintWriter writer = new PrintWriter(file);
        writer.print("");
        writer.close();

        int characterCounter=100;
        while(characterCounter >= 0){
            String serverSentence = inFromServer.readLine();
            System.out.println(serverSentence);
            if (serverSentence.startsWith("Content-Length:")){
                characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ",""));
            }
            if ( !serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ")
                    && !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ")
                    && !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ") ){
                characterCounter = characterCounter - serverSentence.length()-1;
            }

            //write in the file
            FileWriter fw = new FileWriter(file.getAbsoluteFile(),true);
            BufferedWriter bw = new BufferedWriter(fw);
            bw.write(serverSentence+"\r\n");
            bw.close();
        }

        /*
         * Retrieval of all the embedded images on the webpage that are on the same domain.
         */

        Document doc = Jsoup.parse(file, "UTF-8");
        Elements imgs = doc.getElementsByTag("img");

        System.out.println(imgs);



        for (Element link : imgs) {
            String source = link.attr("src");

            source = source.replace("http://"+domain+"", "");

            System.out.println(source);

            //create a file to write in.
            File image = new File(source.replace("/", "."));
            // if file doesnt exists, then create it
            if (!image.exists()) {
                image.createNewFile();
            }

            // Initialize the streams.
            final FileOutputStream fileOutputStream = new FileOutputStream(image);
            final InputStream inputStream = socket.getInputStream();

            // Header end flag.
            boolean headerEnded = false;

            String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
            System.out.println(requestImage);
            outToServer.writeBytes(requestImage);

            int buffersize = 1000000;
            byte[] bytes = new byte[buffersize];
            int length;

            while ((length = inputStream.read(bytes)) != -1) {
                // If the end of the header had already been reached, write the bytes to the file as normal.
                if (headerEnded){

                    fileOutputStream.write(bytes, 0, length);
                }
                // This locates the end of the header by comparing the current byte as well as the next 3 bytes
                // with the HTTP header end "\r\n\r\n" (which in integer representation would be 13 10 13 10).
                // If the end of the header is reached, the flag is set to true and the remaining data in the
                // currently buffered byte array is written into the file.
                else {
                    for (int i = 0; i < buffersize-3; i++) {
                        if (bytes[i] == 13 && bytes[i + 1] == 10 && bytes[i + 2] == 13 && bytes[i + 3] == 10) {
                            headerEnded = true;
                            fileOutputStream.write(bytes, i+4 , buffersize-i-4);
                            break;
                        }
                    }
                }
            }

            inputStream.close();
            fileOutputStream.close();

        }
        socket.close();
        return null;
    }

这是我现在的结果：

我可以获得部分图片，但不是整张图片。使用缓冲区可以让我更进一步，甚至更远一点。

EDIT2：我发现了错误。它只是涉及一些方面。最终工作代码：

public static String GET(String uri, int port) throws IOException {

    /*
     * Retrieval of the webpage
     */

    String domain = uri.split("/",2)[0];
    String filename = uri.split("/",2)[1];
    Socket socket = new Socket(domain, port);


    // send the command to the server.
    System.out.println(socket.isConnected());
    DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream());
    BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream()));
    String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
    System.out.println(request);
    outToServer.writeBytes(request);

    //create a file to write in.
    File file = new File(domain+".txt");
    // if file doesnt exists, then create it
    if (!file.exists()) {
        file.createNewFile();
    }
    PrintWriter writer = new PrintWriter(file);
    writer.print("");
    writer.close();

    int characterCounter=100;
    while(characterCounter >= 0){
        String serverSentence = inFromServer.readLine();
        System.out.println(serverSentence);
        if (serverSentence.startsWith("Content-Length:")){
            characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ",""));
        }
        if ( !serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ")
                && !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ")
                && !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ") ){
            characterCounter = characterCounter - serverSentence.length()-1;
        }

        //write in the file
        FileWriter fw = new FileWriter(file.getAbsoluteFile(),true);
        BufferedWriter bw = new BufferedWriter(fw);
        bw.write(serverSentence+"\r\n");
        bw.close();
    }

    /*
     * Retrieval of all the embedded images on the webpage that are on the same domain.
     */

    Document doc = Jsoup.parse(file, "UTF-8");
    Elements imgs = doc.getElementsByTag("img");

    System.out.println(imgs);


    for (Element link : imgs) {

        // Getting the link ready for GET query.

        String source = link.attr("src");

        source = source.replace("http://"+domain+"", "");

        System.out.println(source);

        //create a file to write in.
        File image = new File(source.replace("/", "."));
        // if file doesnt exists, then create it
        if (!image.exists()) {
            image.createNewFile();
        }

        String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
        System.out.println(requestImage);
        outToServer.writeBytes(requestImage);

        // Initialize the streams.
        final FileOutputStream fileOutputStream = new FileOutputStream(image);
        final InputStream inputStream = socket.getInputStream();

        // Header end flag.
        boolean headerEnded = false;

        int buffersize = 10000;
        byte[] bytes = new byte[buffersize];
        int length;
        while ((length = inputStream.read(bytes)) != -1) {
            // If the end of the header had already been reached, write the bytes to the file as normal.
            if (headerEnded){
                fileOutputStream.write(bytes, 0, length);
            }
            // This locates the end of the header by comparing the current byte as well as the next 3 bytes
            // with the HTTP header end "\r\n\r\n" (which in integer representation would be 13 10 13 10).
            // If the end of the header is reached, the flag is set to true and the remaining data in the
            // currently buffered byte array is written into the file.
            else {
                for (int i = 0; i < length-3; i++) {
                    if (bytes[i] == 13 && bytes[i + 1] == 10 && bytes[i + 2] == 13 && bytes[i + 3] == 10) {
                        headerEnded = true;
                        fileOutputStream.write(bytes, i+4 , length-i-4);
                        break;
                    }
                }
            }
        }

        inputStream.close();
        fileOutputStream.close();

    }
    socket.close();
    return null;
}

Answer 1

尽可能避免使用原始套接字来处理http请求。

如果您可以使用单独的连接来检索图像文件，请参阅4ndrew的答案： https://stackoverflow.com/a/8679160/176873

如果您遇到原始套接字，请避免使用 java.io.BufferedReader 。 BufferedReader不应该用于读取二进制数据。您正在将二进制数据转换为字符串并将文本文件写入本地电脑。

请参阅Alexay的解决方法： https://stackoverflow.com/a/34106534/176873

如何通过同一套接字连接发送图像和文本

1 个答案: