WinSock Chunked数据编码

时间:2013-01-01 18:54:48

标签: c++ sockets winsock2 http-chunked

当我连接到某些网站时,它会给我:

Content-Type: text/html; charset=ISO-8859-1

Connection: close

Transfer-Encoding: chunked

Date: Tue, 01 Jan 2013 18:49:53 GMT   


fff8

在文件的末尾,它看起来像:

</script><!-- vBadvanced 1-3-9-4-8-0 -->

</body>
</html

1

>

0

但是,当我执行stackoverflow.com时,它打印出的格式非常好。它可能在源代码中有额外的空白行,但这很好..为什么其他网站会添加数字?

我该如何解决?另外,我如何将该标题与html本身分开?

我的代码如下:

#define _WIN32_WINNT 0x501

#include <iostream>
#include <winsock2.h>
#include <ws2tcpip.h>
#include <stdio.h>
#include <fstream>
#include <vector>

using namespace std;

void Get(string WebPage)
{
    WSADATA wsaData;
    string Address;
    struct addrinfo *result;
    struct sockaddr_in  *sockaddr_ipv4;

    char Buffer[99000];

    string Header = "GET / HTTP/1.1\r\n";
    Header += "Host: " + WebPage + "\r\n";
    Header += "Connection: close\r\n";
    Header += "\r\n";

    if (WSAStartup(MAKEWORD(2,2), &wsaData) != 0) return;

    SOCKET Socket = socket(AF_INET,SOCK_STREAM,IPPROTO_TCP);

    getaddrinfo(WebPage.c_str(), NULL, NULL, &result);
    if (result->ai_family == AF_INET)
    {
        sockaddr_ipv4 = (struct sockaddr_in *) result->ai_addr;
        Address = inet_ntoa(sockaddr_ipv4->sin_addr);
    }
    freeaddrinfo(result);


    SOCKADDR_IN SockAddr;
    memset(&SockAddr, 0, sizeof(SockAddr));
    SockAddr.sin_port = htons(80);
    SockAddr.sin_family = AF_INET;
    SockAddr.sin_addr.s_addr = inet_addr(Address.c_str());

    if(connect(Socket,(SOCKADDR*)(&SockAddr),sizeof(SockAddr)) == SOCKET_ERROR) return;

    if (send(Socket, Header.c_str(), Header.size(), 0) == SOCKET_ERROR) return;
    shutdown(Socket, SD_SEND);

    std::string Response;

    int bytes = 1;
    while (bytes > 0)
    {
        bytes = recv(Socket, Buffer, sizeof(Buffer), 0);
        Buffer[bytes] = '\0';
        Response.append(Buffer, bytes);
    };

    closesocket(Socket);
    WSACleanup();
}

int main()
{
    Get("google.com");
}

2 个答案:

答案 0 :(得分:3)

请参阅此Wiki页面:http://en.wikipedia.org/wiki/Chunked_transfer_encoding

这些十六进制数(块长度)中的每一个都跟随指定大小的实际块数据(有效负载),紧接着是另一个块长度。如果块长度为零,则不再跟随其他数据字节(eof)。这些元素由换行符分隔。 我不确定,您发布的内容是否可以正确连接,看来,您需要处理多个连续的换行符。只需在浏览器中查看页面及其来源即可。

编辑:

刚刚找到这个嗅探工具,它显示了我想知道的所有细节:

http://web-sniffer.net/

答案 1 :(得分:-1)

此功能将“解锁”您的HTTP数据 - 在VB6中,但您会明白(真的是旧代码)

Private Function UnChunk(Indata As String) As String
  If InStr(LCase(Indata), "transfer-encoding:") = 0 And InStr(LCase(Indata), "chunked") = 0 Then
    'not chunked, so return the input
    UnChunk = Indata
    Exit Function
  End If
  'can't let this crash
  On Error GoTo returnInData

  Dim crlfstart As Long
  Dim crlfend As Long
  Dim chunksize As Long

  'first, get header, which ends with 2 line feeds
  crlfstart = InStr(Indata, vbCrLf & vbCrLf)
  If crlfstart = 0 Then
    'invalid http
    UnChunk = Indata
    Exit Function
  End If
  UnChunk = Left(Indata, crlfstart + 2)

  'start looking for vbCrLf
  crlfstart = InStr(crlfstart + 2, Indata, vbCrLf)
  Do While crlfstart > 0
    'find the next vbCrLf
    crlfend = InStr(crlfstart + 1, Indata, vbCrLf)

    If crlfend > crlfstart And crlfend - crlfstart < 10 Then
      'convert the HEX string to the chunksize
      chunksize = Val("&h" & Mid(Indata, crlfstart + 2, crlfend - (crlfstart + 2)))
      'by spec, if 0 then no more data
      If chunksize > 0 Then
        'there's more data
        'this should be unnecessary, but one more test
        If Mid(Indata, crlfend + 2 + chunksize, 2) = vbCrLf Then
          UnChunk = UnChunk & Mid(Indata, crlfend + 2, chunksize)
        Else
          'oops, failed
          Exit Do
        End If
      Else
        'there's no more data so return what we have
        Exit Function
      End If
    End If
    'look again
    crlfstart = InStr(crlfstart + 1, Indata, vbCrLf)
  Loop
  'just in case this fails, return the input data
returnInData:
  UnChunk = Indata
End Function