无法通过Wikipedia API下载数据

时间:2016-01-19 13:47:23

标签: c++ sockets http wikipedia-api

我正在尝试从维基百科下载数据。我发送了一个GET请求,但返回只包含页面状态和一些HTML详细信息。

我做错了什么?

#include <winsock2.h>
#include <WS2tcpip.h>
#include <windows.h>
#include <iostream>

int main(){

WSADATA wsaData;
if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
    cout << "WSAStartup failed.\n";
    system("pause");
    return -1;
}

struct addrinfo hints;
ZeroMemory(&hints, sizeof(hints));
hints.ai_family = AF_INET;          
hints.ai_protocol = IPPROTO_TCP;    
hints.ai_socktype = SOCK_STREAM;    


struct addrinfo* targetAdressInfo = NULL;
DWORD getAddrRes = getaddrinfo("www.wikipedia.org", NULL, &hints, &targetAdressInfo);
if (getAddrRes != 0 || targetAdressInfo == NULL)
{
    cout << "Could not resolve the Host Name" << endl;
    system("pause");
    WSACleanup();
    return -1;
}

SOCKADDR_IN sockAddr;
sockAddr.sin_addr = ((struct sockaddr_in*) targetAdressInfo->ai_addr)->sin_addr;   
sockAddr.sin_family = AF_INET;  
sockAddr.sin_port = htons(80);  

freeaddrinfo(targetAdressInfo);

SOCKET webSocket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
if (webSocket == INVALID_SOCKET)
{
    cout << "Creation of the Socket Failed" << endl;
    system("pause");
    WSACleanup();
    return -1;
}

if (connect(webSocket, (SOCKADDR*)&sockAddr, sizeof(sockAddr)) != 0)
{
    cout << "Could not connect";
    system("pause");
    closesocket(webSocket);
    WSACleanup();
    return -1;
}

// Sending a HTTP-GET-Request to the Web Server
const char* httpRequest = "GET / http://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=Google HTTP/1.1\r\n\r\n";
int sentBytes = send(webSocket, httpRequest, strlen(httpRequest), 0);
if (sentBytes < strlen(httpRequest) || sentBytes == SOCKET_ERROR)
{
    cout << "Could not send the request to the Server" << endl;
    system("pause");
    closesocket(webSocket);
    WSACleanup();
    return -1;
}

char buffer[1000000];

ZeroMemory(buffer, sizeof(buffer));
int dataLen;
while ((dataLen = recv(webSocket, buffer, sizeof(buffer), 0) > 0))
{
    int i = 0;
    while (buffer[i] >= 32 || buffer[i] == '\n' || buffer[i] == '\r') {
        cout << buffer[i];
        i += 1;
    }
}

closesocket(webSocket);
WSACleanup();

system("pause");
return 0;
}

2 个答案:

答案 0 :(得分:0)

如果我记得维基百科会向您发送HTTP 301 redirection回复。

您需要解析它以获取目标网址,然后您可以从中访问文章内容。

您可以看到执行此操作here的一些示例代码。

答案 1 :(得分:0)

如何使用python脚本?

import urllib2
def get_page(url):
  request = urllib2.Request(url)
  request = urllib2.urlopen(request)
  data = request.read()
  return data
url = "http://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=Google HTTP/1.1\r\n\r\n"
print get_page(url)