无论使用哪个库,外部GET请求永远不会在某些网站上完成,为什么?

时间:2018-01-29 08:13:41

标签: node.js google-app-engine google-cloud-functions axios http-get

我的Google Cloud Functions从数据库获取URL,然后使用GET请求检索相关网页的来源。我使用了Axios,Request和本机HTTP(S)模块。

大多数网站都运行得很好,没有任何问题。然而,对于该URL https://www.healthline.com/health/food-nutrition/cricket-flour-nutrition,Axios和Request只会挂起,直到我的函数超时,但是使用我的本机HTTPS测试,它从URL下载块但从未完成,然后挂起直到函数超时。

These are the logs with the status code, the headers from the response, and the chunks i receive... you can see that it's not finished but still hangs until it timeouts.

我的代码非常简单......

const https = require('https');
const newAgent = new https.Agent({ keepAlive: true });

console.log('-> GET ', document.location.href);

const requestOptions = {
    agent: newAgent,
    hostname: document.location.hostname,
    path: document.location.path
};

var fetchReq = https.request(requestOptions, (res) => {

    let source = '';
    console.log('STATUS: ' + res.statusCode);

    Object.keys(res.headers).forEach(h => {
        console.log('-> ' + h + ' -> ', JSON.stringify(res.headers[h]));
    });

    res.setEncoding('utf8');

    res.on('data', (chunk) => {
        console.log(`-----> CHUNK:`, chunk.substring(0, 40) + ' ... ', chunk.substring(chunk.length - 41, chunk.length - 1));
        source += chunk;
    });

    res.on('close', () => {
        console.log(`-----> CLOSED STREAM`);
    });

    res.on('end', () => {
        console.log(`-----> STREAM ENDED`);
        try {

            console.log(`-> Fetched`, source.length);
            console.log(`-> Saving to GCS`);

            const bucket = storage.bucket(process.env.STORAGE_BUCKET_RAW);
            const fileName = document.organization + '/' + document.key + '.raw';
            const file = bucket.file(fileName);

            const response = {
                uri: 'gs://' + bucket.name + '/' + fileName,
                data: preCleaning(source)
            };

            const options = {
                gzip: true,
                metadata: {
                    metadata: {
                        kind: document.kind,
                        organization: document.organization,
                        username: document.username,
                        key: document.key,
                        url: document.location.href
                    }
                }
            };

            file.save(response.data, options)
                .then(() => {
                    console.log(`-> Saved to GCS`);
                    resolve();
                })
                .catch(err => {
                    reject(new AppError(`Error saving raw document to storage (${uri}).`, 500, err));
                });

        } catch (e) {
            console.log('HTTP message: ', e.message);
        }
    });

    res.on('finish', () => {
        console.log(`-----> FINISHED STREAM`);
    });

    res.on('error', (e) => {
        console.log(`Got error: ${e.message}`);
    })

}).on('socket', (socket) => {
    socket.emit('agentRemove');
}).end();

我尝试过不同的库,相同的代码在本地工作,它只是让人感到困惑,而且我的想法已经用完......

0 个答案:

没有答案