使用可变URL循环访问api get请求

时间:2018-02-17 14:50:12

标签: node.js api needle.js

我正在尝试调用CompaniesHouse API并获取11月至2月期间注册的公司。我采取的方法是选择一个起始指数(一个在11月注册的公司)和一个停止指数(一个在二月注册的公司)并循环以获得在开始和停止指数之间注册的公司。像这样:

var needle = require("needle");
var startIdx = 11059000;
var stopIdx  = 11211109;
for(idx = startIdx; idx < stopIdx; idx++)
{
    needle('get', "https://api.companieshouse.gov.uk/company/"+idx, { 
       username: key,password:"" 
    })
   .then(function(data) {

   })
  .catch(function(err) {
    console.log('Call the locksmith!' + err)
  })
}

但是这不起作用,因为它给出了超时或套接字挂断错误。

API目前处于测试阶段,部分功能尚未实施。

2 个答案:

答案 0 :(得分:0)

var needle = require("needle");
var startIdx = 11059000;
var stopIdx  = 11211109;
const promises = [];
for(idx = startIdx; idx < stopIdx; idx++)
{
    promises.push(
        needle('get', "https://api.companieshouse.gov.uk/company/"+idx, { 
            username: key,password:"" 
        })
    )
}

Promise.all(promises).then(results => {console.log(results);}).catch(err => console.log(err));

简单的Promise.all实施可以提供帮助。

答案 1 :(得分:0)

由于for循环同步运行且您对needle()的调用是异步的,因此不会阻止,因此您最终会尝试一次启动超过100,000个网络请求。这会压倒您的本地计算机或目标服务器,并开始出现套接字错误。

对于这么多请求,您需要一次运行X,因此不会同时运行X.为了最大限度地提高性能,您必须确定要使用的X值,因为它取决于目标服务器以及它如何处理大量同时请求。通常可以安全地从值5开始,然后从那里增加它以测试更高的值。

如果您正在处理数组,则有许多预先构建的选项可以同时运行X请求。最简单的方法是使用预构建的并发管理操作,如Bluebird。或者你可以自己写。您可以在此处查看两者的示例:Make several requests to an API that can only handle 20 request a minute

但是,既然你没有处理一个数组,而只是为每个连续的请求增加一个数字,我就找不到这样做的预建选项。所以,我写了一个通用的,你可以填写增加索引的函数:

// fn gets called on each iteration - must return a promise
// limit is max number of requests to be in flight at once
// cnt is number of times to call fn
// options is optional and can be {continueOnError: true}
// runN returns a promise that resolves with results array.  
// If continueOnError is set, then results array 
// contains error values too (presumed to be instanceof Error so caller can discern
// them from regular values)
function runN(fn, limit, cnt, options = {}) {
    return new Promise((resolve, reject) => {
        let inFlightCntr = 0;
        let results = [];
        let cntr = 0;
        let doneCnt = 0;

        function run() {
            while (inFlightCntr < limit && cntr < cnt) {
                let resultIndex = cntr++;
                ++inFlightCntr;
                fn().then(result => {
                    --inFlightCntr;
                    ++doneCnt;
                    results[resultIndex] = result;
                    run();          // run any more that still need to be run
                }).catch(err => {
                    --inFlightCntr;
                    ++doneCnt;
                    if (options.continueOnError) {
                        // assumes error is instanceof Error so caller can tell the
                        // difference between a genuine result and an error
                        results[resultIndex] = err;       
                        run();          // run any more that still need to be run
                    } else {
                        reject(err);
                    }
                });
            }
            if (doneCnt === cnt) {
                resolve(results);
            }
        }
        run();
    });
}

然后,您可以这样使用:

const needle = require("needle");
const startIdx = 11059000;
const stopIdx  = 11211109;
const numConcurrent = 5;
let idxCntr = startIdx;

runN(function() {
    let idx = idxCntr++;
    return needle('get', "https://api.companieshouse.gov.uk/company/"+idx, { 
        username: key,password:"" 
    });
}, numConcurrent, stopIdx - startIdx + 1, {continueOnError: true}).then(results => {
    console.log(results);
}).catch(err => {
    console.log(err);
});

为了最大限度地减少内存使用,您可以在调用.then()时使用needle()处理程序,并将响应减少到最终数组中所需的内容:

const needle = require("needle");
const startIdx = 11059000;
const stopIdx  = 11211109;
const numConcurrent = 5;
let idxCntr = startIdx;

runN(function() {
    let idx = idxCntr++;
    return needle('get', "https://api.companieshouse.gov.uk/company/"+idx, { 
        username: key,password:"" 
    }).then(response => {
        // construct the smallest possible response here and then return it
        // to minimize memory use for your 100,000+ requests
        return response.someProperty;
    });
}, numConcurrent, stopIdx - startIdx + 1, {continueOnError: true}).then(results => {
    console.log(results);
}).catch(err => {
    console.log(err);
});